读懂正则表达式就这么简单

<div class="post_item"><div class="digg">    <div class="diggit" onclick="DiggIt(3439076,120879,1)">     <span class="diggnum" id="digg_count_3439076">4</span>    </div>    <div class="clear"></div>        <div id="digg_tip_3439076" class="digg_tip"></div></div>      <div class="post_item_body">    <h3><a class="titlelnk" href="http://www.cnblogs.com/swq6413/p/3439076.html" target="_blank">分享完整的项目工程目录结构</a></h3>                       <p class="post_item_summary"><a href="http://www.cnblogs.com/swq6413/" target="_blank"><img width="48" height="48" class="pfs" src="http://pic.cnitblog.com/face/142964/20131116170946.png" alt=""/></a>    在项目开发过程中，如何有序的保存项目中的各类数据文件，建立一个分类清晰、方便管理的目录结构是非常重要的。 综合以前的项目和一些朋友的项目结构，我整理了一份我觉得还不错的项目目录结构。 在这里分享给大家，欢迎各位提出你宝贵的意见和建议。如果喜欢请“推荐”则个，感激万分！！ 整个目录设置到4级子目录，实...     </p>                  <div class="post_item_foot">                        <a href="http://www.cnblogs.com/swq6413/" class="lightblue">七少爷</a>     发布于 2013-11-23 15:48     <span class="article_comment"><a href="http://www.cnblogs.com/swq6413/p/3439076.html#commentform" title="2013-11-23 16:40" class="gray">        评论(4)</a></span><span class="article_view"><a href="http://www.cnblogs.com/swq6413/p/3439076.html" class="gray">阅读(206)</a></span></div></div><div class="clear"></div></div>

通过构造一个Http请求来取到数据并对数据进行相应处理得到关键信息，在过滤Html标签取文章时正则的强大的威力就体现出来了，

正则的知识点也都基本用上了比如 "\s \w+ . * ? "还有捕获分组，零宽断言等等。喜欢的朋友可以试一试，然后自己看如何通过正则取相应数据的，代码中的正则都是很基本简单的，其意思与用法都在上文中详细写了。

    class Program    {        static void Main(string[] args)        {                     string content = HttpUtility.HttpGetHtml();            HttpUtility.GetArticles(content);        }    }    internal class HttpUtility    {        //默认获取第一页数据        public static string HttpGetHtml()        {            HttpWebRequest request = (HttpWebRequest)WebRequest.Create("http://www.cnblogs.com/");            request.Accept = "text/plain, */*; q=0.01";            request.Method = "GET";            request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");            request.ContentLength = 0;                       request.Host = "www.cnblogs.com";            request.UserAgent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.1.3.5000 Chrome/26.0.1410.43 Safari/537.1";            HttpWebResponse response = (HttpWebResponse)request.GetResponse();            Stream responStream = response.GetResponseStream();            StreamReader reader = new StreamReader(responStream, Encoding.UTF8);            string content = reader.ReadToEnd();            return content;        }        public static List<Article> GetArticles(string htmlString)        {            List<Article> articleList = new List<Article>();            Regex regex = null;            Article article = null;            regex = new Regex("<div class=\"post_item\">(?<content>.*?)(?=<div class=\"clear\">" + @"</div>\s*</div>)",                              RegexOptions.Singleline);            if (regex.IsMatch(htmlString))            {                MatchCollection aritcles = regex.Matches(htmlString);                foreach (Match item in aritcles)                {                    article = new Article();                    //取推荐                    regex =                        new Regex(                            "<div class=\"digg\">.*<span.*>(?<digNum>.*)" + @"</span>" +                            ".*<div class=\"post_item_body\">", RegexOptions.Singleline);                    article.DiggNum = regex.Match(item.Value).Groups["digNum"].Value;                    //取文章标题 需要去除转义字符                    regex = new Regex("<h3>(?<a>.*)</h3>", RegexOptions.Singleline);                    string a = regex.Match(item.Value).Groups["a"].Value;                    regex = new Regex("<a\\s.*href=\"(?<href>.*?)\".*>(?<summary>.*)</a>", RegexOptions.Singleline);                    article.AritcleUrl = regex.Match(a).Groups["href"].Value;                    article.AritcleTitle = regex.Match(a).Groups["summary"].Value;                    //取作者图片                     regex = new Regex("<a.*>(?<img><img[^>].*>)</a>", RegexOptions.Singleline);                    article.AuthorImg = regex.Match(item.Value).Groups["img"].Value;                    //取作者博客URL及链接的target属性                    regex = new Regex("<a\\s*?href=\"(?<href>.*)\"\\s*?target=\"(?<target>.*?)\">.*</a>",                                      RegexOptions.Singleline);                    article.AuthorUrl = regex.Match(item.Value).Groups["href"].Value;                    string urlTarget = regex.Match(item.Value).Groups["target"].Value;                    //取文章简介                    //1 先取summary Div中所有内容                    regex = new Regex("<p class=\"post_item_summary\">(?<summary>.*)</p>", RegexOptions.Singleline);                    string summary = regex.Match(item.Value).Groups["summary"].Value;                    //2 取简介                    regex = new Regex("(?<indroduct>(?<=</a>).*)", RegexOptions.Singleline);                    article.AritcleInto = regex.Match(summary).Groups["indroduct"].Value;                    //取发布人与发布时间                    regex =                        new Regex(                            "<div class=\"post_item_foot\">\\s*<a.*?>(?<publishName>.*)</a>(?<publishTime>.*)<span class=\"article_comment\">",                            RegexOptions.Singleline);                    article.Author = regex.Match(item.Value).Groups["publishName"].Value;                    article.PublishTime = regex.Match(item.Value).Groups["publishTime"].Value.Trim();                    //取评论数                    regex =                        new Regex(                            "<span class=\"article_comment\"><a.*>(?<comment>.*)</a></span><span class=\"article_view\">",                            RegexOptions.Singleline);                    article.CommentNum = regex.Match(item.Value).Groups["comment"].Value;                    //取阅读数                    regex = new Regex("<span\\s*class=\"article_view\"><a.*>(?<readNum>.*)</a>", RegexOptions.Singleline);                    article.ReadNum = regex.Match(item.Value).Groups["readNum"].Value;                    articleList.Add(article);                }            }            return articleList;        }        public static string ClearSpecialTag(string htmlString)        {            string htmlStr = Regex.Replace(htmlString, "\n", "", RegexOptions.IgnoreCase);            htmlStr = Regex.Replace(htmlStr, "\t", "", RegexOptions.IgnoreCase);            htmlStr = Regex.Replace(htmlStr, "\r", "", RegexOptions.IgnoreCase);            htmlStr = Regex.Replace(htmlStr, "\"", "'", RegexOptions.IgnoreCase);            return htmlStr;        }    }    public class Article    {        /// <summary>        /// 文章标题        /// </summary>        public string AritcleTitle { get; set; }        /// <summary>        /// 文章链接        /// </summary>        public string AritcleUrl { get; set; }        /// <summary>        /// 文章简介        /// </summary>        public string AritcleInto { get; set; }        /// <summary>        /// 作者名        /// </summary>        public string Author { get; set; }        /// <summary>        /// 作者地址        /// </summary>        public string AuthorUrl { get; set; }        /// <summary>        /// 作者图片        /// </summary>        public string AuthorImg { get; set; }        /// <summary>        /// 发布时间        /// </summary>        public string PublishTime { get; set; }        /// <summary>        /// 推荐数        /// </summary>        public string DiggNum { get; set; }        /// <summary>        /// 评论数        /// </summary>        public string CommentNum { get; set; }        /// <summary>        /// 阅读数        /// </summary>        public string ReadNum { get; set; }    }

正则部分可能写得不很完美，但至少也匹配出来了，另外因为自己也是刚接触正则，也只能写出这种比较简单的正则。还望大家海涵~~

五总结

　　正则其实并不难，了解每个符号的意思后，自己马上动手试一试多写几次自然就明白了，正则是出了名的坑多，随便少写了个点就匹配不到数据了，我也踩了很多坑，踩着踩着就踩出经验了。

本文也只是对正则做了很基本的介绍，还有很多正则的字符没有介绍，只是写了比较常用的一些。如有错误之处，还望在评论中指出，我会马上修改。

如果您觉得本文有给您带来一点收获，不妨点个推荐，为我的付出支持一下，谢谢~

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。

二 正则字符简单介绍

三 正则进阶

四 正则实战

五 总结

二正则字符简单介绍

三正则进阶

四正则实战

五总结