C#版正文抽取所需正則全集

在正文抽取(正文提取)裏一般會用到的正則,易爾譯科技收集了一下,是C#版本的正文抽取正則表達式。歡迎大家補充。

#region 相關正則表達式

 /// <summary>
 /// 去掉所有html標籤
 /// </summary>
 private static readonly Regex FilterAll = new Regex(
 @"(/[([^=]*)(=[^/]]*)?/][/s/S]*?/[//1/])|(?<lj>(?=[^/u4E00-/u9FA5/uFE30-/uFFA0,."");])<a/s+[^>]*>[^<]{2,}</a>(?=[^/u4E00-/u9FA5/uFE30-/uFFA0,."");]))|(?<Style><style[/s/S]+?/style>)|(?<select><select[/s/S]+?/select>)|(?<Script><script[/s/S]*?/script>)|(?<Explein></!/-/-[/s/S]*?/-/->)|(?<li><li(/s+[^>]+)?>[/s/S]*?/li>)|(?<Html></?/s*[^> ]+(/s*[^=>]+?=['""]?[^""']+?['""]?)*?[^/[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>/#[a-z0-9]{6})|(?<Space>/s+)|(/&/#/d+/;)",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase); //(?<Link><a[/s/S]*?</a>)|
 //(?<Style><style[/s/S]+?/style>)|(?<select><select[/s/S]+?/select>)|(?<Script><script[/s/S]*?/script>)|(?<Explein></!/-/-[/s/S]*?/-/->)|(?<li><li(/s+[^>]+)?>[/s/S]*?/li>)|(?<Html></?/s*[^> ]+(/s*[^=>]+?=['""]?[^""']+?['""]?)*?[^/[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>/#[a-z0-9]{6})|(?<Space>/s+)

 /// <summary>
 /// 找出title標籤
 /// </summary>
 private static readonly Regex FindTitle = new Regex(
 @"</s*/?title/s*>",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出title標籤內容
 /// </summary>
 private static readonly Regex FindTitleContent = new Regex(
 @"</s*/?title/s*>(?<Content>[/s/S]*?)</s*/?title/s*>",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出h 和Strong標籤
 /// </summary>
 private static readonly Regex FindHStrong = new Regex(
 @"</s*/?h/s*>|</s*/?strong/s*>",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出p 和br標籤
 /// </summary>
 private static readonly Regex FindPB = new Regex(
 @"</s*/?p/s*>|</s*br/s*/?>|</s*/?tr/s*>",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出nbsp標籤
 /// </summary>
 private static readonly Regex FindNbsp = new Regex(
 @"&nbsp",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出結尾標籤
 /// </summary>
 private static readonly Regex FindS = new Regex(
 @"(?<Content>[/s/S]*?)/$",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出是否爲標準句
 /// </summary>
 private static readonly Regex IsSen = new Regex(
 @"[,.,。!!;;::……??《》“”""]",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出是否爲垃圾句[strong][h]標籤過多的
 /// </summary>
 private static readonly Regex IsWs = new Regex(
 @"/[/(h/)/]",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出是否爲垃圾句冒號和·-過多的
 /// </summary>
 private static readonly Regex IsWsM = new Regex(
 @"/[·]|[-]|[::]",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出是否爲BBS特徵
 /// </summary>
 private static readonly Regex IsBbsInfo = new Regex(
 @"第[^樓]{1,50}樓|Powered/s*/?by[/s/S]*?Dvbbs|Powered/s*/?by[/s/S]*?Discuz",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);
 
 /// <summary>
 /// 取KEYWORD
 /// </summary>
 private static readonly Regex mKeyWord = new Regex(
 @"<meta/s*name/s*=/s*['""]?keywords['""]?/s*content/s*=/s*['""]?(?<KeyWords>[^'"">]*)['""]?[^>]*>|<meta/s*content/s*=/s*['""]?(?<KeyWords>[^'"">]*)['""]?/s*name/s*=/s*['""]?keywords['""]?/s*[^>]*>
",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);

 /// <summary>
 /// 取DESCRIPTION
 /// </summary>
 private static readonly Regex mDescription = new Regex(
 @"<meta/s*name/s*=/s*['""]?description['""]?/s*content/s*=/s*['""]?(?<description>[^'"">]*)['""]?[^>]*>|<meta/s*content/s*=/s*['""]?(?<description>[^'"">]*)['""]?/s*name/s*=/s*['""]?description['""]?/s*[^>]*>
",RegexOptions.ExplicitCapture| RegexOptions.Multiline| RegexOptions.IgnoreCase);
 
 /// <summary>
 /// 取Tags
 /// </summary>
 private static readonly Regex mTag = new Regex(
 @"<meta/s*name/s*=/s*['""]?tagwords['""]?/s*content/s*=/s*['""]?(?<tagwords>[^'"">]*)['""]?[^>]*>|<meta/s*content/s*=/s*['""]?(?<tagwords>[^'"">]*)['""]?/s*name/s*=/s*['""]?tagwords['""]?/s*[^>]*>
", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出是否爲垃圾句:後字符號過少,:號前無“說”字,:號後無"關於"
 /// </summary>
 private static readonly Regex IsWsMM = new Regex(
 @"^[^說/s]{0,8}?[::].{0,10}$",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出spider寫入的url標記
 /// </summary>
 private static readonly Regex txtUrl = new Regex(
 @"當前URL爲:http://(?<URL>.*)",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 /// <summary>
 /// 找出spider寫入的錨點描述標記
 /// </summary>
 private static readonly Regex txtDescription = new Regex(
 @"當前鏈接描述爲:(?<Describe>.*)",
 RegexOptions.ExplicitCapture
 | RegexOptions.Multiline
 | RegexOptions.IgnoreCase);

 ///// <summary>
 ///// 取需要a標籤
 ///// </summary>
 //private static readonly Regex cleanFirst = new Regex(
 // @"([/u4E00-/u9FA5]|[/uFE30-/uFFA0]|[,."");])(?<Robbish1><a/s+[^>]*>)[^<]{1,6}(?<Robbish2></a>)([/u4E00-/u9FA5]|[/uFE30-/uFFA0]|[,."");])", RegexOptions.ExplicitCapture | RegexOptions.Multiline | RegexOptions.IgnoreCase);

 #endregion

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章