該類其功能相當於瀏覽器對當前頁面內鏈接地址的解析功能
主要方法是 字符串的操作,正則表達式的匹配和替換。
比如所當前頁爲:http://www.test.com/list/page.aspx?id=12,其頁面內鏈接爲:
BaseUrl | Result |
/default.aspx?id=14 | http://www.test.com/default.aspx?id=14 |
../details.aspx?id=4 | http://www.test.com/details.aspx?id=4 |
dete.aspx | http://www.test.com/list/dete.aspx |
using System; using System; using System.Collections; using System.Collections.Generic; using System.Text; using System.Text.RegularExpressions; public class Utility { /// <summary> /// 處理URL地址,當BranchUrl爲一個全名的URL時則返回本身,否則恰當的銜接到BaseUrl後面 /// </summary> /// <param name="BaseUrl">完整的URL http://www.test.com/list/page.aspx </param> /// <param name="BranchUrl">分支URL ../test.aspx</param> /// <returns></returns> public static string StickUrl(string BaseUrl, string BranchUrl) { //如果BranchUrl完整路徑則返回 if (Regex.Match(BranchUrl, @"^(http|https|ftp|rtsp|mms)://", RegexOptions.IgnoreCase | RegexOptions.Compiled).Success) { return BranchUrl; } else { BaseUrl = BaseUrl.Replace("\\", "/"); BranchUrl = BranchUrl.Replace("\\", "/"); //如果BranchUrl相對路徑爲根目錄 ep:“/test.aspx” if (BranchUrl.Trim().IndexOf("/") == 0) { return GetLastUrl(BaseUrl, BranchUrl); } //開頭不是 "../" if (BranchUrl.IndexOf("../") != 0) { return UrlPlus(BaseUrl, BranchUrl); } else { //各種正則匹配 if (Regex.Match(BaseUrl, @"/$", RegexOptions.Compiled).Success) { BaseUrl = BaseUrl.TrimEnd('/'); } else if (Regex.Match(BaseUrl, @"/[^\./]+\.[^/]+$", RegexOptions.Compiled).Success) { BaseUrl = Regex.Replace(BaseUrl, @"/[^\./]+\.[^/]+$", "", RegexOptions.Compiled); } while (BranchUrl.IndexOf("../") >= 0) { BranchUrl = Regex.Replace(BranchUrl, @"^\.\./", "", RegexOptions.Compiled); BaseUrl = Regex.Replace(BaseUrl, @"/[^/]*$", "", RegexOptions.Compiled); break; } return BaseUrl + "/" + BranchUrl; } } } /// <summary> /// BranchUrl相對路徑爲根目錄 "/test.aspx" /// </summary> /// <param name="BaseUrl">當前頁面地址 http://www.test.com/list/page.aspx</param> /// <param name="BranchUrl">頁面內鏈接地址 "/test.aspx"</param> /// <returns></returns> private static string GetLastUrl(string BaseUrl, string BranchUrl) { BranchUrl = BranchUrl.TrimStart('/'); //移除 "/test.aspx" 中根目錄符號"/" string Star_url = ""; string End_Url = BaseUrl; //如果包含協議類型 if (BaseUrl.IndexOf("//") > 0) { BaseUrl = BaseUrl.Replace("//", "|"); // http:|www.test.com/list/page.aspx string[] Url_Arr = BaseUrl.Split('|'); //分割數組 Star_url = Url_Arr[0].ToString(); //協議類型:“http:” End_Url = Url_Arr[1].ToString(); //域名地址:www.test.com/list/page.aspx } if (End_Url.IndexOf("/") > 0) { string[] End_Arr = End_Url.Split('/');//分割數組 :www.test.com/list/page.aspx End_Url = End_Arr[0].ToString(); //域名:www.test.com } if (Star_url != string.Empty)//協議類型不爲空 { return Star_url + "//" + End_Url + "/" + BranchUrl; // http: + // + www.test.com + / + BranchUrl } else { return End_Url + "/" + BranchUrl; // www.test.com + / + BranchUrl } } /// <summary> /// BranchUrl相對路徑包爲 單獨頁面 ep "test.aspx" /// </summary> /// <param name="front">當前頁面地址 ep http://www.test.com/list/page.aspx </param> /// <param name="tail">頁面內鏈接地址 "test.aspx"</param> /// <returns></returns> private static string UrlPlus(string front, string tail) { //判斷各種不同的當前頁面地址 if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://[^/]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success) { return front + "/" + tail; } else if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://[^/]+/$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success) { return front + tail; } else if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://.+/$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success) { return front + tail; } else if (Regex.Match(front, @"/[^/\.]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success) { return front + "/" + tail; } else if (Regex.Match(front, @"/[^/\.]+\.[^/]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success) { return Regex.Replace(front, @"/[^/\.]+\.[^/]+$", "", RegexOptions.IgnoreCase | RegexOptions.Compiled) + "/" + tail; } else { return front + "/" + tail; } } }
主要功能借鑑於:dotNETCMSv1.0sp5 CMS。源碼下載地址:http://www.51aspx.com/CV/dotNETCMS10sp5
在數據採集的時候,在文章列表頁中匹配出文章內容頁的完整路徑。