根據當前頁面url匹配出頁面內鏈接地址的完整路徑

該類其功能相當於瀏覽器對當前頁面內鏈接地址的解析功能

主要方法是 字符串的操作,正則表達式的匹配和替換。

比如所當前頁爲:http://www.test.com/list/page.aspx?id=12,其頁面內鏈接爲:

BaseUrl Result
/default.aspx?id=14 http://www.test.com/default.aspx?id=14
../details.aspx?id=4 http://www.test.com/details.aspx?id=4
dete.aspx http://www.test.com/list/dete.aspx
 
 
 
 
 
該類C#代碼:
using System;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

public class Utility
{
    /// <summary>
    /// 處理URL地址,當BranchUrl爲一個全名的URL時則返回本身,否則恰當的銜接到BaseUrl後面
    /// </summary>
    /// <param name="BaseUrl">完整的URL http://www.test.com/list/page.aspx </param>
    /// <param name="BranchUrl">分支URL ../test.aspx</param>
    /// <returns></returns>
    public static string StickUrl(string BaseUrl, string BranchUrl)
    {
        //如果BranchUrl完整路徑則返回
        if (Regex.Match(BranchUrl, @"^(http|https|ftp|rtsp|mms)://", RegexOptions.IgnoreCase | RegexOptions.Compiled).Success)
        {
            return BranchUrl;
        }
        else
        {
            BaseUrl = BaseUrl.Replace("\\", "/");
            BranchUrl = BranchUrl.Replace("\\", "/");
            //如果BranchUrl相對路徑爲根目錄 ep:“/test.aspx”
            if (BranchUrl.Trim().IndexOf("/") == 0)
            {
                return GetLastUrl(BaseUrl, BranchUrl);
            }
            //開頭不是 "../"
            if (BranchUrl.IndexOf("../") != 0)
            {
                return UrlPlus(BaseUrl, BranchUrl);
            }
            else
            {
                //各種正則匹配
                if (Regex.Match(BaseUrl, @"/$", RegexOptions.Compiled).Success)
                {
                    BaseUrl = BaseUrl.TrimEnd('/');
                }
                else if (Regex.Match(BaseUrl, @"/[^\./]+\.[^/]+$", RegexOptions.Compiled).Success)
                {
                    BaseUrl = Regex.Replace(BaseUrl, @"/[^\./]+\.[^/]+$", "", RegexOptions.Compiled);
                }
                while (BranchUrl.IndexOf("../") >= 0)
                {
                    BranchUrl = Regex.Replace(BranchUrl, @"^\.\./", "", RegexOptions.Compiled);
                    BaseUrl = Regex.Replace(BaseUrl, @"/[^/]*$", "", RegexOptions.Compiled);
                    break;
                }
                return BaseUrl + "/" + BranchUrl;
            }
        }
    }
    /// <summary>
    /// BranchUrl相對路徑爲根目錄 "/test.aspx"
    /// </summary>
    /// <param name="BaseUrl">當前頁面地址 http://www.test.com/list/page.aspx</param>
    /// <param name="BranchUrl">頁面內鏈接地址 "/test.aspx"</param>
    /// <returns></returns>
    private static string GetLastUrl(string BaseUrl, string BranchUrl)
    {
        BranchUrl = BranchUrl.TrimStart('/'); //移除 "/test.aspx" 中根目錄符號"/"
        string Star_url = "";
        string End_Url = BaseUrl;
        //如果包含協議類型
        if (BaseUrl.IndexOf("//") > 0)
        {
            BaseUrl = BaseUrl.Replace("//", "|"); // http:|www.test.com/list/page.aspx
            string[] Url_Arr = BaseUrl.Split('|'); //分割數組
            Star_url = Url_Arr[0].ToString(); //協議類型:“http:” 
            End_Url = Url_Arr[1].ToString();  //域名地址:www.test.com/list/page.aspx
        }
        if (End_Url.IndexOf("/") > 0)
        {
            string[] End_Arr = End_Url.Split('/');//分割數組 :www.test.com/list/page.aspx
            End_Url = End_Arr[0].ToString();     //域名:www.test.com
        }
        if (Star_url != string.Empty)//協議類型不爲空
        {
            return Star_url + "//" + End_Url + "/" + BranchUrl; // http: + // + www.test.com + / + BranchUrl
        }
        else
        {
            return End_Url + "/" + BranchUrl;  //  www.test.com + / + BranchUrl
        }
    }

    /// <summary>
    /// BranchUrl相對路徑包爲 單獨頁面 ep "test.aspx"
    /// </summary>
    /// <param name="front">當前頁面地址 ep http://www.test.com/list/page.aspx </param>
    /// <param name="tail">頁面內鏈接地址 "test.aspx"</param>
    /// <returns></returns>
    private static string UrlPlus(string front, string tail)
    {
        //判斷各種不同的當前頁面地址
        if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://[^/]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
        {
            return front + "/" + tail;
        }
        else if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://[^/]+/$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
        {
            return front + tail;
        }
        else if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://.+/$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
        {
            return front + tail;
        }
        else if (Regex.Match(front, @"/[^/\.]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
        {
            return front + "/" + tail;
        }
        else if (Regex.Match(front, @"/[^/\.]+\.[^/]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
        {
            return Regex.Replace(front, @"/[^/\.]+\.[^/]+$", "", RegexOptions.IgnoreCase | RegexOptions.Compiled) + "/" + tail;
        }
        else
        {
            return front + "/" + tail;
        }
    }
}

 主要功能借鑑於:dotNETCMSv1.0sp5 CMS。源碼下載地址:http://www.51aspx.com/CV/dotNETCMS10sp5

在數據採集的時候,在文章列表頁中匹配出文章內容頁的完整路徑。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章