根据当前页面url匹配出页面内链接地址的完整路径

该类其功能相当于浏览器对当前页面内链接地址的解析功能

主要方法是 字符串的操作,正则表达式的匹配和替换。

比如所当前页为:http://www.test.com/list/page.aspx?id=12,其页面内链接为:

BaseUrl Result
/default.aspx?id=14 http://www.test.com/default.aspx?id=14
../details.aspx?id=4 http://www.test.com/details.aspx?id=4
dete.aspx http://www.test.com/list/dete.aspx
 
 
 
 
 
该类C#代码:
using System;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;

public class Utility
{
    /// <summary>
    /// 处理URL地址,当BranchUrl为一个全名的URL时则返回本身,否则恰当的衔接到BaseUrl后面
    /// </summary>
    /// <param name="BaseUrl">完整的URL http://www.test.com/list/page.aspx </param>
    /// <param name="BranchUrl">分支URL ../test.aspx</param>
    /// <returns></returns>
    public static string StickUrl(string BaseUrl, string BranchUrl)
    {
        //如果BranchUrl完整路径则返回
        if (Regex.Match(BranchUrl, @"^(http|https|ftp|rtsp|mms)://", RegexOptions.IgnoreCase | RegexOptions.Compiled).Success)
        {
            return BranchUrl;
        }
        else
        {
            BaseUrl = BaseUrl.Replace("\\", "/");
            BranchUrl = BranchUrl.Replace("\\", "/");
            //如果BranchUrl相对路径为根目录 ep:“/test.aspx”
            if (BranchUrl.Trim().IndexOf("/") == 0)
            {
                return GetLastUrl(BaseUrl, BranchUrl);
            }
            //开头不是 "../"
            if (BranchUrl.IndexOf("../") != 0)
            {
                return UrlPlus(BaseUrl, BranchUrl);
            }
            else
            {
                //各种正则匹配
                if (Regex.Match(BaseUrl, @"/$", RegexOptions.Compiled).Success)
                {
                    BaseUrl = BaseUrl.TrimEnd('/');
                }
                else if (Regex.Match(BaseUrl, @"/[^\./]+\.[^/]+$", RegexOptions.Compiled).Success)
                {
                    BaseUrl = Regex.Replace(BaseUrl, @"/[^\./]+\.[^/]+$", "", RegexOptions.Compiled);
                }
                while (BranchUrl.IndexOf("../") >= 0)
                {
                    BranchUrl = Regex.Replace(BranchUrl, @"^\.\./", "", RegexOptions.Compiled);
                    BaseUrl = Regex.Replace(BaseUrl, @"/[^/]*$", "", RegexOptions.Compiled);
                    break;
                }
                return BaseUrl + "/" + BranchUrl;
            }
        }
    }
    /// <summary>
    /// BranchUrl相对路径为根目录 "/test.aspx"
    /// </summary>
    /// <param name="BaseUrl">当前页面地址 http://www.test.com/list/page.aspx</param>
    /// <param name="BranchUrl">页面内链接地址 "/test.aspx"</param>
    /// <returns></returns>
    private static string GetLastUrl(string BaseUrl, string BranchUrl)
    {
        BranchUrl = BranchUrl.TrimStart('/'); //移除 "/test.aspx" 中根目录符号"/"
        string Star_url = "";
        string End_Url = BaseUrl;
        //如果包含协议类型
        if (BaseUrl.IndexOf("//") > 0)
        {
            BaseUrl = BaseUrl.Replace("//", "|"); // http:|www.test.com/list/page.aspx
            string[] Url_Arr = BaseUrl.Split('|'); //分割数组
            Star_url = Url_Arr[0].ToString(); //协议类型:“http:” 
            End_Url = Url_Arr[1].ToString();  //域名地址:www.test.com/list/page.aspx
        }
        if (End_Url.IndexOf("/") > 0)
        {
            string[] End_Arr = End_Url.Split('/');//分割数组 :www.test.com/list/page.aspx
            End_Url = End_Arr[0].ToString();     //域名:www.test.com
        }
        if (Star_url != string.Empty)//协议类型不为空
        {
            return Star_url + "//" + End_Url + "/" + BranchUrl; // http: + // + www.test.com + / + BranchUrl
        }
        else
        {
            return End_Url + "/" + BranchUrl;  //  www.test.com + / + BranchUrl
        }
    }

    /// <summary>
    /// BranchUrl相对路径包为 单独页面 ep "test.aspx"
    /// </summary>
    /// <param name="front">当前页面地址 ep http://www.test.com/list/page.aspx </param>
    /// <param name="tail">页面内链接地址 "test.aspx"</param>
    /// <returns></returns>
    private static string UrlPlus(string front, string tail)
    {
        //判断各种不同的当前页面地址
        if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://[^/]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
        {
            return front + "/" + tail;
        }
        else if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://[^/]+/$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
        {
            return front + tail;
        }
        else if (Regex.Match(front, "(http|https|ftp|rtsp|mms)://.+/$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
        {
            return front + tail;
        }
        else if (Regex.Match(front, @"/[^/\.]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
        {
            return front + "/" + tail;
        }
        else if (Regex.Match(front, @"/[^/\.]+\.[^/]+$", RegexOptions.Compiled | RegexOptions.IgnoreCase).Success)
        {
            return Regex.Replace(front, @"/[^/\.]+\.[^/]+$", "", RegexOptions.IgnoreCase | RegexOptions.Compiled) + "/" + tail;
        }
        else
        {
            return front + "/" + tail;
        }
    }
}

 主要功能借鉴于:dotNETCMSv1.0sp5 CMS。源码下载地址:http://www.51aspx.com/CV/dotNETCMS10sp5

在数据采集的时候,在文章列表页中匹配出文章内容页的完整路径。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章