正則表達式過濾html

在項目中會經常用正則表達式過濾html,比如得到Body裏面的內容,獲取網頁中的img,a標籤,或者得到純文本等等。

下面的Demo 實現對Html的過濾

主要用到的類:

1、System.Text.RegularExpressions; // 正則表達

2、System.IO;   // IO流

3、System.Net; //

第一步:搭建簡易前臺頁面

    <form id="form1" runat="server">
    <div>
        目標源地址:<asp:TextBox ID="tbUrl" runat="server"></asp:TextBox></div>
    <br />
    <asp:TextBox runat="server" TextMode="MultiLine" Width="500px" Height="500px" ID="tbCode"></asp:TextBox>
    <br />
    <asp:Button ID="btnRetrieveAll" runat="server" Text="搜索整個Html源碼" OnClick="btnRetrieveAll_Click" />
    <asp:Button ID="btnRetrievePureTxt" runat="server" Text="搜索純文本" OnClick="btnRetrievePureTxt_Click" />
    <asp:Button ID="btnRetrieveLink" runat="server" Text="搜索鏈接標籤" OnClick="btnRetrieveLink_Click" />
    <asp:Button ID="btnRetrieveImg" runat="server" Text="搜索圖片標籤" 
        onclick="btnRetrieveImg_Click" />
        <asp:Button ID="btnRetriveScript" runat="server" Text="搜索腳本" 
        onclick="btnRetriveScript_Click" />
    </form>  

第二步:定義類級變量

        string strUrl = String.Empty;
        string strWholeHtml = string.Empty;
        const string MsgPageRetrieveFailed = "對不起,網頁運行失敗!";
        bool flgPageRetrieved = true;

第三步:根據目標源取目標html源碼

/// <summary>
        /// 用WebRequest和WebRespond從SourcePage.aspx中檢索完整的html代碼 
        /// 我們把html代碼的格式轉換爲uft-8.
        /// </summary>
        /// <param name="url"></param>
        /// <returns></returns>
        public string GetWholeHtmlCode(string url)
        {
            string strHtml = string.Empty;
            StreamReader strReader = null;
            HttpWebResponse wrpContent = null;       
            try
            {
                HttpWebRequest wrqContent = (HttpWebRequest)WebRequest.Create(strUrl);
                wrqContent.Timeout = 300000;
                wrpContent = (HttpWebResponse)wrqContent.GetResponse();
                if (wrpContent.StatusCode != HttpStatusCode.OK)
                {
                    flgPageRetrieved = false;
                    strHtml = "對不起,網頁運行失敗";
                }
                if (wrpContent != null)
                {
                    strReader = new StreamReader(wrpContent.GetResponseStream(), Encoding.GetEncoding("utf-8"));
                    strHtml = strReader.ReadToEnd();
                }
            }
            catch (Exception e)
            {
                flgPageRetrieved = false;
                strHtml = e.Message;
            }
            finally 
            {
                if (strReader != null)
                    strReader.Close();
                if (wrpContent != null)
                    wrpContent.Close();
            }
            return strHtml;
        }

目標URL源html碼

        protected void btnRetrieveAll_Click(object sender, EventArgs e)
        {
            strUrl = TextBox1.Text;
            strWholeHtml = this.GetWholeHtmlCode(strUrl);
            if (flgPageRetrieved)
            {
                tbResult.Text = strWholeHtml;
            }
            else
            {
                tbResult.Text = MsgPageRetrieveFailed;
            }
        }

Html源純文本

        /// <summary>
        /// 從html代碼裏搜索純文本,這個純文本只包括html的 
        /// Body標記.
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected void btnRetrievePureText_Click(object sender, EventArgs e)
        {
            strWholeHtml = this.GetWholeHtmlCode(strUrl);
            if (flgPageRetrieved)
            {
                string strRegexScript = @"(?m)<body[^>]*>(\w|\W)*?</body[^>]*>";
                string strRegex = @"<[^>]*>";
                string strMatchScript = string.Empty;
                Match matchText = Regex.Match(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase);
                strMatchScript = matchText.Groups[0].Value;
                string strPureText = Regex.Replace(strMatchScript, strRegex, string.Empty, RegexOptions.IgnoreCase);
                tbResult.Text = strPureText;
            }
            else
            {
                tbResult.Text = MsgPageRetrieveFailed;
            }
        }

獲取腳本代碼

 /// <summary>
        /// 從html代碼中檢索腳本代碼.
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected void btnRetrieveSriptCode_Click(object sender, EventArgs e)
        {
            strWholeHtml = this.GetWholeHtmlCode(strUrl);
            if (flgPageRetrieved)
            {
                string strRegexScript = @"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>";
                string strRegex = @"<[^>]*>";
                string strMatchScript = string.Empty;
                MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase);
                StringBuilder strbScriptList = new StringBuilder();
                foreach (Match matchSingleScript in matchList)
                {
                    string strSingleScriptText = Regex.Replace(matchSingleScript.Value, strRegex, string.Empty, RegexOptions.IgnoreCase);
                    strbScriptList.Append(strSingleScriptText + "\r\n");
                }
                tbResult.Text = strbScriptList.ToString();
            }
            else
            {
                tbResult.Text = MsgPageRetrieveFailed;
            }
        }

獲取圖片img

 /// <summary>
        /// 從html代碼中檢索圖片信息.
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected void btnRetrieveImage_Click(object sender, EventArgs e)
        {
            strWholeHtml = this.GetWholeHtmlCode(strUrl);
            if (flgPageRetrieved)
            {
                string strRegexImg = @"(?is)<img.*?>";
                MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexImg, RegexOptions.IgnoreCase);
                StringBuilder strbImageList = new StringBuilder();

                foreach (Match matchSingleImage in matchList)
                {
                    strbImageList.Append(matchSingleImage.Value + "\r\n");
                }
                tbResult.Text = strbImageList.ToString();
            }
            else
            {
                tbResult.Text = MsgPageRetrieveFailed;
            }
        }

html鏈接

  /// <summary>
        /// 從html代碼中檢索鏈接.
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected void btnRetrievelink_Click(object sender, EventArgs e)
        {
            strUrl = TextBox1.Text;
            strWholeHtml = this.GetWholeHtmlCode(strUrl);
            if (flgPageRetrieved)
            {
                string strRegexLink = @"(?is)<a .*?>";
                MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexLink, RegexOptions.IgnoreCase);
                StringBuilder strbLinkList = new StringBuilder();

                foreach (Match matchSingleLink in matchList)
                {
                    strbLinkList.Append(matchSingleLink.Value + "\r\n");
                }
                tbResult.Text = strbLinkList.ToString();
            }
            else
            {
                tbResult.Text = MsgPageRetrieveFailed;
            }
        }

這個Demo能滿足大多數的過濾Html 需求。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章