C#實現網頁內容正文抓取

思路:
1、抓取遠程網頁源碼,這裏要實現自動判斷網頁編碼,否則有可能抓到亂碼。我是先看應答的 http頭的chareset,一般這個很準,但像csdn的新聞比較變態http應答的頭裏的chareset和網頁的meta裏聲明的 chareset不一致,所以我手工加了一下判斷,如果不一致再在內存流裏用網頁聲明的編碼讀取一遍源碼
2、把網頁分割成幾大塊。試用了一下tidy的.net包裝及HtmlParse的.net版本,都不太好用。於是我自己寫了個算法,可以把網頁裏的div塊,td塊等都提取出來,支持嵌套的情況。一般只提取div的文字塊兒就行了。
3、把漢字少於200的文本塊去了,一般少於200字的文本塊不會是正文,即便是正文,一般來說也不會有太多的價值,我直接去掉。
4、 因爲div支持嵌套,所以剩下的文本塊,有可能是重複的,一個是另一個的父節點,所以要把最裏層的文本塊找出來,最裏層的文本塊肯定是漢字最多的,而其它 文本最少的,所以要計算出剩餘文本塊中漢字佔所有字符比例最高的文本塊,基本上它就是正文的文本塊了。當然有的網頁正文裏也可能還有div的文本塊,這時 候可能會判斷錯誤,但只要正文嵌套的Div文本塊的漢字少於200字,我的算法還是能準確提取正文文本塊的。這一步我用寫了一個自定義的方法傳遞給 List的Sort方法。
5、把<p><br>等標籤替換成特殊佔位符[p][br]等,因爲最終的正文需要保留段落和回車換行等格式。這一步用正則實現。
6、把最後剩下的文本塊的html標籤去掉,我用正則過濾的。
7、把[p]替換成回車換行加倆空格,把[br]替換成回車換行,這步也用正則。到此,正文提取完畢

主要代碼:

public class GetMainContentHelper
{
    ///<summary>
    /// 判斷兩段兒文本里哪個中文佔的比例高
    ///</summary>
    ///<param name="x"></param>
    ///<param name="y"></param>
    ///<returns></returns>
    public static int CompareDinosByChineseLength(string x, string y)
    {
        if (x == null)
        {
            if (y == null)
            {
                return 0;
            }
            else
            {
                return -1;
            }
        }
        else
        {
            if (y == null)
            {
                return 1;
            }
            else
            {
                Regex r = new Regex("[\u4e00-\u9fa5]");
                float xCount = (float)(r.Matches(x).Count) / (float)x.Length;
                float yCount = (float)(r.Matches(y).Count) / (float)y.Length;

                int retval = xCount.CompareTo(yCount);

                if (retval != 0)
                {
                    return retval;
                }
                else
                {
                    return x.CompareTo(y);
                }
            }
        }
    }

    ///<summary>
    /// 獲取一個網頁源碼中的標籤列表,支持嵌套,一般或去div,td等容器
    ///</summary>
    ///<param name="input"></param>
    ///<param name="tag"></param>
    ///<returns></returns>
    public static List<string> GetTags(string input, string tag)
    {
        StringReader strReader = new StringReader(input);
        int lowerThanCharCounter = 0;
        int lowerThanCharPos = 0;
        Stack<int> tagPos = new Stack<int>();
        List<string> taglist = new List<string>();
        int i = 0;
        while (true)
        {
            try
            {
                int intCharacter = strReader.Read();
                if (intCharacter == -1) break;

                char convertedCharacter = Convert.ToChar(intCharacter);

                if (lowerThanCharCounter > 0)
                {
                    if (convertedCharacter == '>')
                    {
                        lowerThanCharCounter--;

                        string biaoqian = input.Substring(lowerThanCharPos, i - lowerThanCharPos + 1);
                        if (biaoqian.StartsWith(string.Format("<{0}", tag)))
                        {
                            tagPos.Push(lowerThanCharPos);
                        }
                        if (biaoqian.StartsWith(string.Format("</{0}", tag)))
                        {
                            if (tagPos.Count < 1)
                                continue;
                            int tempTagPos = tagPos.Pop();
                            string strdiv = input.Substring(tempTagPos, i - tempTagPos + 1);
                            taglist.Add(strdiv);
                        }
                    }
                }

                if (convertedCharacter == '<')
                {
                    lowerThanCharCounter++;
                    lowerThanCharPos = i;
                }
            }
            finally
            {
                i++;
            }
        }
        return taglist;
    }

    ///<summary>
    /// 獲取指定網頁的源碼,支持編碼自動識別
    ///</summary>
    ///<param name="url"></param>
    ///<returns></returns>
    public static string getDataFromUrl(string url)
    {
        string str = string.Empty;
        HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);

        //設置http頭
        request.AllowAutoRedirect = true;
        request.AllowWriteStreamBuffering = true;
        request.Referer = "";
        request.Timeout = 10 * 1000;
        request.UserAgent = "";

        HttpWebResponse response = null;
        try
        {
            response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode == HttpStatusCode.OK)
            {
                //根據http應答的http頭來判斷編碼
                string characterSet = response.CharacterSet;
                Encoding encode;
                if (characterSet != "")
                {
                    if (characterSet == "ISO-8859-1")
                    {
                        characterSet = "gb2312";
                    }
                    encode = Encoding.GetEncoding(characterSet);
                }
                else
                {
                    encode = Encoding.Default;
                }

                //聲明一個內存流來保存http應答流
                Stream receiveStream = response.GetResponseStream();
                MemoryStream mStream = new MemoryStream();

                byte[] bf = new byte[255];
                int count = receiveStream.Read(bf, 0, 255);
                while (count > 0)
                {
                    mStream.Write(bf, 0, count);
                    count = receiveStream.Read(bf, 0, 255);
                }
                receiveStream.Close();

                mStream.Seek(0, SeekOrigin.Begin);

                //從內存流裏讀取字符串
                StreamReader reader = new StreamReader(mStream, encode);
                char[] buffer = new char[1024];
                count = reader.Read(buffer, 0, 1024);
                while (count > 0)
                {
                    str += new String(buffer, 0, count);
                    count = reader.Read(buffer, 0, 1024);
                }

                //從解析出的字符串裏判斷charset,如果和http應答的編碼不一直
                //那麼以頁面聲明的爲準,再次從內存流裏重新讀取文本
                Regex reg =
                    new Regex(@"<meta[\s\S]+?charset=(.*)""[\s\S]+?>",
                              RegexOptions.Multiline | RegexOptions.IgnoreCase);
                MatchCollection mc = reg.Matches(str);
                if (mc.Count > 0)
                {
                    string tempCharSet = mc[0].Result("$1");
                    if (string.Compare(tempCharSet, characterSet, true) != 0)
                    {
                        encode = Encoding.GetEncoding(tempCharSet);
                        str = string.Empty;
                        mStream.Seek(0, SeekOrigin.Begin);
                        reader = new StreamReader(mStream, encode);
                        buffer = new char[255];
                        count = reader.Read(buffer, 0, 255);
                        while (count > 0)
                        {
                            str += new String(buffer, 0, count);
                            count = reader.Read(buffer, 0, 255);
                        }
                    }
                }
                reader.Close();
                mStream.Close();
            }
        }
        catch (Exception ex)
        {
            Trace.TraceError(ex.ToString());
        }
        finally
        {
            if (response != null)
                response.Close();
        }
        return str;
    }

    ///<summary>
    /// 從一段網頁源碼中獲取正文
    ///</summary>
    ///<param name="input"></param>
    ///<returns></returns>
    public static string GetMainContent(string input)
    {
        string reg1 = @"<(p|br)[^<]*>";
        string reg2 =
            @"(\[([^=]*)(=[^\]]*)?\][\s\S]*?\[/\1\])|(?<lj>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");])<a\s+[^>]*>[^<]{2,}</a>(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");]))|(?<Style><style[\s\S]+?/style>)|(?<select><select[\s\S]+?/select>)|(?<Script><script[\s\S]*?/script>)|(?<Explein><\!\-\-[\s\S]*?\-\->)|(?<li><li(\s+[^>]+)?>[\s\S]*?/li>)|(?<Html></?\s*[^> ]+(\s*[^=>]+?=['""]?[^""']+?['""]?)*?[^\[<]*>)|(?<Other>&[a-zA-Z]+;)|(?<Other2>\#[a-z0-9]{6})|(?<Space>\s+)|(\&\#\d+\;)";

        //1、獲取網頁的所有div標籤
        List<string> list = GetTags(input, "div");

        //2、去除漢字少於200字的div
        List<string> needToRemove = new List<string>();
        foreach (string s in list)
        {
            Regex r = new Regex("[\u4e00-\u9fa5]");
            if (r.Matches(s).Count < 300)
            {
                needToRemove.Add(s);
            }
        }
        foreach (string s in needToRemove)
        {
            list.Remove(s);
        }

        //3、把剩下的div按漢字比例多少倒序排列,
        list.Sort(CompareDinosByChineseLength);
        if (list.Count < 1)
        {
            return "";
        }
        input = list[list.Count - 1];

        //4、把p和br替換成特殊的佔位符[p][br]
        input = new Regex(reg1, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "[$1]");

        //5、去掉HTML標籤,保留漢字
        input = new Regex(reg2, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "");

        //6、把特殊佔維護替換成回車和換行
        input = new Regex("\\[p]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "\r\n ");
        input = new Regex("\\[br]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(input, "\r\n");
        return input;
    }
}


發佈了168 篇原創文章 · 獲贊 9 · 訪問量 23萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章