c# 使用HTML解析器NSoup爬取小說

閒着沒事,想試一試爬取一些小說,看了下園子裏很多前輩寫得一些文章很受啓發。

說下我的思路:查看文章網頁鏈接---->後臺遠程抓取到Html代碼---->分析所需數據結構----->提取所需信息 

在這其中則免不了對html的一些操作。

方法很多種,具體移步前輩文章:https://www.cnblogs.com/cang12138/p/7464226.html?utm_source=debugrun&utm_medium=referral

在這裏我貼出我自己測試過的代碼,以此記錄一下

using NSoup;
using NSoup.Nodes;
using NSoup.Select;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Web.Mvc;

namespace PaChongDemo.Controllers
{
    public class HomeController : Controller
    {
        //定義要爬取網站的網址集合
        string[] urlArray = new string[] { "https://www.ddxsku.com/files/article/html/23/23024/index.html", "https://www.ddxsku.com/files/article/html/2/2739/index.html" };

        public ActionResult Index()
        {

            foreach (var item in urlArray)
            {
                NSoup(item);
            }
            return View();
        }

        /// <summary>
        /// 訪問數據
        /// </summary>
        /// <param name="Url"></param>
        /// <param name="postDataStr"></param>
        /// <returns></returns>
        public string HttpGet(string Url, string postDataStr)
        {
            // HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url + (postDataStr == "" ? "" : "?") + postDataStr);
            // request.Method = "GET";
            // request.CookieContainer = new CookieContainer();
            // request.Accept = "*/*";
            //// request.ServicePoint.Expect100Continue = false;
            // //request.Timeout = 30000;
            // ////設置連接超時時間 
            // //request.Headers.Set("Pragma", "no-cache");
            // request.UserAgent = "Mozilla-Firefox-Spider(Wenanry)";
            // request.Headers.Add("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");

            // HttpWebResponse response;
            // request.ContentType = "text/html;charset=UTF-8";
            // try
            // {
            //     response = (HttpWebResponse)request.GetResponse();
            // }
            // catch (WebException ex)
            // {
            //     response = (HttpWebResponse)request.GetResponse();
            // }
            // Stream myResponseStream = response.GetResponseStream();
            // StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
            // string retString = myStreamReader.ReadToEnd();
            // myStreamReader.Close();
            // myResponseStream.Close();
            // return retString;

            CookieContainer cookie = new CookieContainer();
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
            request.Method = "POST";
            request.ContentType = "application/x-www-form-urlencoded";
            request.ContentLength = Encoding.UTF8.GetByteCount(postDataStr);
            request.CookieContainer = cookie;
            Stream myRequestStream = request.GetRequestStream();
            StreamWriter myStreamWriter = new StreamWriter(myRequestStream, Encoding.GetEncoding("gb2312"));
            myStreamWriter.Write(postDataStr);
            myStreamWriter.Close();

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();

            response.Cookies = cookie.GetCookies(response.ResponseUri);
            Stream myResponseStream = response.GetResponseStream();
            StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("utf-8"));
            string retString = myStreamReader.ReadToEnd();
            myStreamReader.Close();
            myResponseStream.Close();

            return retString;
        }

        /// <summary>
        /// 創建文本
        /// </summary>
        /// <param name="content">內容</param>
        /// <param name="name">名字</param>
        /// <param name="path">路徑</param>
        public void Novel(string content, string name, string path)
        {

            string Log = content + "\r\n";
            //創建文件夾,如果不存在就創建file文件夾
            if (Directory.Exists(path) == false)
            {
                Directory.CreateDirectory(path);
            }

            //判斷文件是否存在,不存在則創建
            if (!System.IO.File.Exists(path + '/' + name + ".txt"))
            {
                FileStream fs1 = new FileStream(path + '/' + name + ".txt", FileMode.Create, FileAccess.Write);//創建寫入文件 
                StreamWriter sw = new StreamWriter(fs1);
                sw.WriteLine(Log);//開始寫入值
                sw.Close();
                fs1.Close();
            }
            else
            {
                FileStream fs = new FileStream(path + name + ".txt" + "", FileMode.Append, FileAccess.Write);
                StreamWriter sr = new StreamWriter(fs);
                sr.WriteLine(Log);//開始寫入值
                sr.Close();
                fs.Close();
            }
        }


        /// <summary>
        /// 使用HtmlagilityPack方式解析
        /// </summary>
        /// <param name="Url"></param>
        //public void HtmlagilityPack(string Url = "")
        //{
        //    HtmlWeb webClient = new HtmlWeb();
        //    webClient.OverrideEncoding = Encoding.GetEncoding("utf-8");//編碼,這裏網上有些很多寫法都不正確
        //    HtmlDocument doc = webClient.Load(Url);
        //    HtmlNodeCollection anchors = doc.DocumentNode.SelectNodes("//class[@article_texttitleb]");

        //    string sss = "";
        //    foreach (var htmlNode in anchors)
        //    {
        //        int indexnum = anchors.IndexOf(htmlNode);
        //        sss += htmlNode.InnerHtml;
        //    }
        //}

        /// <summary>
        /// 使用HTML解析器NSoup方式解析
        /// </summary>
        /// <param name="Url"></param>
        public void NSoup(string Url = "")
        {
            Document doc = NSoupClient.Connect(Url).Get();
            Elements titles = doc.GetElementsByTag("title");//獲取題目
            string path = Server.MapPath("/Content/" + titles.Text + "");
            Elements cataLog = doc.GetElementsByClass("at");//獲取 目錄
            Document docChild = NSoupClient.Parse(cataLog.ToString());
            Elements eleChild = docChild.GetElementsByTag("a");//查找a標籤


            foreach (var item in eleChild)
            {
                string tile = item.Text();//獲取章節標題
                string htmlChildUrl = item.Attr("href").ToString().Trim();
                Document docTwo = NSoupClient.Connect(htmlChildUrl).Get();
                Element conTent = docTwo.GetElementById("contents");
                string txtContent = conTent.Text();

                Novel(txtContent,KillBadChar(tile), path);
            }
        }



        /// <summary>
        /// 去掉特殊字符  避免題目報錯
        /// </summary>
        /// <param name="charStr"></param>
        /// <returns></returns>
        public string KillBadChar(string charStr)
        {
            string reg = @"\:" + @"|\;" + @"|\/" + @"|\\" + @"|\|" + @"|\," + @"|\*" + @"|\?" + @"|\""" + @"|\<" + @"|\>";//特殊字符
            Regex r = new Regex(reg);
            return r.Replace(charStr, "");//將特殊字符替換爲""
        }
    }
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章