.Net抓取網頁數據

關鍵點

1、正則匹配有效數據

2、多線程併發獲取數據

代碼展示

 1 class Crawler
 2     {
 3         /// <summary>
 4         /// URI
 5         /// </summary>
 6         public string Uri { get; set; }
 7         /// <summary>
 8         /// 創建實例時傳入URI
 9         /// </summary>
10         /// <param name="uri"></param>
11         public Crawler(string uri)
12         {
13             this.Uri = uri;
14         }
15         /// <summary>
16         /// 根據URI下載HTML
17         /// </summary>
18         /// <returns></returns>
19         public string DownlodHtml()
20         {
21             //根據網站下載對應的html字符串
22             using (WebClient wc = new WebClient())
23             {
24                 wc.Encoding = Encoding.UTF8;
25                 string downloadStr = null;
26                 try
27                 {
28                     downloadStr = wc.DownloadString(this.Uri);
29                 }
30                 catch(Exception e)
31                 {
32                     downloadStr = null;
33                     Console.WriteLine(e.Message);
34                 }
35                 return downloadStr;
36             }
37         }
38         /// <summary>
39         /// 獲取數據
40         /// </summary>
41         /// <param name="regx"></param>
42         /// <param name="html"></param>
43         /// <param name="i">獲取匹配的正則第幾組數數據</param>
44         /// <returns></returns>
45         public List<string> GetData(string regx, string html, int i)
46         {
47             MatchCollection matches = Regex.Matches(html, regx);
48             if (matches.Count > 0)
49             {
50                 List<string> listTemp = new List<string>();
51                 foreach (Match item in matches)
52                 {
53                     listTemp.Add(item.Groups[i].Value);
54                 }
55                 return listTemp;
56             }
57             else
58             {
59                 return null;
60             }
61         }
62     }
 1 class MultiThreadCrawler
 2     {   
 3         private string _uri;
 4         //通知一個或多個正在等待的線程已發生事件
 5         private ManualResetEvent _doneEvent;
 6 
 7         public MultiThreadCrawler(string uri, ManualResetEvent doneEvent)
 8         {
 9             _uri = uri;
10             _doneEvent = doneEvent;
11         }
12 
13         public void ThreadPoolCallback(Object threadContext)
14         {
15             AutoGetData(_uri);
16             //將事件設置爲終止狀態
17             _doneEvent.Set();
18         }
19 
20         public void ThreadPoolCallBackGetOnce(Object state)
21         {
22             AutoGetDataByPageNum(_uri);
23             //將事件設置爲終止狀態
24             _doneEvent.Set();
25         }         
26     }

參考資料
1、https://msdn.microsoft.com/zh-CN/library/3dasc8as.aspx


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章