關鍵點
1、正則匹配有效數據
2、多線程併發獲取數據
代碼展示
1 class Crawler 2 { 3 /// <summary> 4 /// URI 5 /// </summary> 6 public string Uri { get; set; } 7 /// <summary> 8 /// 創建實例時傳入URI 9 /// </summary> 10 /// <param name="uri"></param> 11 public Crawler(string uri) 12 { 13 this.Uri = uri; 14 } 15 /// <summary> 16 /// 根據URI下載HTML 17 /// </summary> 18 /// <returns></returns> 19 public string DownlodHtml() 20 { 21 //根據網站下載對應的html字符串 22 using (WebClient wc = new WebClient()) 23 { 24 wc.Encoding = Encoding.UTF8; 25 string downloadStr = null; 26 try 27 { 28 downloadStr = wc.DownloadString(this.Uri); 29 } 30 catch(Exception e) 31 { 32 downloadStr = null; 33 Console.WriteLine(e.Message); 34 } 35 return downloadStr; 36 } 37 } 38 /// <summary> 39 /// 獲取數據 40 /// </summary> 41 /// <param name="regx"></param> 42 /// <param name="html"></param> 43 /// <param name="i">獲取匹配的正則第幾組數數據</param> 44 /// <returns></returns> 45 public List<string> GetData(string regx, string html, int i) 46 { 47 MatchCollection matches = Regex.Matches(html, regx); 48 if (matches.Count > 0) 49 { 50 List<string> listTemp = new List<string>(); 51 foreach (Match item in matches) 52 { 53 listTemp.Add(item.Groups[i].Value); 54 } 55 return listTemp; 56 } 57 else 58 { 59 return null; 60 } 61 } 62 }
1 class MultiThreadCrawler 2 { 3 private string _uri; 4 //通知一個或多個正在等待的線程已發生事件 5 private ManualResetEvent _doneEvent; 6 7 public MultiThreadCrawler(string uri, ManualResetEvent doneEvent) 8 { 9 _uri = uri; 10 _doneEvent = doneEvent; 11 } 12 13 public void ThreadPoolCallback(Object threadContext) 14 { 15 AutoGetData(_uri); 16 //將事件設置爲終止狀態 17 _doneEvent.Set(); 18 } 19 20 public void ThreadPoolCallBackGetOnce(Object state) 21 { 22 AutoGetDataByPageNum(_uri); 23 //將事件設置爲終止狀態 24 _doneEvent.Set(); 25 } 26 }
參考資料
1、https://msdn.microsoft.com/zh-CN/library/3dasc8as.aspx