使用c#實現爬蟲技術

這是我的第一個爬蟲項目,也是我第一次接觸c# 窗體程序。
我的需求:頁面中有音頻文件但是它時單個下載的,用戶需要一個一個的去點擊下載按鈕進行下載,我的目的:根據用戶的需求篩選出相關的數據,然後我拿到頁面上用戶篩選的數據,實現批量下載,然後將下載並存放到用戶本地文件夾中,然後對下載下來的這些文件進行播放。
主要用到的插件有:CefSharp HtmlAgilityPack
將瀏覽器頁面嵌入到winForm中

將web頁面嵌入到winForm的界面中

//窗體load時執行下面方法
private void Form1_Load(object sender, EventArgs e)
        {
            CefSettings settings = new CefSettings();
            Cef.Initialize(settings);
            webbrowser = new ChromiumWebBrowser(“要嵌入的web地址”);
            webbrowser.Dock = DockStyle.Fill;
            this.pnlTop.Controls.Add(webbrowser);
            webbrowser.FrameLoadEnd += Webbrowser_FrameLoadEnd;//註冊窗體加載事件onload
            webbrowser.FrameLoadEnd += SetCookie;
        }

下面是獲取web頁面的url地址做相應的操作

private void Webbrowser_FrameLoadEnd(object sender, FrameLoadEndEventArgs e)
        {
            if (e.Frame.IsMain)
            {
                if (e.Frame.Url == "頁面的url地址(不同的地址處理不同的事情)")
                {
                    string listPage = "想要跳轉的頁面地址";
                    string js = "window.location.href='" + listPage + "';";
                    this.webbrowser.ExecuteScriptAsync(js);//將這段js添加到web頁面中,它會執行此跳轉
                    return;
                }
                if (e.Frame.Url == "url1")
                {
                    string html = "";
                    e.Frame.GetSourceAsync().ContinueWith(task =>//異步執行
                    {
                        html = task.Result;//抓取到的頁面,然後分析頁面的代碼結構拿到想要的數據
                        String filePath = SavaProcess(html);
                    });
                    return;
                }
                if (e.Frame.Url == "url2")
                {
                    e.Frame.GetSourceAsync().ContinueWith(task =>
                    {
                        string htmlDom = task.Result;

                        var doc = new HtmlDocument();
                        doc.LoadHtml(htmlDom);//可以將html頁面,使可以用類似於操作dom的一些方法來操作
                        //拿到總頁數
                        request requoption = new request();
                        requoption.Method = "POST";
                        //下面是根據抓取到的實際的頁面結構,和具體的也去需求,去獲取頁面上的數據
                        var pageTr = doc.DocumentNode.SelectNodes(@"/html[1]/body[1]/div[3]/table[1]/tbody[1]/tr[@class='forPage']/td[1]/div[1]/div[1]");//選擇標籤數組 
                        if (pageTr.Count > 0)
                        {
                            var p = pageTr[0];
                            var spanNodes = pageTr[0].SelectNodes(@".//span");//取到該節點下的所有span節點
                            }
                        }
                    });
                    return;
                } 
            }
        }

設置cookie方法

private void SetCookie(object sender, CefSharp.FrameLoadEndEventArgs e)
        {
            var cookieManager = CefSharp.Cef.GetGlobalCookieManager();
            CookieVisitor visitor = new CookieVisitor();
            visitor.SendCookie += Visitor_SendCookie;
            cookieManager.VisitAllCookies(visitor);
        }

/// <summary>
        /// 將Cookie保存到字典COOKIES中
        /// </summary>
        /// <param name="obj"></param>
        private void Visitor_SendCookie(CefSharp.Cookie obj)
        {
            lock (lockObject)
            {
                string key = obj.Domain.TrimStart('.') + "^" + obj.Name;
                string value = obj.Value;
                if (!cookies.ContainsKey(key))
                {
                    cookies.Add(key, value);
                }
                else
                {
                    cookies[key] = value;
                }
            }
        }

/// <summary>
        /// 將COOKIES解析成System.Net.Cookie
        /// </summary>
        /// <returns></returns>
        private CookieCollection GetCookieCollection()
        {
            lock (lockObject)
            {
                CookieCollection cookieCollection = new CookieCollection();
                foreach (var keyValue in cookies)
                {
                    System.Net.Cookie cookie = new System.Net.Cookie();
                    cookie.Domain = keyValue.Key.Split('^')[0];
                    cookie.Name = keyValue.Key.Split('^')[1];
                    cookie.Value = keyValue.Value;
                    cookieCollection.Add(cookie);
                }
                return cookieCollection;
            }
        }

下面是已經拿到音頻文件的地址了,然後請求下載地址下載文件

/// <summary>
        /// 將文件下載到本地
        /// </summary>
        public void HttpWebRequestGet(Uri url, string fileName, DataModel data)
        {
            try
            {
                HttpWebRequest AudioReq = (HttpWebRequest)HttpWebRequest.Create(url);
                AudioReq.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8";
                AudioReq.KeepAlive = true;
                AudioReq.Referer = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
                AudioReq.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36";
                AudioReq.Headers.Set("Accept-Encoding", "gzip,deflate");
                AudioReq.Headers.Set("Accept-Language", "zh-CN,zh;q=0.9");
                AudioReq.Headers.Set("Upgrade-Insecure-Requests", "1");
                AudioReq.Headers.Set("Cookie", "JSESSIONID=" + JSESSIONID + ";rememberPass=1;userAccount=" + uid + ";#pwd=" + pwd + ";loginByTwoCode=0");
                string responseData = String.Empty;
                AudioReq.Method = "GET";
                AudioReq.ContentType = "application/x-www-form-urlencoded";

                string path = System.AppDomain.CurrentDomain.BaseDirectory + @"AudioList\AMR";
                if (!System.IO.Directory.Exists(path))
                {
                    System.IO.Directory.CreateDirectory(path);
                }
                HttpWebResponse rsp = (HttpWebResponse)AudioReq.GetResponse();//獲取回寫流
                //將文件存到本地
                var localAmrnb = path + "\\" + fileName;
                FileStream fs = new FileStream(localAmrnb, FileMode.Create, FileAccess.Write, FileShare.ReadWrite);//創建本地文件寫入流
                data.LocalPath = localAmrnb;
                var responseStream = rsp.GetResponseStream();                                                                                                            //創建本地文件寫入流
                byte[] bArr = new byte[1024];
                int iTotalSize = 0;
                int size = responseStream.Read(bArr, 0, (int)bArr.Length);
                while (size > 0)
                {
                    iTotalSize += size;
                    fs.Write(bArr, 0, size);
                    size = responseStream.Read(bArr, 0, (int)bArr.Length);
                }
                fs.Close();
                responseStream.Close();
                rsp.Close();
                rsp.Dispose();
            }
            catch (Exception ex)
            {
                 ex.ToString();
            }
        }

c#序列化數據並寫入文件
List dataList = new List();
System.IO.StreamWriter file1 = new System.IO.StreamWriter(DownloadDataPath, false);
file1.Write(new JavaScriptSerializer().Serialize(dataList));
file1.Close();
file1.Dispose();
從文件中讀取數據並反序列化
using (System.IO.StreamReader sr = new System.IO.StreamReader(DownloadDataPath, Encoding.UTF8))
{
// 從文件讀取並顯示行,直到文件的末尾
string line = sr.ReadLine();
if (line != null)
{
oldData = line;
}
}
System.IO.StreamWriter file2 = new System.IO.StreamWriter(DownloadDataPath, false);
List oldDataList = new JavaScriptSerializer().Deserialize<List>(oldData);//反序列化讀取到的值
dataList.AddRange(oldDataList);//將新的數據添加到之前數據的末尾
file2.Write(new JavaScriptSerializer().Serialize(dataList));
file2.Close();
file2.Dispose();

下面向窗體中添加mediaPlay播放器
首先添加引用如下圖所示:
在這裏插入圖片描述
其次將mediaPlayer組件添加到工具箱中,菜單欄:工具—>選擇工具箱選項,添加如下組件
在這裏插入圖片描述
添加完之後就可以在工具箱中將組件直接拖到界面上了,

具體實現播放的代碼如下所示

	public Boolean getMediaPlayData()
{
    this.playMedia.currentPlaylist.clear();
    for (int i = 0; i < oldDataList.Count; i++)
    {
        this.playMedia.currentPlaylist.appendItem(playMedia.newMedia(oldDataList[i].LocalPath));//將所有要播放的文件添加到播放列表  
    }
    return true;
}
/// <summary>
/// 點擊查詢並播放按鈕
 /// </summary>
 /// <returns></returns>
 private void button1_Click(object sender, EventArgs e)
 {
     if (getMediaPlayData())
     {
         this.playMedia.settings.autoStart = true;
         this.playMedia.settings.setMode("shuffle", false);
         this.playMedia.Ctlcontrols.play();
     }
 }
private void wmp_PlayStateChange(object sender, AxWMPLib._WMPOCXEvents_PlayStateChangeEvent e)
{
     //如果已播放完畢就播放下一個文件
     if ((WMPLib.WMPPlayState)e.newState == WMPLib.WMPPlayState.wmppsReady) playMedia.Ctlcontrols.play();
 }

以上不是完整的代碼。
總體來說把大致的過程和用到的一些技術記錄下來,加深記憶。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章