爬蟲之IP代理

       在爬蟲,爬數據的時候,很容易出現IP鎖定,403,驗證碼等情況,一般出現此類情況的時候,就會用IP代理,來實現,大量的頻繁更換IP 來實現數據的頻繁爬取和併發爬取。

      現在的方式是增加一個這個的一個服務,一直跑幾個免費的IP平臺,大致也是這個流程

以上圖片來源於網絡.

其中代理的網站大致有:

鏈接:http://note.youdao.com/noteshare?id=96531d191709330d79d66088323619e0&sub=39E37FA9387C46C18B5AD3D07B096437


鏈接:http://note.youdao.com/noteshare?id=2f241bbf336978b86f89a1a268ac9478&sub=6290BEA38B3C47FEAF1B931A0E275341


para.URL = "http://www.xicidaili.com/nn/1"; // 西刺
para.URL = "http://ip84.com/dlgn"; // IP巴士
para.URL = "http://www.ip3366.net/free/?stype=1"; // 雲代理
para.URL = "http://www.iphai.com/free/ng"; // IP海
para.URL = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip"; // 66ip
proxy_url = 'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list'

以上是代理的常用接口和平臺

//服務的核心方法,啓動一個監聽,然後訪問這個網址的都會 對已獲取的代理IP序列進行檢查,並找到最新能用的返回給API接口

            string url = "http://*:10086/";
            Console.Title = url;

            Task.Run(() => new HttpListenerServer(url).StartLisener());
            //獲取代理IP
            Task.Run(() => new Free89().Start());
            Task.Run(() => new gitProxy().Start());
            Task.Run(() => new IpAear().Start());//不能用,超時太嚴重
            Task.Run(() => new kuaidailiGaoNi().Start());
            Task.Run(() => new kuaidailiPuTong().Start());
            Task.Run(() => new Liunian().Start());//還能用
            Task.Run(() => new NiMingGet().Start());
            Task.Run(() => new qiyunProxy().Start());
            Task.Run(() => new Xici().Start());
            Task.Run(() => new XiciGaoni().Start());//可以用,質量比較低
            Task.Run(() => new XiciPutong().Start());
            Console.ReadLine();

HttpListenerServer

/// <summary>
    /// http服務
    /// </summary>
    public class HttpListenerServer
    {
        /// <summary>
        /// listener對象
        /// </summary>
        HttpListener listerner;
        /// <summary>
        /// 線程
        /// </summary>
        Thread thread;
        /// <summary>
        /// 構造函數
        /// </summary>
        public HttpListenerServer(string url = "http://*:10086/")
        {
            listerner = new HttpListener();
            listerner.AuthenticationSchemes = AuthenticationSchemes.Anonymous;
            listerner.Prefixes.Add(url);
        }
        /// <summary>
        /// 開始監聽
        /// </summary>
        public void StartLisener()
        {
            thread = new Thread(new ThreadStart(delegate
            {
                listerner.Start();
                //線程裏執行的方法
                while (true)
                {
                    //獲取一個請求體信息
                    HttpListenerContext httpListenerContext = listerner.GetContext();
                    //一個內置的線程,用來處理請求信息
                    new Thread(new ThreadStart(delegate
                    {
                        Process(httpListenerContext);
                    })).Start();
                }
            }));
            thread.IsBackground = true;
            thread.Start();
            Console.WriteLine("代理服務器開啓成功!");
        }
        /// <summary>
        /// 請求數據處理
        /// </summary>
        /// <param name="httpContext"></param>
        public void Process(HttpListenerContext httpContext)
        {
            try
            {
                using (StreamWriter writer = new StreamWriter(httpContext.Response.OutputStream))
                {
                    httpContext.Response.StatusCode = 200;
                    //獲取參數類信息
                    string ip = httpContext.Request.QueryString["ip"];
                    string action = httpContext.Request.QueryString["action"];
                    if (string.IsNullOrEmpty(ip) || string.IsNullOrEmpty(action))
                    {
                        int count = 0;
                        while (true)
                        {
                            //輸出ip和端口地址,以供使用
                            var queue = Currentobject.GetQueue();
                            if (queue != null)
                            {
                                writer.WriteLine(queue.IP);
                                break;
                            }
                            else
                            {
                                Thread.Sleep(1000);
                            }
                            count++;
                            if (count > 5)
                            {
                                writer.WriteLine(false.ToString());
                                break;
                            }
                        }
                    }
                    else
                    {
                        ParameterProcess(ip, action, writer);
                    }
                }
            }
            catch (Exception ex)
            {
                try
                {
                    Console.WriteLine("{0}> 接口異常:{1}", DateTime.Now.ToString("s"), ex.Message);
                    using (StreamWriter writer = new StreamWriter(httpContext.Response.OutputStream))
                    {
                        httpContext.Response.StatusCode = 200;
                        writer.WriteLine("false");
                    }
                }
                catch (Exception e)
                {
                }
            }
        }
        /// <summary>
        /// 參數處理
        /// </summary>
        /// <param name="ip"></param>
        /// <param name="action"></param>
        /// <param name="writer"></param>
        public void ParameterProcess(string ip, string action, StreamWriter writer)
        {
            if (!string.IsNullOrEmpty(action))
            {
                switch (action)
                {
                    case "del"://刪除代理ip地址
                        {
                            writer.WriteLine("true");
                        }
                        break;
                }
            }
        }

    }

以上是核心方法。

寫一個簡單的西刺代理爬取 核心方法

using HtmlAgilityPack;
using HttpRequestCore;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;

namespace ProxyIPPool
{
    /// <summary>
    /// 西刺
    /// </summary>
    public class Xici : IProxyAction
    {
        public void Start()
        {
            while (true)
            {
                try
                {
                    string url = "https://www.xicidaili.com/";
                    HtmlDocument doc = new HtmlDocument();
                    RequestInfo requestInfo = new RequestInfo(url, HttpMethod.GET, new HttpDefaultConfig() { UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36" });
                    requestInfo.Headers.Add("Cookie", "__cfduid=dbc0747d4f20e880b1f3fbeddd7ee7f9b1518096123; Hm_lvt_24b7d5cc1b26f24f256b6869b069278e=1518096226; yjs_id=4b3a274b8bb0be2202978cee06964563; yd_cookie=5bc973fb-f37b-425614221fbda95de6a441f2298ff2543cf1; UM_distinctid=1654c7c2cf02a0-09d94a3256c3ad-514d2f1f-144000-1654c7c2cf35bb; CNZZDATA1254651946=1879619778-1534585157-%7C1534585157; Hm_lvt_8ccd0ef22095c2eebfe4cd6187dea829=1534586531; Hm_lpvt_8ccd0ef22095c2eebfe4cd6187dea829=1534586545");
                    var html = HttpCore.Execute(requestInfo);
                    doc.LoadHtml(html);
                    var table = doc.DocumentNode.SelectSingleNode("//table[@id='ip_list']");
                    var tdList = table.SelectNodes("//tr").ToList();
                    for (int i = 0; i < tdList.Count; i++)
                    {
                        try
                        {
                            var td = tdList[i].SelectNodes("td").ToList();
                            if (td.Count != 8) continue;
                            var ip = td[1].InnerText;
                            int port = Convert.ToInt32(td[2].InnerText);
                            ProxyIP proxy = new ProxyIP() { IP = string.Format("{0}:{1}", ip, port), IPAddress = ip, Port = Convert.ToInt32(port), CreateTime = DateTime.Now, State = ProxyIPState.未驗證 };
                            //判斷Ip是否已經存在
                            if (Currentobject.IsExit(proxy))
                            {
                                continue;
                            }
                            #region 啓用多線程去驗證
                            IList<Task> itasks = new List<Task>();
                            CancellationTokenSource isoure = new CancellationTokenSource();
                            CancellationToken itoken = isoure.Token;
                            itasks.Add(new Task(() =>
                            {
                                try
                                {
                                    if (Currentobject.CheckProxyIp(proxy, "西刺"))
                                    {
                                        Currentobject.AddOrUpdate(proxy);
                                    }
                                }
                                catch (Exception ex)
                                { }
                            }, itoken));
                            itasks[0].Start();
                            Task.WaitAll(itasks.ToArray(), (4 * 1000), itoken);
                            #endregion
                        }
                        catch (Exception e)
                        { }
                    }
                }
                catch (Exception e)
                { }
                Thread.Sleep(TimeSpan.FromMinutes(20));
            }
        }
    }
}

 

以上就是 代理IP的核心。其他都是根據這些擴展而來。

 

 

 

 

 

 

 

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章