在爬蟲,爬數據的時候,很容易出現IP鎖定,403,驗證碼等情況,一般出現此類情況的時候,就會用IP代理,來實現,大量的頻繁更換IP 來實現數據的頻繁爬取和併發爬取。
現在的方式是增加一個這個的一個服務,一直跑幾個免費的IP平臺,大致也是這個流程
以上圖片來源於網絡.
其中代理的網站大致有:
鏈接:http://note.youdao.com/noteshare?id=96531d191709330d79d66088323619e0&sub=39E37FA9387C46C18B5AD3D07B096437
鏈接:http://note.youdao.com/noteshare?id=2f241bbf336978b86f89a1a268ac9478&sub=6290BEA38B3C47FEAF1B931A0E275341
para.URL = "http://www.xicidaili.com/nn/1"; // 西刺
para.URL = "http://ip84.com/dlgn"; // IP巴士
para.URL = "http://www.ip3366.net/free/?stype=1"; // 雲代理
para.URL = "http://www.iphai.com/free/ng"; // IP海
para.URL = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=2&api=66ip"; // 66ip
proxy_url = 'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list'
以上是代理的常用接口和平臺
//服務的核心方法,啓動一個監聽,然後訪問這個網址的都會 對已獲取的代理IP序列進行檢查,並找到最新能用的返回給API接口
string url = "http://*:10086/";
Console.Title = url;
Task.Run(() => new HttpListenerServer(url).StartLisener());
//獲取代理IP
Task.Run(() => new Free89().Start());
Task.Run(() => new gitProxy().Start());
Task.Run(() => new IpAear().Start());//不能用,超時太嚴重
Task.Run(() => new kuaidailiGaoNi().Start());
Task.Run(() => new kuaidailiPuTong().Start());
Task.Run(() => new Liunian().Start());//還能用
Task.Run(() => new NiMingGet().Start());
Task.Run(() => new qiyunProxy().Start());
Task.Run(() => new Xici().Start());
Task.Run(() => new XiciGaoni().Start());//可以用,質量比較低
Task.Run(() => new XiciPutong().Start());
Console.ReadLine();
HttpListenerServer
/// <summary>
/// http服務
/// </summary>
public class HttpListenerServer
{
/// <summary>
/// listener對象
/// </summary>
HttpListener listerner;
/// <summary>
/// 線程
/// </summary>
Thread thread;
/// <summary>
/// 構造函數
/// </summary>
public HttpListenerServer(string url = "http://*:10086/")
{
listerner = new HttpListener();
listerner.AuthenticationSchemes = AuthenticationSchemes.Anonymous;
listerner.Prefixes.Add(url);
}
/// <summary>
/// 開始監聽
/// </summary>
public void StartLisener()
{
thread = new Thread(new ThreadStart(delegate
{
listerner.Start();
//線程裏執行的方法
while (true)
{
//獲取一個請求體信息
HttpListenerContext httpListenerContext = listerner.GetContext();
//一個內置的線程,用來處理請求信息
new Thread(new ThreadStart(delegate
{
Process(httpListenerContext);
})).Start();
}
}));
thread.IsBackground = true;
thread.Start();
Console.WriteLine("代理服務器開啓成功!");
}
/// <summary>
/// 請求數據處理
/// </summary>
/// <param name="httpContext"></param>
public void Process(HttpListenerContext httpContext)
{
try
{
using (StreamWriter writer = new StreamWriter(httpContext.Response.OutputStream))
{
httpContext.Response.StatusCode = 200;
//獲取參數類信息
string ip = httpContext.Request.QueryString["ip"];
string action = httpContext.Request.QueryString["action"];
if (string.IsNullOrEmpty(ip) || string.IsNullOrEmpty(action))
{
int count = 0;
while (true)
{
//輸出ip和端口地址,以供使用
var queue = Currentobject.GetQueue();
if (queue != null)
{
writer.WriteLine(queue.IP);
break;
}
else
{
Thread.Sleep(1000);
}
count++;
if (count > 5)
{
writer.WriteLine(false.ToString());
break;
}
}
}
else
{
ParameterProcess(ip, action, writer);
}
}
}
catch (Exception ex)
{
try
{
Console.WriteLine("{0}> 接口異常:{1}", DateTime.Now.ToString("s"), ex.Message);
using (StreamWriter writer = new StreamWriter(httpContext.Response.OutputStream))
{
httpContext.Response.StatusCode = 200;
writer.WriteLine("false");
}
}
catch (Exception e)
{
}
}
}
/// <summary>
/// 參數處理
/// </summary>
/// <param name="ip"></param>
/// <param name="action"></param>
/// <param name="writer"></param>
public void ParameterProcess(string ip, string action, StreamWriter writer)
{
if (!string.IsNullOrEmpty(action))
{
switch (action)
{
case "del"://刪除代理ip地址
{
writer.WriteLine("true");
}
break;
}
}
}
}
以上是核心方法。
寫一個簡單的西刺代理爬取 核心方法
using HtmlAgilityPack;
using HttpRequestCore;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
namespace ProxyIPPool
{
/// <summary>
/// 西刺
/// </summary>
public class Xici : IProxyAction
{
public void Start()
{
while (true)
{
try
{
string url = "https://www.xicidaili.com/";
HtmlDocument doc = new HtmlDocument();
RequestInfo requestInfo = new RequestInfo(url, HttpMethod.GET, new HttpDefaultConfig() { UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36" });
requestInfo.Headers.Add("Cookie", "__cfduid=dbc0747d4f20e880b1f3fbeddd7ee7f9b1518096123; Hm_lvt_24b7d5cc1b26f24f256b6869b069278e=1518096226; yjs_id=4b3a274b8bb0be2202978cee06964563; yd_cookie=5bc973fb-f37b-425614221fbda95de6a441f2298ff2543cf1; UM_distinctid=1654c7c2cf02a0-09d94a3256c3ad-514d2f1f-144000-1654c7c2cf35bb; CNZZDATA1254651946=1879619778-1534585157-%7C1534585157; Hm_lvt_8ccd0ef22095c2eebfe4cd6187dea829=1534586531; Hm_lpvt_8ccd0ef22095c2eebfe4cd6187dea829=1534586545");
var html = HttpCore.Execute(requestInfo);
doc.LoadHtml(html);
var table = doc.DocumentNode.SelectSingleNode("//table[@id='ip_list']");
var tdList = table.SelectNodes("//tr").ToList();
for (int i = 0; i < tdList.Count; i++)
{
try
{
var td = tdList[i].SelectNodes("td").ToList();
if (td.Count != 8) continue;
var ip = td[1].InnerText;
int port = Convert.ToInt32(td[2].InnerText);
ProxyIP proxy = new ProxyIP() { IP = string.Format("{0}:{1}", ip, port), IPAddress = ip, Port = Convert.ToInt32(port), CreateTime = DateTime.Now, State = ProxyIPState.未驗證 };
//判斷Ip是否已經存在
if (Currentobject.IsExit(proxy))
{
continue;
}
#region 啓用多線程去驗證
IList<Task> itasks = new List<Task>();
CancellationTokenSource isoure = new CancellationTokenSource();
CancellationToken itoken = isoure.Token;
itasks.Add(new Task(() =>
{
try
{
if (Currentobject.CheckProxyIp(proxy, "西刺"))
{
Currentobject.AddOrUpdate(proxy);
}
}
catch (Exception ex)
{ }
}, itoken));
itasks[0].Start();
Task.WaitAll(itasks.ToArray(), (4 * 1000), itoken);
#endregion
}
catch (Exception e)
{ }
}
}
catch (Exception e)
{ }
Thread.Sleep(TimeSpan.FromMinutes(20));
}
}
}
}
以上就是 代理IP的核心。其他都是根據這些擴展而來。