scrapy簡單的反爬蟲方法總結
1.設置user_agent
@簡單設置方法-在setting中設備ua_list,並在
在setting中設置user_agent,
ua_list = [
‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36’,
‘Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0’
]
在spider文件中 from … import ua_list
在類中放置headers
headers = {‘User-Agent’: random.choice(ua_list)}
但是header只在類中獲取的時候random.chioce一次,以後Request中均沒有再隨機改變過,所以需要放在函數中去影響request_headers
def parse(self, response): #切記加入header,不然會導致重定向
nodes = response.xpath('//a[@class="position_link"]/@href').extract()
for node in nodes:
self.headers['User-Agent'] = random.choice(ua_list)#在函數中加入random.choice(ua_list)
yield scrapy.Request(url=node,callback=self.parse_job,headers=self.headers) # meta是額外內容
print(self.headers)
# 獲取頁面下一頁鏈接,並調用回調函數parse,重複循環,形成遞歸函數
next_url = response.xpath('//a[@class="page_no"][contains(text(),"下一頁")]/@href').extract_first("")
if next_url: #提取下一頁的
yield scrapy.Request(url=next_url, headers=self.headers,callback=self.parse)
time.sleep(2)
pass
@在middleware中設置headers,middleware是spider、downlader、scheduler與engine之間的鉤子框架,利用中間件進行requeste和response之間的處理
1.在setting中 import UserAgenT,並設置好UserAgent。
ua = UserAgent()
UA = ua.random
2.在from lagou_spider.settings import UA,
class RandomUserAgentMiddleware(object):
"""This middleware allows spiders to override the user_agent"""
#隨機更換user_agent
def __init__(self):
self.useragent = UA
def process_request(self, request, spider):
UseraGent =self.useragent
request.headers.setdefault('User-Agent', self.useragent)
3.在settings中開啓:
DOWNLOADER_MIDDLEWARES ‘{lagou_spider.middlewares.RandomUserAgentMiddleware’: 1}
2.設置proxy
@首先先定製爬蟲爬取proxy先,切記傳給scrapy的proxy格式
return ‘http://{0}:{1)’.format(ip,port)
proxy爬蟲:
import requests
from bs4 import BeautifulSoup
import pymysql
import time
class GetIP(object):
url = r'http://www.xicidaili.com/wn/'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; rv:54.0) Gecko/20100101 Firefox/54.0'}
def get_ip(self,next_page = '/wn/1'):
#構造下一頁url
next_page_url = r'http://www.xicidaili.com'+str(next_page)
self.process_page_info(next_page_url)
response = requests.get(next_page_url, headers=self.header).text
soup = BeautifulSoup(response, 'html.parser')
next_page = soup.find('a', attrs={'class', 'next_page'})['href']
if next_page: #如果存在next_page
self.get_ip(next_page)
#每一頁下載ip
def process_page_info(self,url):
response = requests.get(url, headers=self.header).text
soup = BeautifulSoup(response, 'html.parser')
conn,cur = self.connect_db()
ip_info_list = soup.findAll('tr',attrs={'class', 'odd'})
for ip_info in ip_info_list:
ip = ip_info.find_all('td')[1].text
port = ip_info.find_all('td')[2].text
ty = ip_info.find_all('td')[5].text
para = (ip,port,ty)
print(para)
#插入每一項ip進入數據庫,加入ip有效判斷
if self.check_proxy(ip,port,type):
time.sleep(1)
self.insert_db(cur,conn,para)
def check_proxy(self,ip,port,type):
proxy = str(ip)+':'+str(port)
proxy_dict = {type:proxy}
resp = requests.get(url, headers=self.header, proxies=proxy_dict)
code = resp.status_code
if code == 200:
print('True')
return True
else:
print('False')
return False
def connect_db(self):
conn = pymysql.connect(host='localhost', user='root', passwd='zhangxinwoaini', charset='utf8', db='lagou')
cur = conn.cursor()
return conn,cur
def insert_db(self,cur,conn,para):
insert_sql ='''
insert into ip_db(ip,port,type)VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE ip=VALUES (ip)
'''
cur.execute(insert_sql,para)
conn.commit()
def get_ip_from_db(self):
conn,cur = self.connect_db()
get_ip_sql = '''
select ip,port,type from ip_db ORDER by rand() limit 1
'''
result = cur.execute(get_ip_sql)
for ip_info in cur.fetchall():
ip = ip_info[0]
port = ip_info[1]
ty = ip_info[2]
#返回需要的proxy
return 'http://{0}:{1)'.format(ip,port)
@在middleware中添加proxy
class RandomProxyMiddleware(object):
def process_request(self, request, spider):
ip = GetIP().get_ip_from_db()
request.meta["proxy"] = ip
pass
@在settings中加入該middleware
DOWNLOADER_MIDDLEWARES = {
'lagou_spider.middlewares.RandomUserAgentMiddleware': 1,
'lagou_spider.middlewares.RandomProxyMiddleware': 2,
}
3.設置延遲下載,在settings中設置延時
DOWNLOAD_DELAY = 1
4.使用打碼平臺,人工打碼和自己識別