總結:
先編寫普通工程代碼再改寫。
selenium使用
1、在爬蟲文件中,用selenium創建瀏覽器對象,
2、然後改寫下載中間件的process_response方法,通過該方法對下載中間件獲取的頁面響應內容進行更改,更改成瀏覽器對象去模擬瀏覽器獲取全部頁面之後的內容。
3、進一步發現頁面下拉的時候數據還會不斷加載,因此在讓瀏覽器再執行一段js代碼移動到瀏覽器底部。
4、settings裏中間件去掉註釋,發揮作用
代理池和ua池都是在中間件過程修改,注意自定義類需要繼承的類。
改寫成分佈式
更改繼承類class WangyiproSpider(RedisSpider):
註釋掉start_url 添加redis_key =‘wangyi’
注意settings配置和上一篇一致,啓動方式也一致
完整代碼:
import scrapy
import re
from selenium import webdriver
from wangyi.items import WangyiItem
from scrapy_redis.spiders import RedisSpider
class WangyiproSpider(RedisSpider):
name = 'wangyipro'
# allowed_domains = ['www.wangyi.com']
# start_urls = ['https://news.163.com/']
redis_key ='wangyi'
def __init__(self):
# selenium 實例化一個瀏覽器,爬蟲開始時創建,結束時關閉
self.bro=webdriver.Chrome(executable_path='C:/Users/GHL/Desktop/分析/firstdemo/chromedriver')
def close(self,spider):
print('爬蟲結束')
self.bro.quit()
def parse(self, response):
# 寫了個循環取標籤 國內,國際,軍事,航空
lis=response.xpath('//div[@class="ns_area list"]/ul/li')
# indexs=[3,4,6,7]
indexs=[3]
li_list=[]
for index in indexs:
li_list.append(lis[index])
# 獲取標籤鏈接和文字
for li in li_list:
url=li.xpath('./a/@href').extract_first()
title=li.xpath('./a/text()').extract_first()
# print(url+':'+title) 測試下
# 拿到url之後再次發起請求獲取頁面數據
yield scrapy.Request(url=url,callback=self.parseSecond,meta={'title':title})
def parseSecond(self,response):
print(response.body)
div_li= response.xpath('//div[contains(@class,"data_row news_article clearfix")]')
# print(len(div_li)) 68
# div_list = response.xpath('//div[@class="data_row news_article clearfix"]')
# print(len(div_list)) 不知道怎麼回事就是不行
# 測試下
# print(len(div_list))-----0 ?頁面數據是動態加載的,
# 瀏覽器發送請求可以獲取,那需要用selenium實例化一個瀏覽器對象
# 實例化對象發送請求,獲取數據之後,改寫中間件下載器的response方法,更改response的頁面數據
for div in div_li:
head=div.xpath('.//div[@class="news_title"]/h3/a/text()').extract_first()
url=div.xpath('.//div[@class="news_title"]/h3/a/@href').extract_first()
# imgurl=div.xpath('./a/img/@src') selector
imgurl = div.xpath('./a/img/@src').extract_first()
tags=div.xpath('.//div[@class="news_tag"]//text()').extract()
new_tags=[re.sub('\s','',tag) for tag in tags]
tags = ",".join(new_tags)
# print(head,url,imgurl,tags)
title = response.meta['title']
item = WangyiItem()
item['head'] = head
item['url'] = url
item['imgurl'] = imgurl
item['tags'] = tags
item['title'] = title
yield scrapy.Request(url=url, callback=self.getContent, meta={'item': item})
def getContent(self,response):
item = response.meta['item']
content_list = response.xpath('//div[@class="post_text"]/p/text()').extract()
content = ''.join(content_list)
item['content'] = content
yield item
中間件
from scrapy import signals
import time
from scrapy.http import HtmlResponse
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
import random
class WangyiDownloaderMiddleware(object):
# selenium的使用
def process_request(self, request, spider):
# request 響應對象對應的請求對象
# response 響應對象
# spider 爬蟲類的實例
if request.url in ['http://news.163.com/domestic/', 'http://news.163.com/world/', 'http://war.163.com/',
'http://news.163.com/air/']:
spider.bro.get(url=request.url)
# 頁面下拉到底部,等5秒鐘動態數據加載完
js = 'window.scrollTo(0,document.body.scrollHeight)'
spider.bro.execute_script(js)
time.sleep(5)
page_text = spider.bro.page_source
# print(page_text) # 這一步是正確的可以獲取到所有的頁面數據
# 返回新的響應對象
return HtmlResponse(url=spider.bro.current_url, body=page_text, encoding='utf-8', request=request)
else:
return response
# UA池
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
class RandomUserAgent(UserAgentMiddleware):
def process_request(self, request, spider):
ua = random.choice(user_agent_list)
request.headers.setdefault('User-Agent', ua)
# 代理池
class Proxy(object):
def process_request(self, request, spider):
# 對攔截到請求的url進行判斷(協議頭到底是http還是https)
# request.url返回值:http://www.xxx.com
h = request.url.split(':')[0] # 請求的協議頭
if h == 'https':
ip = random.choice(PROXY_https)
request.meta['proxy'] = 'https://' + ip
else:
ip = random.choice(PROXY_http)
request.meta['proxy'] = 'http://' + ip
PROXY_http = [
'151.106.8.236:8820',
'46.167.206.116:8985',
'113.160.145.185:8955'
]
PROXY_https = [
'111.198.154.116:9030'
]
import scrapy
class WangyiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
head = scrapy.Field()
url = scrapy.Field()
imgurl = scrapy.Field()
tags = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
BOT_NAME = 'wangyi'
SPIDER_MODULES = ['wangyi.spiders']
NEWSPIDER_MODULE = 'wangyi.spiders'
# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'wangyi (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DOWNLOADER_MIDDLEWARES = {
'wangyi.middlewares.WangyiDownloaderMiddleware': 543,
'wangyi.middlewares.RandomUserAgent':542,
# 'wangyi.middlewares.Proxy':541 註釋掉了網上找的代理ip連不上
}
ITEM_PIPELINES = {
# 'wangyi.pipelines.WangyiPipeline': 300,使用redis執行分佈式時需註釋掉
'scrapy_redis.pipelines.RedisPipeline':400
}
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 使用scrapy_redis組件的調度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否允許暫停,某臺機器出現故障時會從暫停之前的位置開始
SCHEDULER_PERSIST = True
# 配置redis服務器,爬蟲文件在其他電腦上運行。
REDIS_PORT = 6379