爬蟲——基於RedisSpider分佈式爬取網易新聞:代理池,ua池,selenium加載動態數據,分佈式

總結:

先編寫普通工程代碼再改寫。

selenium使用

1、在爬蟲文件中,用selenium創建瀏覽器對象,
2、然後改寫下載中間件的process_response方法,通過該方法對下載中間件獲取的頁面響應內容進行更改,更改成瀏覽器對象去模擬瀏覽器獲取全部頁面之後的內容。
在這裏插入圖片描述
3、進一步發現頁面下拉的時候數據還會不斷加載,因此在讓瀏覽器再執行一段js代碼移動到瀏覽器底部。
4、settings裏中間件去掉註釋,發揮作用

代理池和ua池都是在中間件過程修改,注意自定義類需要繼承的類。

改寫成分佈式

更改繼承類class WangyiproSpider(RedisSpider):註釋掉start_url 添加redis_key =‘wangyi’
注意settings配置和上一篇一致,啓動方式也一致

完整代碼:

wangyipro.py

import scrapy
import re

from selenium import webdriver
from wangyi.items import WangyiItem
from scrapy_redis.spiders import RedisSpider

class WangyiproSpider(RedisSpider):
    name = 'wangyipro'
    # allowed_domains = ['www.wangyi.com']
    # start_urls = ['https://news.163.com/']
    redis_key ='wangyi'


    def __init__(self):
        # selenium 實例化一個瀏覽器,爬蟲開始時創建,結束時關閉
        self.bro=webdriver.Chrome(executable_path='C:/Users/GHL/Desktop/分析/firstdemo/chromedriver')
    def close(self,spider):
        print('爬蟲結束')
        self.bro.quit()



    def parse(self, response):
        # 寫了個循環取標籤 國內,國際,軍事,航空
        lis=response.xpath('//div[@class="ns_area list"]/ul/li')
        # indexs=[3,4,6,7]
        indexs=[3]
        li_list=[]
        for index in indexs:
            li_list.append(lis[index])

        # 獲取標籤鏈接和文字
        for li in li_list:
            url=li.xpath('./a/@href').extract_first()
            title=li.xpath('./a/text()').extract_first()
            # print(url+':'+title) 測試下

            # 拿到url之後再次發起請求獲取頁面數據
            yield scrapy.Request(url=url,callback=self.parseSecond,meta={'title':title})

    def parseSecond(self,response):
        print(response.body)
        div_li= response.xpath('//div[contains(@class,"data_row news_article clearfix")]')

        # print(len(div_li)) 68

        # div_list = response.xpath('//div[@class="data_row news_article clearfix"]')
        # print(len(div_list)) 不知道怎麼回事就是不行
        # 測試下
        # print(len(div_list))-----0 ?頁面數據是動態加載的,
        # 瀏覽器發送請求可以獲取,那需要用selenium實例化一個瀏覽器對象
        # 實例化對象發送請求,獲取數據之後,改寫中間件下載器的response方法,更改response的頁面數據

        for div in div_li:
            head=div.xpath('.//div[@class="news_title"]/h3/a/text()').extract_first()
            url=div.xpath('.//div[@class="news_title"]/h3/a/@href').extract_first()

            # imgurl=div.xpath('./a/img/@src') selector
            imgurl = div.xpath('./a/img/@src').extract_first()

            tags=div.xpath('.//div[@class="news_tag"]//text()').extract()
            new_tags=[re.sub('\s','',tag) for tag in tags]
            tags = ",".join(new_tags)

            # print(head,url,imgurl,tags)
            title = response.meta['title']
            item = WangyiItem()

            item['head'] = head
            item['url'] = url
            item['imgurl'] = imgurl
            item['tags'] = tags
            item['title'] = title

            yield scrapy.Request(url=url, callback=self.getContent, meta={'item': item})

    def getContent(self,response):
        item = response.meta['item']

        content_list = response.xpath('//div[@class="post_text"]/p/text()').extract()
        content = ''.join(content_list)
        item['content'] = content
        yield item

中間件


from scrapy import signals
import time
from scrapy.http import HtmlResponse
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
import random

class WangyiDownloaderMiddleware(object):
 
   # selenium的使用
    def process_request(self, request, spider):
    
        # request 響應對象對應的請求對象
        # response 響應對象
        # spider 爬蟲類的實例
        if request.url in ['http://news.163.com/domestic/', 'http://news.163.com/world/', 'http://war.163.com/',
                           'http://news.163.com/air/']:
            spider.bro.get(url=request.url)

            # 頁面下拉到底部,等5秒鐘動態數據加載完
            js = 'window.scrollTo(0,document.body.scrollHeight)'
            spider.bro.execute_script(js)
            time.sleep(5)

            page_text = spider.bro.page_source

            # print(page_text) # 這一步是正確的可以獲取到所有的頁面數據

            # 返回新的響應對象
            return HtmlResponse(url=spider.bro.current_url, body=page_text, encoding='utf-8', request=request)
        else:
            return response


# UA池
user_agent_list = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
    "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]


class RandomUserAgent(UserAgentMiddleware):
    def process_request(self, request, spider):
        ua = random.choice(user_agent_list)
        request.headers.setdefault('User-Agent', ua)


# 代理池
class Proxy(object):

    def process_request(self, request, spider):
        # 對攔截到請求的url進行判斷(協議頭到底是http還是https)
        # request.url返回值:http://www.xxx.com
        h = request.url.split(':')[0]  # 請求的協議頭
        if h == 'https':
            ip = random.choice(PROXY_https)
            request.meta['proxy'] = 'https://' + ip
        else:
            ip = random.choice(PROXY_http)
            request.meta['proxy'] = 'http://' + ip


PROXY_http = [
    '151.106.8.236:8820',
    '46.167.206.116:8985',
    '113.160.145.185:8955'
]
PROXY_https = [
    '111.198.154.116:9030'
]

items.py

import scrapy


class WangyiItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    head = scrapy.Field()
    url = scrapy.Field()
    imgurl = scrapy.Field()
    tags = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()

settings.py

BOT_NAME = 'wangyi'

SPIDER_MODULES = ['wangyi.spiders']
NEWSPIDER_MODULE = 'wangyi.spiders'

# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'wangyi (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False


DOWNLOADER_MIDDLEWARES = {
   'wangyi.middlewares.WangyiDownloaderMiddleware': 543,
   'wangyi.middlewares.RandomUserAgent':542,

   # 'wangyi.middlewares.Proxy':541 註釋掉了網上找的代理ip連不上
}



ITEM_PIPELINES = {
   # 'wangyi.pipelines.WangyiPipeline': 300,使用redis執行分佈式時需註釋掉
    'scrapy_redis.pipelines.RedisPipeline':400
}



DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"

# 使用scrapy_redis組件的調度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否允許暫停,某臺機器出現故障時會從暫停之前的位置開始
SCHEDULER_PERSIST = True

# 配置redis服務器,爬蟲文件在其他電腦上運行。

REDIS_PORT = 6379

學習:

https://www.cnblogs.com/foremostxl/p/10098086.html

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章