python爬蟲使用scrapy框架

scrapy框架提升篇

關注公衆號“輕鬆學編程”瞭解更多

1、創建啓動爬蟲腳本

在項目目錄下創建start.py文件:

這裏寫圖片描述

添加代碼:

#以後只要運行start.py就可以啓動爬蟲
import scrapy.cmdline

def main():
    #mytencent爲當前項目爬蟲名
    scrapy.cmdline.execute(['scrapy', 'crawl', 'mytencent'])

if __name__ == '__main__':
    main()

2、自動爬取多頁

在spiders文件夾下的mytencent.py中MytencentSpider類要繼承CrawlSpider,然後添加規則即可:

import scrapy
from tencent.items import TencentItem

from scrapy.spiders import CrawlSpider, Rule  # 爬取規則
from scrapy.linkextractors import LinkExtractor  # 提取鏈接

#爬蟲類繼承CrawlSpider
class MytencentSpider(CrawlSpider):
    name = 'mytencent'
    allowed_domains = ['hr.tencent.com']
    start_urls = ['https://hr.tencent.com/position.php?keywords=&tid=0&start=10#a']

    #添加爬取url規則,url符合正則start=(\d+)#a")就爬取
    rules = (Rule(LinkExtractor(allow=("start=(\d+)#a")), callback='get_parse', follow=True),)

    # 一定不能用parse()
    def get_parse(self, response):
        jobList = response.xpath('//tr[@class="even"] | //tr[@class="odd"]')

        # 存儲對象
        item = TencentItem()

        for job in jobList:
            # .extract()提取文本
            jobName = job.xpath('./td[1]/a/text()').extract()[0]  
            jobType = job.xpath('./td[2]/text()').extract()[0]
            item['jobName'] = jobName
            item['jobType'] = jobType
            
            yield item

3、使用框架自帶的Request()構建請求

在spiders文件夾下的mysina.py中:

import scrapy
from scrapy.spiders import CrawlSpider,Rule #爬取規則
from scrapy.linkextractor import LinkExtractor #提取鏈接

class MysinaSpider(CrawlSpider):
    name = 'mysina'
    allowed_domains = ['sina.com.cn']
    start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_1.shtml']
    #設置爬取規則,可迭代對象,可設置多個規則
    rules = [Rule(LinkExtractor(allow=("index_(\d+).shtml")),callback='get_parse',follow=True)]

    def get_parse(self, response):
        newsList = response.xpath('//ul[@class="list_009"]/li')
        for news in newsList:
            # 新聞標題
            title = news.xpath('./a/text()').extract()[0]
            # 新聞時間
            newsTime = news.xpath('./span/text()').extract()[0]
            # print('***********',title,'****',newsTime)
            #獲取正文的url
            contentsUrl = news.xpath('./a/@href').extract()[0]
            #使用框架自帶的Request()構建請求,使用meta傳遞參數
            '''
            scrapy.Request()參數列表:
            url,
            callback=None, 回調函數
            meta=None, 數據傳遞
            '''
            request = scrapy.Request(url=contentsUrl,callback=self.get_article,)
            # 使用meta傳遞參數 是一個字典, 只能傳遞一層
            request.meta['title'] = title
            request.meta['newsTime'] = newsTime

            yield request

    def get_article(self,response):
        contents = response.xpath('//div[@id="article"]//text()')
        #新聞內容
        newsContent = ""
        for content in contents:
            newsContent += content.extract().strip()+'\n'
        print('*****新聞正文*****',newsContent,'*****新聞正文*****')
        item = SinaItem()
        # 從meta中獲取參數
        item['title'] = response.meta['title']
        item['newsTime'] = response.meta['newsTime']
        
        item['newsContent'] = newsContent

        yield item

4、保存進MySQL數據庫模板

在MySQL中建立數據庫,表,然後在pipelines.py中編寫代碼如下:

import pymysql

class TencentPipeline(object):
    def __init__(self):
        #連接數據庫
        self.conn = None
        #遊標
        self.cur = None

    # 打開爬蟲時調用,只調用一次
    def open_spider(self,spider):
        self.conn = pymysql.connect(host='127.0.0.1',
                                    user='root',
                                    password="123456",
                                    database='tjob', #數據庫爲tjob
                                    port=3306,
                                    charset='utf8')
        self.cur = self.conn.cursor()

    def process_item(self, item, spider):
        clos,value = zip(*item.items())
        sql = "INSERT INTO `%s`(%s) VALUES (%s)" % ('tencentjob', #表名爲tencentjob
                                                    ','.join(clos),
                                                    ','.join(['%s'] * len(value)))

        self.cur.execute(sql, value)

        self.conn.commit()
        return item

    def close_spider(self, spider):
        self.cur.close()
        self.conn.close()

settings.py中要開啓

ITEM_PIPELINES = {
   'tencent.pipelines.TencentPipeline': 300,
}

5、使用中間件做UA代理,IP代理

在middlewares.py中添加:

from scrapy import signals
import random

#ip代理
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
#UA代理
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware

from weixinsougou.settings import USER_AGENTS,PROXIES


class RandomUAMiddleware(UserAgentMiddleware):
    '''
    隨機UA代理,中間件
    '''
    def process_request(self, request, spider):
        '''
        所有的請求都會經過process_request
        :param request:請求
        :param spider:爬蟲名
        :return:
        '''
        ua = random.choice(USER_AGENTS)
        request.headers.setdefault("User-Agent", ua)

class RandomIPMiddleware(HttpProxyMiddleware):
    '''
    隨機IP代理
    '''

    def process_request(self, request, spider):
        proxy = random.choice(PROXIES)
        request.meta['proxy'] = 'http://' + proxy['ip_port']
        

#class RandomCookieMiddleware(CookiesMiddleware):
#     '''
#     隨機cookie池
#     '''
# 
#     def process_request(self, request, spider):
#         cookie = random.choice(COOKIES)
#         request.cookies = cookie

在settings.py中添加:

# -*- coding: utf-8 -*-


# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Disable cookies (enabled by default)
COOKIES_ENABLED = False


# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#啓用中間件
DOWNLOADER_MIDDLEWARES = {
   # 'weixinsougou.middlewares.WeixinsougouDownloaderMiddleware': 543,
   'weixinsougou.middlewares.RandomUAMiddleware': 543,
   'weixinsougou.middlewares.RandomIPMiddleware': 544,

}

#UA池
USER_AGENTS = [
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5"
]

#IP池
PROXIES = [
    {'ip_port': '171.38.85.93:8123'},
    {'ip_port': '113.67.227.143:8118'},
    {'ip_port': '101.236.19.165:8866'},
    {'ip_port': '101.236.21.22:8866'},
]

#cookle池
COOKIES = []

# 默認線程數量 10
REACTOR_THREADPOOL_MAXSIZE = 20

# 併發 默認16
CONCURRENT_REQUESTS = 16

# pipelines同時處理數量 默認100
CONCURRENT_ITEMS = 50

# scrapy 深度爬取,默認0 不做深度限制
DEPTH_LIMIT = 4
# 下載超時
DOWNLOAD_TIMEOUT = 180

#####6、使用redis實現分佈式爬取

https://blog.csdn.net/lm_is_dc/article/details/81866275

#####7、部署

https://blog.csdn.net/lm_is_dc/article/details/81869508

8、使用gerapy管理爬蟲

https://blog.csdn.net/lm_is_dc/article/details/81869508

後記

【後記】爲了讓大家能夠輕鬆學編程,我創建了一個公衆號【輕鬆學編程】,裏面有讓你快速學會編程的文章,當然也有一些乾貨提高你的編程水平,也有一些編程項目適合做一些課程設計等課題。

也可加我微信【1257309054】,拉你進羣,大家一起交流學習。
如果文章對您有幫助,請我喝杯咖啡吧!

公衆號

公衆號

讚賞碼

關注我,我們一起成長~~

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章