Scrapy實現微博關鍵詞爬蟲(爬蟲結果寫入mongodb)

爬取字段信息有:

  1. 關鍵詞
  2. 微博ID
  3. 微博內容信息
  4. 微博讚的個數
  5. 微博轉發個數
  6. 微博評論個數
  7. 轉發微博的轉發原因
  8. 微博日期
  9. 轉發源ID
  10. 原微博的贊個數
  11. 原微博的評論個數
  12. 原微博的轉發個數
  13. 存入數據庫的ID值(可忽略)

spiders文件夾下的microBlogSpider.py裏這樣寫:

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Spider, Request, FormRequest
from scrapy.selector import Selector
import datetime
import random
from blogSpider.items import microBlogItem, keyWordItem
from blogSpider.items import keyWordItem
import json

class MicroblogspiderSpider(scrapy.Spider):
    name = 'microBlogSpider'
    allowed_domains = ['weibo.cn']
    search_url = 'https://weibo.cn/search/mblog'
    # 默認100頁
    max_page = 100
    myCookie = 'xxxxxxxxxxx'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': ' max-age=0',
        'Connection': ' keep-alive',
        'Content-Type': ' application/x-www-form-urlencoded',
        'Host': ' weibo.cn',
        'Origin': ' https://weibo.cn',
        'Upgrade-Insecure-Requests': ' 1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
    }

    def start_requests(self):
        keyword = 羅志祥'
        startTime = '2020-05-01'
        endTime = '2020-05-02'

        start_time = datetime.datetime.strptime(startTime, '%Y-%m-%d')
        end_time = datetime.datetime.strptime(endTime, '%Y-%m-%d')
        # print(start_time)
        # print(end_time)
        cookie = {}
        for i in self.myCookie.split(';')[:-1]:
            cookie[i.split('=')[0]] = i.split('=')[1]
        while start_time <= end_time:
            currentTime = start_time
            # 處理時間格式
            start = str(currentTime)
            end = str(currentTime)
            start_T = start[:start.index(' ')].replace('-', '')
            end_T = end[:end.index(' ')].replace('-', '')
            url = '{url}?hideSearchFrame=&keyword={keyword}&starttime={st}&endtime={et}&sort=hot'.format(url=self.search_url, keyword=keyword, st=start_T, et=end_T)
            yield scrapy.FormRequest(
                url,
                callback=self.parse_index,
                cookies=cookie,
                headers=self.headers,
                meta={
                    'time': start_time,
                    'keyWord': keyword,
                }
            )
            start_time += datetime.timedelta(days=1)


    def parse_index(self, response):
        if not response.body:
            return
        dateTime = response.meta['time']
        keyWord = response.meta['keyWord']
        # print(dateTime)
        cookie = {}
        for i in self.myCookie.split(';')[:-1]:
            cookie[i.split('=')[0]] = i.split('=')[1]
        url = '{url}&page={page}'.format(url=response.url, page=1)
        yield FormRequest(
            url,
            headers=self.headers,
            callback=self.getInfo,
            cookies=cookie,
            meta={
                'time': dateTime,
                'keyWord': keyWord,

            }
        )

    def getInfo(self, response):
        pageInfo = Selector(response)

        # print(response.body.decode('utf-8'))
        # allDiv是獲取當前頁面中所有含有微博信息的div標籤
        allDiv = pageInfo.xpath('//div[starts-with(@id,"M_")]')
        # print(allDiv)
        # print(len(allDiv))
        # # 然後對所有DIV進行解析
        for div in allDiv:
            # microBlogInfo = microBlogItem()
            # print(div)
            # print(type(div))
            # 用來標註是否存入數據
            flag = 0
            microBlogInfo = microBlogItem()
            microBlogInfo['keyWord'] = response.meta['keyWord']
            microBlogInfo['blogDate'] = str(response.meta['time']).split(' ')[0]
            microBlogInfo['microBlogId'] = div.css('a.nk ::text').extract()[0]
            microBlogInfo['originalBlogSupportNum'] = ''
            microBlogInfo['originalBlogCommentsNum'] = ''
            microBlogInfo['originalBlogForwardingNum'] = ''
            microBlogInfo['forwardingReason'] = ''
            microBlogInfo['fromBlogId'] = ''
            # print(microBlogInfo['microBlogId'])
            divs = div.xpath('div')
            divNum = len(divs)
            # print(divNum)

            if divNum == 3:
                # 包含3個DIV的類型爲帶圖帶轉發微博
                try:
                    microBlogInfo['fromBlogId'] = div.xpath('div[1]/span[@class="cmt"]/a/text()').extract()[0]
                except:
                    flag = 1
                # print(microBlogInfo['fromBlogId'])
                if flag == 0:
                    contents = div.xpath('div[1]/span[@class="ctt"]/text()').extract()
                    microBlogInfo['contentInfo'] = ''
                    for content in contents:
                        content = "".join(content.split())
                        microBlogInfo['contentInfo'] += content
                    #  去掉冒號
                    if microBlogInfo['contentInfo'][0] == ':':
                        microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]
                    # print(microBlogInfo['contentInfo'])
                    # 包含贊和轉發數['贊[8322]', '原文轉發[2927]']
                    originalInfo = div.xpath('div[2]/span[@class="cmt"]/text()').extract()
                    oSNum = originalInfo[0][originalInfo[0].index('[')+1:originalInfo[0].index(']')]
                    oFNum = originalInfo[1][originalInfo[1].index('[')+1:originalInfo[1].index(']')]
                    # 包含評論數['原文評論[333]']
                    originalComment = div.xpath('div[2]/a[last()]/text()').extract()
                    oCNum = originalComment[0][originalComment[0].index('[') + 1:originalComment[0].index(']')]
                    # print(originalInfo)
                    # print(oSNum)
                    # print(oFNum)
                    # print(originalComment)
                    # print(oCNum)
                    microBlogInfo['originalBlogSupportNum'] = oSNum
                    microBlogInfo['originalBlogCommentsNum'] = oCNum
                    microBlogInfo['originalBlogForwardingNum'] = oFNum
                    lastDivInfo1 = div.xpath('div[3]/text()').extract()
                    lastDivInfo2 = div.xpath('div[3]/*/text()').extract()
                    microBlogInfo['forwardingReason'] = ''
                    for info1 in lastDivInfo1:
                        info1 = "".join(info1.split())
                        microBlogInfo['forwardingReason'] += info1
                    # print(microBlogInfo['forwardingReason'])
                    for info2 in lastDivInfo2:
                        info2 = "".join(info2.split())
                        if info2.startswith('贊['):
                            microBlogInfo['numOfSupport'] = info2[info2.index('[')+1:info2.index(']')]
                        elif info2.startswith('評論['):
                            microBlogInfo['numOfComments'] = info2[info2.index('[')+1:info2.index(']')]
                        elif info2.startswith('轉發['):
                            microBlogInfo['numOfForwarding'] = info2[info2.index('[')+1:info2.index(']')]
                        else:
                            continue
                    # print('讚的個數:'+microBlogInfo['numOfSupport'])
                    # print('評論個數:'+microBlogInfo['numOfComments'])
                    # print('轉發個數:'+microBlogInfo['numOfForwarding'])
            elif divNum == 2:
                firstSpan = div.xpath('div[1]/span[1]/@class').extract()[0]
                # print(firstSpan)
                # 第一個span的class=‘cmt’則證明是非帶圖轉發微博
                if firstSpan == 'cmt':
                    try:
                        microBlogInfo['fromBlogId'] = div.xpath('div[1]/span[@class="cmt"]/a/text()').extract()[0]
                    except:
                        flag = 1
                    if flag == 0:
                        contents = div.xpath('div[1]/span[@class="ctt"]/text()').extract()
                        microBlogInfo['contentInfo'] = ''
                        for content in contents:
                            content = "".join(content.split())
                            microBlogInfo['contentInfo'] += content
                        #  去掉冒號
                        if microBlogInfo['contentInfo'][0] == ':':
                            microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]

                        originalInfo = div.xpath('div[1]/span[@class="cmt"]/text()').extract()
                        oSNum = originalInfo[len(originalInfo)-2][originalInfo[len(originalInfo)-2].index('[')+1:originalInfo[len(originalInfo)-2].index(']')]
                        oFNum = originalInfo[len(originalInfo)-1][originalInfo[len(originalInfo)-1].index('[')+1:originalInfo[len(originalInfo)-1].index(']')]
                        # print(oSNum)
                        # print(oFNum)
                        originalComment = div.xpath('div[1]/a[last()]/text()').extract()
                        oCNum = originalComment[0][originalComment[0].index('[') + 1:originalComment[0].index(']')]
                        # print(oCNum)
                        microBlogInfo['originalBlogSupportNum'] = oSNum
                        microBlogInfo['originalBlogCommentsNum'] = oCNum
                        microBlogInfo['originalBlogForwardingNum'] = oFNum
                        lastDivInfo1 = div.xpath('div[2]/text()').extract()
                        lastDivInfo2 = div.xpath('div[2]/*/text()').extract()
                        microBlogInfo['forwardingReason'] = ''
                        for info1 in lastDivInfo1:
                            info1 = "".join(info1.split())
                            microBlogInfo['forwardingReason'] += info1
                        # print(microBlogInfo['forwardingReason'])
                        for info2 in lastDivInfo2:
                            info2 = "".join(info2.split())
                            if info2.startswith('贊['):
                                microBlogInfo['numOfSupport'] = info2[info2.index('[') + 1:info2.index(']')]
                            elif info2.startswith('評論['):
                                microBlogInfo['numOfComments'] = info2[info2.index('[') + 1:info2.index(']')]
                            elif info2.startswith('轉發['):
                                microBlogInfo['numOfForwarding'] = info2[info2.index('[') + 1:info2.index(']')]
                            else:
                                continue
                        # print('讚的個數:'+microBlogInfo['numOfSupport'])
                        # print('評論個數:'+microBlogInfo['numOfComments'])
                        # print('轉發個數:'+microBlogInfo['numOfForwarding'])

                # ctt則爲帶圖非轉發微博
                elif firstSpan == 'ctt':
                    contents = div.xpath('div[1]/span[@class="ctt"]/text()').extract()
                    microBlogInfo['contentInfo'] = ''
                    for content in contents:
                        content = "".join(content.split())
                        microBlogInfo['contentInfo'] += content
                    #  去掉冒號
                    if microBlogInfo['contentInfo'][0] == ':':
                        microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]
                    lastDivInfo = div.xpath('div[2]/a/text()').extract()
                    # print(lastDivInfo)
                    for info in lastDivInfo:
                        info = "".join(info.split())
                        if info.startswith('贊['):
                            microBlogInfo['numOfSupport'] = info[info.index('[') + 1:info.index(']')]
                        elif info.startswith('評論['):
                            microBlogInfo['numOfComments'] = info[info.index('[') + 1:info.index(']')]
                        elif info.startswith('轉發['):
                            microBlogInfo['numOfForwarding'] = info[info.index('[') + 1:info.index(']')]
                        else:
                            continue
                    # print('讚的個數:'+microBlogInfo['numOfSupport'])
                    # print('評論個數:'+microBlogInfo['numOfComments'])
                    # print('轉發個數:'+microBlogInfo['numOfForwarding'])
            # 原創不帶圖微博
            elif divNum == 1:
                contents = div.xpath('div/span[@class="ctt"]/text()').extract()
                microBlogInfo['contentInfo'] = ''
                for content in contents:
                    content = "".join(content.split())
                    microBlogInfo['contentInfo'] += content
                #  去掉冒號
                if microBlogInfo['contentInfo'][0] == ':':
                    microBlogInfo['contentInfo'] = microBlogInfo['contentInfo'][1:]
                # print(microBlogInfo['contentInfo'])
                lastDivInfo = div.xpath('div/a/text()').extract()
                # print(lastDivInfo)
                for info in lastDivInfo:
                    info = "".join(info.split())
                    if info.startswith('贊['):
                        microBlogInfo['numOfSupport'] = info[info.index('[') + 1:info.index(']')]
                    elif info.startswith('評論['):
                        microBlogInfo['numOfComments'] = info[info.index('[') + 1:info.index(']')]
                    elif info.startswith('轉發['):
                        microBlogInfo['numOfForwarding'] = info[info.index('[') + 1:info.index(']')]
                    else:
                        continue
                # print('讚的個數:'+microBlogInfo['numOfSupport'])
                # print('評論個數:'+microBlogInfo['numOfComments'])
                # print('轉發個數:'+microBlogInfo['numOfForwarding'])
            if flag == 0:
                yield microBlogInfo

注意:mycookie填寫你登錄之後的cookie信息就行,具體操作步驟如下:
爬蟲網站登錄之後,F12打開瀏覽器網頁詳細界面,按照下面的點擊,然後複製cookie粘貼到爬蟲文件裏即可
在這裏插入圖片描述
items.py裏這樣寫:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class microBlogItem(scrapy.Item):
    '''
    微博信息
    '''
    collection = 'microBlogData'
    keyWord = scrapy.Field()
    microBlogId = scrapy.Field()
    contentInfo = scrapy.Field()
    numOfSupport = scrapy.Field()
    numOfComments = scrapy.Field()
    numOfForwarding = scrapy.Field()
    forwardingReason = scrapy.Field()
    blogDate = scrapy.Field()
    fromBlogId = scrapy.Field()
    originalBlogSupportNum = scrapy.Field()
    originalBlogCommentsNum = scrapy.Field()
    originalBlogForwardingNum = scrapy.Field()
    id = scrapy.Field()

pipelines.py裏這樣寫:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo

class BlogspiderPipeline(object):

    def __init__(self):
        '''
        初始化mongodb的各項參數
        :param mongoUrl:
        :param mongoPort:
        :param mongoDB:
        '''
        self.mongoUrl = '127.0.0.1'
        self.mongoPort = 27017
        self.mongoDB = 'SpiderTest'

    def open_spider(self, spider):
        '''
        開啓爬蟲時,鏈接數據庫
        :param spider:
        :return:
        '''
        self.client = pymongo.MongoClient(self.mongoUrl, self.mongoPort)
        self.db = self.client[self.mongoDB]

    def process_item(self, item, spider):
        '''
        將數據寫入數據庫
        :param item:
        :param spider:
        :return:
        '''
       
        exist = self.db[item.collection].find({'microBlogId': item['microBlogId'], 'contentInfo': item['contentInfo'], 'blogDate': item['blogDate']}).count()
            if exist == 0:
                count = self.db[item.collection].count()
                item['id'] = count + 1
                self.db[item.collection].insert_one(dict(item))
      
        return item

    def close_spider(self, spider):
        '''
        關閉爬蟲,關閉數據庫
        :param spider:
        :return:
        '''
        self.client.close()

接下來,執行爬蟲就可以啦!

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章