scrapy 模擬登陸 並且爬取51cto 文章

a51cto.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import FormRequest
from cto.items import CtospiderItem

class CtoSpider(scrapy.Spider):
    name = '51cto'
    allowed_domains = ['51cto.com']

    def start_requests(self):
        urls = ['http://home.51cto.com/index']
        for url in urls:
            yield scrapy.Request(url, callback=self.cto_login, meta={'cookiejar': 1})

    def cto_login(self, response):
        # 獲取csrf值
        csrf = response.xpath("//input[@name='d1g0Smlta3o7DE0kJiU8OQM3WTMjXhtDJCp8JC0qADhPH2YbGT5dHw==']/@value").extract_first()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://blog.51cto.com',
            'Content-Type': 'application/x-www-form-urlencoded',
        }
        # 此處爲logger輸出供調試時使用
        # self.logger.info("獲取csrf值爲 %s" % csrf)
        yield FormRequest.from_response(response,
                                        url='https://blog.51cto.com/linuxliu?type=1',
                                        headers=headers,
                                        meta={'cookiejar': response.meta['cookiejar']},
                                        formdata={
                                                  # 這個位置注意0要加引號,不然會報錯,這個參數意思是是否記住密碼10天內自動登錄
                                                  'LoginForm[rememberMe]': '0',
                                                  'LoginForm[username]': '****',
                                                  'LoginForm[password]': '****',
                                                  '_csrf': csrf,
                                                  },
                                        callback=self.after_login,
                                        dont_click=True,
                                        )

    def after_login(self, response):

        # 獲取的網頁內容
        home_page = response.xpath("//a[@class='con']/text()").extract()
        if 'wx5c789cd76c3af' in home_page:
            self.logger.info('我的博客')
        else:
            self.logger.error('登錄失敗')

        resps = response.css("ul.artical-list li")
        for resp in resps:
            # 寫入item字段中
            item['title_url'] = resp.css("a.tit::attr(href)").extract_first()
            item['title'] = resp.css("a.tit::text").extract_first().strip()
            # fullname的格式爲“[名稱](鏈接)”之所以這樣是因爲
            # markdown語法裏這個表示鏈接的意思,點擊名稱直接打開鏈接內容
            item['fullname'] = '[' + item['title'] + ']' + '(' + item['title_url'] + ')'
            # 此處logger也是調試使用
            # self.logger.info("title url的值爲:%s , title的值爲%s" % (tit_url, tit))
            yield item

        # 下一頁內容獲取
        next_page = response.css('li.next a::attr(href)').extract_first()
        # self.logger.info("下一頁鏈接爲:%s" % next_page)
        if next_page is not None:
            yield scrapy.Request(next_page, callback=self.after_login)
items.py
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class CtospiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    title_url = scrapy.Field()
    fullname = scrapy.Field()

 

 

scrapy crawl 51cto -o cto.csv
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章