Scrapy学习第四课

任务:爬取凤凰网导航下所有一级、二级和具体新闻数据

凤凰网导航
一级标题:
在这里插入图片描述

二级标题:
在这里插入图片描述
新闻链接:
在这里插入图片描述
具体新闻标题:
在这里插入图片描述

执行:爬虫实例

1、items.py文件:明确要爬取的数据字段

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class IfengprojectItem(scrapy.Item):

    #一级大标题和超链接
    parentTitle = scrapy.Field()
    parentUrls = scrapy.Field()

    #二级标题和超链接
    secondTitle = scrapy.Field()
    secondUrls = scrapy.Field()

    #新闻链接路径和路径
    newsUrls = scrapy.Field()
    newsFileName = scrapy.Field()

    #具体新闻标题、正文内容、新闻发布时间
    newsHead = scrapy.Field()
    newsContent = scrapy.Field()
    newsPublicTime = scrapy.Field()

2、ifeng.py文件:执行具体爬虫过程

# -*- coding: utf-8 -*-
import scrapy
import os
from iFengProject.items import IfengprojectItem

class IfengSpider(scrapy.Spider):
    name = 'ifeng'
    allowed_domains = ['ifeng.com']
    start_urls = ['http://www.ifeng.com/daohang/']

    def parse(self, response):
        
        items = []

        #一级大标题和超链接
        parentUrls = response.xpath('//div[@class = "col3"]/h2/a/@href').extract()
        parentTitle = response.xpath('//div[@class = "col3"]/h2/a/text()').extract()

        #二级标题和超链接
        secondUrls = response.xpath('//ul[@class = "clearfix"]/li/a/@href').extract()
        secondTitle = response.xpath('//ul[@class = "clearfix"]/li/a/text()').extract()

        #爬取所有的一级大标题和超链接
        for i in range(0, len(parentTitle)):
            #指定大类目录的路径和目录名
            parentFileName = "./数据/"+parentTitle[i]

            #如果目录不存在,则创建目录
            if(not os.path.exists(parentFileName)):
                os.makedirs(parentFileName)

            #爬取所有二级数据
            for j in range(0, len(secondTitle)):
                item = IfengprojectItem()

                #保存大类的title和urls
                item['parentTitle'] = parentTitle[i]
                item['parentUrls'] = parentUrls[i]

                # 检查小类的url是否以同类别大类url开头,如果是返回True
                # 由于网站二级链接有些并不以一级链接的url开头,故会丢弃一部分数据
                if_belong = secondUrls[j].startswith(item['parentUrls'])

                if(if_belong):
                    secondFileName = parentFileName + '/' + secondTitle[j]
                    #如果目录不存在,创建二级路径
                    if(not os.path.exists(secondFileName)):
                        os.makedirs(secondFileName)
                    
                    item['secondUrls'] = secondUrls[j]
                    item['secondTitle'] = secondTitle[j]
                    item['newsFileName'] = secondFileName
                    filename = 'secondFileName.html'
                    f = open(filename,'a+')
                    f.write(item['newsFileName']+ "  **   ")
                    f.close()
                    items.append(item)
        
        for item in items:
            yield scrapy.Request(url=item['secondUrls'], meta={'meta_1': item}, callback=self.second_parse)

    
    #对于二级标题和超链接下的url,再进行递归请求
    def second_parse(self, response):
        #提取每次Response的meta数据
        meta_1 = response.meta['meta_1']
        
        #取出小类里的新闻链接list
        xpathStr = "//div[@class='juti_list']/h3/a/@href"
        xpathStr += " | " +"//div[@class='box_list clearfix']/h2/a/@href"
        newsUrls = response.xpath(xpathStr).extract()
        
        items=[]
        for i in range(0, len(newsUrls)):
            # 检查每个链接是否以大类url开头、以.shtml结尾,如果是返回True
            if_belong = newsUrls[i].endswith('.shtml') and newsUrls[i].startswith(meta_1['parentUrls'])

            if(if_belong):
                item = IfengprojectItem()
                item['parentUrls'] = meta_1['parentUrls']
                item['parentTitle'] = meta_1['parentTitle']
                item['secondUrls'] = meta_1['secondUrls']
                item['secondTitle'] = meta_1['secondTitle']
                item['newsFileName'] = meta_1['newsFileName']
                item['newsUrls'] = newsUrls[i]
                items.append(item)

        for item in items:
            yield scrapy.Request(url=item['newsUrls'], meta={'meta_2':item}, callback=self.news_parse)

    def news_parse(self,response):
        item = response.meta['meta_2']
        content = ""
        head  = response.xpath("//title/text()")[0].extract()
        content_list = response.xpath('//div[@id="main_content"]/p/text() | //div[@id="yc_con_txt"]/p/text()').extract()
        if response.xpath("//span[@class='ss01']/text() | //div[@class='yc_tit']/p/span/text()"):
            newsPublicTime = response.xpath("//span[@class='ss01']/text() | //div[@class='yc_tit']/p/span/text()")[0].extract()
        else:
            newsPublicTime = "时间未统计出来"    
        
        for each in content_list:
            content += each
        
        item['newsHead'] = head
        item['newsContent'] = content
        item['newsPublicTime'] = newsPublicTime

        yield item

3、pipelines.py文件:执行爬取数据的存储操作

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re

class IfengprojectPipeline(object):
    def process_item(self, item, spider):
        head = item['newsHead']
        headStr = re.sub("[\\\\/:*?\"<>|]","",head)
        newsPublicTime = item['newsPublicTime']
        filename = newsPublicTime + head.rstrip()
        pattern=r'[\\/:*?"<>|\r\n]+'
        filename = re.sub(pattern,'-',filename)
        filename = filename + ".txt"
        fp = open(item['newsFileName']+ '/'+ filename, "w",encoding='utf-8')
        fp.write(item['newsContent'])
        fp.close()

4、settings.py文件:配置文件,打开管道处理爬虫数据的开关

# 设置管道文件
ITEM_PIPELINES = {
   'iFengProject.pipelines.IfengprojectPipeline': 300,
}

结果:爬取结果展示

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述
补充:由于每一个新闻链接的源码格式不统一,在爬取过程,设置的规则有限,并不能覆盖所有新闻链接,因为有些文件夹/文件为空。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章