財經新聞數據scrapy實戰(東方財富網)

先看BeautifulSoup版本的

import requests
from bs4 import BeautifulSoup
link_head='http://finance.eastmoney.com/news/cywjh_'
link_end='.html'
hd={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
for i in range(1,4):
    link=link_head+str(i)+link_end
    r=requests.get(link,headers=hd)
    r.encoding=r.apparent_encoding
    soup=BeautifulSoup(r.text,'lxml')
    topic_list=soup.find_all('div',class_='text')
    for each in topic_list:
        title=each.find('p',class_='title')
        print(title.a.text.strip())
        print(each.a['href'])
        content=each.find('p',class_='info')
        print(content.text.strip())

scrapy版本(總結一下詳細做法,方便後人學習)

 打開命令行cmd,切換到選定目錄下,我切換到桌面上

cd C:\Users\Heisenberg\Desktop

然後運行命令

scrapy startproject financeSpider

然後桌面就出現了這個工程文件夾

然後打開items.py,定義字段,修改後如下

import scrapy


class FinancespiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title=scrapy.Field()
    link=scrapy.Field()
    content=scrapy.Field()

 然後在cmd上輸入

scrapy genspider finance finance.eastmoney.com

修改爬蟲器finance.py進行網頁解析,代碼修改如下:

先看普通版

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from financeSpider.items import FinancespiderItem
#從這個工程fianceSpider.items中引入FinancespiderItem

class FinanceSpider(scrapy.Spider):
    name = 'finance'
    allowed_domains = ['finance.eastmoney.com']
    start_urls = ['http://finance.eastmoney.com/news/cywjh_1.html']
    url_head='http://finance.eastmoney.com/news/cywjh_'
    url_end='.html'
    
    def start_requests(self):
        #獲取前3頁的url地址
        for i in range(1,4):
            url=self.url_head+str(i)+self.url_end
            print('當前頁面是:',url)
        
        #對新聞列表發送Request請求
            yield scrapy.Request(url=url,callback=self.parse)
    
    def parse(self, response):
        #response.encoding=response.apparent_encoding
        soup=BeautifulSoup(response.text,'lxml')
        title_list=soup.find_all('div',class_='text')
        for i in range(len(title_list)):
            #將數據封裝到FinancespiderItem對象,字典型數據
            item=FinancespiderItem()
            title=title_list[i].find('p',class_='title')
            title=title.a.text.strip()
            link=title_list[i].a['href']
            #變成字典
            content=title_list[i].find('p',class_='info')
            content=content.text.strip()
            item['title']=title
            #
            item['link']=link
            item['content']=content
            yield item
            #根據文章鏈接,發送request請求,並傳遞item參數
            #yield scrapy.Request(url=link,meta={'item':item},callback=self.parse2)
    
    #def parse2(self,response):
        #接收傳遞的item
        #item=response.meta['item']
        #解析提取文章內容
        #soup=BeautifulSoup(response.text,"lxml")
        #content=soup.find('p',class_='info')
        #content=content.text.strip()
        #content=content.text.strip()
        #content=content.replace('\n'," ")
        #print('hello,content',content)
        #item['content']=content
        #返回item,交給item pipeline
        #yield item
        
        

再看並行版:

 

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from financeSpider.items import FinancespiderItem
#從這個工程fianceSpider.items中引入FinancespiderItem

class FinanceSpider(scrapy.Spider):
    name = 'finance'
    allowed_domains = ['finance.eastmoney.com']
    start_urls = ['http://finance.eastmoney.com/news/cywjh_1.html']
    url_head='http://finance.eastmoney.com/news/cywjh_'
    url_end='.html'
    
    def start_requests(self):
        #獲取前3頁的url地址
        for i in range(1,4):
            url=self.url_head+str(i)+self.url_end
            print('當前頁面是:',url)
        
        #對新聞列表發送Request請求
            yield scrapy.Request(url=url,callback=self.parse)
    
    def parse(self, response):
        #response.encoding=response.apparent_encoding
        soup=BeautifulSoup(response.text,'lxml')
        title_list=soup.find_all('div',class_='text')
        for i in range(len(title_list)):
            #將數據封裝到FinancespiderItem對象,字典型數據
            item=FinancespiderItem()
            title=title_list[i].find('p',class_='title')
            title=title.a.text.strip()
            link=title_list[i].a['href']
            #變成字典
            #content=title_list[i].find('p',class_='info')
            #content=content.text.strip()
            item['title']=title
            #
            item['link']=link
            #item['content']=content
            #yield item
            #根據文章鏈接,發送request請求,並傳遞item參數
            yield scrapy.Request(url=link,meta={'item':item},callback=self.parse2)
    
    def parse2(self,response):
        #接收傳遞的item
        item=response.meta['item']
        #解析提取文章內容
        soup=BeautifulSoup(response.text,"lxml")
        content=soup.find('div',class_='b-review')
        content=content.text.strip()
        #content=content.text.strip()
        #content=content.replace('\n'," ")
        #print('hello,content',content)
        item['content']=content
        #返回item,交給item pipeline
        yield item
        
        

修改數據的存儲文件pipelines.py,修改後如下:

class FinancespiderPipeline(object):
    #如果是反斜槓'\'記得轉義
    file_path='C:\\Users\\Heisenberg\\Desktop\\financeSpider\\result.txt'
    
    def __init__(self):
        self.article=open(self.file_path,"w",encoding="utf-8")
    
    #定義管道的處理方法
    def process_item(self, item, spider):
        title=item['title']
        link=item['link']
        content=item['content']
        output=title+'\n'+link+'\n'+content+'\n'
        print('hello,output',output)
        self.article.write(output)
        return item

然後務必將settings.py的ITEM_PIPELINES取消註釋

 完成上述操作後在cmd中運行:

scrapy crawl finance

結果如下

發現問題是:並行版,文件存儲的不是按照順序存儲的,是隨機的,看爬取時間先後

並行版

普通版:

scrapy工程文件下載鏈接:financeSpider 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章