先看BeautifulSoup版本的
import requests
from bs4 import BeautifulSoup
link_head='http://finance.eastmoney.com/news/cywjh_'
link_end='.html'
hd={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'}
for i in range(1,4):
link=link_head+str(i)+link_end
r=requests.get(link,headers=hd)
r.encoding=r.apparent_encoding
soup=BeautifulSoup(r.text,'lxml')
topic_list=soup.find_all('div',class_='text')
for each in topic_list:
title=each.find('p',class_='title')
print(title.a.text.strip())
print(each.a['href'])
content=each.find('p',class_='info')
print(content.text.strip())
scrapy版本(總結一下詳細做法,方便後人學習)
打開命令行cmd,切換到選定目錄下,我切換到桌面上
cd C:\Users\Heisenberg\Desktop
然後運行命令
scrapy startproject financeSpider
然後桌面就出現了這個工程文件夾
然後打開items.py,定義字段,修改後如下
import scrapy
class FinancespiderItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title=scrapy.Field()
link=scrapy.Field()
content=scrapy.Field()
然後在cmd上輸入
scrapy genspider finance finance.eastmoney.com
修改爬蟲器finance.py進行網頁解析,代碼修改如下:
先看普通版
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from financeSpider.items import FinancespiderItem
#從這個工程fianceSpider.items中引入FinancespiderItem
class FinanceSpider(scrapy.Spider):
name = 'finance'
allowed_domains = ['finance.eastmoney.com']
start_urls = ['http://finance.eastmoney.com/news/cywjh_1.html']
url_head='http://finance.eastmoney.com/news/cywjh_'
url_end='.html'
def start_requests(self):
#獲取前3頁的url地址
for i in range(1,4):
url=self.url_head+str(i)+self.url_end
print('當前頁面是:',url)
#對新聞列表發送Request請求
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
#response.encoding=response.apparent_encoding
soup=BeautifulSoup(response.text,'lxml')
title_list=soup.find_all('div',class_='text')
for i in range(len(title_list)):
#將數據封裝到FinancespiderItem對象,字典型數據
item=FinancespiderItem()
title=title_list[i].find('p',class_='title')
title=title.a.text.strip()
link=title_list[i].a['href']
#變成字典
content=title_list[i].find('p',class_='info')
content=content.text.strip()
item['title']=title
#
item['link']=link
item['content']=content
yield item
#根據文章鏈接,發送request請求,並傳遞item參數
#yield scrapy.Request(url=link,meta={'item':item},callback=self.parse2)
#def parse2(self,response):
#接收傳遞的item
#item=response.meta['item']
#解析提取文章內容
#soup=BeautifulSoup(response.text,"lxml")
#content=soup.find('p',class_='info')
#content=content.text.strip()
#content=content.text.strip()
#content=content.replace('\n'," ")
#print('hello,content',content)
#item['content']=content
#返回item,交給item pipeline
#yield item
再看並行版:
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from financeSpider.items import FinancespiderItem
#從這個工程fianceSpider.items中引入FinancespiderItem
class FinanceSpider(scrapy.Spider):
name = 'finance'
allowed_domains = ['finance.eastmoney.com']
start_urls = ['http://finance.eastmoney.com/news/cywjh_1.html']
url_head='http://finance.eastmoney.com/news/cywjh_'
url_end='.html'
def start_requests(self):
#獲取前3頁的url地址
for i in range(1,4):
url=self.url_head+str(i)+self.url_end
print('當前頁面是:',url)
#對新聞列表發送Request請求
yield scrapy.Request(url=url,callback=self.parse)
def parse(self, response):
#response.encoding=response.apparent_encoding
soup=BeautifulSoup(response.text,'lxml')
title_list=soup.find_all('div',class_='text')
for i in range(len(title_list)):
#將數據封裝到FinancespiderItem對象,字典型數據
item=FinancespiderItem()
title=title_list[i].find('p',class_='title')
title=title.a.text.strip()
link=title_list[i].a['href']
#變成字典
#content=title_list[i].find('p',class_='info')
#content=content.text.strip()
item['title']=title
#
item['link']=link
#item['content']=content
#yield item
#根據文章鏈接,發送request請求,並傳遞item參數
yield scrapy.Request(url=link,meta={'item':item},callback=self.parse2)
def parse2(self,response):
#接收傳遞的item
item=response.meta['item']
#解析提取文章內容
soup=BeautifulSoup(response.text,"lxml")
content=soup.find('div',class_='b-review')
content=content.text.strip()
#content=content.text.strip()
#content=content.replace('\n'," ")
#print('hello,content',content)
item['content']=content
#返回item,交給item pipeline
yield item
修改數據的存儲文件pipelines.py,修改後如下:
class FinancespiderPipeline(object):
#如果是反斜槓'\'記得轉義
file_path='C:\\Users\\Heisenberg\\Desktop\\financeSpider\\result.txt'
def __init__(self):
self.article=open(self.file_path,"w",encoding="utf-8")
#定義管道的處理方法
def process_item(self, item, spider):
title=item['title']
link=item['link']
content=item['content']
output=title+'\n'+link+'\n'+content+'\n'
print('hello,output',output)
self.article.write(output)
return item
然後務必將settings.py的ITEM_PIPELINES取消註釋
完成上述操作後在cmd中運行:
scrapy crawl finance
結果如下
發現問題是:並行版,文件存儲的不是按照順序存儲的,是隨機的,看爬取時間先後
並行版
普通版:
scrapy工程文件下載鏈接:financeSpider