python爬蟲框架scrapy學習第八課
目標爬取文章,實現文本和圖片數據存儲
文本數據以json文件存儲
文本數據存儲在mongodb數據庫中
圖片保存在本地
爬取地址:伯樂在線文章
爬蟲實例
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class JobboleItem(scrapy.Item):
#標題
title = scrapy.Field()
#發佈日期
create_date = scrapy.Field()
#鏈接
url = scrapy.Field()
#MD5加密的url
url_object_id = scrapy.Field()
#圖片的url
front_image_url = scrapy.Field()
#圖片存儲路徑
front_image_path = scrapy.Field()
#點贊數
praise_nums = scrapy.Field()
#收藏數
fav_nums = scrapy.Field()
#評論數
comment_nums = scrapy.Field()
#標籤
tag = scrapy.Field()
#內容
#content = scrapy.Field()
# -*- coding: utf-8 -*-
import scrapy
from urllib.parse import urljoin
from jobBole.items import JobboleItem
import re
import hashlib
import datetime
def get_md5(md5str):
#生成1個MD5對象
m1 = hashlib.md5()
#使用MD5對象你的update方法進行md5轉換
m1.update(md5str.encode("utf-8"))
md5ConvertStr = m1.hexdigest()
return md5ConvertStr
class BoleSpider(scrapy.Spider):
name = 'bole'
allowed_domains = ['blog.jobbole.com']
start_urls = ['http://blog.jobbole.com/all-posts/']
def parse(self, response):
'''
1.獲取文章列表也中具體文章url,並交給scrapy進行下載後並進行解析
2.獲取下一頁的url並交給scrapy進行下載,下載完成後,交給parse
:param response:
:return:
'''
#解析列表頁中所有文章的url, 並交給scrapy下載並解析
post_nodes = response.css("#archive .floated-thumb .post-thumb a")
for post_node in post_nodes:
#image_url是圖片的地址
image_url = post_node.css("img::attr(src)").extract_first("")
post_url = post_node.css("::attr(href)").extract_first("")
#這裏通過meta參數將圖片的url傳遞進來,parse.urljoin的好處是如果有域名,則前面的response.url不生效
#如果沒有,就會把response.url和post_urlz做拼接
yield scrapy.Request(url=urljoin(response.url, post_url), meta={
"front_image_url": urljoin(response.url, image_url)
},callback = self.parse_detail)
#提取下一頁並交給scrapy下載
next_url = response.css(".next.page-numbers::attr(href)").extract_first("")
curr_page = int(response.xpath('//span[@class="page-numbers current"]/text()').extract()[0])
if next_url and curr_page < 3:
yield scrapy.Request(url = next_url, callback = self.parse)
def parse_detail(self, response):
'''
獲取文章的詳細內容
:param response:
:return:
'''
article_item = JobboleItem()
front_image_url = response.meta.get("front_image_url", "")
title = response.xpath('//div[@class="entry-header"]/h1/text()').extract_first()
create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().split()[0]
tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
#去掉標籤中的評論
tag_list = [element for element in tag_list if -1 == element.find("評論")]
tag = ",".join(tag_list)
praise_nums = response.xpath('//span[contains(@class, "vote-post-up")]/h10/text()') .extract()[0]
print('praise_nums ', praise_nums)
if len(praise_nums) == 0:
praise_nums = 0
else:
praise_nums = int(praise_nums[0])
fav_nums = response.xpath('//span[contains(@class, "bookmark-btn")]/text()').extract()[0]
match_re = re.match(".*(\d+).*", fav_nums)
if match_re:
fav_nums = int(match_re.group(1))
else:
fav_nums = 0
#print('@@@@ ', response.xpath('//a[@href="#article-comment"]/span/text()').extract())
comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
match_com = re.match(".*(\d+).*", comment_nums)
if match_com:
comment_nums= int(match_com.group(1))
else:
comment_nums = 0
content = response.xpath('//div[@class="entry"]').extract()[0]
article_item['url_object_id'] = get_md5(response.url) #對地址進行md5變成了定長
article_item['title'] = title
article_item['url'] = response.url
try:
create_date = datetime.datetime.strptime(create_date, '%Y/%m/%d').date()
except Exception as e:
create_date = datetime.now().date()
article_item['create_date'] = str(create_date)
article_item['front_image_url'] = [front_image_url]
article_item['praise_nums'] = int(praise_nums)
article_item['fav_nums'] = fav_nums
article_item['comment_nums'] = comment_nums
article_item['tag'] = tag
#article_item['content'] = content
yield article_item
ITEM_PIPELINES = {
'jobBole.pipelines.JobbolePipeline': 300,
'jobBole.pipelines.ArticleImagePipeline' : 301,
'jobBole.pipelines.MongoDBTwistedPipline': 302
}
IMAGES_STORE = 'D:\SunWork\python\jobBole'
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'bole'
MONGODB_SHEETNAME = 'bolePaper'
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
from scrapy.pipelines.images import ImagesPipeline
import codecs
import json
import pymongo
from scrapy.conf import settings
class JobbolePipeline(object):
'''
返回json數據到文件中
'''
def __init__(self):
self.file = codecs.open("article.json", 'w',encoding='utf-8')
def process_item(self, item, spider):
print('@@@@@@@@@@ ', item)
lines = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(lines)
return item
def spider_closed(self, spider):
self.file.close()
class ArticleImagePipeline(ImagesPipeline):
'''
對圖片的處理
'''
def get_media_requests(self, item, info):
for image_url in item['front_image_url']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
for ok, value in results:
if ok:
image_file_path = value['path']
item['front_image_path'] = image_file_path
else:
item['front_image_path'] = ""
return item
class MongoDBTwistedPipline(object):
def __init__(self):
#主機
host = settings["MONGODB_HOST"]
#端口
port = settings["MONGODB_PORT"]
#數據庫名
dbname = settings["MONGODB_DBNAME"]
#數據表名
sheetname = settings["MONGODB_SHEETNAME"]
#創建MONGODB數據庫
client = pymongo.MongoClient(host=host, port=port)
#指定數據庫
mydb = client[dbname]
#指定數據表
self.post = mydb[sheetname]
def process_item(self, item, spider):
data = dict(item)
self.post.insert(data)
return item
注:代碼來源https://www.cnblogs.com/zhaof/p/7173094.html。在此基礎上進行部分修改。