from AticleSpider.items import JobBoleAricleItem, ArticleItemLoader
如果不使用item_loader則需要
def parse_detail(self, response):
article_item = items.JobBoleAricleItem()
re_select = response.xpath("/html/body/div[1]/div[3]/div[1]/div[1]/h1")
# 標題
title = response.xpath('//div[@class="entry-header"]/h1/text()').extract()[0]
print(title)
create_date = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0].strip().replace(' ·', "")
# 點贊數
praise_nums = response.xpath('//span[contains(@class,"vote-post-up")]/h10/text()').extract()[0]
# 收藏數
fav_nums = response.xpath('//span[contains(@class,"bookmark-btn")]/text()').extract()[0]
match_re = re.match(r'.*(\d+).*', fav_nums)
if match_re:
fav_nums = int(match_re.group(1))
else:
fav_nums = 0;
# 評論
comment_nums = response.xpath('//a[@href="#article-comment"]/span/text()').extract()[0]
match_re = re.match(r'.*(\d+).*', comment_nums)
if match_re:
comment_nums = int(match_re.group(1))
else:
comment_nums = 0;
# 內容
content = response.xpath('//div[@class="entry"]').extract()[0]
# 分類
tag_list = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/a/text()').extract()
tag_list = [element for element in tag_list if not element.strip().endswith("評論")]
tags = ",".join(tag_list)
#填充數據
article_item["title"]=title
article_item["url"]=response.url
article_item["url_object_id"]=common.get_md5(response.url)
try:
create_date = datetime.datetime.strftime(create_date,"%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now()
article_item["create_date"]=create_date
article_item["front_image_url"]=[front_image_url]
article_item["praise_nums"] = praise_nums
article_item["fav_nums"] = fav_nums
article_item["comment_nums"] = comment_nums
article_item["tags"] = tags
article_item["content"] = content
會傳到pipelines中去
# 通過item_loader 加載item
front_image_url = response.meta.get("front_image_url", "") # 文章封面圖
利用item_loader.
item_loader有三個方法
- add_css()
- add_xpath()
- add_values()
def parse_detail(self, response):
# 通過item_loader 加載item
front_image_url = response.meta.get("front_image_url", "") # 文章封面圖
item_loader = ArticleItemLoader(item=JobBoleAricleItem(), response=response)
item_loader.add_value('url', response.url)
item_loader.add_value('url_object_id', common.get_md5(response.url))
item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()')
item_loader.add_xpath('create_date', '//p[@class="entry-meta-hide-on-mobile"]/text()')
item_loader.add_value('front_image_url', [front_image_url])
item_loader.add_xpath('praise_nums', '//span[contains(@class,"vote-post-up")]/h10/text()')
item_loader.add_xpath('fav_nums', '//span[contains(@class,"bookmark-btn")]/text()')
item_loader.add_xpath('comment_nums', '//a[@href="#article-comment"]/span/text()')
item_loader.add_xpath('content', '//div[@class="entry"]')
item_loader.add_xpath('tags', '//p[@class="entry-meta-hide-on-mobile"]/a/text()')
# 調用默認item的時候就會都是list
article_item = item_loader.load_item()
yield article_item
但是單純的通過scrapy提供的item_loader提取出來的數據都是list格式的,並不是我們需要的
所以我們需要自定義item_loader,並在我們的item類中匹配我們需要的值 通過使用
from scrapy.loader.processors import MapCompose, TakeFirst, Join
Mapcompose()可傳遞多個函數來處理值然後返回
def data_convert(value):
try:
create_date = datetime.datetime.strptime(value, "%Y/%m/%d").date()
except Exception as e:
create_date = datetime.datetime.now().date()
return create_date
def get_nums(values):
match_re = re.match(r'.*(\d+).*', values)
if match_re:
nums = int(match_re.group(1))
else:
nums = 0;
return nums
def remove_comment_tags(value):
# 去掉tags中提取的評論
if "評論" in value:
return ""
else:
return value # 自定義item_loader
def return_value(value):
return value
class ArticleItemLoader(ItemLoader):
# //設置默認輸出函數
default_output_processor = TakeFirst()
class JobBoleAricleItem(scrapy.Item):
title = scrapy.Field(
# 預處理 Mapcompose() 可以傳遞任意函數
input_processor=MapCompose()
)
create_date = scrapy.Field(
input_processor=MapCompose(data_convert),
# TakeFirst實質上就是說只取得第一個
)
url = scrapy.Field()
url_object_id = scrapy.Field()
front_image_url = scrapy.Field(
#這樣可做到既沒有調用default_oupput_processor,也沒有修改原來的值
output_processor=MapCompose(return_value)
)
front_image_path = scrapy.Field()
praise_nums = scrapy.Field(
input_processor=MapCompose(get_nums), )
fav_nums = scrapy.Field(
input_processor=MapCompose(get_nums),
)
comment_nums = scrapy.Field(
)
tags = scrapy.Field(
input_processor=MapCompose(remove_comment_tags),
output_processor=Join(",")
)
content = scrapy.Field()