目錄結構
articleSpider.py
from scrapy.selector import Selector
from scrapy import Spider
from firstscrapy.items import FirstscrapyItem
class ArticleSpider(Spider):
name="csdn"
allowed_domains = ["https://blog.csdn.net"]
start_urls = ["https://blog.csdn.net/wei_zhen_dong",]
def parse(self, response):
# xpath利用谷歌瀏覽器copy對應信息
papers = response.xpath("//*[@id='mainBox']/main/div[2]/div[1]")
for pap in papers:
title = pap.xpath("//h4/a/text()").re('.*')
url = pap.xpath("//h4/a/@href").extract()
read_num = pap.xpath("//div[1]/p[3]/span/span/text()").extract()
time = pap.xpath("//div[1]/p[1]/span/text()").extract()
# 稍稍清洗一下數據
title1 = []
for i in range(0, len(title)):
if title[i].replace(" ", "") != "":
title1.append(title[i])
time1 = []
for i in range(2, len(time)):
time1.append(time[i])
item = FirstscrapyItem(title = title1, url = url, read_num = read_num, time = time1)
yield item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class FirstscrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
url = scrapy.Field()
read_num = scrapy.Field()
time = scrapy.Field()
pass
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
from opdata.opexcel import Operatingexcel
class FirstscrapyPipeline(object):
def __init__(self):
self.op = Operatingexcel()
def process_item(self, item, spider):
print(item)
if item['title']:
self.op.set_excel_dic(item,"data\csdn_data.xlsx",0,0)
print(item['title'])
return item
else:
return "數據丟失"
settings.py
加入以下代碼,用以啓動pipelines
ITEM_PIPELINES = {
'firstscrapy.pipelines.FirstscrapyPipeline': 300,
}
opexcel.py
自己寫的操作excel的類,只有兩個功能,有時間應該完善一下,我用的挺方便的,哈哈哈…
import xlrd
import xlwt
from xlutils.copy import copy
class Operatingexcel():
def get_excel_dic(self,filename,sheetname):
# filename 文件名
# sheetname 表單名
# 返回字典格式
dic = {}
data = xlrd.open_workbook(filename, 'r', encoding_override='utf-8')
table = data.sheet_by_name(sheetname)
for i in range(1, table.nrows):
for y in range(len(table.row_values(0))):
if table.row_values(i)[y] != "":
dic.setdefault(table.row_values(0)[y], []).append(table.row_values(i)[y])
return dic
def get_excel_list(self,filename,sheetname):
# filename 文件名
# sheetname 表單名
# 返回列表格式
list = []
data = xlrd.open_workbook(filename, 'r', encoding_override='utf-8')
table = data.sheet_by_name(sheetname)
for y in range(table.nrows):
for x in range(len(table.row_values(0))):
if table.row_values(y)[x] != "":
list.append(table.row_values(y)[x])
return list
def set_excel_dic(self,dic,filename,sheet_index,start_r):
# filename 文件名
# sheet_index第幾個工作表格
# start_r那一列
x = start_r
for k in dic.keys():
list = []
list.append(k)
for v in dic[k]:
list.append(v)
self.set_excel_list(list,filename,sheet_index,x)
x = x + 1
def set_excel_list(self,list,filename,sheet_index,start_r):
# filename 文件名
# sheet_index第幾個工作表格
# start_r那一列
# 讀取excel文件
r_xls = xlrd.open_workbook(filename)
# 將xlrd的對象轉化爲xlwt的對象
excel = copy(r_xls)
table = excel.get_sheet(sheet_index)
for y in range(len(list)):
if str(list[y]).split() !="":
table.write(y,start_r,str(list[y]).split())
else:
continue
excel.save(filename)
運行結果
評論和點讚的數據幾乎沒有,這裏就不進行獲取了…