目標:爬我本人的csdn博客的文章、鏈接等。
首先創建好爬蟲項目
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class MywebItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
Link = scrapy.Field()
read = scrapy.Field()
comment = scrapy.Field()
date = scrapy.Field()
title = scrapy.Field()
articletype = scrapy.Field()
創建模板文件 myspider.py
myspider.py
# -*- coding: utf-8 -*-
import scrapy
from myweb.items import MywebItem
from scrapy.http import Request
class MyspiderSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['blog.csdn.net']
start_urls = ['https://blog.csdn.net/weixin_43614688/article/list/1?']
def start_requests(self):
yield Request("https://blog.csdn.net/weixin_43614688/article/list/1?",
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"})
def parse(self, response):
Item = MywebItem()
Item['title'] = response.xpath('//div[@class="article-item-box csdn-tracking-statistics"]//h4[@class=""]/a/text()').extract()
Item['articletype']=response.xpath('//div[@class="article-item-box csdn-tracking-statistics"]//h4[@class=""]/a/span/text()').extract()
Item['date']=response.xpath('//div[@class="info-box d-flex align-content-center"]//span[@class="date"]/text()').extract()
Item['Link']=response.xpath('//div[@class="article-item-box csdn-tracking-statistics"]//h4[@class=""]/a/@href').extract()
yield Item
yield Request("https://blog.csdn.net/weixin_43614688/article/list/2?",
callback = self.parse,
headers={"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"})
# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class MywebPipeline(object):
def __init__(self):
self.conn = pymysql.connect(host='localhost',user='root',passwd='lkm',db='bank')
self.cursor = self.conn.cursor(cursor=pymysql.cursors.DictCursor)
def process_item(self, item, spider):
for j in range(len(item["Link"])):
title = item['title'][2*j+1]
articletype = item['articletype'][j]
date = item['date'][j]
link = item['Link'][j]
sql = "insert into myblog(title, articletype, date, link) values('"+title+"','"+articletype+"','"+date+"','"+link+"')"
self.cursor.execute(sql)
return item
def close_spider(self, spider):
self.cursor.connection.commit()
self.cursor.close()
在 setting.py 文件中進行相應的配置
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'myweb.pipelines.MywebPipeline': 300,
}
mysql中創建好數據表之後就可以開始運行程序了。
調試:
在調試時可能會遇到一個比較隱晦的報錯信息, ERROR: Error processing, 這時只要安裝下面的提示找出報錯位置進行檢查即可。
因爲創建表時title字段的定義是varchar(30),有的title超過了這個長度,導致運行到一半程序終止,這時需要修改定義:
alter table myblog modify title varchar(100);
在排除錯誤之後,程序正常運行
select * from myblog;