首先創建項目,代碼操作參照我之前的博客https://blog.csdn.net/zhouchen1998/article/details/81328858
這裏強調一下,由於scrapy是異步io,同時處理多個http,所以要想按順序存一個txt每章按順序寫入,可以實現但有點繁瑣,這裏只爲了scrapy的熟悉和了解其爬取能力,我就只是每一章存了一個txt(當然,有很多合併方法)。
用pycharm打開項目,並且自定義一個Spider爲TTSpider,繼承自scrapy.spider,這裏在寫spider過程中逐漸完善添加item最後得到的item需要爲下。
items.py
import scrapy
class TiantianshuwuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 每一個章節鏈接
link_url = scrapy.Field()
# 每一章的章節名
dir_name = scrapy.Field()
# 每一章的內容
dir_content = scrapy.Field()
# 每一章內容存放的js文件
content_js_url = scrapy.Field()
settings.py
BOT_NAME = 'tiantianshuwu'
SPIDER_MODULES = ['tiantianshuwu.spiders']
NEWSPIDER_MODULE = 'tiantianshuwu.spiders'
ITEM_PIPELINES = {
'tiantianshuwu.pipelines.TiantianshuwuPipeline': 300,
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
STORE = r"D:\聖墟"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0.25
開始寫爬蟲(具體註釋)
TTSpider.py
import scrapy
from tiantianshuwu.items import TiantianshuwuItem
class TTSpider(scrapy.Spider):
name = "tianshu"
def __init__(self):
# 鏈接頭
self.server_link = 'http://www.ttshu.com'
# 限制域名
self.allowed_domains = ['www.ttshu.com']
# 其實http文件
self.start_url = "http://www.ttshu.com/html/content/18424482.html"
def start_requests(self):
yield scrapy.Request(url=self.start_url, callback=self.parse1)
# 解析出每一個章節鏈接
def parse1(self, response):
items = []
# 查詢到存放章節鏈接的a標籤,獲得href鏈接組成列表,該列表存放的就是每一章的鏈接尾巴
chapter_urls = response.xpath(r'//td[@bgcolor="#F6F6F6"]/a/@href').extract()
# 遍歷每一個鏈接放入item
for index in range(len(chapter_urls)):
item = TiantianshuwuItem()
item["link_url"] = self.server_link + chapter_urls[index]
items.append(item)
# 對每個鏈接發出request
for item in items:
yield scrapy.Request(url=item['link_url'], meta={"data": item}, callback=self.parse2)
def parse2(self, response):
# 獲得item對象數據
item = response.meta['data']
# 提取h1標籤中的章節名稱
item['dir_name'] = response.xpath(r'//h1/text()').extract()[0]
# 提取js代碼鏈接位置
item['content_js_url'] = self.server_link + response.xpath(r'//p/script/@src').extract()[0]
# 請求js文件
yield scrapy.Request(url=item['content_js_url'], meta={"data": item}, callback=self.parse3)
# 解析js文件解碼後的字符串,去掉html文件的符號代替符
def solve_text(self, content):
content = content.replace("document.write('", "")
content = content.replace("' ;", "")
content = content.replace(")", " ")
content = content.replace("</br>", "\n")
content = content.replace("<br />", "\n")
content = content.replace("<br><br>", "\n")
content = content.replace(" ", " ")
return content
def parse3(self, response):
item = response.meta["data"]
# 解析文檔整體,獲得文本內容
item['dir_content'] = self.solve_text(str(response.body.decode('gb2312', 'ignore')))
yield item
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from tiantianshuwu import settings
import os
class TiantianshuwuPipeline(object):
def process_item(self, item, spider):
# 檢查存放目錄是否存在,不存在則創建目錄
if os.path.exists(settings.STORE):
pass
else:
os.makedirs(settings.STORE)
# 每一章內容以txt文件寫入文件夾
with open(settings.STORE+'\\'+item['dir_name'].strip()+".txt", 'w') as f:
f.write(item['dir_content'])
return item
如圖,1161章的內容全部爬取成功。
打開文件,可以閱讀,但是我們發現還是有字符處理不正確,希望改進。