Centos 7 安裝scrapy
yum install gcc libffi-devel openssl-devel libxml2 libxslt-devel libxml2-devel python-devel -y
安裝lxml,再安裝scrapy
pip install lxml
pip install scrapy
Scrapy是一個爲了爬取網站數據,提取結構性數據而編寫的應用框架。 可以應用在包括數據挖掘,信息處理或存儲歷史數據等一系列的程序中。
一、新建一個項目。
scrapy startproject quote
scrapy genspider quotes quotes.toscrape.com
scrapy crawl quotes
將parse中的pass修改。就可以執行操作了。
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
print(response.text)
就可以打印抓取網站的源代碼
In [1]: response
Out[1]: <200 http://quotes.toscrape.com>
In [2]: quotes = response.css('.quote')
In [4]: quotes[0]
Out[4]: <Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' quote ')]" data='<div class="quote" itemscope itemtype="h'>
查看quotes的內容
In [6]: quotes[0].css('.text::text').extract()
Out[6]: ['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”']
取出所有的tag
In [13]: quotes[0].css('.tags .tag::text').extract()
Out[13]: ['change', 'deep-thoughts', 'thinking', 'world']
注: scrapy 的 :: 的意思就是 ::text 屬性下的 內容。這是 scrapy特有的。.extract_first() 是返回第一個結果。而extract() 是返回全部結果。
三、開始抓取。
首先修改 items.py 的QuoteItem類。這個類有例子。將想要的內容聲明下。class QuoteItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
再修改quotes.py 。利用剛纔 shell 裏那個講解的內容去抓下來想要的內容。
def parse(self, response):
quotes = response.css('.quote')
for quote in quotes:
item = QuoteItem()
text = quote.css('.text::text').extract_first()
author = quote.css('.author::text').extract_first()
tags = quote.css('.tags .tag::text').extract()
item['text'] = text
item['author'] = author
item['tags'] = tags
yield item
這樣就可以把每條名言的 內容,作者,標籤都拿到手了。。
# -*- coding: utf-8 -*-
import scrapy
from quote.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.css('.quote')
for quote in quotes:
item = QuoteItem()
text = quote.css('.text::text').extract_first()
author = quote.css('.author::text').extract_first()
tags = quote.css('.tags .tag::text').extract()
item['text'] = text
item['author'] = author
item['tags'] = tags
yield item
next = response.css('.pager .next a::attr(href)').extract_first()
url = response.urljoin(next)
yield scrapy.Request(url=url,callback=self.parse)
MONGO_DB = 'quotes'
'quote.pipelines.QuotePipeline': 300,
'quote.pipelines.MongoPipeline': 400
}
# -*- coding: utf-8 -*-
import pymongo
from scrapy.exceptions import DropItem
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class QuotePipeline(object):
def __init__(self):
self.limit = 150
def process_item(self, item):
if item['text']:
if len(item['text']) > self.limit:
item['text'] = item['text'][0:self.limit].rstrip() + '...'
return item
else:
return DropItem('Missing Text')
class MongoPipeline(object):
def __init__(self,mongo_uri,mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_uri = crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DB')
)
def open_spider(self):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self,item,spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self,spider):
self.client.close()