Centos 7 安装scrapy
yum install gcc libffi-devel openssl-devel libxml2 libxslt-devel libxml2-devel python-devel -y
安装lxml,再安装scrapy
pip install lxml
pip install scrapy
Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。 可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中。
一、新建一个项目。
scrapy startproject quote
scrapy genspider quotes quotes.toscrape.com
scrapy crawl quotes
将parse中的pass修改。就可以执行操作了。
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
print(response.text)
就可以打印抓取网站的源代码
In [1]: response
Out[1]: <200 http://quotes.toscrape.com>
In [2]: quotes = response.css('.quote')
In [4]: quotes[0]
Out[4]: <Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' quote ')]" data='<div class="quote" itemscope itemtype="h'>
查看quotes的内容
In [6]: quotes[0].css('.text::text').extract()
Out[6]: ['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”']
取出所有的tag
In [13]: quotes[0].css('.tags .tag::text').extract()
Out[13]: ['change', 'deep-thoughts', 'thinking', 'world']
注: scrapy 的 :: 的意思就是 ::text 属性下的 内容。这是 scrapy特有的。.extract_first() 是返回第一个结果。而extract() 是返回全部结果。
三、开始抓取。
首先修改 items.py 的QuoteItem类。这个类有例子。将想要的内容声明下。class QuoteItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
再修改quotes.py 。利用刚才 shell 里那个讲解的内容去抓下来想要的内容。
def parse(self, response):
quotes = response.css('.quote')
for quote in quotes:
item = QuoteItem()
text = quote.css('.text::text').extract_first()
author = quote.css('.author::text').extract_first()
tags = quote.css('.tags .tag::text').extract()
item['text'] = text
item['author'] = author
item['tags'] = tags
yield item
这样就可以把每条名言的 内容,作者,标签都拿到手了。。
# -*- coding: utf-8 -*-
import scrapy
from quote.items import QuoteItem
class QuotesSpider(scrapy.Spider):
name = 'quotes'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
def parse(self, response):
quotes = response.css('.quote')
for quote in quotes:
item = QuoteItem()
text = quote.css('.text::text').extract_first()
author = quote.css('.author::text').extract_first()
tags = quote.css('.tags .tag::text').extract()
item['text'] = text
item['author'] = author
item['tags'] = tags
yield item
next = response.css('.pager .next a::attr(href)').extract_first()
url = response.urljoin(next)
yield scrapy.Request(url=url,callback=self.parse)
MONGO_DB = 'quotes'
'quote.pipelines.QuotePipeline': 300,
'quote.pipelines.MongoPipeline': 400
}
# -*- coding: utf-8 -*-
import pymongo
from scrapy.exceptions import DropItem
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
class QuotePipeline(object):
def __init__(self):
self.limit = 150
def process_item(self, item):
if item['text']:
if len(item['text']) > self.limit:
item['text'] = item['text'][0:self.limit].rstrip() + '...'
return item
else:
return DropItem('Missing Text')
class MongoPipeline(object):
def __init__(self,mongo_uri,mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_uri = crawler.settings.get('MONGO_URI'),
mongo_db = crawler.settings.get('MONGO_DB')
)
def open_spider(self):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self,item,spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self,spider):
self.client.close()