Scrapy框架学习笔记(一)

Scrapy框架学习笔记(一)

使用步骤:

1、首先建立自己的Item文件,其中定义的是抓取的内容的数据类型。

2、接下来建立自己的spider文件,

3、最后建立自己的pipeline文件,负责接收spider传送过来的Item,并在这个文件中进行处理,可以导出到文件,可以存入数据库。 

碰到的问题:

1、爬取的中文输出到文件中全为unicode字符,初期以为是字符编码的问题,尝试了各种编码以后,发现非也。实为list输出错误,只需输出list[0]即可解决。

2、勿忘在settings.py注册pipeline。 

下面为几个递归爬取的例子:

方法1:itemRequest对象都放入到items中,通过返回items,让框架自己去识别时item还是Request。

class SlyySpider(BaseSpider):
    name = "a"
    allowed_domains = [".com"]
    start_urls = ["****"]
    def parse(self, response):
        hxs = HtmlXPathSelector(response)        
        items = []
        h3 = hxs.select('''*****''').extract()
        h3_unicode = "".join(h3)
        t1 = hxs.select('''****''').extract()
        items.append(SlyyItem(head=h3_unicode, url=response.url))
        for url in hxs.select('''***''').extract():
            items.append(Request(url, callback=self.parse))
       return items

方法2通过yield来区别对待itemrequest

class SlyySpider(BaseSpider):
    name = "slyy2"
    allowed_domains = ["***"]
    start_urls = ["***"]
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        h3 = hxs.select('''***''').extract()
        h3_unicode = "".join(h3)
        yield SlyyItem(head=h3_unicode, url=response.url)
        for url in hxs.select('''***''').extract():
            yield Request(url, callback=self.parse) 
方法3: 
例一
class SlyySpider(BaseSpider):
    name = "slyy3"
    allowed_domains = ["***"]
    start_urls = ["***"]    
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        firspost = hxs.select('''***''').extract()[0]
        items.extend([self.make_requests_from_url(firspost).replace(callback=self.parse_post)])

        url2 = hxs.select('''***''').extract()[0]
        items.append(self.make_requests_from_url(url2))        
        return items
        
    def parse_post(self, response):
        hxs = HtmlXPathSelector(response)
        h3 = hxs.select('''***''').extract()[0]
        print h3
        item = SlyyItem()
        item['url'] = response.url
        item['head'] = h3
        return item

例二
1 from scrapy.selector import HtmlXPathSelector
 2  
 3 def parse(self, response):
 4     hxs = HtmlXPathSelector(response)
 5     items = []
 6  
 7     newurls = hxs.select('//a/@href').extract()
 8     validurls = []
 9     for url in newurls:
10             #判断URL是否合法
11             if true: 
12                     validurls.append(url)
13     items.extend([self.make_requests_from_url(url).replace(callback=self.parse) for url in validurls])
15  
16     sites = hxs.select('//ul/li')
17     items = []
18     for site in sites:
19             item = DmozItem()
20             item['title'] = site.select('a/text()').extract()
21             item['link'] = site.select('a/@href').extract()
22             item['desc'] = site.select('text()').extract()
23             items.append(item)
24  
25     return items


CrawlSpier
1 from scrapy.selector import HtmlXPathSelector
 2  from sitemap.items import SitemapItem
 3  
 4  import urllib
 5  import simplejson
 6  import exceptions
 7  import pickle
 8  
 9  class SitemapSpider(CrawlSpider):
10      name = 'sitemap_spider'
11      allowed_domains = ['qunar.com']
12      start_urls = ['http://www.qunar.com/routes/']
13  
14      rules = (
15          #Rule(SgmlLinkExtractor(allow=(r'http://www.qunar.com/routes/.*')), callback='parse'),
16          #Rule(SgmlLinkExtractor(allow=('http:.*/routes/.*')), callback='parse'),
17      )
18  
19      def parse(self, response):
20          item = SitemapItem()
21          x         = HtmlXPathSelector(response)
22          raw_urls  = x.select("//a/@href").extract()
23          urls      = []
24          for url in raw_urls:
25              if 'routes' in url:
26                  if 'http' not in url:
27                      url = 'http://www.qunar.com' + url
28                  urls.append(url)
29  
30          for url in urls:
31              yield Request(url)
32  
33          item['url']         = response.url.encode('UTF-8')
34          arr_keywords        = x.select("//meta[@name='keywords']/@content").extract()
35          item['keywords']    = arr_keywords[0].encode('UTF-8')
36          arr_description     = x.select("//meta[@name='description']/@content").extract()
37          item['description'] = arr_description[0].encode('UTF-8')
38          yield item 

关于rule:定义了一系列的相关链接

       allow属性为允许的链接

       deny属性为不允许的链接

       callback属性为回调函数

    rules = ( 

        #下面是符合规则的网址,但是不抓取内容,只是提取该页的链接(这里网址是虚构的,实际使用时请替换

       Rule(SgmlLinkExtractor(allow=(r'http://test_url/test?page_index=\d+'))), 

        #下面是符合规则的网址,提取内容,(这里网址是虚构的,实际使用时请替换

       Rule(SgmlLinkExtractor(allow=(r'http://test_rul/test?product_id=\d+')), callback="parse_item"), 

    )

以下为我编写的爬取沪江网站的简单爬虫:

items.py:

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html

from scrapy.item import Item, Field

class MyproItem(Item):
        # define the fields for your item here like:
        # name = Field()
        id = Field()
	th = Field()
        zh = Field()
        url = Field()
        title = Field()

pipelines.py:

class MyproPipeline(object):
    def __init__(self):
        self.file = open('th.txt' , 'w')
        self.file2 = open('zh.txt' , 'w')
    def process_item(self, item, spider):
        if len(item['th']) > 0 and len(item['zh']) > 0 :
            if(len(item['th']) == len(item['zh'])):              
                self.file.write(str(item['title'][0].encode("utf-8"))+ '\n')
                for i in range(len(item['th'])):
                    self.file.write(str(item['th'][i].encode("utf-8")) + '\n')
                self.file2.write(str(item['title'][0].encode("utf-8"))+ '\n')
                for i in range(len(item['zh'])):
                    self.file2.write(str(item['zh'][i].encode("utf-8")) + '\n')
        return item

hjspider.py:

from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from mypro.items import MyproItem
from scrapy.http import Request

class HjSpider(BaseSpider):
    name = "hujiang"
    allowed_domain = ["hujiang.com"]
    start_urls = [
            "http://th.hujiang.com/new/"
        ]

    def parse(self , response):
        hxs = HtmlXPathSelector(response)
        items = []
        urls = []
        raw_title = hxs.select('//title').extract()
        raw_th = hxs.select("//div[@class='langs_en']/text()").extract()
        raw_zh = hxs.select("//div[@class='langs_cn']/text()").extract()
        items.append(MyproItem(title = raw_title , zh = raw_zh , th = raw_th))
        
        raw_urls = hxs.select('//a/@href').extract()
        for url in raw_urls:
             if('http' not in url):
                 if('new' in url):
                     if (url not in urls):
                         url = "http://th.hujiang.com" + url
                         #item = MyproItem()
                         #item['url'] = url
                         urls.append(url)
                         items.append(Request(url , callback = self.parse))
        return items
        
        







發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章