Scrapy框架學習筆記(一)

Scrapy框架學習筆記(一)

使用步驟:

1、首先建立自己的Item文件,其中定義的是抓取的內容的數據類型。

2、接下來建立自己的spider文件,

3、最後建立自己的pipeline文件,負責接收spider傳送過來的Item,並在這個文件中進行處理,可以導出到文件,可以存入數據庫。 

碰到的問題:

1、爬取的中文輸出到文件中全爲unicode字符,初期以爲是字符編碼的問題,嘗試了各種編碼以後,發現非也。實爲list輸出錯誤,只需輸出list[0]即可解決。

2、勿忘在settings.py註冊pipeline。 

下面爲幾個遞歸爬取的例子:

方法1:itemRequest對象都放入到items中,通過返回items,讓框架自己去識別時item還是Request。

class SlyySpider(BaseSpider):
    name = "a"
    allowed_domains = [".com"]
    start_urls = ["****"]
    def parse(self, response):
        hxs = HtmlXPathSelector(response)        
        items = []
        h3 = hxs.select('''*****''').extract()
        h3_unicode = "".join(h3)
        t1 = hxs.select('''****''').extract()
        items.append(SlyyItem(head=h3_unicode, url=response.url))
        for url in hxs.select('''***''').extract():
            items.append(Request(url, callback=self.parse))
       return items

方法2通過yield來區別對待itemrequest

class SlyySpider(BaseSpider):
    name = "slyy2"
    allowed_domains = ["***"]
    start_urls = ["***"]
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        h3 = hxs.select('''***''').extract()
        h3_unicode = "".join(h3)
        yield SlyyItem(head=h3_unicode, url=response.url)
        for url in hxs.select('''***''').extract():
            yield Request(url, callback=self.parse) 
方法3: 
例一
class SlyySpider(BaseSpider):
    name = "slyy3"
    allowed_domains = ["***"]
    start_urls = ["***"]    
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        firspost = hxs.select('''***''').extract()[0]
        items.extend([self.make_requests_from_url(firspost).replace(callback=self.parse_post)])

        url2 = hxs.select('''***''').extract()[0]
        items.append(self.make_requests_from_url(url2))        
        return items
        
    def parse_post(self, response):
        hxs = HtmlXPathSelector(response)
        h3 = hxs.select('''***''').extract()[0]
        print h3
        item = SlyyItem()
        item['url'] = response.url
        item['head'] = h3
        return item

例二
1 from scrapy.selector import HtmlXPathSelector
 2  
 3 def parse(self, response):
 4     hxs = HtmlXPathSelector(response)
 5     items = []
 6  
 7     newurls = hxs.select('//a/@href').extract()
 8     validurls = []
 9     for url in newurls:
10             #判斷URL是否合法
11             if true: 
12                     validurls.append(url)
13     items.extend([self.make_requests_from_url(url).replace(callback=self.parse) for url in validurls])
15  
16     sites = hxs.select('//ul/li')
17     items = []
18     for site in sites:
19             item = DmozItem()
20             item['title'] = site.select('a/text()').extract()
21             item['link'] = site.select('a/@href').extract()
22             item['desc'] = site.select('text()').extract()
23             items.append(item)
24  
25     return items


CrawlSpier
1 from scrapy.selector import HtmlXPathSelector
 2  from sitemap.items import SitemapItem
 3  
 4  import urllib
 5  import simplejson
 6  import exceptions
 7  import pickle
 8  
 9  class SitemapSpider(CrawlSpider):
10      name = 'sitemap_spider'
11      allowed_domains = ['qunar.com']
12      start_urls = ['http://www.qunar.com/routes/']
13  
14      rules = (
15          #Rule(SgmlLinkExtractor(allow=(r'http://www.qunar.com/routes/.*')), callback='parse'),
16          #Rule(SgmlLinkExtractor(allow=('http:.*/routes/.*')), callback='parse'),
17      )
18  
19      def parse(self, response):
20          item = SitemapItem()
21          x         = HtmlXPathSelector(response)
22          raw_urls  = x.select("//a/@href").extract()
23          urls      = []
24          for url in raw_urls:
25              if 'routes' in url:
26                  if 'http' not in url:
27                      url = 'http://www.qunar.com' + url
28                  urls.append(url)
29  
30          for url in urls:
31              yield Request(url)
32  
33          item['url']         = response.url.encode('UTF-8')
34          arr_keywords        = x.select("//meta[@name='keywords']/@content").extract()
35          item['keywords']    = arr_keywords[0].encode('UTF-8')
36          arr_description     = x.select("//meta[@name='description']/@content").extract()
37          item['description'] = arr_description[0].encode('UTF-8')
38          yield item 

關於rule:定義了一系列的相關鏈接

       allow屬性爲允許的鏈接

       deny屬性爲不允許的鏈接

       callback屬性爲回調函數

    rules = ( 

        #下面是符合規則的網址,但是不抓取內容,只是提取該頁的鏈接(這裏網址是虛構的,實際使用時請替換

       Rule(SgmlLinkExtractor(allow=(r'http://test_url/test?page_index=\d+'))), 

        #下面是符合規則的網址,提取內容,(這裏網址是虛構的,實際使用時請替換

       Rule(SgmlLinkExtractor(allow=(r'http://test_rul/test?product_id=\d+')), callback="parse_item"), 

    )

以下爲我編寫的爬取滬江網站的簡單爬蟲:

items.py:

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html

from scrapy.item import Item, Field

class MyproItem(Item):
        # define the fields for your item here like:
        # name = Field()
        id = Field()
	th = Field()
        zh = Field()
        url = Field()
        title = Field()

pipelines.py:

class MyproPipeline(object):
    def __init__(self):
        self.file = open('th.txt' , 'w')
        self.file2 = open('zh.txt' , 'w')
    def process_item(self, item, spider):
        if len(item['th']) > 0 and len(item['zh']) > 0 :
            if(len(item['th']) == len(item['zh'])):              
                self.file.write(str(item['title'][0].encode("utf-8"))+ '\n')
                for i in range(len(item['th'])):
                    self.file.write(str(item['th'][i].encode("utf-8")) + '\n')
                self.file2.write(str(item['title'][0].encode("utf-8"))+ '\n')
                for i in range(len(item['zh'])):
                    self.file2.write(str(item['zh'][i].encode("utf-8")) + '\n')
        return item

hjspider.py:

from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from mypro.items import MyproItem
from scrapy.http import Request

class HjSpider(BaseSpider):
    name = "hujiang"
    allowed_domain = ["hujiang.com"]
    start_urls = [
            "http://th.hujiang.com/new/"
        ]

    def parse(self , response):
        hxs = HtmlXPathSelector(response)
        items = []
        urls = []
        raw_title = hxs.select('//title').extract()
        raw_th = hxs.select("//div[@class='langs_en']/text()").extract()
        raw_zh = hxs.select("//div[@class='langs_cn']/text()").extract()
        items.append(MyproItem(title = raw_title , zh = raw_zh , th = raw_th))
        
        raw_urls = hxs.select('//a/@href').extract()
        for url in raw_urls:
             if('http' not in url):
                 if('new' in url):
                     if (url not in urls):
                         url = "http://th.hujiang.com" + url
                         #item = MyproItem()
                         #item['url'] = url
                         urls.append(url)
                         items.append(Request(url , callback = self.parse))
        return items
        
        







發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章