pyspider採集例子

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-11-21 07:17:36
# Project: xdf
from pyspider.libs.base_handler import *
import re

def getId(url):
        j = len(url)-1
        while j >0:
         if url[j] == '/':
          break
         j=j-1
        url=url[j+1:len(url)]
        id=url.split('.')[0]            
        return id 

class Handler(BaseHandler):
    crawl_config = {
    }
       
        
    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl('http://www.koolearn.com/', callback=self.index_page)

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('.snavbx a').items():
            self.crawl(each.attr.href, callback=self.list_page)

    @config(age=10 * 24 * 60 * 60)
    def list_page(self, response):
        for each in response.doc('a[href^=http]').items():
            url=each.attr.href         
            if re.match('http://www.koolearn.com/product/c_(\d+_\d+).html',url): 
                 self.crawl(url, callback=self.detail_page_new)
            elif re.match('http://www.koolearn.com/product/(\d+_\d+).html',url): 
                 self.crawl(url, callback=self.detail_page_old)
            elif re.match('http://wxlm.gaodun.com/Public/jsShow_last/tag/(\d+)',url): 
                 self.crawl(url, callback=self.detail_page_gaodun)
    #新的詳情頁
    def detail_page_new(self, response): 
            price=response.doc('.p-price span').text() 
            price=re.sub(r'[^0-9,.]', "", price)
            url=response.url
            id=getId(url)
            return { 
            "edu_id":id,
            "name":response.doc('.p-content-head-title h1').text(),
            "categoryName":response.doc('.f1 a').text(),
            "price":price
        }
    #舊的詳情頁
    def detail_page_old(self, response):
            price=response.doc('.pri_num').text() 
            price=re.sub(r'[^0-9,.]', "", price)
            url=response.url
            id=getId(url)
            return {
            "edu_id":id,
            "name":response.doc('.add_box a:last-child').text(),
            "categoryName":response.doc('h1').text(),
            "price":price
        }
    #高頓課程頁
    def detail_page_gaodun(self, response):
            price=response.doc('div.m-c-tit span.d-price').text()
            price=re.sub(r'[^0-9,.]', "", price)
            url=response.url
            id=getId(url)
            return {
            "edu_id":id,
            "name":response.doc('h2.d-tit').text(),
            "categoryName":response.doc('.m-c-crumbs span:nth-child(2)').text(),
            "price":price
        }


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章