#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2016-11-21 07:17:36 # Project: xdf from pyspider.libs.base_handler import * import re def getId(url): j = len(url)-1 while j >0: if url[j] == '/': break j=j-1 url=url[j+1:len(url)] id=url.split('.')[0] return id class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): self.crawl('http://www.koolearn.com/', callback=self.index_page) @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('.snavbx a').items(): self.crawl(each.attr.href, callback=self.list_page) @config(age=10 * 24 * 60 * 60) def list_page(self, response): for each in response.doc('a[href^=http]').items(): url=each.attr.href if re.match('http://www.koolearn.com/product/c_(\d+_\d+).html',url): self.crawl(url, callback=self.detail_page_new) elif re.match('http://www.koolearn.com/product/(\d+_\d+).html',url): self.crawl(url, callback=self.detail_page_old) elif re.match('http://wxlm.gaodun.com/Public/jsShow_last/tag/(\d+)',url): self.crawl(url, callback=self.detail_page_gaodun) #新的詳情頁 def detail_page_new(self, response): price=response.doc('.p-price span').text() price=re.sub(r'[^0-9,.]', "", price) url=response.url id=getId(url) return { "edu_id":id, "name":response.doc('.p-content-head-title h1').text(), "categoryName":response.doc('.f1 a').text(), "price":price } #舊的詳情頁 def detail_page_old(self, response): price=response.doc('.pri_num').text() price=re.sub(r'[^0-9,.]', "", price) url=response.url id=getId(url) return { "edu_id":id, "name":response.doc('.add_box a:last-child').text(), "categoryName":response.doc('h1').text(), "price":price } #高頓課程頁 def detail_page_gaodun(self, response): price=response.doc('div.m-c-tit span.d-price').text() price=re.sub(r'[^0-9,.]', "", price) url=response.url id=getId(url) return { "edu_id":id, "name":response.doc('h2.d-tit').text(), "categoryName":response.doc('.m-c-crumbs span:nth-child(2)').text(), "price":price }
pyspider採集例子
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.