學完python正則表達,最主要參考了CQC的博客:http://cuiqingcai.com/990.html。
功能:把段子的作者、發表時間、點贊數、內容和配圖都給匹配了出來。附加跳轉到前一頁,後一頁,某一頁,退出功能。
修改後代碼如下(20160220匹配成功):
#!/usr/bin/env python #-*-coding:utf-8 -*- __author__ = "PS" """ modified from CQC http://cuiqingcai.com/990.html python version : 2.7.9 """ import urllib import urllib2 import re import time class Scrapy_qiushibaike(): def __init__(self): self.pageIndex = 1 self.user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' self.headers = {'User-Agent':self.user_agent} self.stories = [] self.enable = True def get_page(self,pageIndex): try: url = 'http://www.qiushibaike.com/hot/page/' + str(pageIndex) request = urllib2.Request(url,headers=self.headers) response = urllib2.urlopen(request) pageCode = response.read().decode('utf-8') return pageCode except urllib2.URLError,e: if hasattr(e, "reason"): print "connect to the web error",e.reason return None def get_page_items(self,pageIndex): page_code = self.get_page(pageIndex) if not page_code: print "response failure" return None pattern = re.compile('<div.*?article.*?'+ '<h2>(.*?)</h2>.*?' + '<div class="content">(.*?)' + '<!--(.*?)-->.*?'+ 'div>(.*?)class="stats".*?' + 'class="number">(.*?)</i>', re.S) #item[0]:name,item[1]:content,item[2]:time,itme[3]:img,item[4]:support number items = re.findall(pattern,page_code) page_stories = [] for item in items: haveImg = re.search("img",item[3]) if haveImg: pattern_img = re.compile('<img src="(.*?)"') img_url = ''.join(re.findall(pattern_img,item[3])) else: img_url = 'no image' replaceBR = re.compile('<br/>') text = re.sub(replaceBR,"\n",item[1]) time_float = time.gmtime(float(item[2])) time_formated = time.strftime('%Y-%m-%d %H:%M:%S',time_float) author = item[0] support_number = item[4] page_stories.append([author.strip(), text.strip(),time_formated.strip(), img_url,support_number.strip()]) return page_stories def load_page(self): if self.enable == True: if len(self.stories) <= 2: page_stories = self.get_page_items(self.pageIndex) # add to global variable stories if page_stories: self.stories.append(page_stories) def get_one_page_story(self): self.load_page() for story in self.stories[0]: print "page%d\nauthor:%s\ntime:%s\nsupport_number:%s\n%s\n%s\n" %(self.pageIndex,story[0],story[2],story[4],story[1],story[3]) del self.stories[0] def start(self): while self.enable: self.get_one_page_story() input = raw_input("'n' -> next page, 'p' -> previous page, number -> that page, q -> quit,others -> current page:") if input == 'q': self.enable = False return None elif input == 'f': self.pageIndex += 1 elif input == 'b': self.pageIndex -= 1 elif input.isdigit(): self.pageIndex = int(input) print self.pageIndex if __name__ == '__main__': spider = Scrapy_qiushibaike() spider.start()