以下爲爬取百度貼吧帖子案例。運行後會把帖子內的所有發言下載到一個TXT文件裏面
詳細的 可以去看 http://cuiqingcai.com/993.html
#coding=utf-8 import urllib import urllib2 import re #處理頁面標籤類 class Tool: #去除img標籤,7位長空格 removeImg = re.compile('<img.*?>| {7}|') #刪除超鏈接標籤 removeAddr = re.compile('<a.*?>|</a>') #把換行的標籤換爲\n replaceLine = re.compile('<tr>|<div>|</div>|</p>') #將表格製表<td>替換爲\t replaceTD= re.compile('<td>') #把段落開頭換爲\n加空兩格 replacePara = re.compile('<p.*?>') #將換行符或雙換行符替換爲\n replaceBR = re.compile('<br><br>|<br>') #將其餘標籤剔除 removeExtraTag = re.compile('<.*?>') def replace(self,x): x = re.sub(self.removeImg,"",x) x = re.sub(self.removeAddr,"",x) x = re.sub(self.replaceLine,"\n",x) x = re.sub(self.replaceTD,"\t",x) x = re.sub(self.replacePara,"\n ",x) x = re.sub(self.replaceBR,"\n",x) x = re.sub(self.removeExtraTag,"",x) #strip()將前後多餘內容刪除 return x.strip() #百度貼吧爬蟲類 class BDTB: #初始化,傳入基地址,是否只看樓主的參數 def __init__(self,baseUrl,seeLZ,floorTag): #base鏈接地址 self.baseURL = baseUrl #是否只看樓主 self.seeLZ = '?see_lz='+str(seeLZ) #HTML標籤剔除工具類對象 self.tool = Tool() #全局file變量,文件寫入操作對象 self.file = None #樓層標號,初始爲1 self.floor = 1 #默認的標題,如果沒有成功獲取到標題的話則會用這個標題 self.defaultTitle = u"百度貼吧" #是否寫入樓分隔符的標記 self.floorTag = floorTag #傳入頁碼,獲取該頁帖子的代碼 def getPage(self,pageNum): try: #構建URL url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request) #返回UTF-8格式編碼內容 return response.read().decode('utf-8') #無法連接,報錯 except urllib2.URLError, e: if hasattr(e,"reason"): print u"連接百度貼吧失敗,錯誤原因",e.reason return None #獲取帖子標題 def getTitle(self,page): #得到標題的正則表達式 pattern = re.compile('<h1 class="core_title_txt.*?>(.*?)</h1>',re.S) result = re.search(pattern,page) if result: #如果存在,則返回標題 return result.group(1).strip() else: return None #獲取帖子一共有多少頁 def getPageNum(self,page): #獲取帖子頁數的正則表達式 pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S) result = re.search(pattern,page) if result: return result.group(1).strip() else: return None #獲取每一層樓的內容,傳入頁面內容 def getContent(self,page): #匹配所有樓層的內容 pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S) items = re.findall(pattern,page) contents = [] for item in items: #將文本進行去除標籤處理,同時在前後加入換行符 content = "\n"+self.tool.replace(item)+"\n" contents.append(content.encode('utf-8')) return contents def setFileTitle(self,title): #如果標題不是爲None,即成功獲取到標題 if title is not None: self.file = open(title + ".txt","w+") else: self.file = open(self.defaultTitle + ".txt","w+") def writeData(self,contents): #向文件寫入每一樓的信息 for item in contents: if self.floorTag == '1': #樓之間的分隔符 floorLine = "\n" + str(self.floor) + u"-----------------------------------------------------------------------------------------\n" self.file.write(floorLine) self.file.write(item) self.floor += 1 def start(self): indexPage = self.getPage(1) pageNum = self.getPageNum(indexPage) title = self.getTitle(indexPage) self.setFileTitle(title) if pageNum == None: print u"URL已失效,請重試" return try: print u"該帖子共有" + str(pageNum) + u"頁" for i in range(1,int(pageNum)+1): print u"正在寫入第" + str(i) + u"頁數據" page = self.getPage(i) contents = self.getContent(page) self.writeData(contents) #出現寫入異常 except IOError,e: print u"寫入異常,原因" + e.message finally: print u"寫入任務完成" print u"請輸入帖子代號" baseURL = 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/')) seeLZ = raw_input("是否只獲取樓主發言,是輸入1,否輸入0\n".decode('utf-8').encode('gbk')) floorTag = raw_input("是否寫入樓層信息,是輸入1,否輸入0\n".decode('utf-8').encode('gbk')) bdtb = BDTB(baseURL,seeLZ,floorTag) bdtb.start()