參考http://cuiqingcai.com/993.html,寫的第一個比較完整的python爬蟲程序。
對正則表達式要有更加深刻的研究,本程序對百度貼吧使用,但是截取不了表情。如果想把所有頁面的內容都獲取下來不難,拓展一下即可。
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
#處理頁面標籤類
class Tool:
#去除img標籤,7位長空格
removeImg = re.compile('<img.*?>| {7}|')
#刪除超鏈接標籤
removeAddr = re.compile('<a.*?>|</a>')
#把換行的標籤換爲\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
#將表格製表<td>替換爲\t
replaceTD= re.compile('<td>')
#把段落開頭換爲\n加空兩格
replacePara = re.compile('<p.*?>')
#將換行符或雙換行符替換爲\n
replaceBR = re.compile('<br><br>|<br>')
#將其餘標籤剔除
removeExtraTag = re.compile('<.*?>')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replacePara,"\n ",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
#strip()將前後多餘內容刪除
return x.strip()
#百度貼吧爬蟲類
class BDTB:
#初始化,傳入基地址,是否只看樓主的參數
def __init__(self,baseUrl,seeLZ):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
self.tool = Tool()
self.file = None
self.defaultTitle = u"百度貼吧"
#傳入頁碼,獲取該頁帖子的代碼
def getPage(self,pageNum):
try:
url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
#print content
return content
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"連接百度貼吧失敗,錯誤原因",e.reason
return None
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S)
#pattern = re.compile('<h1 class="core_title_txt.*?>(.*?)</h1>',re.S)
#pattern = re.compile('<h3.*?>(.*?)</h3>',re.S)
#_compile(pattern, flags).search(string)報錯就修改,加str
#result = re.search(pattern,str(page))
#findall得到的result是list結構的
result = re.findall(pattern,page)
if result:
#print result.group(1) #測試輸出
#return result.group(1).strip()
#print result.encode('utf-8')
"""
因爲result是list,只有一維的
for i in range(len(result)):
for j in range(len(result[i])):
print result[i][j].encode('utf-8')
"""
for i in range(len(result)):
print '標題:'+ result[i].encode('utf-8') + '\n'
self.file = open(result[i]+ ".txt","w+")
self.file.writelines(result[i].encode('utf-8') + '\n')
self.file.close()
#print result
return result[0]
else:
return None
# 獲取帖子一共有多少頁
def getPageNum(self):
page = self.getPage(1)
pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span class="red">(.*?)</span>',re.S)
#result = re.search(pattern,page)
result = re.findall(pattern,page)
if result:
#print result.group(1) #測試輸出
#return result.group(1).strip()
#for i in range(len(result)):
# print result[i].encode('utf-8')
#print result
return result[0].encode('utf-8')
else:
return None
#獲取每一層樓的內容,傳入頁面內容
def getContent(self):
page = self.getPage(1)
#pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
#l_post l_post_bright j_l_post clearfix
#pattern = re.compile('<div class="l_post l_post_bright j_l_post clearfix.*?>(.*?)</div>',re.S)
#pattern = re.compile('<div id="post_content_.*?class="d_post_content j_d_post_content.*?img class=.*?>(.*?)</dic>',re.S)
#pattern = re.compile('<div class="p_content ">(.*?)</div>',re.S)
pattern = re.compile('<div id="post_content.*?>(.*?)</div>',re.S)
items = re.findall(pattern,page)
#print len(items)
floor = 1
#a表示追加模式寫入txt
self.file = open(self.getTitle()+ ".txt","a")
for i in range(len(items)):
#print floor,u"樓---------------------------------------\n"
self.file.write(str(floor)+ '樓---------------------------------------\n')
floor += 1
#print self.tool.replace(items[i].encode('utf-8'))
self.file.write(self.tool.replace(items[i].encode('utf-8')) + '\n\n')
print '正在寫入第'+str(floor-1)+'樓...'
self.file.close()
#baseURL = 'http://tieba.baidu.com/p/3138733512'
#baseURL = 'http://tieba.baidu.com/p/4399969515'
#baseURL = 'http://tieba.baidu.com/p/4400019865'
baseURL = 'http://tieba.baidu.com/p/4075653034'
#下面參數0表示看所有信息,1表示只看樓主信息
bdtb = BDTB(baseURL,0)
#bdtb.getPage(1)
#print bdtb.getTitle()
print '該貼吧總共有'+ str(bdtb.getPageNum()) + '頁'
bdtb.getContent()