参考http://cuiqingcai.com/993.html,写的第一个比较完整的python爬虫程序。
对正则表达式要有更加深刻的研究,本程序对百度贴吧使用,但是截取不了表情。如果想把所有页面的内容都获取下来不难,拓展一下即可。
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
#处理页面标签类
class Tool:
#去除img标签,7位长空格
removeImg = re.compile('<img.*?>| {7}|')
#删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
#把换行的标签换为\n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为\t
replaceTD= re.compile('<td>')
#把段落开头换为\n加空两格
replacePara = re.compile('<p.*?>')
#将换行符或双换行符替换为\n
replaceBR = re.compile('<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile('<.*?>')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replacePara,"\n ",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
#strip()将前后多余内容删除
return x.strip()
#百度贴吧爬虫类
class BDTB:
#初始化,传入基地址,是否只看楼主的参数
def __init__(self,baseUrl,seeLZ):
self.baseURL = baseUrl
self.seeLZ = '?see_lz='+str(seeLZ)
self.tool = Tool()
self.file = None
self.defaultTitle = u"百度贴吧"
#传入页码,获取该页帖子的代码
def getPage(self,pageNum):
try:
url = self.baseURL+ self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
content = response.read().decode('utf-8')
#print content
return content
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接百度贴吧失败,错误原因",e.reason
return None
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class="core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S)
#pattern = re.compile('<h1 class="core_title_txt.*?>(.*?)</h1>',re.S)
#pattern = re.compile('<h3.*?>(.*?)</h3>',re.S)
#_compile(pattern, flags).search(string)报错就修改,加str
#result = re.search(pattern,str(page))
#findall得到的result是list结构的
result = re.findall(pattern,page)
if result:
#print result.group(1) #测试输出
#return result.group(1).strip()
#print result.encode('utf-8')
"""
因为result是list,只有一维的
for i in range(len(result)):
for j in range(len(result[i])):
print result[i][j].encode('utf-8')
"""
for i in range(len(result)):
print '标题:'+ result[i].encode('utf-8') + '\n'
self.file = open(result[i]+ ".txt","w+")
self.file.writelines(result[i].encode('utf-8') + '\n')
self.file.close()
#print result
return result[0]
else:
return None
# 获取帖子一共有多少页
def getPageNum(self):
page = self.getPage(1)
pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span class="red">(.*?)</span>',re.S)
#result = re.search(pattern,page)
result = re.findall(pattern,page)
if result:
#print result.group(1) #测试输出
#return result.group(1).strip()
#for i in range(len(result)):
# print result[i].encode('utf-8')
#print result
return result[0].encode('utf-8')
else:
return None
#获取每一层楼的内容,传入页面内容
def getContent(self):
page = self.getPage(1)
#pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
#l_post l_post_bright j_l_post clearfix
#pattern = re.compile('<div class="l_post l_post_bright j_l_post clearfix.*?>(.*?)</div>',re.S)
#pattern = re.compile('<div id="post_content_.*?class="d_post_content j_d_post_content.*?img class=.*?>(.*?)</dic>',re.S)
#pattern = re.compile('<div class="p_content ">(.*?)</div>',re.S)
pattern = re.compile('<div id="post_content.*?>(.*?)</div>',re.S)
items = re.findall(pattern,page)
#print len(items)
floor = 1
#a表示追加模式写入txt
self.file = open(self.getTitle()+ ".txt","a")
for i in range(len(items)):
#print floor,u"楼---------------------------------------\n"
self.file.write(str(floor)+ '楼---------------------------------------\n')
floor += 1
#print self.tool.replace(items[i].encode('utf-8'))
self.file.write(self.tool.replace(items[i].encode('utf-8')) + '\n\n')
print '正在写入第'+str(floor-1)+'楼...'
self.file.close()
#baseURL = 'http://tieba.baidu.com/p/3138733512'
#baseURL = 'http://tieba.baidu.com/p/4399969515'
#baseURL = 'http://tieba.baidu.com/p/4400019865'
baseURL = 'http://tieba.baidu.com/p/4075653034'
#下面参数0表示看所有信息,1表示只看楼主信息
bdtb = BDTB(baseURL,0)
#bdtb.getPage(1)
#print bdtb.getTitle()
print '该贴吧总共有'+ str(bdtb.getPageNum()) + '页'
bdtb.getContent()