利用requests、正則表達式和Beautiful爬取貼吧用戶名、評論和圖片
定義工具類——去除爬取項多餘內容
class Tool():
removeImg = re.compile('<img.*?>|{7}| ') # 去除img標籤,1-7位空格,
removeAddr = re.compile('<a.*?>|</a>') # 刪除超鏈接標籤
replaceLine = re.compile('<tr>|<div>|</div>|</p>') # 把換行的標籤換位\n
replaceTD = re.compile('<td>') # 把表格製表<td>換爲\t
replaceBR = re.compile('<br><br>|<br>|</br>|</br></br>') # 把換行符或者雙換行符換爲\n
removeExtraTag = re.compile('.*?') # 把其餘標籤剔除
removeNoneLine = re.compile('\n+') # 把多餘空行刪除
def replace(self, x):
x = re.sub(self.removeImg, "", x)
x = re.sub(self.removeAddr, "", x)
x = re.sub(self.replaceLine, "\n", x)
x = re.sub(self.replaceTD, "\t", x)
x = re.sub(self.replaceBR, "\n", x)
x = re.sub(self.removeExtraTag, "", x)
x = re.sub(self.removeNoneLine, "\n", x)
return x.strip() # 把strip()前後多餘內容刪除
帖子標題、頁數都可以在第一個鏈接頁面找到相關數據,圖片等也是直接用正則表達式進行獲取
class Spider():
def __init__(self):
self.tool = Tool()
# 獲取源碼
def getSource(self, url):
user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \(KHTML, like Gecko) Element Browser 5.0',
'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)',
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \Version/6.0 Mobile/10A5355d Safari/8536.25',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/28.0.1468.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']
# user_agent在一堆範圍中隨機獲取
# random.randint()獲取隨機數,防止網站認出是爬蟲而訪問受限
index = random.randint(0, 9)
user_agent = user_agents[index]
headers = {'User_agent': user_agent}
html = requests.get(url, headers=headers)
return html.text
# 獲取帖子標題
def getTitle(self, url):
result = self.getSource(url)
pattern = re.compile('<h1.*?title.*?>(.*?)</h1>', re.S)
items = re.search(pattern, result)
print('這篇帖子標題爲:', self.tool.replace(items.group(1)))
# 獲取帖子總頁數
def getPageNumber(self, url):
result = self.getSource(url)
pattern = re.compile('<ul.*?l_posts_num.*?<span class="red">(.*?)</span>', re.S)
items = re.search(pattern, result).group(1)
print('帖子共有', items, '頁')
return items
def getContent(self, url):
result = self.getSource(url)
pattern = re.compile('<a data-field.*?p_author_name.*?">(.*?)</a>.*?<div id="post_content_.*?>(.*?)</div>',
re.S)
items = re.findall(pattern, result)
# 獲取樓層數可以直接用循環,省去正則匹配的麻煩
number = 1
for item in items:
# item[0]爲樓主,item[1]爲發言內容,使用\n換行符打出內容更乾淨利落
# item[1]中可能有img鏈接,用自定義Tool工具清洗
print('\n', number, '樓', '\n樓主:', item[0], '\n內容:', self.tool.replace(item[1]))
time.sleep(0.01)
number += 1
# 獲取曬圖,清洗獲得鏈接並保存入list
def getImage(self, url):
result = self.getSource(url)
soup = BeautifulSoup(result, 'lxml')
# 此處用BeautifulSoup顯然更高效
# find_all()返回一個list,find()返回一個元素
# 注意class屬性和python內置的重合,所以加_變成class_
items = soup.find_all('img', class_="BDE_Image")
images = []
number = 0
for item in items:
print('發現一張圖,鏈接爲:', item['src'])
images.append(item['src'])
number += 1
if number >= 1:
print('\n', '共曬圖', number, '張,厲害了我的哥!!!')
else:
print('喏,沒有圖......')
return images
創建目錄進行圖片保存
# 創建目錄
def makeDir(self, path):
self.path = path.strip()
# E = os.path.exists(os.path.join(os.getcwd(), self.path))
E = os.path.exists(self.path)
if not E:
# 創建新目錄,若想將內容保存至別的路徑(非系統默認),需要更環境變量
# 更改環境變量用os.chdir()
# os.makedirs(os.path.join(os.getcwd(), self.path))
os.mkdir(self.path)
# os.chdir(os.path.join(os.getcwd(), self.path))
print('正在創建名爲', self.path, '的文件夾')
return self.path
else:
print('名爲', self.path, '的文件夾已經存在...')
return False
def saveImage(self, detailURL, name):
try:
data = requests.get(detailURL, timeout=10).content
# 保存文件,一定要用絕對路徑 `
# 所以設置self.path,是爲了方便後面函數無障礙調用
except requests.exceptions.ConnectionError:
print('下載圖片失敗')
return None
fileName = name + '.' + 'jpg'
fileName = os.path.join(self.path, name+'.jpg')
f = open(fileName, 'wb')
f.write(data)
f.close()
print('成功保存圖片', fileName)
獲取所有頁面——主要代碼邏輯
def getAllPage(self):
self.siteURL = 'http://tieba.baidu.com/p/5862596971'
# 獲取帖子標題
self.getTitle(self.siteURL)
# 獲取帖子頁數
numbers = self.getPageNumber(self.siteURL)
for page in range(1, int(numbers) + 1):
# 格式化索引鏈接
self.url = self.siteURL + '?pn=' + str(page)
print('\n\n', '正準備獲取第', page, '頁的內容...')
# 獲取評論
print('\n', '正準備獲取評論...')
self.getContent(self.url)
# 每一頁創建一個文件
self.makeDir(path='page' + str(page))
# 獲取圖片
print('\n', '正準備獲取圖片...')
images = self.getImage(self.url)
print('\n', '正準備保存圖片...')
number = 1
# 保存圖片,先從之前的list中找鏈接
for detailURL in images:
name = 'page' + str(page) + '_'+'num' + str(number)
self.saveImage(detailURL, name)
time.sleep(0.1)
number += 1
print('\n\n', '完成第', page, u'頁')
print('\n\n', '恭喜,圓滿成功!')
- 代碼過程中對於用戶名中含有特殊符號(圖片)的會直接返回鏈接,改良中
- 更多爬蟲代碼詳情參考Githun