環境準備
搭建Python環境
略
安裝requests:
進入Python安裝目錄下的Scripts目錄(確保pip.exe文件存在,正常情況下存在)
進入cmd終端並進入該目錄,輸入命令"pip install requests"
安裝re(可以省略)
輸入pip install re
爬取珍愛網信息代碼
import requests
import re
class Zhenghun(object):
url = "http://www.zhenai.com/zhenghun/"
# 爬取類型爲指定城市的女性
def __init__(self,city):
self.newurl = Zhenghun.url + city + "/nv"
# 獲取html內容
def getHtml(self):
return requests.get(self.newurl).content.decode("utf-8")
# 通過正則表達式過濾html中的內容並返回數組
def parse(self):
html = self.getHtml()
rex = '<a href="http://album.zhenai.com/u/[0-9]+"[^>]*[^<]+</a>'
list = re.findall(rex,html)
return list
def main():
# 設定城市
citylist = ["dongcheng","chaoyang1","changping"]
# 按城市遍歷
for city in citylist:
zhenghun = Zhenghun(city)
list = zhenghun.parse()
# 按解析遍歷
for l in list:
# 切片
u = l.split('"')
#print(u[1])
n = l[l.rfind('"')+2:l.rfind("<")]
# 打印姓名和頁面鏈接
print(n+"\t\t\t"+u[1])
#防止測試代碼被調用
if __name__=="__main__":
main()
爬取結果(已遮擋網址,侵刪)
爬取Discuz論壇發帖和回帖代碼
代碼尚未優化
import requests
import re
# 爬取Discuz發帖和回帖內容
class Discuz(object):
# 用於存取主題鏈接的集合
list2 = set()
# 初始頁面
url = "https://www.discuz.net/forum-plugin-1.html"
def __init__(self):
self.firstUrl = Discuz.url
# 獲取初始頁面html內容
def getFirstHtml(self):
return requests.get(self.firstUrl).content.decode("gbk")
# 獲取跳轉路徑
def getPath(self):
html = self.getFirstHtml()
# 正則匹配
secondUrl = '<a href="thread-[0-9]*-1-1.html" onclick'
list = re.findall(secondUrl, html)
return list
# 拼接跳轉路徑並將路徑儲存在list3集合
def getSecondHtml(self):
discuz = Discuz()
list = discuz.getPath()
list3 = discuz.list2
for l in list:
thirdUrl = l.split('"')
list3.add("https://www.discuz.net/" + thirdUrl[1])
return list3
# 爬取發帖回帖內容
def getThirdHtml(self, Urls):
# 遍歷帖子鏈接
for finalUrl in Urls:
finalHtml = requests.get(finalUrl).content.decode("gbk")
regular = '999">[\s\S]*?</td>'
# 獲取類容數組
contentList = re.findall(regular, finalHtml)
# 遍歷鏈接中帖子內容
for content in contentList:
# 過濾標籤/空格/換行
sp = re.sub('<(.*?)>', '', content, 0).replace(" ", "").replace("\n", "")
# 切片
sp1 = sp.split('99">')
# 將切片結果輸出
word = sp1[1]
print(word)
def main():
discuz = Discuz()
getUrl = discuz.getSecondHtml()
# for i in getUrl:
# print(i)
discuz.getThirdHtml(getUrl)
# print(discuz.list2)
if __name__ == "__main__":
main()
爬取結果
由於該網站對於匹配發帖和回帖內容的標籤有多種,導致爬取結果不全,可以使用多種正則匹配方法匹配信息