背景
用Pycharm編輯器Python3.x語言寫一個百度貼吧爬蟲程序
代碼如下:
import urllib.request import urllib.parse def loadPage(url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER "} #構造一個請求對象 request = urllib.request.Request(url,headers=headers) response = urllib.request.urlopen(request) # print(response.read()) return response.read() def writePage(html,filename): """ 作用:將html內容寫入到本地 html:服務器相應文件內容 """ #文件寫入 with open(filename,"w") as f : f.write(html) print("_"*30) def tiebaSpider(url,beginPage,endPage): """ 作用:貼吧爬蟲調度器,負責組合處理每個頁面的url url:貼吧url的前部分 beginPage :起始頁 endPage : 結束頁 """ for page in range (beginPage,endPage+1): pn = (page - 1)*50 filename = "第" + str(page) + "頁.html" html = url +"&pn" +str(pn) #print(html) html =loadPage(html) writePage(html,filename) #print(html.decode("utf-8")) if __name__ == "__main__" : kw = input("請輸入需要爬取的貼吧名:") beginPage = int(input("請輸入起始頁:")) endPage = int(input("請輸入結束頁:")) url = "http://tieba.baidu.com/f?" key = urllib.parse.urlencode({"kw":kw}) fullurl = url +key tiebaSpider(fullurl,beginPage,endPage)
程序運行:
在網上查資料可知,pickle存儲方式默認是二進制方式,將writePage方法中代碼with open(filename,"w") as f :改成二進制方式打開便可,with open(filename,"wb") as f :
運行結果如下: