1.總體流程
總體思路是:先爬下所有的百度知道的搜索結果頁面中的鏈接,有了鏈接再繼續爬鏈接裏的頁面數據。
核心流程如下:
獲取url
主要是爬百度知道,搜的鏈接,待會兒使用。
# -*- coding: utf-8 -*-
"""
Created on Thu Nov 7 15:56:06 2019
@author: dennglei
"""
import re
import requests
from bs4 import BeautifulSoup
def getHTML(url,keyword,pn):
if keyword != None and pn!=None:
url = url + 'search?word=' + keyword + '&ie=gbk&site=-1&sites=0&date=0&pn=' + pn
print(url)
try:
r = requests.get(url,timeout = 30)
r.raise_for_status
#百度的網頁使用GB2312編碼的
r.encoding = "GB2312"
return r.text
except:
return ""
def getSoup(page):
soup = BeautifulSoup(page,'html.parser')
return soup
def fillResult(soup):
results = []
resultsTemp = []
links = soup.find_all('a',{'href':re.compile('question/')})
for a in links:
link = a.attrs['href']
resultsTemp.append(link)
for i in range(len(resultsTemp)):
if i%2 != 0:
results.append(resultsTemp[i])
return results
def printResult(results):
for i in range(len(results)):
print(i,results[i])
#傳入 關鍵字和需要的知道網頁頁數,返回這些結果的鏈接
def getUrl(keyword,pageNum):
url = 'https://zhidao.baidu.com/'
resultLinks = []
for i in range(pageNum):
pn = str(i*10)
page = getHTML(url,keyword,pn)
soup = getSoup(page)
results = fillResult(soup)
resultLinks += results
return resultLinks
獲取單個網頁中的數據
通過上個函數得到的鏈接,爬取頁面上具體的數據,這裏是標題、問題和最佳答案,沒有最佳就把回答設置爲空,便於後面做判斷。
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 8 16:09:46 2019
@author: dennglei
"""
import re
import requests
from bs4 import BeautifulSoup
def getHTML_text(url):
print(url)
try:
r = requests.get(url,timeout = 30)
r.raise_for_status
r.encoding = "GB2312"
return r.text
except:
return ""
def getSoup_text(page):
soup = BeautifulSoup(page,'html.parser')
return soup
def fillResult_text(soup):
results = []
results_title = ''
results_question = ''
results_answer = '' #results_answer之前是列表類型,+連接字符串後,是把每個單個字符作爲元素+上
#初始值換成字符串後,就沒爲題了
#獲取問題title
span1 = soup.find_all('span',{'class':re.compile('ask-title')})
for s in span1:
title = s.string
results_title += title
#獲取 問題內容
span2 = soup.find('span',{'class':re.compile('con-all')})
#提問者沒有編輯過問題內容時,問題和標題一致
if span2 == None:
results_question = results_title
else:
for child in span2.children:
if child.string != None:
temp = str(child.string) #強轉爲string
textString = temp.strip() #去掉換行符
if textString == '': #空字符不加入,跳過
continue
results_question += textString
#獲取 回答內容
div = soup.find('div',{'id':re.compile('best-content')})
if div == None:
results_answer = None
else:
for child in div.children:
if child.string != None:
temp = str(child.string) #強轉爲string
textString = temp.strip() #去掉換行符
if textString == '': #空字符不加入,跳過
continue
results_answer += textString
results.append(results_title)
results.append(results_question)
results.append(results_answer)
return results
def printResult_text(results):
for i in range(len(results)):
print(i,results[i])
#傳入鏈接,返回鏈接中的數據,一個含有三個字符串元素的列表,包括標題,問題,答案
def getSingleData(url):
resultsData = []
page = getHTML_text(url)
soup = getSoup_text(page)
resultsData = fillResult_text(soup)
return resultsData
保存數據和寫入數據到csv
存入當前的文件夾中,我是spyder的工作目錄
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 9 13:46:43 2019
@author: dennglei
"""
import csv
#寫入數據到csv
def csv_write(path,data):
# encoding 格式必須是UTF-8-sig 纔不會亂碼,utf-8 會亂碼
with open(path,'w',encoding = 'UTF-8-sig',newline = '') as f:
writer = csv.writer(f,dialect = 'excel')
for row in data:
writer.writerow(row)
return True
#從csv讀取數據
def csv_read(path):
data = []
with open(path,'r',encoding='UTF-8-sig') as f:
reader = csv.reader(f,dialect='excel')
for row in reader:
data.append(row)
return data
主方法
兩個函數,保存爬到的數據,以及合併數據,保存數據的時候,每個網頁的數據裏,檢查有沒有空,有就跳過,只存完整的數據。
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 9 15:54:28 2019
@author: dennglei
"""
from get_url import getUrl
from get_data import getSingleData
from save_data import csv_write
from save_data import csv_read
#通過關鍵字和頁碼,獲取數據,並寫入
def spiderData(keyword,pageNum):
#獲取百度知道,查詢出的各個答案的鏈接
links = getUrl(keyword,pageNum)
data_all = []
fileName = keyword + '_Data.csv'
for url in links:
#
flag = 0
singleData = getSingleData(url)
#如果在單個數據中,存在None值,則跳過,不加進去
for data in singleData:
if data == None:
flag = 1
if flag == 1:
continue
else:
data_all.append(singleData)
#寫入csv文件
csv_write(fileName,data_all)
#合併csv
def mergeData(pathInput,pathOutput):
newData = []
for path in pathInput:
p = path + '_Data.csv'
data = csv_read(p)
for line in data:
newData.append(line)
if csv_write(pathOutput,newData):
print('merge Data successfully')
else:
print("merge Data false")
#入口
dataNeed = ['微波爐維修',
'空調維修',
'冰箱維修',
'電視維修',
'抽油煙機維修',
'電風扇維修',
'熱水器維修',
'電腦維修',
'吹風機維修',
'音響維修',
'空氣淨化器維修',
]
# 獲取dataNeed中各類 數據 的爬蟲數據
for spi in dataNeed:
spiderData(spi,10)
#合併所有爬蟲數據
pathOutput = 'all_data.csv'
mergeData(dataNeed,pathOutput)
總結
花了三天實踐,不懂就百度翻書看ppt,最後勉強做出來了,很粗糙幸好勉強能用。這裏給自己記錄一下,請大佬勿噴,歡迎同學們建議,一起學習進步。