python爬取數據實踐,以及趟過的坑

1.總體流程

總體思路是:先爬下所有的百度知道的搜索結果頁面中的鏈接,有了鏈接再繼續爬鏈接裏的頁面數據。
核心流程如下:

Created with Raphaël 2.2.0開始輸入關鍵詞和頁數獲取當前關鍵詞所有網頁url遍歷的單個網頁url獲取單個網頁數據當前數據不爲空?加進data_all遍歷了當前關鍵詞所有鏈接?存入csv結束yesnoyesno

獲取url

主要是爬百度知道,搜的鏈接,待會兒使用。

# -*- coding: utf-8 -*-
"""
Created on Thu Nov  7 15:56:06 2019

@author: dennglei
"""

import re
import requests
from bs4 import BeautifulSoup

def getHTML(url,keyword,pn):
    
    if keyword != None and pn!=None:
        
        url = url + 'search?word=' + keyword + '&ie=gbk&site=-1&sites=0&date=0&pn=' + pn
    
        
    print(url)
    
    try:
        
        r = requests.get(url,timeout = 30)
        
        r.raise_for_status
        #百度的網頁使用GB2312編碼的
        r.encoding = "GB2312"
        
        return r.text
    
    except:
        
        return ""
    
    
def getSoup(page):
    
    soup = BeautifulSoup(page,'html.parser')
    
    return soup


def fillResult(soup):
    
    results = []
    
    resultsTemp = []
   
    links = soup.find_all('a',{'href':re.compile('question/')})
    
    for a in links:
        
        link = a.attrs['href']
        
        resultsTemp.append(link)
        
    for i in range(len(resultsTemp)):
        
        if i%2 != 0:
            
            results.append(resultsTemp[i])
            
    return results


def printResult(results):
    
    for i in range(len(results)):
        
        print(i,results[i])
      

#傳入 關鍵字和需要的知道網頁頁數,返回這些結果的鏈接

def getUrl(keyword,pageNum):  
      
    url = 'https://zhidao.baidu.com/'

    resultLinks = []

    for i in range(pageNum):
        
        pn = str(i*10)
        
        page = getHTML(url,keyword,pn)
        
        soup = getSoup(page)
        
        results = fillResult(soup)
        
        resultLinks += results
        
    return resultLinks
     


獲取單個網頁中的數據

通過上個函數得到的鏈接,爬取頁面上具體的數據,這裏是標題、問題和最佳答案,沒有最佳就把回答設置爲空,便於後面做判斷。

# -*- coding: utf-8 -*-
"""
Created on Fri Nov  8 16:09:46 2019

@author: dennglei
"""

import re
import requests
from bs4 import BeautifulSoup

def getHTML_text(url):
          
    print(url)
    
    try:
        
        r = requests.get(url,timeout = 30)
        
        r.raise_for_status
        
        r.encoding = "GB2312"
        
        return r.text
    
    except:
        
        return ""
    
def getSoup_text(page):
    
    soup = BeautifulSoup(page,'html.parser')
    
    return soup


def fillResult_text(soup):
    
    results = []
    
    results_title = ''
    
    results_question = ''
    
    results_answer = '' #results_answer之前是列表類型,+連接字符串後,是把每個單個字符作爲元素+上
    #初始值換成字符串後,就沒爲題了
    
    #獲取問題title
    
    span1 = soup.find_all('span',{'class':re.compile('ask-title')})
    
    for s in span1:
        
        title = s.string
        
        results_title += title
          
    #獲取 問題內容 
    
    span2 = soup.find('span',{'class':re.compile('con-all')})
   
    #提問者沒有編輯過問題內容時,問題和標題一致
    if span2 == None:
        
        results_question = results_title
        
    else:
        
        for child in span2.children:
            
            if child.string != None:
           
                temp = str(child.string) #強轉爲string
            
                textString = temp.strip() #去掉換行符
            
                if textString == '': #空字符不加入,跳過
                    
                    continue
                
                results_question += textString
                
    #獲取 回答內容  
    
    div = soup.find('div',{'id':re.compile('best-content')})
    
    if div == None:
        
        results_answer = None
    else:
    
        for child in div.children:
               
            if child.string != None:
                
                temp = str(child.string) #強轉爲string
              
                textString = temp.strip() #去掉換行符
                
                if textString == '': #空字符不加入,跳過
                    
                    continue
                
                results_answer += textString
                  
    results.append(results_title)
    
    results.append(results_question)
    
    results.append(results_answer)
    
    return results

def printResult_text(results):
    
    for i in range(len(results)):
        
        print(i,results[i])


#傳入鏈接,返回鏈接中的數據,一個含有三個字符串元素的列表,包括標題,問題,答案
        
def getSingleData(url):

    resultsData = []

    page = getHTML_text(url)
    
    soup = getSoup_text(page)
    
    resultsData = fillResult_text(soup)
    
    return resultsData
   

保存數據和寫入數據到csv

存入當前的文件夾中,我是spyder的工作目錄

# -*- coding: utf-8 -*-
"""
Created on Sat Nov  9 13:46:43 2019

@author: dennglei
"""

import csv

#寫入數據到csv

def csv_write(path,data):
    
    #  encoding 格式必須是UTF-8-sig 纔不會亂碼,utf-8 會亂碼
    
    with open(path,'w',encoding = 'UTF-8-sig',newline = '') as f:
        
        writer = csv.writer(f,dialect = 'excel')
        
        for row in data:
            
            writer.writerow(row)
            
    return True



#從csv讀取數據

def csv_read(path):
    
    data = []
    
    with open(path,'r',encoding='UTF-8-sig') as f:
        
        reader = csv.reader(f,dialect='excel')
        
        for row in reader:
            
            data.append(row)
            
    return data


主方法

兩個函數,保存爬到的數據,以及合併數據,保存數據的時候,每個網頁的數據裏,檢查有沒有空,有就跳過,只存完整的數據。

# -*- coding: utf-8 -*-
"""
Created on Sat Nov  9 15:54:28 2019

@author: dennglei
"""
from get_url import getUrl
from get_data import getSingleData
from save_data import csv_write
from save_data import csv_read

#通過關鍵字和頁碼,獲取數據,並寫入

def spiderData(keyword,pageNum):
    #獲取百度知道,查詢出的各個答案的鏈接
    links = getUrl(keyword,pageNum)
    data_all = []
    fileName = keyword + '_Data.csv'
    for url in links:
        #
        flag = 0
        singleData = getSingleData(url)
        #如果在單個數據中,存在None值,則跳過,不加進去
        for data in singleData:
            if data == None:
                flag = 1
        if flag == 1:
            continue
        else:
            data_all.append(singleData)
    #寫入csv文件    
    csv_write(fileName,data_all)

#合併csv

def mergeData(pathInput,pathOutput):
    
    newData = []
    for path in pathInput:
        p = path + '_Data.csv'
        data = csv_read(p)
        for line in data:
            newData.append(line)
    if csv_write(pathOutput,newData):
        print('merge Data successfully')
    else:
        print("merge Data false")
    
#入口
    
dataNeed = ['微波爐維修',
            '空調維修',
            '冰箱維修',
            '電視維修',
            '抽油煙機維修',
            '電風扇維修',
            '熱水器維修',
            '電腦維修',
            '吹風機維修',
            '音響維修',
            '空氣淨化器維修',
            
        ]

# 獲取dataNeed中各類 數據 的爬蟲數據

for spi in dataNeed:
    
    spiderData(spi,10)

#合併所有爬蟲數據
    
pathOutput = 'all_data.csv'

mergeData(dataNeed,pathOutput)

總結

花了三天實踐,不懂就百度翻書看ppt,最後勉強做出來了,很粗糙幸好勉強能用。這裏給自己記錄一下,請大佬勿噴,歡迎同學們建議,一起學習進步。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章