前言
很久沒寫什麼代碼,除了應付學校的期末大作業,一直都渾渾噩噩的,然後今天看到了Go語言的爬蟲,好像還挺有趣的手癢癢。不過好久沒寫go了,還是花了一點時間來回憶的。
正文
寫了兩段代碼,打算比較一下python和go的爬蟲速度
目標網站: 性感美女圖
其實原準備爬取豆瓣250,可是實力確實太菜,一開始用go爬取忘記添加請求頭,直接被封ip了,沒辦法選擇了這個反爬措施比較少的網站。然後主要比較速度,所以只爬取了標題,沒有獲取圖片(存這些圖片怕身體喫不消)
1.python版本代碼
python版本 |
import requests
from lxml import etree
import time
def get_title():
headers = {
"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Host": "www.meituba.com"
}
for page in range(1,11):
url=f"http://www.meituba.com/xinggan/list8{page}.html"
response=requests.get(url,headers=headers)
response.encoding='utf-8'
bs = etree.HTML(response.text)
items = bs.xpath('//div[@class="channel_list"]/ul/li')
for i in range(len(items)):
title = items[i].xpath('./a/text()')[0]
print(f"{(page-1)*len(items)+i+1}:",title)
if __name__ == '__main__':
#proxies ={'https': '183.185.185.64:9797'}
begin=time.time()
get_title()
end=time.time()
spendTime=end-begin
print("花費的時間爲:",spendTime)
python爬取160張圖片的標題花了5.3秒
2.go基礎版本
package main
import (
"fmt"
"log"
"net/http"
"strconv"
"time"
"github.com/goquery"
)
func main() {
client := &http.Client{}
begin := time.Now()
for page := 1; page <= 10; page++ {
url := "http://www.meituba.com/xinggan/list8" + strconv.Itoa(page) + ".html"
request, err := http.NewRequest("GET", url, nil)
request.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36")
request.Header.Add("Host", "www.meituba.com")
response, err := client.Do(request)
opending, err := goquery.NewDocumentFromResponse(response)
if err != nil {
log.Fatal(err)
}
ele := opending.Find(".imgc")
ele.Each(func(index int, content *goquery.Selection) {
name, _ := content.Find("a").First().Attr("title")
fmt.Printf("%d: %s\n", index+1+(page-1)*16, name)
})
}
end := time.Now()
spendTime := end.Sub(begin)
fmt.Println("花費時間爲:", spendTime)
}
發現go只花了4.7秒,比python快了0.6秒
3.go進階版本
然後進階一下,使用channel和goroutine試試能不能繼續提速
將請求網頁返回的頁面放進channel,然後一邊獲得頁面,一邊解析頁面
package main
import (
"fmt"
"io/ioutil"
"net/http"
"strings"
"time"
"github.com/goquery"
)
func getBody(url string, chanWeb chan string) {
client := &http.Client{}
request, _ := http.NewRequest("GET", url, nil)
request.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36")
request.Header.Add("Host", "www.meituba.com")
response, _ := client.Do(request)
body, _ := ioutil.ReadAll(response.Body)
chanWeb <- string(body)
}
func getTitle(body string, chanR chan string) {
dom, err := goquery.NewDocumentFromReader(strings.NewReader(body))
if err != nil {
fmt.Println(err)
}
dom.Find(".imgc").Each(func(i int, selection *goquery.Selection) {
title, _ := selection.Find("a").First().Attr("title")
chanR <- title
})
}
func main() {
begin := time.Now()
baseUrl := "http://www.meituba.com/xinggan/list8%d.html"
chanWeb := make(chan string)
defer close(chanWeb)
chanR := make(chan string)
defer close(chanR)
for i := 1; i <= 10; i++ {
url := fmt.Sprintf(baseUrl, i*16)
go getBody(url, chanWeb)
}
go func() {
for web_content := range <-chanWeb {
go getTitle(string(web_content), chanR)
}
}()
count := 0
for res := range <-chanR {
count += 1
fmt.Printf("%d:%s\n", count, res)
}
end := time.Now()
spendTime := end.Sub(begin)
fmt.Println("花費的時間爲:", spendTime)
}
更快了,大概只要4.3秒。
最後
所以go以速度快(高併發)出名還是有道理的,在數據量越來越大的時代,程序的處理速度也得跟上,習慣了使用簡單的python可遠遠不夠,還是應該繼續學習新知識。其實比起C語言來說,go還是簡單很多的,雖然沒有python那麼容易上手,但是熟悉之後會喜歡上的。