package main
import (
"fmt"
"net/http"
"os"
"regexp"
"strconv"
"strings"
)
func main() {
var start, end int
fmt.Printf("請輸入起始頁( >= 1) :")
fmt.Scan(&start)
fmt.Printf("請輸入終止頁( >= 起始頁) :")
fmt.Scan(&end)
DoWork(start, end) //工作函數
}
//開啓協程
func DoWork(start, end int) {
fmt.Printf("準備爬取第%d頁到%d頁的網址\n", start, end)
page := make(chan int)
for i := start; i <= end; i++ {
//開協程爬
go SpiderPape(i, page)
}
for i := start; i <= end; i++ {
fmt.Printf("第%d個頁面爬取完成\n", <-page)
}
}
//爬取頁面
func SpiderPape(i int, page chan int) {
//這裏需要分析頁面的規律
url := "https://m.pengfu.com/xiaohua_" + strconv.Itoa(i) + ".html"
fmt.Printf("正在爬取第%d個網頁:%s\n", i, url)
//開始爬取頁面內容
result, err := HttpGet(url)
if err != nil {
fmt.Println("HttpGet err = ", err)
return
}
//這裏需要分析內容頁面url的規律
re := regexp.MustCompile(`<h1 class="f18"><a href="(?s:(.*?))"`)
if re == nil {
fmt.Println("regexp.MustCompile err")
return
}
//取內容的url
joyUrls := re.FindAllStringSubmatch(result, -1)
fileTitle := make([]string, 0)
fileContent := make([]string, 0)
//根據內容url爬取標題和內容
//第一個返回下標,第二個返回內容
for _, data := range joyUrls {
//開始爬取每一個笑話,每一個段子
title, content, err := SpiderOneJoy(data[1])
if err != nil {
fmt.Println("SpiderOneJoy err = ", err)
continue
}
fileTitle = append(fileTitle, title) //追加內容
fileContent = append(fileContent, content) //追加內容
}
//把內容寫入到文件
fmt.Println("title = ", fileTitle)
fmt.Println("content = ", fileContent)
StoreJoyToFile(i, fileTitle, fileContent)
page <- i //寫內容,寫num
}
//通過get請求獲取網頁內容
func HttpGet(url string) (result string, err error) {
resp, err1 := http.Get(url) //發送get請求
if err1 != nil {
err = err1
return
}
defer resp.Body.Close()
//讀取網頁內容
buf := make([]byte, 4*1024)
for {
n, _ := resp.Body.Read(buf)
if n == 0 {
break
}
result += string(buf[:n]) //累加讀取的內容
}
return
}
//開始爬取每一個笑話,每一個段子 title, content, err := SpiderOneJoy(url)
func SpiderOneJoy(url string) (title, content string, err error) {
//開始爬取頁面內容
result, err1 := HttpGet(url)
if err1 != nil {
err = err1
return
}
//這裏需要分析標題的規律
re1 := regexp.MustCompile(`<title>(?s:(.*?))</title>`)
if re1 == nil {
//fmt.Println("regexp.MustCompile err")
err = fmt.Errorf("%s", "regexp.MustCompile err")
return
}
//取內容
tmpTitle := re1.FindAllStringSubmatch(result, 1) //最後一個參數爲1,只過濾第一個
for _, data := range tmpTitle {
title = data[1]
title = strings.Replace(title, "\t", "", -1)
break
}
//這裏需要分析內容的規律
re2 := regexp.MustCompile(`<div class="con-txt">(?s:(.*?))</div`)
if re2 == nil {
err = fmt.Errorf("%s", "regexp.MustCompile err2")
return
}
//取內容
tmpContent := re2.FindAllStringSubmatch(result, -1)
for _, data := range tmpContent {
content = data[1]
content = strings.Replace(content, "\t", "", -1)
content = strings.Replace(content, "\n", "", -1)
content = strings.Replace(content, "\r", "", -1)
content = strings.Replace(content, "<br />", "", -1)
break
}
return
}
//把內容寫入到文件
func StoreJoyToFile(i int, fileTitle, fileContent []string) {
//新建文件
f, err := os.Create(strconv.Itoa(i) + ".txt")
if err != nil {
fmt.Println("os.Create err = ", err)
return
}
defer f.Close()
//寫內容
n := len(fileTitle)
for i := 0; i < n; i++ {
//寫標題
f.WriteString(fileTitle[i] + "\n")
//寫內容
f.WriteString(fileContent[i] + "\n")
f.WriteString("\n=================================================================\n")
}
}
go寫爬蟲
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.