1、Quick Start
只需三步,你就可以部署一個爬取 gocn 網站的所有新聞的爬蟲
第一步,你需要去 github 上生成一個自己的 token Settings ——> Developer settings ——> Personal access tokens ——> Generate new token
然後,配置自己的環境變量 export GITHUB_TOKEN=(第一步生成的 token),或者將代碼中全局 Token 修改爲自己 token
var Token = GetValueFromEnv(“GITHUB_TOKEN”)
第二步,需要在本地安裝 redis,並且啓動程序之前需要先啓動本地 redis,端口使用默認端口 6379,因爲程序默認使用 redis 進行去重。redis 的安裝 可以參考 redis安裝
第三步,git clone 代碼倉庫,並且在後臺進程中運行爬蟲,每 6 個小時爬取當天新聞並進行 github 推送。
git clone https://github.com/lubanproj/crawl.git
cd crawl
go build -v
./crawl &
2、特性
- 支持每天定時爬取
- 支持分頁爬取
- 支持數據去重
- 支持 github 推送
3、展示效果
詳情效果可見:go_read
4、源碼分析
(1)爬取網站
// Crawl all gocn topics
func Crawl(url string) {
pattern := `/topics/\d+`
collector := colly.NewCollector()
collector.OnHTML("a[title]", func(e *colly.HTMLElement) {
// regex match topic
path := e.Attr("href")
topic, ok := regexMatch(path, pattern)
if ok {
e.Request.Visit(fmt.Sprintf("https://gocn.vip%s",topic))
}
})
redisAddr := ":6379"
conn, err := redis.Dial("tcp",redisAddr)
if err != nil {
log.Fatalf("get redis conn error : %v", err)
}
defer conn.Close()
collector.OnRequest(func(r *colly.Request) {
topic, ok := regexMatch(r.URL.Path, pattern)
if ok {
r.Visit(fmt.Sprintf("https://gocn.vip%s",topic))
// fmt.Println("content",r.URL)
}
})
collector.OnResponse(func(r *colly.Response) {
topic := strings.Replace(r.Request.URL.Path,"/topics/","", -1)
isExist, err := existTopic(conn, topic)
// the topic has had crawled
if isExist == 1 || err != nil {
return
}
title, content, ok := parseContent(string(r.Body))
titleAndContent := fmt.Sprintf("<h3>%s</h3>%s<hr>", title, content)
fmt.Println("titleAndContent : ", titleAndContent)
date := getDate(title)
if curDay := time.Now().Format("2006-01-02"); curDay != date {
// just climb today's data
return
}
if ok && content != "" && title != "" {
pushToGithub(titleAndContent, Token)
}
saveDB(conn, topic, date)
})
collector.Visit(url)
}
(2)正則表達式解析內容
func parseContent(body string) (string, string, bool) {
pattern := `<p>GoCN(.|\n|\t)*每日新聞(.*?)</p>`
title, _ := regexMatch(body, pattern)
if title == "" {
pattern = `<h[0-9]>GoCN(.|\n|\t)*每日新聞(.|\n|\t)*</h[0-9]>?`
title, _ = regexMatch(body, pattern)
if title == "" {
return "", "", false
}
pattern = `>(.|\n|\t)*每日新聞(.|\n|\t)*<`
title, _ = regexMatch(title, pattern)
title = strings.Replace(title, "<", "", 1)
title = strings.Replace(title, ">", "", 1)
}
pattern = `<ol>(.|\n|\t)*</ol>`
content, _ := regexMatch(body, pattern)
return title, content, true
}
(3) 推送 github
func pushToGithub(data, token string) error {
if data == "" {
return errors.New("params error")
}
ctx := context.Background()
ts := oauth2.StaticTokenSource(
&oauth2.Token{AccessToken: token},
)
tc := oauth2.NewClient(ctx, ts)
client := github.NewClient(tc)
c := "feat: add gocn news, date : " + time.Now().Format("2006-01-02")
sha := ""
content := &github.RepositoryContentFileOptions{
Message: &c,
SHA: &sha,
Committer: &github.CommitAuthor{
Name: github.String("lubanproj"),
Email: github.String("[email protected]"),
Login: github.String("lubanproj"),
},
Author: &github.CommitAuthor{
Name: github.String("lubanproj"),
Email: github.String("[email protected]"),
Login: github.String("lubanproj"),
},
Branch: github.String("master"),
}
op := &github.RepositoryContentGetOptions{}
repo, _, _, er := client.Repositories.GetContents(ctx, "lubanproj", "go_read", "README.md", op)
if er != nil || repo == nil {
fmt.Println("get github repositories error, date: ", time.Now())
return er
}
content.SHA = repo.SHA
decodeBytes, err := base64.StdEncoding.DecodeString(*repo.Content)
if err != nil {
fmt.Println("decode repo error, ",err)
return err
}
oldContentList := strings.Split(string(decodeBytes), "<br>")
if len(oldContentList) != 2 {
fmt.Println("README.md format error")
}
content.Content = []byte(oldContentList[0] + "<br>" + data + oldContentList[1])
_, _, err = client.Repositories.UpdateFile(ctx, "lubanproj", "go_read", "README.md", content)
if err != nil {
println(err)
return err
}
return nil
}