golang 圖片爬蟲

package main

import (
	"errors"
	"flag"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"strconv"
	"strings"

	"github.com/PuerkitoBio/goquery" // 解析html
)

var website = ""
var dir string = ""
var file_size int64 = 200 //文件大於200kb則下載
func loadUrl(uri string) (*goquery.Document, error) {
	resp, err_resp := http.Get(uri)
	if err_resp != nil {
		return nil, err_resp
	}
	defer resp.Body.Close()
	log.Printf("resp.Status %v", resp.Status)
	if resp.StatusCode != 200 {
		log.Fatalf("訪問 異常 %v", uri)
		return nil, errors.New("訪問異常,code:" + resp.Status)
	}
	return goquery.NewDocumentFromResponse(resp)
}
func getCatagreyUrls() []string {
	var urls []string
	doc, _ := loadUrl(website)
	if doc == nil {
		return nil
	}
	doc.Find(".pagelist > .thisclass").Each(func(i int, s *goquery.Selection) {
		pageTotal := s.Text()
		log.Printf("共%v頁", pageTotal)
		p_count, ee := strconv.Atoi(pageTotal)
		if ee == nil {
			var url string
			for i := 1; i < p_count; i++ {
				url = website + "/list_" + strconv.Itoa(i) + ".html" //網址信息
				urls = append(urls, url)
			}
		}
	})
	return urls
}

//分析欄目
func parseCatagrey(url string) {
	doc, err := loadUrl(url)
	if err != nil {
		log.Fatal(err)
	}
	if doc == nil {
		return
	}
	nodes := doc.Find(".w170img > a ")
	if nodes == nil {
		return
	}
	log.Printf("欄目分頁 %v\t當前頁共%v圖片", url, nodes.Length())
	nodes.Each(func(i int, s *goquery.Selection) { //遍歷整個文檔
		item_url, _ := s.Attr("href")
		log.Printf("item_url:%v", item_url)
		if item_url == "" || item_url == "#" {
			return
		}
		// 啓動協程下載圖片
		if strings.Index(item_url, "//") == 0 {
			item_url = "https:" + item_url
		}
		parseImgDetail(item_url, true)
	})
}

//分析展示詳情頁的圖片地址
func parseImgDetail(uri string, repeat bool) {
	uri = strings.Replace(uri, "/p/", website, -1)
	log.Printf("圖片瀏覽頁:%v", uri)
	doc, err_doc := loadUrl(uri)
	if err_doc != nil {
		log.Printf("%v,解析異常:%v", uri, err_doc)
	}
	if doc == nil {
		return
	}
	imgs := doc.Find(".imagebody > p > a > img")
	log.Printf("圖片數量 %v", imgs.Length())
	img_src, _ := imgs.Attr("src")
	// imgs.Each(func(j int, t *goquery.Selection) {
	// 	img_src, _ := t.Attr("src")
	if img_src == "" {
		return
	}
	go download(img_src)
	if repeat == false {
		return
	}
	pageList := doc.Find(".pagelist > a")
	log.Printf("%v\t當前頁共%v頁", uri, pageList.Length())
	pageList.Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		text := s.Text()
		if href == "" || text == "" || text == "下一頁" || href == uri {
			return
		}
		parseImgDetail(href, false)
	})
}

// 下載圖片
func download(img_url string) {
	log.Printf("圖片:%v", img_url)
	file_name := strings.Replace(img_url, "https://", dir, -1)
	log.Printf("保持文件:%v", file_name)
	os.MkdirAll(file_name+"_", os.ModePerm)
	os.RemoveAll(file_name + "_")
	_, err_stat := os.Stat(file_name)
	if err_stat == nil {
		log.Printf("已存在:%v", file_name)
		return
	}
	f, err := os.Create(file_name)
	if err != nil {
		log.Panic("文件創建失敗", err)
		return
	}
	defer f.Close() //結束關閉文件

	resp, err := http.Get(img_url)
	if err != nil {
		log.Println("http.get err", err)
	}
	log.Printf("resp: %s", resp.Status)
	ctLen := resp.ContentLength / 1024
	log.Printf("圖片大小 %v", ctLen)
	if file_size > 0 && ctLen <= file_size {
		log.Printf("文件太小<%v", file_size)
		return
	}
	body, err1 := ioutil.ReadAll(resp.Body)
	if err1 != nil {
		log.Println("讀取數據失敗")
	}
	defer resp.Body.Close() //結束關閉
	f.Write(body)
}

func main() {
	flag.StringVar(&website, "url", "", "網址")
	flag.StringVar(&dir, "d", "", "保存目錄,默認當前目錄下")
	flag.Int64Var(&file_size, "fsize", 0, "文件大小 kb 默認0kb 表示不限制")

	flag.Parse() //一定要執行
	if website == "" {
		log.Println("未設置網址,使用-url 傳參數")
		return
	}
	urls := getCatagreyUrls()
	for _, url := range urls {
		parseCatagrey(url)
	}
}

 

線程池方式執行

package main

import (
	"errors"
	"flag"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"runtime"
	"strconv"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery" // 解析html
)

//任務接口
type Job interface {
	Do()
}

//工人
type Worker struct {
	JobQueue chan Job
	Quit     chan bool
}

func NewWorker() Worker {
	return Worker{
		JobQueue: make(chan Job),
		Quit:     make(chan bool),
	}
}

func (w Worker) Run(wq chan chan Job) {
	go func() {
		for {
			wq <- w.JobQueue
			select {
			case job := <-w.JobQueue:
				job.Do()
			case <-w.Quit:
				return
			}
		}
	}()
}

type WorkerPool struct {
	WokerLen    int
	JobQueue    chan Job
	WorkerQueue chan chan Job
}

func NewWorkerPool(workerlen int) *WorkerPool {
	return &WorkerPool{
		WokerLen:    workerlen,                      //開始建立 workerlen 個worker(工人)協程
		JobQueue:    make(chan Job),                 //工作隊列 通道
		WorkerQueue: make(chan chan Job, workerlen), //最大通道參數設爲 最大協程數 workerlen 工人的數量最大值
	}
}

func (wp *WorkerPool) Run() {
	log.Println("初始化worker")
	for i := 0; i < wp.WokerLen; i++ {
		worker := NewWorker()
		worker.Run(wp.WorkerQueue)
	}
	go func() {
		for {
			select {
			case job := <-wp.JobQueue: //讀取任務
				//嘗試獲取一個可用的worker作業通道 這將阻塞,直到一個worker空閒
				worker := <-wp.WorkerQueue
				//將任務分配給工人
				worker <- job
			}
		}
	}()
}

var website = ""
var dir string = ""
var file_size int64 = 0 //文件大於200kb則下載
var workpool *WorkerPool

func loadUrl(uri string) (*goquery.Document, error) {
	resp, err_resp := http.Get(uri)
	if err_resp != nil {
		return nil, err_resp
	}
	defer resp.Body.Close()
	log.Printf("resp.Status %v", resp.Status)
	if resp.StatusCode != 200 {
		log.Fatalf("訪問 異常 %v", uri)
		return nil, errors.New("訪問異常,code:" + resp.Status)
	}
	return goquery.NewDocumentFromResponse(resp)
}
func getCatagreyUrls() []string {
	var urls []string
	doc, _ := loadUrl(website)
	if doc == nil {
		return nil
	}
	doc.Find(".pagelist > .thisclass").Each(func(i int, s *goquery.Selection) {
		pageTotal := s.Text()
		log.Printf("共%v頁", pageTotal)
		p_count, ee := strconv.Atoi(pageTotal)
		if ee == nil {
			var url string
			for i := 1; i < p_count; i++ {
				url = website + "/list_" + strconv.Itoa(i) + ".html" //網址信息
				urls = append(urls, url)
			}
		}
	})
	return urls
}

//分析欄目
func parseCatagrey(url string) {
	doc, err := loadUrl(url)
	if err != nil {
		log.Fatal(err)
	}
	if doc == nil {
		return
	}
	nodes := doc.Find(".w170img > a ")
	if nodes == nil {
		return
	}
	log.Printf("欄目分頁 %v\t當前頁共%v圖片", url, nodes.Length())
	nodes.Each(func(i int, s *goquery.Selection) { //遍歷整個文檔
		item_url, _ := s.Attr("href")
		log.Printf("item_url:%v", item_url)
		if item_url == "" || item_url == "#" {
			return
		}
		// 啓動協程下載圖片
		if strings.Index(item_url, "//") == 0 {
			item_url = "https:" + item_url
		}
		parseImgDetail(item_url, true)
	})
}

//分析展示詳情頁的圖片地址
func parseImgDetail(uri string, repeat bool) {
	uri = strings.Replace(uri, "/p/", website, -1)
	log.Printf("圖片瀏覽頁:%v", uri)
	doc, err_doc := loadUrl(uri)
	if err_doc != nil {
		log.Printf("%v,解析異常:%v", uri, err_doc)
	}
	if doc == nil {
		return
	}
	imgs := doc.Find(".imagebody > p > a > img")
	log.Printf("圖片數量 %v", imgs.Length())
	img_src, _ := imgs.Attr("src")
	if img_src == "" {
		return
	}
	// go download(img_src)
	//使用線程池
	//創建任務
	sc := &DownloadJob{uri: img_src}
	//加入線程隊列
	workpool.JobQueue <- sc

	if repeat == false {
		return
	}
	pageList := doc.Find(".pagelist > a")
	log.Printf("%v\t當前頁共%v頁", uri, pageList.Length())
	pageList.Each(func(i int, s *goquery.Selection) {
		href, _ := s.Attr("href")
		text := s.Text()
		if href == "" || text == "" || text == "下一頁" || href == uri {
			return
		}
		parseImgDetail(href, false)
	})
}

type DownloadJob struct {
	uri string
}

//實現Job Do 接口
func (d *DownloadJob) Do() {
	download(d.uri)
}

// 下載圖片
func download(img_url string) {
	log.Printf("圖片:%v", img_url)
	file_name := strings.Replace(img_url, "https://", dir, -1)
	os.MkdirAll(file_name+"_", os.ModePerm)
	os.RemoveAll(file_name + "_")
	_, err_stat := os.Stat(file_name)
	if err_stat == nil {
		log.Printf("已存在:%v", file_name)
		return
	}
	f, err := os.Create(file_name)
	if err != nil {
		log.Panic("文件創建失敗", err)
		return
	}
	defer f.Close() //結束關閉文件

	resp, err := http.Get(img_url)
	if err != nil {
		log.Println("http.get err", err)
	}
	log.Printf("resp: %s", resp.Status)
	ctLen := resp.ContentLength / 1024
	log.Printf("圖片大小 %v", ctLen)
	if file_size > 0 && ctLen <= file_size {
		log.Printf("文件太小<%v", file_size)
		return
	}
	body, err1 := ioutil.ReadAll(resp.Body)
	if err1 != nil {
		log.Println("讀取數據失敗")
	}
	defer resp.Body.Close() //結束關閉
	f.Write(body)
	log.Printf("保存文件:%v", file_name)
}

func main() {
	var poolSzie int = 1000
	flag.StringVar(&website, "url", "", "網址")
	flag.StringVar(&dir, "dir", "", "保存目錄,默認當前目錄下")
	flag.Int64Var(&file_size, "file-size", 0, "文件大小 kb 默認0 表示不限制")
	flag.IntVar(&poolSzie, "pool", 1000, "線程池大小 默認1000")
	flag.Parse() //一定要執行
	if website == "" {
		log.Println("未設置網址,使用-url 傳參數")
		return
	}
	workpool = NewWorkerPool(poolSzie)
	workpool.Run()
	urls := getCatagreyUrls()
	for _, url := range urls {
		parseCatagrey(url)
	}
	for { //阻塞主程序結束
		log.Println("=========\nruntime.NumGoroutine() :%v", runtime.NumGoroutine())
		time.Sleep(5 * time.Second)
	}
}

跑了4個小時下載的文件數量

由於目標網站的IP封鎖,所以開設的線程數10個,耗時比較長。

原理是使用goquery分析頁面,查找對應的鏈接地址,然後再訪問該地址獲取圖片鏈接,再http.get獲取並保存。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章