package main
import (
"errors"
"flag"
"io/ioutil"
"log"
"net/http"
"os"
"strconv"
"strings"
"github.com/PuerkitoBio/goquery" // 解析html
)
var website = ""
var dir string = ""
var file_size int64 = 200 //文件大於200kb則下載
func loadUrl(uri string) (*goquery.Document, error) {
resp, err_resp := http.Get(uri)
if err_resp != nil {
return nil, err_resp
}
defer resp.Body.Close()
log.Printf("resp.Status %v", resp.Status)
if resp.StatusCode != 200 {
log.Fatalf("訪問 異常 %v", uri)
return nil, errors.New("訪問異常,code:" + resp.Status)
}
return goquery.NewDocumentFromResponse(resp)
}
func getCatagreyUrls() []string {
var urls []string
doc, _ := loadUrl(website)
if doc == nil {
return nil
}
doc.Find(".pagelist > .thisclass").Each(func(i int, s *goquery.Selection) {
pageTotal := s.Text()
log.Printf("共%v頁", pageTotal)
p_count, ee := strconv.Atoi(pageTotal)
if ee == nil {
var url string
for i := 1; i < p_count; i++ {
url = website + "/list_" + strconv.Itoa(i) + ".html" //網址信息
urls = append(urls, url)
}
}
})
return urls
}
//分析欄目
func parseCatagrey(url string) {
doc, err := loadUrl(url)
if err != nil {
log.Fatal(err)
}
if doc == nil {
return
}
nodes := doc.Find(".w170img > a ")
if nodes == nil {
return
}
log.Printf("欄目分頁 %v\t當前頁共%v圖片", url, nodes.Length())
nodes.Each(func(i int, s *goquery.Selection) { //遍歷整個文檔
item_url, _ := s.Attr("href")
log.Printf("item_url:%v", item_url)
if item_url == "" || item_url == "#" {
return
}
// 啓動協程下載圖片
if strings.Index(item_url, "//") == 0 {
item_url = "https:" + item_url
}
parseImgDetail(item_url, true)
})
}
//分析展示詳情頁的圖片地址
func parseImgDetail(uri string, repeat bool) {
uri = strings.Replace(uri, "/p/", website, -1)
log.Printf("圖片瀏覽頁:%v", uri)
doc, err_doc := loadUrl(uri)
if err_doc != nil {
log.Printf("%v,解析異常:%v", uri, err_doc)
}
if doc == nil {
return
}
imgs := doc.Find(".imagebody > p > a > img")
log.Printf("圖片數量 %v", imgs.Length())
img_src, _ := imgs.Attr("src")
// imgs.Each(func(j int, t *goquery.Selection) {
// img_src, _ := t.Attr("src")
if img_src == "" {
return
}
go download(img_src)
if repeat == false {
return
}
pageList := doc.Find(".pagelist > a")
log.Printf("%v\t當前頁共%v頁", uri, pageList.Length())
pageList.Each(func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
text := s.Text()
if href == "" || text == "" || text == "下一頁" || href == uri {
return
}
parseImgDetail(href, false)
})
}
// 下載圖片
func download(img_url string) {
log.Printf("圖片:%v", img_url)
file_name := strings.Replace(img_url, "https://", dir, -1)
log.Printf("保持文件:%v", file_name)
os.MkdirAll(file_name+"_", os.ModePerm)
os.RemoveAll(file_name + "_")
_, err_stat := os.Stat(file_name)
if err_stat == nil {
log.Printf("已存在:%v", file_name)
return
}
f, err := os.Create(file_name)
if err != nil {
log.Panic("文件創建失敗", err)
return
}
defer f.Close() //結束關閉文件
resp, err := http.Get(img_url)
if err != nil {
log.Println("http.get err", err)
}
log.Printf("resp: %s", resp.Status)
ctLen := resp.ContentLength / 1024
log.Printf("圖片大小 %v", ctLen)
if file_size > 0 && ctLen <= file_size {
log.Printf("文件太小<%v", file_size)
return
}
body, err1 := ioutil.ReadAll(resp.Body)
if err1 != nil {
log.Println("讀取數據失敗")
}
defer resp.Body.Close() //結束關閉
f.Write(body)
}
func main() {
flag.StringVar(&website, "url", "", "網址")
flag.StringVar(&dir, "d", "", "保存目錄,默認當前目錄下")
flag.Int64Var(&file_size, "fsize", 0, "文件大小 kb 默認0kb 表示不限制")
flag.Parse() //一定要執行
if website == "" {
log.Println("未設置網址,使用-url 傳參數")
return
}
urls := getCatagreyUrls()
for _, url := range urls {
parseCatagrey(url)
}
}
線程池方式執行
package main
import (
"errors"
"flag"
"io/ioutil"
"log"
"net/http"
"os"
"runtime"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery" // 解析html
)
//任務接口
type Job interface {
Do()
}
//工人
type Worker struct {
JobQueue chan Job
Quit chan bool
}
func NewWorker() Worker {
return Worker{
JobQueue: make(chan Job),
Quit: make(chan bool),
}
}
func (w Worker) Run(wq chan chan Job) {
go func() {
for {
wq <- w.JobQueue
select {
case job := <-w.JobQueue:
job.Do()
case <-w.Quit:
return
}
}
}()
}
type WorkerPool struct {
WokerLen int
JobQueue chan Job
WorkerQueue chan chan Job
}
func NewWorkerPool(workerlen int) *WorkerPool {
return &WorkerPool{
WokerLen: workerlen, //開始建立 workerlen 個worker(工人)協程
JobQueue: make(chan Job), //工作隊列 通道
WorkerQueue: make(chan chan Job, workerlen), //最大通道參數設爲 最大協程數 workerlen 工人的數量最大值
}
}
func (wp *WorkerPool) Run() {
log.Println("初始化worker")
for i := 0; i < wp.WokerLen; i++ {
worker := NewWorker()
worker.Run(wp.WorkerQueue)
}
go func() {
for {
select {
case job := <-wp.JobQueue: //讀取任務
//嘗試獲取一個可用的worker作業通道 這將阻塞,直到一個worker空閒
worker := <-wp.WorkerQueue
//將任務分配給工人
worker <- job
}
}
}()
}
var website = ""
var dir string = ""
var file_size int64 = 0 //文件大於200kb則下載
var workpool *WorkerPool
func loadUrl(uri string) (*goquery.Document, error) {
resp, err_resp := http.Get(uri)
if err_resp != nil {
return nil, err_resp
}
defer resp.Body.Close()
log.Printf("resp.Status %v", resp.Status)
if resp.StatusCode != 200 {
log.Fatalf("訪問 異常 %v", uri)
return nil, errors.New("訪問異常,code:" + resp.Status)
}
return goquery.NewDocumentFromResponse(resp)
}
func getCatagreyUrls() []string {
var urls []string
doc, _ := loadUrl(website)
if doc == nil {
return nil
}
doc.Find(".pagelist > .thisclass").Each(func(i int, s *goquery.Selection) {
pageTotal := s.Text()
log.Printf("共%v頁", pageTotal)
p_count, ee := strconv.Atoi(pageTotal)
if ee == nil {
var url string
for i := 1; i < p_count; i++ {
url = website + "/list_" + strconv.Itoa(i) + ".html" //網址信息
urls = append(urls, url)
}
}
})
return urls
}
//分析欄目
func parseCatagrey(url string) {
doc, err := loadUrl(url)
if err != nil {
log.Fatal(err)
}
if doc == nil {
return
}
nodes := doc.Find(".w170img > a ")
if nodes == nil {
return
}
log.Printf("欄目分頁 %v\t當前頁共%v圖片", url, nodes.Length())
nodes.Each(func(i int, s *goquery.Selection) { //遍歷整個文檔
item_url, _ := s.Attr("href")
log.Printf("item_url:%v", item_url)
if item_url == "" || item_url == "#" {
return
}
// 啓動協程下載圖片
if strings.Index(item_url, "//") == 0 {
item_url = "https:" + item_url
}
parseImgDetail(item_url, true)
})
}
//分析展示詳情頁的圖片地址
func parseImgDetail(uri string, repeat bool) {
uri = strings.Replace(uri, "/p/", website, -1)
log.Printf("圖片瀏覽頁:%v", uri)
doc, err_doc := loadUrl(uri)
if err_doc != nil {
log.Printf("%v,解析異常:%v", uri, err_doc)
}
if doc == nil {
return
}
imgs := doc.Find(".imagebody > p > a > img")
log.Printf("圖片數量 %v", imgs.Length())
img_src, _ := imgs.Attr("src")
if img_src == "" {
return
}
// go download(img_src)
//使用線程池
//創建任務
sc := &DownloadJob{uri: img_src}
//加入線程隊列
workpool.JobQueue <- sc
if repeat == false {
return
}
pageList := doc.Find(".pagelist > a")
log.Printf("%v\t當前頁共%v頁", uri, pageList.Length())
pageList.Each(func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
text := s.Text()
if href == "" || text == "" || text == "下一頁" || href == uri {
return
}
parseImgDetail(href, false)
})
}
type DownloadJob struct {
uri string
}
//實現Job Do 接口
func (d *DownloadJob) Do() {
download(d.uri)
}
// 下載圖片
func download(img_url string) {
log.Printf("圖片:%v", img_url)
file_name := strings.Replace(img_url, "https://", dir, -1)
os.MkdirAll(file_name+"_", os.ModePerm)
os.RemoveAll(file_name + "_")
_, err_stat := os.Stat(file_name)
if err_stat == nil {
log.Printf("已存在:%v", file_name)
return
}
f, err := os.Create(file_name)
if err != nil {
log.Panic("文件創建失敗", err)
return
}
defer f.Close() //結束關閉文件
resp, err := http.Get(img_url)
if err != nil {
log.Println("http.get err", err)
}
log.Printf("resp: %s", resp.Status)
ctLen := resp.ContentLength / 1024
log.Printf("圖片大小 %v", ctLen)
if file_size > 0 && ctLen <= file_size {
log.Printf("文件太小<%v", file_size)
return
}
body, err1 := ioutil.ReadAll(resp.Body)
if err1 != nil {
log.Println("讀取數據失敗")
}
defer resp.Body.Close() //結束關閉
f.Write(body)
log.Printf("保存文件:%v", file_name)
}
func main() {
var poolSzie int = 1000
flag.StringVar(&website, "url", "", "網址")
flag.StringVar(&dir, "dir", "", "保存目錄,默認當前目錄下")
flag.Int64Var(&file_size, "file-size", 0, "文件大小 kb 默認0 表示不限制")
flag.IntVar(&poolSzie, "pool", 1000, "線程池大小 默認1000")
flag.Parse() //一定要執行
if website == "" {
log.Println("未設置網址,使用-url 傳參數")
return
}
workpool = NewWorkerPool(poolSzie)
workpool.Run()
urls := getCatagreyUrls()
for _, url := range urls {
parseCatagrey(url)
}
for { //阻塞主程序結束
log.Println("=========\nruntime.NumGoroutine() :%v", runtime.NumGoroutine())
time.Sleep(5 * time.Second)
}
}
跑了4個小時下載的文件數量
由於目標網站的IP封鎖,所以開設的線程數10個,耗時比較長。
原理是使用goquery分析頁面,查找對應的鏈接地址,然後再訪問該地址獲取圖片鏈接,再http.get獲取並保存。