go語言爬取圖片

一、前言

發現了一個網站 https://wall.alphacoders.com/ ,圖片質量挺高,正好最近在學習go,就用go下載圖片到本地
我記錄一下,以便以後查看,代碼肯定是有所不足,大家別笑話,看看就好。
[起始頁截圖 https://wall.alphacoders.com/finding_wallpapers.php ]
在這裏插入圖片描述

二、大概思路

1、https://wall.alphacoders.com/finding_wallpapers.php 作爲入口地址
2、根據入口地址獲取每個分類的title和href
3、處理每個分類:進入分類頁面,獲取到最後一頁的地址
4、知道最後一頁的地址,從第一頁循環到最後一頁,獲取每頁的數據
5、循環匹配每頁中原圖的地址(還是個頁面,並非圖片)
6、在原圖地址中獲取待下載的圖片地址:$(’.img.main-content’).attr(‘src’)
7、保存圖片到本地,如果已保存,則不重複保存(注意:如果程序中途結束,有些圖片是不完整的,不完整的圖片是不會再次下載的)
【注意】:如果網頁的標籤有所變動,代碼肯定是會報錯的
【延伸】:
如果 goquery 模塊不存在,cmd中執行 [ go get github.com/opesun/goquery ] 即可
這個html解析器好像不是很強,可以用正則匹配地址或者用其他的html解析器
這個可以優化一下,用多線程執行

三、代碼

package main

import (
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"strconv"
	"strings"
	//"github.com/PuerkitoBio/goquery"	//這個模塊配置環境經常出錯就沒有選擇,我這裏顯示該模塊某些方法已棄用
	"github.com/opesun/goquery"
)

const DOMAIN string = "https://wall.alphacoders.com/"

func main()  {
	start()	//下載所有

	//自定義分類下載:分類數據在 log.txt 文件中獲取
	/*custom := [10][2]string{
		{"41276Women", "https://wall.alphacoders.com/by_category.php?id=33&name=Women+Wallpapers"},
		{"1545Game", "https://wall.alphacoders.com/by_category.php?id=14&name=Game+Wallpapers"}}
	customStart(custom)*/
}

//自定義分類鏈接下載
func customStart(custom [10][2]string)  {
	fmt.Println("開始下載...")
	for _, value := range custom{
		if len(value) == 0 {	//爲空,跳過後面的步驟,執行下一個循環
			continue
		}
		//獲取鍵(標題)值(url)地址
		title := value[0]
		href := value[1]

		dir := createDir(title)	//創建文件夾名爲分類的文件夾
		handleOneUrl(href, dir)	//處理單個分類的數據
	}
	fmt.Println("結束下載!!!")
}

func start(){
	fmt.Println("開始下載...")
	titleToUrls := getUrls()	//獲取到title - href 的map數據

	//fmt.Println(titleToUrls)
	//循環,以title爲文件夾名,處理每個分類下面的分頁數據
	for _, value := range titleToUrls{
		if len(value) == 0 {	//一維數組爲空,跳過後面的步驟,執行下一個循環
			continue
		}
		//獲取鍵(標題)值(url)地址
		title := value[0]
		href := value[1]

		dir := createDir(title)
		handleOneUrl(href, dir)
	}
	fmt.Println("結束下載!!!")
}

//處理單個分類URL的數據(包含分頁數據)
func handleOneUrl(url string, dir string){
	doc, err := goquery.ParseUrl(url)
	if err != nil {//hidden-sm
		fmt.Println("error to get url html:", err.Error())
		return
	}

	liList := doc.Find("li")	//匹配到這個分類頁面裏所有li標籤
	lastUrl := ""

	length := liList.Length()
	isLast := false	//是否是最後一頁,我們認爲在 ... 後面的一個url是最後一頁
	// 這個for循環就是動態獲取分類最後一頁的url地址
	for i := 0; i < length; i++{	//獲取到最後一頁的ID
		text := liList.Find("a").Eq(i).Text()
		href := liList.Find("a").Eq(i).Attr("href")

		if isLast {	//isLast爲true,是最後一頁,保留最後一頁的Url,結束本次循環
			lastUrl = href
			break
		}

		if strings.Index(text, "...") != -1 {	//...存在,認爲下一個數據爲最後一頁,修改isLast爲true
			isLast = true
		}
	}

	//如果lastUrl爲空,返回,結束該分類的後續操作
	if lastUrl == "" {
		fmt.Println("Not Found Last Page:", url)
		return
	}

	//把該分類最後一頁的地址 https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440&page=474 拆分爲 https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440&page= 和 474
	lastEqIndex := strings.LastIndex(lastUrl, "=")	//獲取最後一個 = 在url中的索引
	startUrl := lastUrl[:lastEqIndex+1]	//https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440&page=
	lastPage := lastUrl[lastEqIndex+1:]	//最後一頁:474

	//循環處理單個分類的每一頁數據
	handleOnePage(startUrl, lastPage, dir)
}

//循環處理單個分類的每一頁數據
func handleOnePage(startUrl string, lastPage string, dir string){
	endPage,_ := strconv.Atoi(lastPage)	//string轉int
	for i := 1; i <= endPage; i++ {	//每頁數據,頁碼爲 [1, lastPage]
		pageUrl := startUrl + strconv.Itoa(i)	//獲取每頁的url
		pageDoc, err := goquery.ParseUrl(pageUrl)
		if err != nil {	//如果該頁數據不存在,則打印日誌,繼續執行下一頁
			fmt.Println("PageUrl error:", err.Error())
			continue
		}

		//獲取該的所有圖片原圖頁面
		urls := pageDoc.Find("div.boxgrid")
		length := urls.Length()
		for i:=0; i < length; i++ {
			suffixUrl := urls.Find("a").Eq(i).Attr("href")	//獲取每張圖片原圖頁面(原圖地址在原圖頁面中):big.php?i=1079534
			detailUrl := DOMAIN + suffixUrl	//大圖的URL地址

			//獲取最後大圖的地址
			pageDetailDoc, err := goquery.ParseUrl(detailUrl)
			if err != nil {
				fmt.Println("PageDetailUrl error:", err.Error())
				continue
			}

			//獲取最終URL:https://images8.alphacoders.com/107/thumb-1920-1079562.jpg
			finalUrl := pageDetailDoc.Find("img.main-content").Attr("src")	//獲取每張圖片原圖地址:把該地址的圖片下載保存到本地

			//設置文件名:獲取 https://images8.alphacoders.com/107/thumb-1920-1079562.jpg 的 thumb-1920-1079562.jpg 作爲文件名
			lastEqIndex := strings.LastIndex(finalUrl, "/")
			fileName := finalUrl[lastEqIndex+1:]	//最後一個/後面的內容
			savePath := dir + "/" + fileName		//保存到本地的地址:地址 + / +  圖片名

			saveFile(finalUrl, savePath)	//下載圖片到本地
		}
	}
}

//保存文件操作(如果程序被強行結束,下載的某些圖片可能不完整)
func saveFile(finalUrl string, savePath string){
	//判斷文件是否存在,如果存在,說明已經下載了,直接return,下載下一張圖片
	_, err := os.Stat(savePath)
	if err == nil {
		fmt.Printf("圖片 [ %s ] 已下載\n", finalUrl)
		return
	}

	//創建文件:如果文件已存在,會將文件清空
	file, err := os.Create(savePath)
	if err != nil {
		fmt.Println("createFile error:", err.Error())
		return
	}
	defer file.Close()

	//讀取url的信息,存入到文件
	resp, err := http.Get(finalUrl)
	if err != nil {
		fmt.Println("imageGet error:", err.Error())
		return
	}
	defer file.Close()

	buf := make([]byte, 4096)
	for {
		res, err2 := resp.Body.Read(buf)
		if res == 0{
			break
		}
		if err2 != nil && err2 != io.EOF {
			err = err2
			return
		}
		//寫入文件
		file.Write(buf[:res])
	}

	//循環結束,我們認爲圖片已經下載成功,控制檯輸出提示
	fmt.Printf("圖片 [ %s ] 下載 [ %s ] 成功\n", finalUrl, savePath)
}

//創建名爲dirName的文件夾,用於存放改分類的圖片
func createDir(dirName string) string{
	dir := "./images/" + dirName

	//如果文件夾不存在,則創建
	_, err := os.Stat(dir)
	if err != nil {
		//os.Mkdir(dir, os.ModePerm)	//創建單個文件夾
		os.MkdirAll(dir, os.ModePerm)	//遞歸創建文件夾
	}
	return dir
}

//獲取title,href二維數組結果數據
func getUrls() [100][2]string{
	url := "https://wall.alphacoders.com/finding_wallpapers.php"	//搜索頁面作爲入口

	doc, err := goquery.ParseUrl(url)
	if err != nil {	//如果入口頁數據獲取失敗,直接退出程序
		fmt.Println("SearchPage error:", err)
		os.Exit(0)	//結束程序
	}

	//定義二維數組,接收每個分類的title,href
	result := [100][2]string{}

	urls := doc.Find("a.list-group-item")	//匹配入口頁的url所在的標籤
	length := urls.Length()	//獲取匹配到的標籤長度,用於循環;urls.Length()可以直接寫在for中,但是那樣的話每次循環都要執行一次該函數,影響效率
	for i:=0; i < length; i++{
		text := urls.Eq(i).Text()	//匹配到html標籤裏的text內容
		text = strings.Replace(text, " ", "", -1)	//去除text內容的空格
		text = strings.Replace(text, "\n", "", -1)	//去除text內容的換行
		href := urls.Eq(i).Attr("href")	//獲取當前分類的url

		//去除爲 # 的href
		if href == "#" {	//如果href爲#,則後面的語句不執行,執行下一次循環
			continue
		}

		//如果沒有字符http的拼接上域名
		if strings.Index(href, "http") == -1 {
			href = DOMAIN + href
		}

		logger(text + " - " + href, "log.txt")	//記錄日誌

		//數組數據追加:數組不是連貫的數組(會有爲空的情況),因爲前面有continue操作;另外:設置了數組長度100,這裏只有80多個分類,所以後面10多個一維數組都是空的。
		result[i][0] = text
		result[i][1] = href
	}
	return result	//返回title,href二維數組結果數據
}

//寫日誌
func logger(content string, logFile string){
	//判斷文件是否存在,不存在則創建
	_, err := os.Stat(logFile)
	if err != nil {
		os.Create(logFile)
	}

	//設置文件類型爲追加
	file, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE, 666)
	if err != nil {
		log.Fatalln("fail to open log file")
		return
	}

	logger := log.New(file, "", log.LstdFlags)	//設置日誌的記錄格式
	logger.Println(content)	//記錄(寫入)日誌
}

四、打印

控制檯 輸出的內容如下
在這裏插入圖片描述

五、日誌

log.txt 文件保存的內容如下

2020/05/20 12:31:14 MobileVersion - https://mobile.alphacoders.com
2020/05/20 12:31:14 NewestWallpapers - https://wall.alphacoders.com/newest_wallpapers.php
2020/05/20 12:31:14 FeaturedWallpapers - https://wall.alphacoders.com/featured.php
2020/05/20 12:31:14 CreatorWallpapers - https://wall.alphacoders.com/by_creator.php
2020/05/20 12:31:14 AuthorsAtWallpaperAbyss - https://wall.alphacoders.com/authors.php
2020/05/20 12:31:14 HDWallpapers - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1080
2020/05/20 12:31:14 UltraHD4kWallpapers - https://wall.alphacoders.com/by_resolution.php?w=3840&h=2160
2020/05/20 12:31:14 Retina5kWallpapers - https://wall.alphacoders.com/by_resolution.php?w=5120&h=2880
2020/05/20 12:31:14 UltraHD8kWallpapers - https://wall.alphacoders.com/by_resolution.php?w=7680&h=4320
2020/05/20 12:31:14 PopularWallpapers - https://wall.alphacoders.com/popular.php
2020/05/20 12:31:14 CCOWallpapers - https://wall.alphacoders.com/by_license.php?filter=4
2020/05/20 12:31:14 RandomWallpapers - https://wall.alphacoders.com/random.php
2020/05/20 12:31:14 RecentComments - https://wall.alphacoders.com/comments.php
2020/05/20 12:31:14 RecentCaptions - https://wall.alphacoders.com/captions.php
2020/05/20 12:31:14 HighestRated - https://wall.alphacoders.com/highest_rated.php
2020/05/20 12:31:14 MostViewed - https://wall.alphacoders.com/by_views.php
2020/05/20 12:31:14 MostFavorited - https://wall.alphacoders.com/by_favorites.php
2020/05/20 12:31:14 MostCommentedOn - https://wall.alphacoders.com/by_comments.php
2020/05/20 12:31:14 PopularCollections - https://alphacoders.com/collections
2020/05/20 12:31:14 ByTag - https://wall.alphacoders.com/all_tags.php
2020/05/20 12:31:14 ByColor - https://wall.alphacoders.com/by_color.php
2020/05/20 12:31:14 iPhone11 - https://mobile.alphacoders.com/by-device/540/iPhone-11-Wallpapers?ref=wa
2020/05/20 12:31:14 iPhoneX - https://mobile.alphacoders.com/by-device/450/iPhone-X-Wallpapers?ref=wa
2020/05/20 12:31:14 GalaxyNote10 - https://mobile.alphacoders.com/by-device/543/Galaxy-Note10-Wallpapers?ref=wa
2020/05/20 12:31:14 GooglePixel4 - https://mobile.alphacoders.com/by-device/551/Pixel-4-Wallpapers?ref=wa
2020/05/20 12:31:14 18300Abstract - https://wall.alphacoders.com/by_category.php?id=1&name=Abstract+Wallpapers
2020/05/20 12:31:14 53634Animal - https://wall.alphacoders.com/by_category.php?id=2&name=Animal+Wallpapers
2020/05/20 12:31:14 180969Anime - https://wall.alphacoders.com/by_category.php?id=3&name=Anime+Wallpapers
2020/05/20 12:31:14 20318Artistic - https://wall.alphacoders.com/by_category.php?id=4&name=Artistic+Wallpapers
2020/05/20 12:31:14 30116Celebrity - https://wall.alphacoders.com/by_category.php?id=7&name=Celebrity+Wallpapers
2020/05/20 12:31:14 24640Comics - https://wall.alphacoders.com/by_category.php?id=8&name=Comics+Wallpapers
2020/05/20 12:31:14 6087Dark - https://wall.alphacoders.com/by_category.php?id=9&name=Dark+Wallpapers
2020/05/20 12:31:14 61436Earth - https://wall.alphacoders.com/by_category.php?id=10&name=Earth+Wallpapers
2020/05/20 12:31:14 23407Fantasy - https://wall.alphacoders.com/by_category.php?id=11&name=Fantasy+Wallpapers
2020/05/20 12:31:14 17620Food - https://wall.alphacoders.com/by_category.php?id=12&name=Food+Wallpapers
2020/05/20 12:31:14 1545Game - https://wall.alphacoders.com/by_category.php?id=14&name=Game+Wallpapers
2020/05/20 12:31:14 7893Holiday - https://wall.alphacoders.com/by_category.php?id=15&name=Holiday+Wallpapers
2020/05/20 12:31:14 2398Humor - https://wall.alphacoders.com/by_category.php?id=13&name=Humor+Wallpapers
2020/05/20 12:31:14 33340ManMade - https://wall.alphacoders.com/by_category.php?id=16&name=Man+Made+Wallpapers
2020/05/20 12:31:14 523Men - https://wall.alphacoders.com/by_category.php?id=17&name=Men+Wallpapers
2020/05/20 12:31:14 10226Military - https://wall.alphacoders.com/by_category.php?id=18&name=Military+Wallpapers
2020/05/20 12:31:14 4026Misc - https://wall.alphacoders.com/by_category.php?id=19&name=Misc+Wallpapers
2020/05/20 12:31:14 45031Movie - https://wall.alphacoders.com/by_category.php?id=20&name=Movie+Wallpapers
2020/05/20 12:31:14 24587Music - https://wall.alphacoders.com/by_category.php?id=22&name=Music+Wallpapers
2020/05/20 12:31:14 17927Photography - https://wall.alphacoders.com/by_category.php?id=24&name=Photography+Wallpapers
2020/05/20 12:31:14 992Products - https://wall.alphacoders.com/by_category.php?id=25&name=Products+Wallpapers
2020/05/20 12:31:14 3168Religious - https://wall.alphacoders.com/by_category.php?id=26&name=Religious+Wallpapers
2020/05/20 12:31:14 17904SciFi - https://wall.alphacoders.com/by_category.php?id=27&name=Sci+Fi+Wallpapers
2020/05/20 12:31:14 14990Sports - https://wall.alphacoders.com/by_category.php?id=28&name=Sports+Wallpapers
2020/05/20 12:31:14 23041TVShow - https://wall.alphacoders.com/by_category.php?id=29&name=TV+Show+Wallpapers
2020/05/20 12:31:14 4204Technology - https://wall.alphacoders.com/by_category.php?id=30&name=Technology+Wallpapers
2020/05/20 12:31:14 67145Vehicles - https://wall.alphacoders.com/by_category.php?id=31&name=Vehicles+Wallpapers
2020/05/20 12:31:14 90168VideoGame - https://wall.alphacoders.com/by_category.php?id=32&name=Video+Game+Wallpapers
2020/05/20 12:31:14 1913Weapons - https://wall.alphacoders.com/by_category.php?id=34&name=Weapons+Wallpapers
2020/05/20 12:31:14 41289Women - https://wall.alphacoders.com/by_category.php?id=33&name=Women+Wallpapers
2020/05/20 12:31:14 68451280x960 - https://wall.alphacoders.com/by_resolution.php?w=1280&h=960 
2020/05/20 12:31:14 130221280x1024 - https://wall.alphacoders.com/by_resolution.php?w=1280&h=1024 
2020/05/20 12:31:14 41271281x961 - https://wall.alphacoders.com/by_resolution.php?w=1281&h=961 
2020/05/20 12:31:14 42381366x768 - https://wall.alphacoders.com/by_resolution.php?w=1366&h=768 
2020/05/20 12:31:14 60081440x900 - https://wall.alphacoders.com/by_resolution.php?w=1440&h=900 
2020/05/20 12:31:14 55191600x900 - https://wall.alphacoders.com/by_resolution.php?w=1600&h=900 
2020/05/20 12:31:14 20321600x1000 - https://wall.alphacoders.com/by_resolution.php?w=1600&h=1000 
2020/05/20 12:31:14 268361600x1200 - https://wall.alphacoders.com/by_resolution.php?w=1600&h=1200 
2020/05/20 12:31:14 150881680x1050 - https://wall.alphacoders.com/by_resolution.php?w=1680&h=1050 
2020/05/20 12:31:14 1819081920x1080 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1080 
2020/05/20 12:31:14 1109811920x1200 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1200 
2020/05/20 12:31:14 76841920x1280 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1280 
2020/05/20 12:31:14 77931920x1440 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1440 
2020/05/20 12:31:14 28732000x1333 - https://wall.alphacoders.com/by_resolution.php?w=2000&h=1333 
2020/05/20 12:31:14 37412048x1152 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1152 
2020/05/20 12:31:14 131212048x1365 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1365 
2020/05/20 12:31:14 21762048x1366 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1366 
2020/05/20 12:31:14 29052048x1367 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1367 
2020/05/20 12:31:14 52452048x1536 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1536 
2020/05/20 12:31:14 142022560x1440 - https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440 
2020/05/20 12:31:14 399912560x1600 - https://wall.alphacoders.com/by_resolution.php?w=2560&h=1600 
2020/05/20 12:31:14 20882560x1920 - https://wall.alphacoders.com/by_resolution.php?w=2560&h=1920 
2020/05/20 12:31:14 58902880x1800 - https://wall.alphacoders.com/by_resolution.php?w=2880&h=1800 
2020/05/20 12:31:14 52063000x2000 - https://wall.alphacoders.com/by_resolution.php?w=3000&h=2000 
2020/05/20 12:31:14 199733840x2160 - https://wall.alphacoders.com/by_resolution.php?w=3840&h=2160 
2020/05/20 12:31:14 50833840x2400 - https://wall.alphacoders.com/by_resolution.php?w=3840&h=2400 
2020/05/20 12:31:14 18775184x3456 - https://wall.alphacoders.com/by_resolution.php?w=5184&h=3456 
2020/05/20 12:31:14 19555616x3744 - https://wall.alphacoders.com/by_resolution.php?w=5616&h=3744 
2020/05/20 12:31:14 21265760x3840 - https://wall.alphacoders.com/by_resolution.php?w=5760&h=3840 
2020/05/20 12:31:14 26816000x4000 - https://wall.alphacoders.com/by_resolution.php?w=6000&h=4000 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章