go语言爬取图片

一、前言

发现了一个网站 https://wall.alphacoders.com/ ,图片质量挺高,正好最近在学习go,就用go下载图片到本地
我记录一下,以便以后查看,代码肯定是有所不足,大家别笑话,看看就好。
[起始页截图 https://wall.alphacoders.com/finding_wallpapers.php ]
在这里插入图片描述

二、大概思路

1、https://wall.alphacoders.com/finding_wallpapers.php 作为入口地址
2、根据入口地址获取每个分类的title和href
3、处理每个分类:进入分类页面,获取到最后一页的地址
4、知道最后一页的地址,从第一页循环到最后一页,获取每页的数据
5、循环匹配每页中原图的地址(还是个页面,并非图片)
6、在原图地址中获取待下载的图片地址:$(’.img.main-content’).attr(‘src’)
7、保存图片到本地,如果已保存,则不重复保存(注意:如果程序中途结束,有些图片是不完整的,不完整的图片是不会再次下载的)
【注意】:如果网页的标签有所变动,代码肯定是会报错的
【延伸】:
如果 goquery 模块不存在,cmd中执行 [ go get github.com/opesun/goquery ] 即可
这个html解析器好像不是很强,可以用正则匹配地址或者用其他的html解析器
这个可以优化一下,用多线程执行

三、代码

package main

import (
	"fmt"
	"io"
	"log"
	"net/http"
	"os"
	"strconv"
	"strings"
	//"github.com/PuerkitoBio/goquery"	//这个模块配置环境经常出错就没有选择,我这里显示该模块某些方法已弃用
	"github.com/opesun/goquery"
)

const DOMAIN string = "https://wall.alphacoders.com/"

func main()  {
	start()	//下载所有

	//自定义分类下载:分类数据在 log.txt 文件中获取
	/*custom := [10][2]string{
		{"41276Women", "https://wall.alphacoders.com/by_category.php?id=33&name=Women+Wallpapers"},
		{"1545Game", "https://wall.alphacoders.com/by_category.php?id=14&name=Game+Wallpapers"}}
	customStart(custom)*/
}

//自定义分类链接下载
func customStart(custom [10][2]string)  {
	fmt.Println("开始下载...")
	for _, value := range custom{
		if len(value) == 0 {	//为空,跳过后面的步骤,执行下一个循环
			continue
		}
		//获取键(标题)值(url)地址
		title := value[0]
		href := value[1]

		dir := createDir(title)	//创建文件夹名为分类的文件夹
		handleOneUrl(href, dir)	//处理单个分类的数据
	}
	fmt.Println("结束下载!!!")
}

func start(){
	fmt.Println("开始下载...")
	titleToUrls := getUrls()	//获取到title - href 的map数据

	//fmt.Println(titleToUrls)
	//循环,以title为文件夹名,处理每个分类下面的分页数据
	for _, value := range titleToUrls{
		if len(value) == 0 {	//一维数组为空,跳过后面的步骤,执行下一个循环
			continue
		}
		//获取键(标题)值(url)地址
		title := value[0]
		href := value[1]

		dir := createDir(title)
		handleOneUrl(href, dir)
	}
	fmt.Println("结束下载!!!")
}

//处理单个分类URL的数据(包含分页数据)
func handleOneUrl(url string, dir string){
	doc, err := goquery.ParseUrl(url)
	if err != nil {//hidden-sm
		fmt.Println("error to get url html:", err.Error())
		return
	}

	liList := doc.Find("li")	//匹配到这个分类页面里所有li标签
	lastUrl := ""

	length := liList.Length()
	isLast := false	//是否是最后一页,我们认为在 ... 后面的一个url是最后一页
	// 这个for循环就是动态获取分类最后一页的url地址
	for i := 0; i < length; i++{	//获取到最后一页的ID
		text := liList.Find("a").Eq(i).Text()
		href := liList.Find("a").Eq(i).Attr("href")

		if isLast {	//isLast为true,是最后一页,保留最后一页的Url,结束本次循环
			lastUrl = href
			break
		}

		if strings.Index(text, "...") != -1 {	//...存在,认为下一个数据为最后一页,修改isLast为true
			isLast = true
		}
	}

	//如果lastUrl为空,返回,结束该分类的后续操作
	if lastUrl == "" {
		fmt.Println("Not Found Last Page:", url)
		return
	}

	//把该分类最后一页的地址 https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440&page=474 拆分为 https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440&page= 和 474
	lastEqIndex := strings.LastIndex(lastUrl, "=")	//获取最后一个 = 在url中的索引
	startUrl := lastUrl[:lastEqIndex+1]	//https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440&page=
	lastPage := lastUrl[lastEqIndex+1:]	//最后一页:474

	//循环处理单个分类的每一页数据
	handleOnePage(startUrl, lastPage, dir)
}

//循环处理单个分类的每一页数据
func handleOnePage(startUrl string, lastPage string, dir string){
	endPage,_ := strconv.Atoi(lastPage)	//string转int
	for i := 1; i <= endPage; i++ {	//每页数据,页码为 [1, lastPage]
		pageUrl := startUrl + strconv.Itoa(i)	//获取每页的url
		pageDoc, err := goquery.ParseUrl(pageUrl)
		if err != nil {	//如果该页数据不存在,则打印日志,继续执行下一页
			fmt.Println("PageUrl error:", err.Error())
			continue
		}

		//获取该的所有图片原图页面
		urls := pageDoc.Find("div.boxgrid")
		length := urls.Length()
		for i:=0; i < length; i++ {
			suffixUrl := urls.Find("a").Eq(i).Attr("href")	//获取每张图片原图页面(原图地址在原图页面中):big.php?i=1079534
			detailUrl := DOMAIN + suffixUrl	//大图的URL地址

			//获取最后大图的地址
			pageDetailDoc, err := goquery.ParseUrl(detailUrl)
			if err != nil {
				fmt.Println("PageDetailUrl error:", err.Error())
				continue
			}

			//获取最终URL:https://images8.alphacoders.com/107/thumb-1920-1079562.jpg
			finalUrl := pageDetailDoc.Find("img.main-content").Attr("src")	//获取每张图片原图地址:把该地址的图片下载保存到本地

			//设置文件名:获取 https://images8.alphacoders.com/107/thumb-1920-1079562.jpg 的 thumb-1920-1079562.jpg 作为文件名
			lastEqIndex := strings.LastIndex(finalUrl, "/")
			fileName := finalUrl[lastEqIndex+1:]	//最后一个/后面的内容
			savePath := dir + "/" + fileName		//保存到本地的地址:地址 + / +  图片名

			saveFile(finalUrl, savePath)	//下载图片到本地
		}
	}
}

//保存文件操作(如果程序被强行结束,下载的某些图片可能不完整)
func saveFile(finalUrl string, savePath string){
	//判断文件是否存在,如果存在,说明已经下载了,直接return,下载下一张图片
	_, err := os.Stat(savePath)
	if err == nil {
		fmt.Printf("图片 [ %s ] 已下载\n", finalUrl)
		return
	}

	//创建文件:如果文件已存在,会将文件清空
	file, err := os.Create(savePath)
	if err != nil {
		fmt.Println("createFile error:", err.Error())
		return
	}
	defer file.Close()

	//读取url的信息,存入到文件
	resp, err := http.Get(finalUrl)
	if err != nil {
		fmt.Println("imageGet error:", err.Error())
		return
	}
	defer file.Close()

	buf := make([]byte, 4096)
	for {
		res, err2 := resp.Body.Read(buf)
		if res == 0{
			break
		}
		if err2 != nil && err2 != io.EOF {
			err = err2
			return
		}
		//写入文件
		file.Write(buf[:res])
	}

	//循环结束,我们认为图片已经下载成功,控制台输出提示
	fmt.Printf("图片 [ %s ] 下载 [ %s ] 成功\n", finalUrl, savePath)
}

//创建名为dirName的文件夹,用于存放改分类的图片
func createDir(dirName string) string{
	dir := "./images/" + dirName

	//如果文件夹不存在,则创建
	_, err := os.Stat(dir)
	if err != nil {
		//os.Mkdir(dir, os.ModePerm)	//创建单个文件夹
		os.MkdirAll(dir, os.ModePerm)	//递归创建文件夹
	}
	return dir
}

//获取title,href二维数组结果数据
func getUrls() [100][2]string{
	url := "https://wall.alphacoders.com/finding_wallpapers.php"	//搜索页面作为入口

	doc, err := goquery.ParseUrl(url)
	if err != nil {	//如果入口页数据获取失败,直接退出程序
		fmt.Println("SearchPage error:", err)
		os.Exit(0)	//结束程序
	}

	//定义二维数组,接收每个分类的title,href
	result := [100][2]string{}

	urls := doc.Find("a.list-group-item")	//匹配入口页的url所在的标签
	length := urls.Length()	//获取匹配到的标签长度,用于循环;urls.Length()可以直接写在for中,但是那样的话每次循环都要执行一次该函数,影响效率
	for i:=0; i < length; i++{
		text := urls.Eq(i).Text()	//匹配到html标签里的text内容
		text = strings.Replace(text, " ", "", -1)	//去除text内容的空格
		text = strings.Replace(text, "\n", "", -1)	//去除text内容的换行
		href := urls.Eq(i).Attr("href")	//获取当前分类的url

		//去除为 # 的href
		if href == "#" {	//如果href为#,则后面的语句不执行,执行下一次循环
			continue
		}

		//如果没有字符http的拼接上域名
		if strings.Index(href, "http") == -1 {
			href = DOMAIN + href
		}

		logger(text + " - " + href, "log.txt")	//记录日志

		//数组数据追加:数组不是连贯的数组(会有为空的情况),因为前面有continue操作;另外:设置了数组长度100,这里只有80多个分类,所以后面10多个一维数组都是空的。
		result[i][0] = text
		result[i][1] = href
	}
	return result	//返回title,href二维数组结果数据
}

//写日志
func logger(content string, logFile string){
	//判断文件是否存在,不存在则创建
	_, err := os.Stat(logFile)
	if err != nil {
		os.Create(logFile)
	}

	//设置文件类型为追加
	file, err := os.OpenFile(logFile, os.O_APPEND|os.O_CREATE, 666)
	if err != nil {
		log.Fatalln("fail to open log file")
		return
	}

	logger := log.New(file, "", log.LstdFlags)	//设置日志的记录格式
	logger.Println(content)	//记录(写入)日志
}

四、打印

控制台 输出的内容如下
在这里插入图片描述

五、日志

log.txt 文件保存的内容如下

2020/05/20 12:31:14 MobileVersion - https://mobile.alphacoders.com
2020/05/20 12:31:14 NewestWallpapers - https://wall.alphacoders.com/newest_wallpapers.php
2020/05/20 12:31:14 FeaturedWallpapers - https://wall.alphacoders.com/featured.php
2020/05/20 12:31:14 CreatorWallpapers - https://wall.alphacoders.com/by_creator.php
2020/05/20 12:31:14 AuthorsAtWallpaperAbyss - https://wall.alphacoders.com/authors.php
2020/05/20 12:31:14 HDWallpapers - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1080
2020/05/20 12:31:14 UltraHD4kWallpapers - https://wall.alphacoders.com/by_resolution.php?w=3840&h=2160
2020/05/20 12:31:14 Retina5kWallpapers - https://wall.alphacoders.com/by_resolution.php?w=5120&h=2880
2020/05/20 12:31:14 UltraHD8kWallpapers - https://wall.alphacoders.com/by_resolution.php?w=7680&h=4320
2020/05/20 12:31:14 PopularWallpapers - https://wall.alphacoders.com/popular.php
2020/05/20 12:31:14 CCOWallpapers - https://wall.alphacoders.com/by_license.php?filter=4
2020/05/20 12:31:14 RandomWallpapers - https://wall.alphacoders.com/random.php
2020/05/20 12:31:14 RecentComments - https://wall.alphacoders.com/comments.php
2020/05/20 12:31:14 RecentCaptions - https://wall.alphacoders.com/captions.php
2020/05/20 12:31:14 HighestRated - https://wall.alphacoders.com/highest_rated.php
2020/05/20 12:31:14 MostViewed - https://wall.alphacoders.com/by_views.php
2020/05/20 12:31:14 MostFavorited - https://wall.alphacoders.com/by_favorites.php
2020/05/20 12:31:14 MostCommentedOn - https://wall.alphacoders.com/by_comments.php
2020/05/20 12:31:14 PopularCollections - https://alphacoders.com/collections
2020/05/20 12:31:14 ByTag - https://wall.alphacoders.com/all_tags.php
2020/05/20 12:31:14 ByColor - https://wall.alphacoders.com/by_color.php
2020/05/20 12:31:14 iPhone11 - https://mobile.alphacoders.com/by-device/540/iPhone-11-Wallpapers?ref=wa
2020/05/20 12:31:14 iPhoneX - https://mobile.alphacoders.com/by-device/450/iPhone-X-Wallpapers?ref=wa
2020/05/20 12:31:14 GalaxyNote10 - https://mobile.alphacoders.com/by-device/543/Galaxy-Note10-Wallpapers?ref=wa
2020/05/20 12:31:14 GooglePixel4 - https://mobile.alphacoders.com/by-device/551/Pixel-4-Wallpapers?ref=wa
2020/05/20 12:31:14 18300Abstract - https://wall.alphacoders.com/by_category.php?id=1&name=Abstract+Wallpapers
2020/05/20 12:31:14 53634Animal - https://wall.alphacoders.com/by_category.php?id=2&name=Animal+Wallpapers
2020/05/20 12:31:14 180969Anime - https://wall.alphacoders.com/by_category.php?id=3&name=Anime+Wallpapers
2020/05/20 12:31:14 20318Artistic - https://wall.alphacoders.com/by_category.php?id=4&name=Artistic+Wallpapers
2020/05/20 12:31:14 30116Celebrity - https://wall.alphacoders.com/by_category.php?id=7&name=Celebrity+Wallpapers
2020/05/20 12:31:14 24640Comics - https://wall.alphacoders.com/by_category.php?id=8&name=Comics+Wallpapers
2020/05/20 12:31:14 6087Dark - https://wall.alphacoders.com/by_category.php?id=9&name=Dark+Wallpapers
2020/05/20 12:31:14 61436Earth - https://wall.alphacoders.com/by_category.php?id=10&name=Earth+Wallpapers
2020/05/20 12:31:14 23407Fantasy - https://wall.alphacoders.com/by_category.php?id=11&name=Fantasy+Wallpapers
2020/05/20 12:31:14 17620Food - https://wall.alphacoders.com/by_category.php?id=12&name=Food+Wallpapers
2020/05/20 12:31:14 1545Game - https://wall.alphacoders.com/by_category.php?id=14&name=Game+Wallpapers
2020/05/20 12:31:14 7893Holiday - https://wall.alphacoders.com/by_category.php?id=15&name=Holiday+Wallpapers
2020/05/20 12:31:14 2398Humor - https://wall.alphacoders.com/by_category.php?id=13&name=Humor+Wallpapers
2020/05/20 12:31:14 33340ManMade - https://wall.alphacoders.com/by_category.php?id=16&name=Man+Made+Wallpapers
2020/05/20 12:31:14 523Men - https://wall.alphacoders.com/by_category.php?id=17&name=Men+Wallpapers
2020/05/20 12:31:14 10226Military - https://wall.alphacoders.com/by_category.php?id=18&name=Military+Wallpapers
2020/05/20 12:31:14 4026Misc - https://wall.alphacoders.com/by_category.php?id=19&name=Misc+Wallpapers
2020/05/20 12:31:14 45031Movie - https://wall.alphacoders.com/by_category.php?id=20&name=Movie+Wallpapers
2020/05/20 12:31:14 24587Music - https://wall.alphacoders.com/by_category.php?id=22&name=Music+Wallpapers
2020/05/20 12:31:14 17927Photography - https://wall.alphacoders.com/by_category.php?id=24&name=Photography+Wallpapers
2020/05/20 12:31:14 992Products - https://wall.alphacoders.com/by_category.php?id=25&name=Products+Wallpapers
2020/05/20 12:31:14 3168Religious - https://wall.alphacoders.com/by_category.php?id=26&name=Religious+Wallpapers
2020/05/20 12:31:14 17904SciFi - https://wall.alphacoders.com/by_category.php?id=27&name=Sci+Fi+Wallpapers
2020/05/20 12:31:14 14990Sports - https://wall.alphacoders.com/by_category.php?id=28&name=Sports+Wallpapers
2020/05/20 12:31:14 23041TVShow - https://wall.alphacoders.com/by_category.php?id=29&name=TV+Show+Wallpapers
2020/05/20 12:31:14 4204Technology - https://wall.alphacoders.com/by_category.php?id=30&name=Technology+Wallpapers
2020/05/20 12:31:14 67145Vehicles - https://wall.alphacoders.com/by_category.php?id=31&name=Vehicles+Wallpapers
2020/05/20 12:31:14 90168VideoGame - https://wall.alphacoders.com/by_category.php?id=32&name=Video+Game+Wallpapers
2020/05/20 12:31:14 1913Weapons - https://wall.alphacoders.com/by_category.php?id=34&name=Weapons+Wallpapers
2020/05/20 12:31:14 41289Women - https://wall.alphacoders.com/by_category.php?id=33&name=Women+Wallpapers
2020/05/20 12:31:14 68451280x960 - https://wall.alphacoders.com/by_resolution.php?w=1280&h=960 
2020/05/20 12:31:14 130221280x1024 - https://wall.alphacoders.com/by_resolution.php?w=1280&h=1024 
2020/05/20 12:31:14 41271281x961 - https://wall.alphacoders.com/by_resolution.php?w=1281&h=961 
2020/05/20 12:31:14 42381366x768 - https://wall.alphacoders.com/by_resolution.php?w=1366&h=768 
2020/05/20 12:31:14 60081440x900 - https://wall.alphacoders.com/by_resolution.php?w=1440&h=900 
2020/05/20 12:31:14 55191600x900 - https://wall.alphacoders.com/by_resolution.php?w=1600&h=900 
2020/05/20 12:31:14 20321600x1000 - https://wall.alphacoders.com/by_resolution.php?w=1600&h=1000 
2020/05/20 12:31:14 268361600x1200 - https://wall.alphacoders.com/by_resolution.php?w=1600&h=1200 
2020/05/20 12:31:14 150881680x1050 - https://wall.alphacoders.com/by_resolution.php?w=1680&h=1050 
2020/05/20 12:31:14 1819081920x1080 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1080 
2020/05/20 12:31:14 1109811920x1200 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1200 
2020/05/20 12:31:14 76841920x1280 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1280 
2020/05/20 12:31:14 77931920x1440 - https://wall.alphacoders.com/by_resolution.php?w=1920&h=1440 
2020/05/20 12:31:14 28732000x1333 - https://wall.alphacoders.com/by_resolution.php?w=2000&h=1333 
2020/05/20 12:31:14 37412048x1152 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1152 
2020/05/20 12:31:14 131212048x1365 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1365 
2020/05/20 12:31:14 21762048x1366 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1366 
2020/05/20 12:31:14 29052048x1367 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1367 
2020/05/20 12:31:14 52452048x1536 - https://wall.alphacoders.com/by_resolution.php?w=2048&h=1536 
2020/05/20 12:31:14 142022560x1440 - https://wall.alphacoders.com/by_resolution.php?w=2560&h=1440 
2020/05/20 12:31:14 399912560x1600 - https://wall.alphacoders.com/by_resolution.php?w=2560&h=1600 
2020/05/20 12:31:14 20882560x1920 - https://wall.alphacoders.com/by_resolution.php?w=2560&h=1920 
2020/05/20 12:31:14 58902880x1800 - https://wall.alphacoders.com/by_resolution.php?w=2880&h=1800 
2020/05/20 12:31:14 52063000x2000 - https://wall.alphacoders.com/by_resolution.php?w=3000&h=2000 
2020/05/20 12:31:14 199733840x2160 - https://wall.alphacoders.com/by_resolution.php?w=3840&h=2160 
2020/05/20 12:31:14 50833840x2400 - https://wall.alphacoders.com/by_resolution.php?w=3840&h=2400 
2020/05/20 12:31:14 18775184x3456 - https://wall.alphacoders.com/by_resolution.php?w=5184&h=3456 
2020/05/20 12:31:14 19555616x3744 - https://wall.alphacoders.com/by_resolution.php?w=5616&h=3744 
2020/05/20 12:31:14 21265760x3840 - https://wall.alphacoders.com/by_resolution.php?w=5760&h=3840 
2020/05/20 12:31:14 26816000x4000 - https://wall.alphacoders.com/by_resolution.php?w=6000&h=4000 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章