GO 爬ZOL手機信息、練手項目

package demo

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
	"strings"

	"github.com/axgle/mahonia"
)

type ZolSpider struct {
	indexUrl string
}

func (this ZolSpider) ConvertToString(src string, srcCode string, tagCode string) string {
	srcCoder := mahonia.NewDecoder(srcCode)
	srcResult := srcCoder.ConvertString(src)
	tagCoder := mahonia.NewDecoder(tagCode)
	_, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
	result := string(cdata)
	return result
}

func (this ZolSpider) readUrlBody(url string) (string, error) {
	client := &http.Client{}
	request, err := http.NewRequest("GET", url, nil)
	if err != nil {
		return "err", err
	}
	request.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
	request.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36")
	request.Header.Set("Referer", "http://www.baidu.com")
	response, err := client.Do(request)
	body, err := ioutil.ReadAll(response.Body)
	defer response.Body.Close()
	if response.StatusCode == 200 {
		_, err := ioutil.ReadAll(response.Body)
		if err != nil {
			fmt.Println(err)
		}
	}
	return this.ConvertToString(string(body), "GBK", "UTF-8"), err
}

func (this ZolSpider) catchCategoryUrl(url string) ([]string, []string) {
	body, _ := this.readUrlBody(url)
	rcg := regexp.MustCompile(`<a class="more" href="(.*?)" target="_blank">更多參數&gt;&gt;<\/a>`)
	regPrice := regexp.MustCompile(`<b class="price-type">(.*?)</b>`)
	urls := rcg.FindAllStringSubmatch(body, -1)
	prices := regPrice.FindAllStringSubmatch(body, -1)
	cateUrl := make([]string, len(urls))
	catePrice := make([]string, len(prices))
	for i, u := range prices {
		catePrice[i] = u[1]
	}
	for i, u := range urls {
		cateUrl[i] = u[1]
	}
	return cateUrl, catePrice
}
func (this ZolSpider) catchProductInfo(url string) string {
	body, _ := this.readUrlBody(url)
	rcg := regexp.MustCompile(`<div class="detailed-parameters">(?sU:.*)<td class="copytable" colspan="2">`)
	result := rcg.FindString(body)

	re := regexp.MustCompile(`data-rel=\'(.*?)\'|<a(.*?)>(.*?)<\/a>`)
	result = re.ReplaceAllString(result, "")

	re = regexp.MustCompile(`<span id="(.*?)">(.*?)</span>`)
	result = re.ReplaceAllString(result, "$2")

	result = strings.Replace(result, "<br />", ",", -1)
	result = strings.Replace(result, "<span></i>>", "", -1)
	result = strings.Replace(result, "&nbsp;", " ", -1)
	result = strings.Replace(result, " class=\"hover-edit-param\"", "", -1)
	result = strings.Replace(result, "<em class=\"edit-param\" data-role=\"user-login\" >糾錯</em>", "", -1)

	phoneNameRegx := regexp.MustCompile(`proName\: \'(?sU:.*)\'`)
	phoneName := phoneNameRegx.FindString(body)
	phoneBrandRegx := regexp.MustCompile(`|manuName\: \'(?sU:.*)\'`)
	phoneBrand := phoneBrandRegx.FindString(body)

	fmt.Println(phoneName)
	fmt.Println(phoneBrand)
	rowReg := regexp.MustCompile(`<tr>(?sU:.*)<\/tr>`)
	ceilReg := regexp.MustCompile(`<t[d|h](.*?)>(?sU:.*)<\/t[d|h]>`)
	params := rowReg.FindAllStringSubmatch(result, -1)
	// reg := regexp.MustCompile(`<!--[^>]+>|<iframe[\S\s]+?</iframe>|<a[^>]+>|<td>|</td>|<th>|</th>|</a>|<script[\S\s]+?</script>|<div class="hzh_botleft">[\S\s]+?</div>`)

	for i := 0; i < len(params); i++ {
		ceil := ceilReg.FindAllStringSubmatch(params[i][0], -1)
		if len(ceil) == 1 {
			continue
		}
		left := ceil[0][0]
		right := ceil[1][0]
		fmt.Println(trimHtml(left), ":", trimHtml(right))
	}

	return ""
}

func trimHtml(src string) string {
	//將HTML標籤全轉換成小寫
	re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
	src = re.ReplaceAllStringFunc(src, strings.ToLower)
	//去除STYLE
	re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
	src = re.ReplaceAllString(src, "")
	//去除SCRIPT
	re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
	src = re.ReplaceAllString(src, "")
	//去除所有尖括號內的HTML代碼,並換成換行符
	re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
	src = re.ReplaceAllString(src, "\n")
	//去除連續的換行符
	re, _ = regexp.Compile("\\s{2,}")
	src = re.ReplaceAllString(src, "\n")
	return strings.TrimSpace(src)
}

func (this ZolSpider) run(url string) {
	cateUrls, _ := this.catchCategoryUrl(url)
	for i, url := range cateUrls {
		this.catchProductInfo("http://detail.zol.com.cn" + string(url))
		if i >= 3 {
			break
		}
		fmt.Println("****************************************************", (i + 1), "*******************************************")
	}
}
func Spider() {
	spder := new(ZolSpider)
	page := "http://detail.zol.com.cn/cell_phone_index/subcate57_0_list_1_0_1_1_0_1.html"
	// $oldUrl = 'http://detail.zol.com.cn/history/subcate57_0_1_0_1_%d.html';
	// $maxPage = 104;
	// $oldPage = 200;
	spder.run(page)
}

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章