鋼管門戶官網產品信息爬取-圖片存儲-文字存至excel--Python-request

# -*- coding: utf-8 -*-
"""
Created on Mon Nov 18 14:58:57 2019

@author: Administrator
"""

import urllib.request as request
import lxml.html as html
import xlsxwriter
import time
import random

#模擬谷歌瀏覽器
headers = {'User-Agent':' Mozilla/5.0 (Windows NT 6.1; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'}
#代理
px = request.ProxyHandler({
        'http':'182.35.84.97:9999'
        #'http':'114.239.42.226:9999',
        #'http':'183.154.55.73:9999',
       # 'http':'1.198.73.115:9999',
       # 'http':'183.166.125.221:9999'
        })
#opener = request.build_opener(px)

xpathNm = "//div[@class = \"chanpin\"]/ul/li/a/p/text()"
xpathNm2 = "//div[@class = \"place\"]/text()"
xpathUrl = "//div[@class = \"chanpin\"]/ul/li/a/@href"
xpathImg = "//div[@class = \"chanpin\"]/ul/li/a/img/@src"
xpathFenlei = "//div[@class  = \"place\"]/a/text()"
xpathJieshao = "//div[@class = \"content\"]/text()"
a = 1
def jiexi(url,xpath):    
    #代理抓取
    #req = request.Request(url=url,headers=headers)
    #res = opener.open(req)
    #data = res.read().decode("utf-8")
    data = request.urlopen(url,timeout = 500).read().decode("utf-8")
    dom = html.document_fromstring(data)
    
    linksO = dom.xpath(xpath)
    return(linksO)
excelPath = "C:/Users/Administrator/pachong/excel_1967416.xls"
xl = xlsxwriter.Workbook(r'C:\Users\Administrator\pachong\excel_1967416.xls')
sheet=xl.add_worksheet()
def write_excel_xls_append(x, y,value):
    sheet.write_string(x+ str(y),value)
    
x=2
y=2
z=2
while a<=12:
    url = "http://www.sdblygc.com/cpzx/list_2_"+str(a)+".html"
    imgUrls = jiexi(url,xpathImg)
    for imgUrl in imgUrls:
        imgUrl = "http://www.sdblygc.com" + imgUrl
        write_excel_xls_append('C', x,imgUrl)
        #print(imgUrl)
        x += 1
    '''
    names = jiexi(url,xpathNm)
    for name in names:
        write_excel_xls_append('D', z,name)
        print(name)
        z += 1
    '''
    linksUrl = jiexi(url,xpathUrl)
   
    fenleis = []
    jieshaos= []
    for linkUrl in linksUrl:
        linkUrl = "http://www.sdblygc.com" + linkUrl
        name = jiexi(linkUrl,xpathNm2)
        write_excel_xls_append('D', y,name[3])
        print(name[3])
        fenlei = jiexi(linkUrl,xpathFenlei)
        write_excel_xls_append('A', y,fenlei[2])
        write_excel_xls_append('B', y,fenlei[3])
        #print(fenlei)
        jieshao = jiexi(linkUrl,xpathJieshao)
        write_excel_xls_append('E', y,jieshao[1])
        #print("介紹",jieshao)
        #dataFrame = pd.DataFrame({'大類名稱':fenlei})
        #dataFrame.to_csv(r"C:\Users\Administrator\pachong\111.csv",sep=',')
        #csv_writer.writerow({'大類名稱':fenlei})
        y += 1
    a += 1
    time.sleep(random.randint(3,5))
   # write_excel_xls_append(excelPath, fenleis)


xl.close()

練手保存 不做註釋!

發佈了13 篇原創文章 · 獲贊 14 · 訪問量 4萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章