8月23日爬蟲項目心得

學習了urllib庫

import urllib.request,urllib.parse
from lxml import etree
import re

domain = "http://www.ssme.sh.gov.cn"
url = "http://www.ssme.sh.gov.cn/public/search!productList.do"
header = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",\
          "Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7",\
          "Connection":"keep-alive","Content-Type":"application/x-www-form-urlencoded","Cookie":"userName=%u4E0A%u6D77%u79FB%u52A8; NTKF_T2D_CLIENTID=guest5854E8D0-B295-3D14-2777-AE30ACA765C8; ssmevisit=d3ce576d-6285-47c5-9d96-c4b376121d3f; JSESSIONID=B540D1C65BEC09A8CA504F7056B30B36; nTalk_CACHE_DATA={uid:fw_1000_ISME9754_guest5854E8D0-B295-3D,tid:1566534429812073}; ssmevisittemp=cea07490-9887-47a3-beed-c04d9e39c9f1",\
          "Host":"www.ssme.sh.gov.cn","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36}"}
for num in range(1,171):
    data = {}
    data["pageNo"] = str(num)
    postdata = urllib.parse.urlencode(data).encode('utf-8')
    request = urllib.request.Request(url,headers=header,method="POST",data= postdata)

    response = urllib.request.urlopen(request)


    html = etree.HTML(response.read())


    find = html.xpath('//div[@class="g_img"]/a/@href')

    for item in find:
        s = str(item)
        searchObject = re.search(r'/.*',s)
        if searchObject:
            with open('download/url_data.txt','a') as f:
                f.write(domain+searchObject.group()+'\n')

正則中^和$是對句子的開頭和結尾進行匹配,也就是說句子的開始必須得是某個字符,比如^a只能匹配ab,而在‘bac’中匹配不了任何東西。

編碼不統一的時候可以先用txt文本文件打開,轉換代碼之後即可解除亂碼,字符串編碼是使得字符串變爲字節數據,而解碼操作是將字節數據轉換成字符串的過程

文件可以持續寫入需要把文件的打開模式變爲‘a’

import urllib.request,urllib.parse
from lxml import etree
import re,csv
import time

header = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",\
          "Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7",\
          "Connection":"keep-alive","Cookie":"userName=%u4E0A%u6D77%u79FB%u52A8; NTKF_T2D_CLIENTID=guest5854E8D0-B295-3D14-2777-AE30ACA765C8; ssmevisit=d3ce576d-6285-47c5-9d96-c4b376121d3f; JSESSIONID=B540D1C65BEC09A8CA504F7056B30B36; nTalk_CACHE_DATA={uid:fw_1000_ISME9754_guest5854E8D0-B295-3D,tid:1566534429812073}; ssmevisittemp=351ecff8-92d2-422c-be98-5752ae312bcc",\
          "Host":"www.ssme.sh.gov.cn","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36}"}

for line in open('download/url_data.txt','r'):
    l = list()
    url = line
    request = urllib.request.Request(url, headers=header, method="GET")

    response = urllib.request.urlopen(request)

    html = etree.HTML(response.read())
    uname = html.xpath('//div[@class="cs_c_info"]//tr[2]/td[1]/text()')
    ucell = html.xpath('//div[@class="cs_c_info"]//tr[2]/td[2]/text()')
    utel = html.xpath('//div[@class="cs_c_info"]//tr[3]/td[2]/text()')
    umail = html.xpath('//div[@class="cs_c_info"]//tr[4]/td/text()')
    company = html.xpath('//div[@class="cs_shop_bg clear"]/span/text()')
    place = html.xpath('//div[@class="cs_shop_info"]//tr[3]/td/text()')
    if uname:
        l.append(uname[0])
    if ucell:
        l.append(ucell[0])
    if utel:
        l.append(utel[0])
    if umail:
        l.append(umail[0])
    if company:
        l.append(company[0])
    if place:
        l.append(place[0])
    print(l)
    out = open('Stu_csv.csv', 'a', newline='\n',encoding='utf-8')
    csv_write = csv.writer(out, dialect='excel')
    csv_write.writerow(l)
    time.sleep(3)

python的re(正則模塊)中search方法和match方法的區別在於前者匹配整個句子,後者僅僅從句子開頭匹配若不符合則不與匹配。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章