學習了urllib庫
import urllib.request,urllib.parse from lxml import etree import re domain = "http://www.ssme.sh.gov.cn" url = "http://www.ssme.sh.gov.cn/public/search!productList.do" header = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",\ "Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7",\ "Connection":"keep-alive","Content-Type":"application/x-www-form-urlencoded","Cookie":"userName=%u4E0A%u6D77%u79FB%u52A8; NTKF_T2D_CLIENTID=guest5854E8D0-B295-3D14-2777-AE30ACA765C8; ssmevisit=d3ce576d-6285-47c5-9d96-c4b376121d3f; JSESSIONID=B540D1C65BEC09A8CA504F7056B30B36; nTalk_CACHE_DATA={uid:fw_1000_ISME9754_guest5854E8D0-B295-3D,tid:1566534429812073}; ssmevisittemp=cea07490-9887-47a3-beed-c04d9e39c9f1",\ "Host":"www.ssme.sh.gov.cn","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36}"} for num in range(1,171): data = {} data["pageNo"] = str(num) postdata = urllib.parse.urlencode(data).encode('utf-8') request = urllib.request.Request(url,headers=header,method="POST",data= postdata) response = urllib.request.urlopen(request) html = etree.HTML(response.read()) find = html.xpath('//div[@class="g_img"]/a/@href') for item in find: s = str(item) searchObject = re.search(r'/.*',s) if searchObject: with open('download/url_data.txt','a') as f: f.write(domain+searchObject.group()+'\n')
正則中^和$是對句子的開頭和結尾進行匹配,也就是說句子的開始必須得是某個字符,比如^a只能匹配ab,而在‘bac’中匹配不了任何東西。
編碼不統一的時候可以先用txt文本文件打開,轉換代碼之後即可解除亂碼,字符串編碼是使得字符串變爲字節數據,而解碼操作是將字節數據轉換成字符串的過程
文件可以持續寫入需要把文件的打開模式變爲‘a’
import urllib.request,urllib.parse from lxml import etree import re,csv import time header = {"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",\ "Accept-Encoding":"gzip, deflate","Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7",\ "Connection":"keep-alive","Cookie":"userName=%u4E0A%u6D77%u79FB%u52A8; NTKF_T2D_CLIENTID=guest5854E8D0-B295-3D14-2777-AE30ACA765C8; ssmevisit=d3ce576d-6285-47c5-9d96-c4b376121d3f; JSESSIONID=B540D1C65BEC09A8CA504F7056B30B36; nTalk_CACHE_DATA={uid:fw_1000_ISME9754_guest5854E8D0-B295-3D,tid:1566534429812073}; ssmevisittemp=351ecff8-92d2-422c-be98-5752ae312bcc",\ "Host":"www.ssme.sh.gov.cn","Upgrade-Insecure-Requests":"1","User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36}"} for line in open('download/url_data.txt','r'): l = list() url = line request = urllib.request.Request(url, headers=header, method="GET") response = urllib.request.urlopen(request) html = etree.HTML(response.read()) uname = html.xpath('//div[@class="cs_c_info"]//tr[2]/td[1]/text()') ucell = html.xpath('//div[@class="cs_c_info"]//tr[2]/td[2]/text()') utel = html.xpath('//div[@class="cs_c_info"]//tr[3]/td[2]/text()') umail = html.xpath('//div[@class="cs_c_info"]//tr[4]/td/text()') company = html.xpath('//div[@class="cs_shop_bg clear"]/span/text()') place = html.xpath('//div[@class="cs_shop_info"]//tr[3]/td/text()') if uname: l.append(uname[0]) if ucell: l.append(ucell[0]) if utel: l.append(utel[0]) if umail: l.append(umail[0]) if company: l.append(company[0]) if place: l.append(place[0]) print(l) out = open('Stu_csv.csv', 'a', newline='\n',encoding='utf-8') csv_write = csv.writer(out, dialect='excel') csv_write.writerow(l) time.sleep(3)
python的re(正則模塊)中search方法和match方法的區別在於前者匹配整個句子,後者僅僅從句子開頭匹配若不符合則不與匹配。