0.程序是針對美團中的美食部分數據按好評排序採集。
要抓取保存的數據爲:
商家名 類型 地理位置 評論人數 均價 最低價格
1.首先編寫網頁數據採集函數,使用request採集網頁源碼,具體實現如下
def getHtml(url): headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11') opener = urllib.request.build_opener() opener.addheaders = [headers] htmldata = opener.open(url).read() htmldata=htmldata.decode('utf-8') return htmldata
2.根據網頁源碼解析獲取已上線城市的url
class GetCityUrl(HTMLParser): part = ('gaevent','changecity/build') urldic = {} def handle_starttag(self, tag, attrs): if tag=='a' and (self.part in attrs): for att,value in attrs: if att=='href': self.urldic.__setitem__(value, value+'/category/meishi/all/rating') def getUrl(self): return self.urldic
3.獲取分頁url
class GetPages(HTMLParser): pagelist = list() temphref = str() flg = 0 initurl = str() def setInitUrl(self,url): self.initurl = url def handle_starttag(self, tag, attrs): if tag=='a': for attr,value in attrs: if attr=='href' and ('page' in value): self.temphref = self.initurl + value if self.temphref not in self.pagelist: self.pagelist.append(self.temphref) def getList(self): return self.pagelist
4.解析網頁源碼 獲取有效信息
class MyHTMLParser(HTMLParser): tempstr = str() divsum = int() def handle_starttag(self, tag, attrs): if tag=='div': for attr,value in attrs: if attr=='class' and value.find('poi-tile-nodeal')!=-1: self.tempstr='' self.divsum = 0 def handle_data(self, data): if(data.isspace()==False): data = data.replace('·', '·') if data=='¥': if '¥' not in self.tempstr: self.tempstr+='無' +'\t' self.tempstr+=data elif data=='¥': if '¥' not in self.tempstr: self.tempstr+='無' +'\t' self.tempstr+='¥' elif data=='人評價': self.tempstr=self.tempstr[0:-1]+data+'\t' elif data=='人均 ': self.tempstr+='人均' elif data[0]=='起': self.tempstr=self.tempstr[0:-1]+'起' else: self.tempstr+=data+'\t' def handle_endtag(self, tag): if tag=='div': self.divsum+=1 if self.divsum==6: if (self.tempstr.find('¥'))!=-1: if (re.split(r'\t', self.tempstr).__len__())==5: teststr = str() flg = 0 for stmp in re.split(r'\t',self.tempstr): if flg==2: teststr+='無位置信息'+'\t' teststr+=stmp+'\t' flg+=1 self.tempstr=teststr if (re.split(r'\t', self.tempstr).__len__())==6: arraystr.append(self.tempstr) self.divsum=0 self.tempstr=''
5.將信息存放於Excel中
def SaveExcel(listdata): head=['商家名','類型','地理位置','評論人數','均價','最低價格'] wbk=xlwt.Workbook() sheet1=wbk.add_sheet("sheet1") ii=0 for testhand in head: sheet1.write(0,ii,testhand) ii+=1 i=1 j=0 for stt in listdata: j=0 lis = re.split(r'\t',stt) for ls in lis: sheet1.write(i,j,ls) j=j+1 i+=1 wbk.save('test.xls')
以下是Excel中的數據:
附錄完整代碼:
#encoding:utf-8 ''' Created on 2016年7月22日 python version 3.5 @author: baalhuo ''' from html.parser import HTMLParser import re import urllib.request import xlwt import time #存放採集的商家信息 arraystr = list() #解析網頁源碼 獲取有效信息 class MyHTMLParser(HTMLParser): tempstr = str() divsum = int() def handle_starttag(self, tag, attrs): if tag=='div': for attr,value in attrs: if attr=='class' and value.find('poi-tile-nodeal')!=-1: self.tempstr='' self.divsum = 0 def handle_data(self, data): if(data.isspace()==False): data = data.replace('·', '·') if data=='¥': if '¥' not in self.tempstr: self.tempstr+='無' +'\t' self.tempstr+=data elif data=='¥': if '¥' not in self.tempstr: self.tempstr+='無' +'\t' self.tempstr+='¥' elif data=='人評價': self.tempstr=self.tempstr[0:-1]+data+'\t' elif data=='人均 ': self.tempstr+='人均' elif data[0]=='起': self.tempstr=self.tempstr[0:-1]+'起' else: self.tempstr+=data+'\t' def handle_endtag(self, tag): if tag=='div': self.divsum+=1 if self.divsum==6: if (self.tempstr.find('¥'))!=-1: if (re.split(r'\t', self.tempstr).__len__())==5: teststr = str() flg = 0 for stmp in re.split(r'\t',self.tempstr): if flg==2: teststr+='無位置信息'+'\t' teststr+=stmp+'\t' flg+=1 self.tempstr=teststr if (re.split(r'\t', self.tempstr).__len__())==6: arraystr.append(self.tempstr) self.divsum=0 self.tempstr='' #獲取美團已上線城市的url 目前爲844個城市地區 class GetCityUrl(HTMLParser): part = ('gaevent','changecity/build') urldic = {} def handle_starttag(self, tag, attrs): if tag=='a' and (self.part in attrs): for att,value in attrs: if att=='href': self.urldic.__setitem__(value, value+'/category/meishi/all/rating') def getUrl(self): return self.urldic #獲取分頁URL class GetPages(HTMLParser): pagelist = list() temphref = str() flg = 0 initurl = str() def setInitUrl(self,url): self.initurl = url def handle_starttag(self, tag, attrs): if tag=='a': for attr,value in attrs: if attr=='href' and ('page' in value): self.temphref = self.initurl + value if self.temphref not in self.pagelist: self.pagelist.append(self.temphref) def getList(self): return self.pagelist #採集網頁源碼信息 def getHtml(url): headers = ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11') opener = urllib.request.build_opener() opener.addheaders = [headers] htmldata = opener.open(url).read() htmldata=htmldata.decode('utf-8') return htmldata #將信息保存到Excel中 def SaveExcel(listdata): head=['商家名','類型','地理位置','評論人數','均價','最低價格'] wbk=xlwt.Workbook() sheet1=wbk.add_sheet("sheet1") ii=0 for testhand in head: sheet1.write(0,ii,testhand) ii+=1 i=1 j=0 for stt in listdata: j=0 lis = re.split(r'\t',stt) for ls in lis: sheet1.write(i,j,ls) j=j+1 i+=1 wbk.save('e:/test3.xls') par = GetCityUrl() par.feed(getHtml('http://www.meituan.com/index/changecity/initiative')) urldic = par.getUrl() par = MyHTMLParser() print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))) ffwait=1 for url in urldic: data = getHtml(urldic.get(url)) getpage = GetPages() getpage.setInitUrl(url) getpage.feed(data) pageurllist = getpage.getList() par.feed(data) for urltemp in pageurllist: par.feed(getHtml(urltemp)) arraystr.append('切換地區 ') if ffwait ==4:#此處只抓取了4個城市數據 break; ffwait+=1 SaveExcel(arraystr) print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))) print('Done')
學之,以記之。
使用Python抓取美團數據存於Excel中
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.