python 爬蟲實現
本文使用python3 實現從谷歌學術獲得搜索結果
模擬瀏覽器發送請求
網絡訪問的模型使用請求應答的模型。客戶端發送請求,瀏覽器相應請求。
使用chrome瀏覽器獲得請求方式
在f12開發者模式下,查看請求頭,發現是使用get方法。複製爲url得到請求內容
爲了模擬瀏覽器,所以使用headers。
在headers中可以將cookies刪除,測試不影響
在python中實現
使用rullib中的模塊
數據分析
使用正則表達式
分析html文件。通過正則表達式匹配
代碼塊
import urllib.parse
import urllib.request
import re
keyword=input("keywords is?\n")
print(keyword)
url='https://scholar.google.com/scholar?&hl=en&q='+keyword+'&btnG=&lr='
header_dict={'Host': 'scholar.google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://scholar.google.com/schhp?hl=zh-CN',
'Connection': 'keep-alive'}
req = urllib.request.Request(url=url,headers=header_dict)
response = urllib.request.urlopen(req,timeout=120)
#print(f.read())
#with open('aaa.html', 'wb') as f:
# f.write(response.read())
print("conneect succeed!")
'''data=response.read().decode('utf-8')
pattern = re.compile(r'<div class="gs_r"><div class="gs_ri"><h3.*?<a onclick',re.S)
for m in re.finditer(pattern,data):
print (m.group())
'''
#print(response.read())
data=response.read()
data=data.decode()
pattern = re.compile(r'<div class="gs_ri">.*?</div></div></div>')
#print(data)
# 使用re.match匹配文本,獲得匹配結果,無法匹配時將返回None
result1 = re.search(pattern,data)
'''
if result1:
# 使用Match獲得分組信息
print (result1.group().encode('utf_8'))
else:
print ('1匹配失敗!')
'''
m=re.findall(pattern,data)
print("data get")
print(len(m))
address = re.compile(r'<a href=".*?"')
author= re.compile(r'<div class="gs_a">.*?</div>')
abstruct=re.compile(r'<div class="gs_rs">.*?</div>')
for s in m:
net=re.search(address,s)
temp=net.group()
print("url:")
print(temp[9:-1])
net=re.search(author,s)
temp=net.group()
a1 = re.compile(r'<a.*?>')
print("author:")
#replacedStr = re.sub("\d+", "222", inputStr)
temp= re.sub(a1,'',temp)
print(temp[18:-6])
net=re.search(abstruct,s)
if(net):
print("abstruct:")
temp=net.group()
temp=temp.replace("<b>"," ").replace("<br>"," ").replace("</b>"," ")
print(temp[19:-6])
else:
print("no abstrutct")
print('')
url='https://scholar.google.com/scholar?start=20&hl=en&q='+keyword+'234&btnG=&lr='
header_dict={'Host': 'scholar.google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://scholar.google.com/schhp?hl=zh-CN',
'Connection': 'keep-alive'}
req = urllib.request.Request(url=url,headers=header_dict)
response = urllib.request.urlopen(req,timeout=120)
#print(f.read())
#with open('aaa.html', 'wb') as f:
# f.write(response.read())
print("conneect succeed!")
'''data=response.read().decode('utf-8')
pattern = re.compile(r'<div class="gs_r"><div class="gs_ri"><h3.*?<a onclick',re.S)
for m in re.finditer(pattern,data):
print (m.group())
'''
#print(response.read())
data=response.read()
data=data.decode()
pattern = re.compile(r'<div class="gs_ri">.*?</div></div></div>')
#print(data)
# 使用re.match匹配文本,獲得匹配結果,無法匹配時將返回None
result1 = re.search(pattern,data)
'''
if result1:
# 使用Match獲得分組信息
print (result1.group().encode('utf_8'))
else:
print ('1匹配失敗!')
'''
m=re.findall(pattern,data)
print("data get")
print(len(m))
address = re.compile(r'<a href=".*?"')
author= re.compile(r'<div class="gs_a">.*?</div>')
abstruct=re.compile(r'<div class="gs_rs">.*?</div>')
for s in m:
net=re.search(address,s)
temp=net.group()
print("url:")
print(temp[9:-1])
net=re.search(author,s)
temp=net.group()
a1 = re.compile(r'<a.*?>')
print("author:")
#replacedStr = re.sub("\d+", "222", inputStr)
temp= re.sub(a1,'',temp)
print(temp[18:-6])
net=re.search(abstruct,s)
if(net):
print("abstruct:")
temp=net.group()
temp=temp.replace("<b>"," ").replace("<br>"," ").replace("</b>"," ")
print(temp[19:-6])
else:
print("no abstrutct")
print('')
start=20
start+=10
url='https://scholar.google.com/scholar?start='+str(start)+'&hl=en&q='+keyword+'234&btnG=&lr='
header_dict={'Host': 'scholar.google.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Referer': 'https://scholar.google.com/schhp?hl=zh-CN',
'Connection': 'keep-alive'}
req = urllib.request.Request(url=url,headers=header_dict)
response = urllib.request.urlopen(req,timeout=120)
#print(f.read())
#with open('aaa.html', 'wb') as f:
# f.write(response.read())
print("conneect succeed!")
'''data=response.read().decode('utf-8')
pattern = re.compile(r'<div class="gs_r"><div class="gs_ri"><h3.*?<a onclick',re.S)
for m in re.finditer(pattern,data):
print (m.group())
'''
#print(response.read())
data=response.read()
data=data.decode()
pattern = re.compile(r'<div class="gs_ri">.*?</div></div></div>')
#print(data)
# 使用re.match匹配文本,獲得匹配結果,無法匹配時將返回None
result1 = re.search(pattern,data)
'''
if result1:
# 使用Match獲得分組信息
print (result1.group().encode('utf_8'))
else:
print ('1匹配失敗!')
'''
m=re.findall(pattern,data)
print("data get")
print(len(m))
address = re.compile(r'<a href=".*?"')
author= re.compile(r'<div class="gs_a">.*?</div>')
abstruct=re.compile(r'<div class="gs_rs">.*?</div>')
for s in m:
net=re.search(address,s)
temp=net.group()
print("url:")
print(temp[9:-1])
net=re.search(author,s)
temp=net.group()
a1 = re.compile(r'<a.*?>')
print("author:")
#replacedStr = re.sub("\d+", "222", inputStr)
temp= re.sub(a1,'',temp)
print(temp[18:-6])
net=re.search(abstruct,s)
if(net):
print("abstruct:")
temp=net.group()
temp=temp.replace("<b>"," ").replace("<br>"," ").replace("</b>"," ")
print(temp[19:-6])
else:
print("no abstrutct")
print('')