有些小瑕疵不知道爲啥爬取的標題個數與摘要個數不是整個頁面的,而且兩者也不相等,不過是第一次做出來,簡單記錄一下啦~~~~
import urllib.request
import re
import os
def url_open(url):
req=urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36')
response=urllib.request.urlopen(url)
html=response.read().decode('utf-8')
return html
def get_title(html):
p=r'<a.*?class="title".*?target="_blank".*?href=".*?">(.*?)</a>'
t=r'<p.*?class="abstract">(.*?)</p>'
titlelist=re.findall(p,html,re.S)
abstractlist=re.findall(t,html,re.S)
'''
for each in titlelist:
print(each)
for each in abstractlist:
print(each)
'''
i=len(titlelist)
with open(r'essay.txt','w') as f:
for x in range(i-1):
f.write(str(x+1)+titlelist[x]+'\n'+abstractlist[x]+'\n')
if __name__=='__main__':
os.mkdir("E:\Title")
os.chdir("E:\Title")
url='http://www.jianshu.com/'
get_title(url_open(url))