爬取的是資訊內容,網頁也是一樣的 地址換一下:
from bs4 import BeautifulSoup
import re
import requests
import string
import json
import tkinter
import tkinter.messagebox
import xlwt
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('baidu',cell_overwrite_ok=True)
worksheet.write(0,0, label = '標題')
worksheet.write(0,1, label = '時間')
worksheet.write(0,2, label = '發佈者')
worksheet.write(0,3, label = '內容')
worksheet.write(0,4, label = '鏈接')
def search():
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}
sl = var.get()
pageNum = int(var2.get()) + 1
#f = open("baidu.txt","w",encoding='utf8')
#f.write('標題\t鏈接\n')
index = 1;
for j in range(1,pageNum):
response = requests.get('https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&word='+sl+'&x_bfe_rqs=03E80&x_bfe_tjscore=0.650971&tngroupname=organic_news&newVideo=12&rsv_dl=news_b_pn&pn=' + str((j-1) * 10),headers=headers)
html = response.text
soup = BeautifulSoup(html,'lxml')
links = soup.find_all('h3') #所有鏈接
names = soup.find_all("p", class_="c-author") #所有出處
details = soup.find_all("div", class_="c-summary c-row")
for lenDetails in range(len(links)):
divSp = details[lenDetails].div
#print(str(divSp))
if divSp is not None:
divSp.clear()
hr = links[lenDetails].a
worksheet.write(index ,0, label = hr.get_text().replace('\n','').replace('\t',''))
nameAndTime = names[lenDetails].get_text().replace('\n','').replace('\t','').replace('\xa0\xa0','*').split('*',1)
worksheet.write(index ,1,label = nameAndTime[0])
worksheet.write(index ,2,label = nameAndTime[1])
tag = details[lenDetails].p #在寫完name後再刪除
tag.clear()
tag1 = details[lenDetails].span
tag1.clear()
worksheet.write(index ,3,label = details[lenDetails].get_text().replace('\n','').replace('\t',''))
worksheet.write(index ,4,label = hr.get('href'))
index+=1
workbook.save('baidu.xls')
tkinter.messagebox.showinfo('提示','導出成功!')
top = tkinter.Tk()
top.geometry('300x300')
var=tkinter.StringVar()
var.set('常州')
tkinter.Entry(top,textvariable=var).pack()
var2=tkinter.StringVar()
var2.set('1')
tkinter.Entry(top,textvariable=var2).pack()
tkinter.Button(top, text='百度搜索',width=10,height=1,command=search).pack()
top.mainloop()
搜狗微信內容:
from bs4 import BeautifulSoup
import re
import requests
import string
import json
import time, datetime
import tkinter
import tkinter.messagebox
import xlwt
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('sougou',cell_overwrite_ok=True)
worksheet.write(0,0, label = '標題')
worksheet.write(0,1, label = '說明')
worksheet.write(0,2, label = '發佈者')
worksheet.write(0,3, label = '時間')
worksheet.write(0,4, label = '鏈接')
def search():
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}
sl = var.get()
pageNum = int(var2.get()) + 1
#result = open("sougou.txt","w",encoding='utf8')
#result.write('標題\t說明\t發佈者\t時間\t鏈接\n')
index = 1;
for j in range(1,pageNum):
url='https://weixin.sogou.com/weixin?query='+sl+'&_sug_type_=&sut=1244&lkt=1%2C1587459452091%2C1587459452091&s_from=input&_sug_=n&type=2&sst0=1587459452192&page='+str(j)+'&ie=utf8&w=01019900&dr=1';
response = requests.get(url,headers=headers)
html = response.text
html = html.replace('<script>document.write(timeConvert(','').replace('))</script>','')
soup = BeautifulSoup(html,'lxml')
links = soup.find_all('h3')
details = soup.find_all("p", class_="txt-info")
names = soup.find_all("a", class_="account")
times = soup.find_all("span", class_="s2")
#f.write(html)
i=0
for link in links:
hr = link.find('a')
worksheet.write(index ,0, label = hr.get_text())
worksheet.write(index ,1, label = details[i].get_text())
worksheet.write(index ,2, label = names[i].get_text())
#print(times[i])
timeStamp = int(str(times[i].get_text()).replace("'",""))
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y--%m--%d %H:%M:%S", timeArray)
#print(otherStyleTime)
worksheet.write(index ,3, label = otherStyleTime)
worksheet.write(index ,4, label ="https://weixin.sogou.com"+hr.get('href'))
#print(hr.get('href') + '\n')
#print(hr.get_text() + '\n')
i+=1
index+=1
workbook.save('sougou.xls')
tkinter.messagebox.showinfo('提示','導出成功!')
top = tkinter.Tk()
top.geometry('300x300')
var=tkinter.StringVar()
var.set('中華恐龍園')
tkinter.Entry(top,textvariable=var).pack()
var2=tkinter.StringVar()
var2.set('1')
tkinter.Entry(top,textvariable=var2).pack()
tkinter.Button(top, text='搜狗微信搜索',width=10,height=1,command=search).pack()
top.mainloop()