python爬取百度 搜狗(資訊)內容

爬取的是資訊內容,網頁也是一樣的 地址換一下:

 

from bs4 import BeautifulSoup
import re
import requests
import string
import json
import tkinter
import tkinter.messagebox
import xlwt

workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('baidu',cell_overwrite_ok=True)
worksheet.write(0,0, label = '標題')
worksheet.write(0,1, label = '時間')
worksheet.write(0,2, label = '發佈者')
worksheet.write(0,3, label = '內容')
worksheet.write(0,4, label = '鏈接')
def search():
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, compress',
        'Accept-Language': 'en-us;q=0.5,en;q=0.3',
        'Cache-Control': 'max-age=0',   
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
    } 
    sl = var.get()
    pageNum = int(var2.get()) + 1 
    #f = open("baidu.txt","w",encoding='utf8')
    #f.write('標題\t鏈接\n')
    index = 1;
    for j in range(1,pageNum):
        response = requests.get('https://www.baidu.com/s?rtt=1&bsst=1&cl=2&tn=news&word='+sl+'&x_bfe_rqs=03E80&x_bfe_tjscore=0.650971&tngroupname=organic_news&newVideo=12&rsv_dl=news_b_pn&pn=' + str((j-1) * 10),headers=headers)
        html = response.text
        soup = BeautifulSoup(html,'lxml')
        links = soup.find_all('h3') #所有鏈接
        names = soup.find_all("p", class_="c-author") #所有出處
        details = soup.find_all("div", class_="c-summary c-row")
        for lenDetails in range(len(links)):
            divSp = details[lenDetails].div
            #print(str(divSp))
            if divSp is not None:
                divSp.clear()
            hr = links[lenDetails].a
            worksheet.write(index ,0, label = hr.get_text().replace('\n','').replace('\t',''))
            nameAndTime = names[lenDetails].get_text().replace('\n','').replace('\t','').replace('\xa0\xa0','*').split('*',1)
            worksheet.write(index ,1,label = nameAndTime[0])
            worksheet.write(index ,2,label = nameAndTime[1])
            tag = details[lenDetails].p #在寫完name後再刪除
            tag.clear()
            tag1 = details[lenDetails].span
            tag1.clear()
            worksheet.write(index ,3,label = details[lenDetails].get_text().replace('\n','').replace('\t',''))
            worksheet.write(index ,4,label = hr.get('href'))
            index+=1
    workbook.save('baidu.xls')
    tkinter.messagebox.showinfo('提示','導出成功!')
    
top = tkinter.Tk()
top.geometry('300x300')
var=tkinter.StringVar()
var.set('常州')
tkinter.Entry(top,textvariable=var).pack()

var2=tkinter.StringVar()
var2.set('1')
tkinter.Entry(top,textvariable=var2).pack()

tkinter.Button(top, text='百度搜索',width=10,height=1,command=search).pack()
top.mainloop()

 

搜狗微信內容:

from bs4 import BeautifulSoup
import re
import requests
import string
import json
import time, datetime
import tkinter
import tkinter.messagebox
import xlwt

workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('sougou',cell_overwrite_ok=True)
worksheet.write(0,0, label = '標題')
worksheet.write(0,1, label = '說明')
worksheet.write(0,2, label = '發佈者')
worksheet.write(0,3, label = '時間')
worksheet.write(0,4, label = '鏈接')
def search():
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, compress',
        'Accept-Language': 'en-us;q=0.5,en;q=0.3',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
    }
    sl = var.get()
    pageNum = int(var2.get()) + 1 
    #result = open("sougou.txt","w",encoding='utf8')
    #result.write('標題\t說明\t發佈者\t時間\t鏈接\n')
    index = 1;
    for j in range(1,pageNum):
        url='https://weixin.sogou.com/weixin?query='+sl+'&_sug_type_=&sut=1244&lkt=1%2C1587459452091%2C1587459452091&s_from=input&_sug_=n&type=2&sst0=1587459452192&page='+str(j)+'&ie=utf8&w=01019900&dr=1';
        response = requests.get(url,headers=headers)
        html = response.text
        html = html.replace('<script>document.write(timeConvert(','').replace('))</script>','')
        soup = BeautifulSoup(html,'lxml')
        links = soup.find_all('h3')
        details = soup.find_all("p", class_="txt-info")
        names = soup.find_all("a", class_="account")
        times = soup.find_all("span", class_="s2")
        #f.write(html)
        i=0
        for link in links:
            hr = link.find('a')
            worksheet.write(index ,0, label = hr.get_text())
            worksheet.write(index ,1, label = details[i].get_text())    
            worksheet.write(index ,2, label = names[i].get_text())
            #print(times[i])
            timeStamp = int(str(times[i].get_text()).replace("'",""))
            timeArray = time.localtime(timeStamp)
            otherStyleTime = time.strftime("%Y--%m--%d %H:%M:%S", timeArray)
            #print(otherStyleTime)
            worksheet.write(index ,3, label = otherStyleTime)
            worksheet.write(index ,4, label ="https://weixin.sogou.com"+hr.get('href'))
            #print(hr.get('href') + '\n')
            #print(hr.get_text() + '\n')
            i+=1
            index+=1
    workbook.save('sougou.xls')
    tkinter.messagebox.showinfo('提示','導出成功!')

top = tkinter.Tk()
top.geometry('300x300')
var=tkinter.StringVar()
var.set('中華恐龍園')
tkinter.Entry(top,textvariable=var).pack()

var2=tkinter.StringVar()
var2.set('1')
tkinter.Entry(top,textvariable=var2).pack()

tkinter.Button(top, text='搜狗微信搜索',width=10,height=1,command=search).pack()
top.mainloop()

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章