騰訊新聞爬取
- url: https://news.qq.com/
- 爬取思路:
- 用selenium庫模擬瀏覽器操作,獲取ur/li下面的data
- 用bs4庫解析data
- 用pandas庫將解析結果導出
# 第一步:模擬瀏覽器操作
import time
from selenium import webdriver
driver = webdriver.Chrome(executable_path = "D:/Anaconda/chromedriver.exe")
driver.get("https://news.qq.com")
#進程掛起時間2s
for i in range(1,50):
time.sleep(2)
driver.execute_script("window.scrollTo(window.pageXOffset, %d);"%(i*200))
# 第二步:獲取數據並解析
from bs4 import BeautifulSoup
html=driver.page_source
info_bs=BeautifulSoup(html,"lxml")
jxtits=info_bs.find_all("div",{"class":"jx-tit"})[0].find_next_sibling().find_all("li")
# 第三步:數據導出
import pandas as pd
list_news = []
i = 0
for i,jxtit in enumerate(jxtits):
try:
text=jxtit.find_all("img")[0]["alt"]
except:
text=jxtit.find_all("div",{"class":"lazyload-placeholder"})[0].text
try:
url=jxtit.find_all("a")[0]["href"]
except:
print(jxtit)
info_news = [i + 1, text, url]
list_news.append(info_news)
name = ['index', 'title', 'url']
test = pd.DataFrame(columns=name, data=list_news)
test.to_csv("C:/Users/SHOHOKU/Desktop/test.csv")
參考:https://blog.csdn.net/Riolu/article/details/105778202