通過Python爬取CSDN百度關鍵頁的搜索詞和入口頁次數,通過Word Art製作詞雲圖。
效果:
login.py(登錄CSDN)
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
username = "賬號"
password = "密碼"
def run():
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get(url="https://passport.csdn.net/login") # get訪問url
browser.find_element_by_xpath('//*[@id="app"]/div/div/div/div[2]/div[2]/ul/li[1]/a').click()
browser.find_element_by_xpath('//*[@id="all"]').send_keys(username)
browser.find_element_by_xpath('//*[@id="password-number"]').send_keys(password)
browser.find_element_by_xpath('//*[@id="app"]/div/div/div/div[2]/div[2]/form/div/div[6]/div/button').click()
time.sleep(1)
cookie = browser.get_cookies()
cookies = {}
for co in cookie:
cookies[co["name"]] = co["value"]
with open('cookie.json', 'w') as f:
f.write(str(cookies))
print("登陸成功!")
browser.quit()
main.py(爬取搜索詞和入口頁次數)
# -*- coding: utf-8 -*-
import requests
from pyquery import PyQuery as pq
import login
def pa():
login.run() # 登錄
with open("cookie.json", "r") as file:
cookie = eval(file.read())
file.close()
# 驗證cookie
h1 = requests.get("https://me.csdn.net/api/user/show", cookies=cookie)
if h1.json()["code"] != 200:
login.run() # 登錄
with open("cookie.json", "r") as file:
cookie = eval(file.read())
h2 = requests.get("https://mp.csdn.net/data/baidukeyword", cookies=cookie)
h3 = pq(h2.text)("tbody")("tr").items()
statistical = ""
for h4 in h3:
h5 = h4("td:not([rowspan])")
keyword = h5.eq(0).text()
number = h5.eq(2).text()
statistical += keyword + " " + str(number) + "\n"
print(statistical)
if __name__ == "__main__":
pa()
使用Word Art製作詞雲圖【Word Art】