上一篇介紹了靜態網頁的爬蟲,這次我們試試動態網頁怎麼爬取。
#指定瀏覽器位置
chrm = R"F:\Python\chromedriver_win32\chromedriver.exe"
#爬取摘要、url等信息寫入數據庫
import sqlite3
#創建數據庫
db = R"E:\TencentNews.db"
with sqlite3.connect(db) as conn:
sql = (
"Create Table If Not Exists News( "
"id INTEGER PRIMARY KEY NOT NULL, "
"url varchar(100), "
"title varchar(100), "
"theme varchar(100), "
"date date, "
"time time)")
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
with sqlite3.connect(db) as conn:
sql = (
"Create Table If Not Exists NewsBody( "
"id INTEGER PRIMARY KEY NOT NULL, "
"text text, "
"FOREIGN KEY(id) REFERENCES News(id))")
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
###--------------------------------------
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import datetime
import sqlite3
import requests
def start_crawler(start_date, end_date):
# 如果驅動被添加到系統路徑中則可以不傳入參數使用
driver = webdriver.Chrome(chrm)
driver.get("http://roll.news.qq.com/")
for date in pd.date_range(start_date, end_date):
# 切換瀏覽器新聞日期
driver = to_date(driver, date)
print("抓取{}天新聞".format(date.day))
exist_url = get_exist(date)
while True:
# 抓取當前頁面內容
one_page = get_page_news(driver, date, exist_url)
# 寫入數據庫
to_db(one_page)
# 跳轉到下一頁
try:
driver.find_element_by_xpath(
'//div[@id="pageArea"]//a[text()="下一頁>"]').click()
print("進行下一頁的抓取")
time.sleep(1) #設置速度,給服務器響應時間
except NoSuchElementException:
print("當前天{}抓取完畢".format(date))
break
print("正常抓取完畢,關閉瀏覽器!")
driver.quit()
def to_date(driver, date):
# 獲取當前日期
soup = BeautifulSoup(driver.page_source, "lxml")
ym_s = soup.find("td", id="_CalendarYear_").h3.string
ym = datetime.datetime.strptime(ym_s, "%Y年%m月")
p_year, p_month = ym.year, ym.month # 獲取到頁面當前的年和月
# 求頁面和目標時間的年月差距
diff_year = date.year - p_year
diff_month = date.month - p_month
# 定位到年月切換按鈕
last_year = driver.find_element_by_xpath('//td[@title="上一年"]')
last_month = driver.find_element_by_xpath('//td[@title="上一月"]')
next_month = driver.find_element_by_xpath('//td[@title="下一月"]')
next_year = driver.find_element_by_xpath('//td[@title="下一年"]')
# 年月調整
if diff_year >= 0:
for i in range(diff_year):
next_year.click()
time.sleep(0.1)
else:
for i in range(-diff_year):
last_year.click()
time.sleep(0.1)
# 月
if diff_month >= 0:
for i in range(diff_month):
next_month.click()
time.sleep(0.1)
else:
for i in range(-diff_month):
last_month.click()
time.sleep(0.1)
# 調整天
driver.find_element_by_xpath(
'//tbody//a[text()={}]'.format(date.day)).click()
time.sleep(1)
return driver
def get_exist(date):
db = 'E:\\TencentNews.db'
with sqlite3.connect(db) as conn:
sql = (
"SELECT url FROM News where "
"`date`='{}'".format(datetime.datetime.strftime(date, "%Y-%m-%d")))
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
result = cursor.fetchall()
return set(*zip(*result)) #提取result中的url,用set去重
def get_page_news(driver, date, exist_url):
soup = BeautifulSoup(driver.page_source, "lxml")
one_page = []
for i in soup.find("div", class_="list c")("li"):
if i.a["href"] in exist_url:
continue
one_page.append(
(None,
i.a["href"],
i.a.string,
i.find("span", class_="t-tit").string[1:-1],
datetime.datetime.strftime(date, "%Y-%m-%d"),
i.find("span", class_="t-time").string.split()[1]))
return one_page
def to_db(data):
db = 'E:\\TencentNews.db'
with sqlite3.connect(db) as conn:
sql = (
"INSERT INTO News "
"(id,url,title,theme,date,time) "
"VALUES (?, ?, ?, ?, ?, ?)")
cursor = conn.cursor()
cursor.executemany(sql, data)
conn.commit()
# 啓動爬蟲
start_crawler("2017-06-01", "2017-06-05")
# 使用pandas查詢結果
import sqlalchemy
import pandas as pd
sqlite_engine = sqlalchemy.create_engine('sqlite:///E:/TencentNews.db',
encoding='utf-8')
df = pd.read_sql("SELECT * FROM News limit 1", sqlite_engine)
df.date.values
抓取1天新聞
進行下一頁的抓取
…
當前天2017-06-05 00:00:00抓取完畢
正常抓取完畢,關閉瀏覽器!
##
# 第二部分, 爬取數據庫中的鏈接的具體新聞
def get_news_body(start_date, end_date):
for date in pd.date_range(start_date, end_date): # 對目標的時間進行逐個迭代
link_list = get_news_linksfrom_database(date)
print("INFO: Crawling Date {} News Body Now...".format(date))
for linkid, url in link_list:
news_body = get_news_text(url)
# 寫入數據庫
writer_news_body_to_database(linkid, news_body)
print("抓取時間段:{}到{}新聞主體完畢!".format(start_date, end_date))
def get_news_text(url):
html = requests.get(url)
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text, 'html.parser')
try:
return soup.find("div", {"id": "Cnt-Main-Article-QQ"}).text
except:
return None
def writer_news_body_to_database(linkid, news_body):
print("INFO: Writing News ID:{} To Database...".format(linkid))
sql = (
"INSERT INTO newsbody (id, text) "
"VALUES (?, ?)")
db = 'E:\\TencentNews.db'
with sqlite3.connect(db) as conn:
cursor = conn.cursor()
cursor.execute(sql, (linkid, news_body))
conn.commit()
def get_news_linksfrom_database(date):
db = 'E:\\TencentNews.db'
sql = (
"SELECT news.id, news.url "
"FROM news LEFT JOIN newsbody "
"ON news.id = newsbody.id WHERE "
"news.date = '{}' AND newsbody.id "
"IS NULL;".format(datetime.datetime.strftime(date, '%Y-%m-%d')))
with sqlite3.connect(db) as conn:
cursor = conn.cursor()
cursor.execute(sql)
result = cursor.fetchall()
return result if result else []
get_news_body("2017-06-01", "2017-06-02")
INFO: Crawling Date 2017-06-01 00:00:00 News Body Now…
INFO: Writing News ID:1 To Database…
INFO: Writing News ID:2 To Database…
…
INFO: Crawling Date 2017-06-02 00:00:00 News Body Now…
抓取時間段:2017-06-01到2017-06-02新聞主體完畢!