Python爬蟲實戰(動態網頁)

上一篇介紹了靜態網頁的爬蟲,這次我們試試動態網頁怎麼爬取。

#指定瀏覽器位置
chrm = R"F:\Python\chromedriver_win32\chromedriver.exe"

#爬取摘要、url等信息寫入數據庫

import sqlite3
#創建數據庫
db = R"E:\TencentNews.db"

with sqlite3.connect(db) as conn:
    sql = (
        "Create Table If Not Exists News( "
        "id INTEGER PRIMARY KEY NOT NULL, "
        "url varchar(100), "
        "title varchar(100), "
        "theme varchar(100), "
        "date date, "
        "time time)")
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()

with sqlite3.connect(db) as conn:
    sql = (
        "Create Table If Not Exists NewsBody( "
        "id INTEGER PRIMARY KEY NOT NULL, "
        "text text, "
        "FOREIGN KEY(id) REFERENCES News(id))")
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()

###--------------------------------------
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import datetime
import sqlite3
import requests



def start_crawler(start_date, end_date):
    # 如果驅動被添加到系統路徑中則可以不傳入參數使用
    driver = webdriver.Chrome(chrm)
    driver.get("http://roll.news.qq.com/")
    for date in pd.date_range(start_date, end_date):
        # 切換瀏覽器新聞日期
        driver = to_date(driver, date)
        print("抓取{}天新聞".format(date.day))
        exist_url = get_exist(date)
        while True:
            # 抓取當前頁面內容
            one_page = get_page_news(driver, date, exist_url)
            # 寫入數據庫
            to_db(one_page)

            # 跳轉到下一頁
            try:
                driver.find_element_by_xpath(
                    '//div[@id="pageArea"]//a[text()="下一頁>"]').click()
                print("進行下一頁的抓取")
                time.sleep(1)  #設置速度,給服務器響應時間
            except NoSuchElementException:
                print("當前天{}抓取完畢".format(date))
                break
    print("正常抓取完畢,關閉瀏覽器!")
    driver.quit()


def to_date(driver, date):
    # 獲取當前日期
    soup = BeautifulSoup(driver.page_source, "lxml")
    ym_s = soup.find("td", id="_CalendarYear_").h3.string
    ym = datetime.datetime.strptime(ym_s, "%Y年%m月")
    p_year, p_month = ym.year, ym.month  # 獲取到頁面當前的年和月
    # 求頁面和目標時間的年月差距
    diff_year = date.year - p_year
    diff_month = date.month - p_month
    # 定位到年月切換按鈕
    last_year = driver.find_element_by_xpath('//td[@title="上一年"]')
    last_month = driver.find_element_by_xpath('//td[@title="上一月"]')
    next_month = driver.find_element_by_xpath('//td[@title="下一月"]')
    next_year = driver.find_element_by_xpath('//td[@title="下一年"]')
    # 年月調整
    if diff_year >= 0:
        for i in range(diff_year):
            next_year.click()
            time.sleep(0.1)
    else:
        for i in range(-diff_year):
            last_year.click()
            time.sleep(0.1)
            # 月
    if diff_month >= 0:
        for i in range(diff_month):
            next_month.click()
            time.sleep(0.1)
    else:
        for i in range(-diff_month):
            last_month.click()
            time.sleep(0.1)
            # 調整天
    driver.find_element_by_xpath(
        '//tbody//a[text()={}]'.format(date.day)).click()
    time.sleep(1)
    return driver


def get_exist(date):
    db = 'E:\\TencentNews.db'
    with sqlite3.connect(db) as conn:
        sql = (
            "SELECT url FROM News where "
            "`date`='{}'".format(datetime.datetime.strftime(date, "%Y-%m-%d")))

        cursor = conn.cursor()
        cursor.execute(sql)
        conn.commit()
        result = cursor.fetchall()
    return set(*zip(*result))  #提取result中的url,用set去重


def get_page_news(driver, date, exist_url):
    soup = BeautifulSoup(driver.page_source, "lxml")
    one_page = []
    for i in soup.find("div", class_="list c")("li"):
        if i.a["href"] in exist_url:
            continue
        one_page.append(
            (None,
             i.a["href"],
             i.a.string,
             i.find("span", class_="t-tit").string[1:-1],
             datetime.datetime.strftime(date, "%Y-%m-%d"),
             i.find("span", class_="t-time").string.split()[1]))
    return one_page


def to_db(data):
    db = 'E:\\TencentNews.db'
    with sqlite3.connect(db) as conn:
        sql = (
            "INSERT INTO News "
            "(id,url,title,theme,date,time) "
            "VALUES (?, ?, ?, ?, ?, ?)")
        cursor = conn.cursor()
        cursor.executemany(sql, data)
        conn.commit()


# 啓動爬蟲
start_crawler("2017-06-01", "2017-06-05")

# 使用pandas查詢結果
import sqlalchemy
import pandas as pd

sqlite_engine = sqlalchemy.create_engine('sqlite:///E:/TencentNews.db',
                                         encoding='utf-8')


df = pd.read_sql("SELECT * FROM News limit 1", sqlite_engine)
df.date.values

抓取1天新聞
進行下一頁的抓取

當前天2017-06-05 00:00:00抓取完畢
正常抓取完畢,關閉瀏覽器!

##
# 第二部分, 爬取數據庫中的鏈接的具體新聞

def get_news_body(start_date, end_date):
    for date in pd.date_range(start_date, end_date):  # 對目標的時間進行逐個迭代
        link_list = get_news_linksfrom_database(date)
        print("INFO: Crawling Date {} News Body Now...".format(date))
        for linkid, url in link_list:
            news_body = get_news_text(url)
            # 寫入數據庫
            writer_news_body_to_database(linkid, news_body)

    print("抓取時間段:{}到{}新聞主體完畢!".format(start_date, end_date))

def get_news_text(url):
    html = requests.get(url)
    html.encoding = html.apparent_encoding
    soup = BeautifulSoup(html.text, 'html.parser')
    try:
        return soup.find("div", {"id": "Cnt-Main-Article-QQ"}).text
    except:
        return None


def writer_news_body_to_database(linkid, news_body):
    print("INFO: Writing News ID:{} To Database...".format(linkid))
    sql = (
        "INSERT INTO newsbody (id, text) "
        "VALUES (?, ?)")
    db = 'E:\\TencentNews.db'
    with sqlite3.connect(db) as conn:
        cursor = conn.cursor()
        cursor.execute(sql, (linkid, news_body))
        conn.commit()


def get_news_linksfrom_database(date):
    db = 'E:\\TencentNews.db'

    sql = (
    "SELECT news.id, news.url "
    "FROM news LEFT JOIN newsbody "
    "ON news.id = newsbody.id WHERE "
    "news.date = '{}' AND newsbody.id "
    "IS NULL;".format(datetime.datetime.strftime(date, '%Y-%m-%d')))

    with sqlite3.connect(db) as conn:
        cursor = conn.cursor()
        cursor.execute(sql)
        result = cursor.fetchall()
    return result if result else []

get_news_body("2017-06-01", "2017-06-02")

INFO: Crawling Date 2017-06-01 00:00:00 News Body Now…
INFO: Writing News ID:1 To Database…
INFO: Writing News ID:2 To Database…

INFO: Crawling Date 2017-06-02 00:00:00 News Body Now…
抓取時間段:2017-06-01到2017-06-02新聞主體完畢!

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章