steamdb免費遊戲信息爬取(不是爬蟲教學,日常記錄,賊不工整,不喜勿看)

日常記錄而已,不是特別工整,不喜勿噴,不喜勿看。

1. 數據來源於steamdb, 目標網址:https://steamdb.info/upcoming/free/

2. 由於網址存在反爬措施,在沒有cookie的情況下,網站會由js進行跳轉,跳轉過程中post表單的數據由js計算而來,詳情請看另一篇文章:stemadb反扒機制分析。爲了簡化工作量,使用selenium進行訪問網頁,進行跳轉然後獲取cookie, cookie有效期爲一天。

3. cookie有效時,網站直接採用requests進行訪問,爬取數據。當訪問失敗時,使用selenium訪問網站刷新cookie。

4. 上療效,原先準備作爲網站小功能的一部分,每日定時更新並提供訂閱定時發送郵件功能,但是由於太耗資源,因而放棄了。

5. 然後直接上代碼,這裏只貼有關steamdb的代碼,涉及到網站的部分就不貼了

5.1 本代碼負責手動更新cookie. 也用於selenium測試。

# -*- coding: utf-8 -*-
# @Author  : LG

from selenium import webdriver

import argparse
from selenium.webdriver.support.ui import WebDriverWait

def update_cookie(url, file_name, delayed):
    # 無頭模式,不打開瀏覽器窗口
    option = webdriver.FirefoxOptions()
    option.add_argument('--headless')

    driver = webdriver.Firefox(options=option)
    driver.get(url)
    # 這裏加了延時,直到頁面找到特定元素 或超時。
    WebDriverWait(driver, delayed).until(lambda aaa: driver.find_element_by_id('live-promotions'))

    # 將cookie保存在特定文件中,平時更新時由其他程序讀取
    with open(file_name, 'w')as f:
        for cookie in driver.get_cookies():
            print(cookie)
            f.write(cookie['name']+','+cookie['value']+'\n')
    driver.close()
    return True

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='update cookie')
    parser.add_argument('-u', '--url', type=str, default='https://steamdb.info/upcoming/free/')
    parser.add_argument('-f', '--filename', type=str, default='cookie.txt')
    parser.add_argument('-d', '--delayed', type=int, default=15)
    args = parser.parse_args()

    update_cookie(url=args.url, file_name=args.filename, delayed=args.delayed)

5.2 自動更新以及自動發送郵件等功能的實現。

# 爬取的時間 轉UTC 轉本地
def time_analysis(time_str):
    months = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06',
              'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12'}

    time = time_str.split('')
    time = '{}-{}-{} {}'.format(time[2], time[1], time[0], time[4])
    for month_en, month_num in months.items():
        time = time.replace(month_en, month_num)
    utc_dt = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
    local_tz = pytz.timezone('Asia/Chongqing')
    local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
    local_dt = str(local_dt).split('+')[0]
    return local_dt

# 更新cookie 
def update_cookie():
    url = 'https://steamdb.info/upcoming/free/'

    option = webdriver.FirefoxOptions()
    option.add_argument('--headless')

    driver = webdriver.Firefox(options=option)
    driver.get(url)

    WebDriverWait(driver, delayed).until(lambda aaa: driver.find_element_by_id('live-promotions'))

    # 這只是簡單調試時的日誌,不要在意
    if len(driver.get_cookies()) ==1:
        with open('XXX/steamfree_log.txt', 'a') as f:
            f.write("{} {}\n".format(datetime.now(), 'cookie更新失敗'))
        return False

    with open('cookie.txt', 'w')as f:
        for cookie in driver.get_cookies():
            f.write(cookie['name']+','+cookie['value']+'\n')

    with open('XXX/steamfree_log.txt', 'a') as f:
        f.write("{} {}\n".format(datetime.now(), 'cookie更新失敗'))

    return True

# 讀取cookie,並訪問網站
def get_html(url='https://steamdb.info/upcoming/free/'):
    session = requests.session()

    headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"}
    cookie_str = ''
    try:
        f = open('cookie.txt','r')
        lines = f.readlines()

        for line in lines:
            name, value = line.rstrip('\n').split(',')
            cookie_str+=name+"="+value+";"
    except:
        pass
    headers["Cookie"]=cookie_str
    html = session.get(url, headers=headers)
    return html

# 從網站頁面抽取需要爬取的數據,這裏倆個表格格式略有不同,後面會分開處理
def get_tables():
    html = get_html()
    if html.status_code != 200:
        with open('XXX/steamfree_log.txt', 'a') as f:
            f.write("{} {} {}\n".format(datetime.now(), '請求失敗刷新cookie', html.status_code))
        # 請求失敗,更新cookies
        if update_cookie():
            # 重新讀取網頁
            html = get_html()
        else:
            # 更新失敗
            return None
        if html.status_code != 200:
            return None

    # 請求成功
    soup = BeautifulSoup(html.text, 'lxml')
    tables = soup.find_all('table')
    return tables


def currently(currently_table):
    # 第一個表格
    trs = currently_table.find('tbody').find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        link = tds[0].a['href']
        pic = tds[0].img['src']
        name = tds[1].a.b.string
        type = tds[3].string or tds[3].b.string
        starts = time_analysis(tds[4].string)
        ends = time_analysis(tds[5].string)
        # 這裏是網站專門爲這個功能添加一個數據庫
        steamfree = Steamfree(link=link, pic=pic, name=name, type=type, start=starts, end=ends)
        db.session.add(steamfree)
    return True


def upcoming(upcoming_table):
    # 第二個表格
    trs = upcoming_table.find('tbody').find_all('tr')
    for tr in trs:
        tds = tr.find_all('td')
        link = tds[0].a['href']
        pic = tds[0].img['src']
        name = tds[1].a.b.string
        type = tds[2].string or tds[2].b.string
        starts = time_analysis(tds[3].string)
        ends = time_analysis(tds[4].string)
        steamfree = Steamfree(link=link, pic=pic,name=name,type=type,start=starts,end=ends)
        db.session.add(steamfree)
    return True


def steamfree_delete_all():
    # 更新數據庫前,需要全部刪除
    steamfrees = Steamfree.query.all()
    for steamfree in steamfrees:
        db.session.delete(steamfree)
    db.session.commit()


def steamdb_free_update():
    # 這裏添加了一個手動刷新數據的函數,在前端給了接口調用,不用在意
    tables = get_tables()
    if not isinstance(tables, int):
        steamfree_delete_all()
        currently(tables[0])
        upcoming(tables[1])
        with open('XXX/steamfree_log.txt', 'a') as f:
            f.write("{} {}\n".format(datetime.now(), 'steamfree手動更新成功'))
    else:
        with open('XXX/steamfree_log.txt', 'a') as f:
            f.write("{} {}\n".format(datetime.now(), 'steamfree手動更新失敗'))

# 自動發送郵件
def send_steamfree_auto():
    # 這裏定時,用到了 flask_apscheduler,在操作數據庫時,需要一個app
    with scheduler.app.app_context():
        steamfrees = Steamfree.query.all()
        # 訂閱用戶
        subers = Steamfree_sub.query.all()    
        for sub in subers:
            send_email(to=sub.email,
                       subject='steam免費信息',
                       template='laboratory/email/steam_free_send',
                       steamfrees=steamfrees,
                       link='')
            with open('XXX/steamfree_log.txt', 'a') as f:
                f.write("{} {} {}\n".format(datetime.now(), 'steamfree自動發送郵件成功', sub.email))

# 自動更新
def steamdb_free_update_auto():
    with scheduler.app.app_context():
        tables = get_tables()
        if not isinstance(tables, int):
            steamfree_delete_all()
            currently(tables[0])
            upcoming(tables[1])
            with open('XXX/steamfree_log.txt', 'a') as f:
                f.write("{} {}\n".format(datetime.now(), 'steamfree自動更新成功'))
        else:
            with open('XXX/steamfree_log.txt', 'a') as f:
                f.write("{} {}\n".format(datetime.now(), 'steamfree自動更新失敗'))


# 每6小時自動更新一次數據庫
scheduler.add_job(func=steamdb_free_update_auto,
                  id='1',
                  trigger='interval',   # trigger='interval' 表示是一個循環任務,每隔多久執行一次
                  hours=6)

# 每天早上10點自動發送郵件
scheduler.add_job(func=send_steamfree_auto,
                  id='2',
                  trigger='cron',       # 定時任務
                  day_of_week='0-6',
                  hour=10,
                  minute=0,
                  second=0)

5.3 數據庫模型也貼一下

# steam 免費遊戲數據庫
class Steamfree(db.Model):
    __bind_key__ = 'laboratory'    # 這裏由於網站有多個數據庫,所以需要指定bind
    __name__ = 'steamfrees'
    id = db.Column(db.Integer, primary_key=True)    
    link = db.Column(db.Text)    # steam遊戲頁面鏈接
    pic = db.Column(db.Text)    # 縮略圖
    name = db.Column(db.String)    # 遊戲名
    type = db.Column(db.String)    # 免費類型
    start = db.Column(db.String)    # 開始時間
    end = db.Column(db.String)    # 結束時間


# steam免費遊戲服務訂閱數據庫
class Steamfree_sub(db.Model):
    __bind_key__ = 'laboratory'
    __name__ = 'steamfree_subs'
    id = db.Column(db.Integer, primary_key=True)
    email = db.Column(db.Text)    # 這裏只保存訂閱者的郵箱

5.4 網站前端也發出來吧(但是基礎頁面不發)

{% extends "base.html" %}

{% block contents %}

<div class="container">
    <div  style="margin-bottom: 20px">
        <h2>Steam免費遊戲資訊</h2>
        <div class="pull-right">
            <a class="btn btn-success" href="{{ url_for('laboratory.subscribe') }}">訂閱</a>
            <a class="btn btn-danger" href="{{ url_for('laboratory.unsubscribe') }}">退訂</a>
        </div>
        <p>數據來源於
            <a href="https://steamdb.info/upcoming/free/" class="text-dark">steamdb</a>,
            本站每天6:00刷新一次數據,並10:00發送一封郵件到訂閱的用戶郵箱中。
            您可以免費訂閱,並可隨時退訂。
        </p>
    </div>
    <div style="background-color: white">
        <table class="table table-striped table-hover">
            <thead>
            <tr>
                <th></th>
                <th>遊戲名</th>
                <th>免費類型</th>
                <th>開始時間</th>
                <th>結束時間</th>
            </tr>
            </thead>
            <tbody>
            {% for steamfree in steamfrees %}
                <tr>
                    <td><img src="{{ steamfree.pic }}"></td>
                    <td><a href="{{ steamfree.link }}" target="_blank">{{ steamfree.name }}</a></td>
                    <td>
                        {% if steamfree.type == 'Weekend' %}
                        <span class="text-dark">限時遊玩</span>
                        {% elif steamfree.type == 'Keep' %}
                        <span class="text-success">限時領取</span>
                        {% endif %}
                    </td>
                    <td>{{ steamfree.start }}</td>
                    <td>{{ steamfree.end }}</td>
                </tr>
            {% endfor %}
            </tbody>

        </table>

    </div>
</div>

{% endblock %}

5.5 視圖,這裏只貼部分代碼

# 免費資訊主頁面
@laboratory_blueprint.route('/laboratory/steamfree')
def steamfree():
    steamfrees = Steamfree.query.all()
    return render_template('laboratory/steamfree.html', steamfrees=steamfrees, current_user=current_user)

# 手動更新
@laboratory_blueprint.route('/laboratory/steamfree/update')
@login_required
def steamfree_update():

    steamdb_free_update()
    return redirect(url_for('laboratory.steamfree'))

# 訂閱
@laboratory_blueprint.route('/laboratory/steamfree/subscribe')
@login_required
def subscribe():
    sub = Steamfree_sub.query.filter(Steamfree_sub.email == current_user.email).first()
    if sub is None:
        sub = Steamfree_sub(email=current_user.email)
        flash('您已訂閱steam免費遊戲資訊。')
        db.session.add(sub)
    else:
        flash('您已訂閱steam免費遊戲資訊,無需重複訂閱。')


    return redirect(url_for('laboratory.steamfree'))

# 退訂
@laboratory_blueprint.route('/laboratory/steamfree/unsubscribe')
@login_required
def unsubscribe():
    sub = Steamfree_sub.query.filter(Steamfree_sub.email==current_user.email).first()
    if sub is None:
        flash('您還未訂閱steam免費遊戲資訊,無需退訂。')
    else:
        db.session.delete(sub)
        db.session.commit()
        flash('您已退訂steam免費遊戲資訊。')
    return redirect(url_for('laboratory.steamfree'))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章