日常記錄而已,不是特別工整,不喜勿噴,不喜勿看。
1. 數據來源於steamdb, 目標網址:https://steamdb.info/upcoming/free/
2. 由於網址存在反爬措施,在沒有cookie的情況下,網站會由js進行跳轉,跳轉過程中post表單的數據由js計算而來,詳情請看另一篇文章:stemadb反扒機制分析。爲了簡化工作量,使用selenium進行訪問網頁,進行跳轉然後獲取cookie, cookie有效期爲一天。
3. cookie有效時,網站直接採用requests進行訪問,爬取數據。當訪問失敗時,使用selenium訪問網站刷新cookie。
4. 上療效,原先準備作爲網站小功能的一部分,每日定時更新並提供訂閱定時發送郵件功能,但是由於太耗資源,因而放棄了。
5. 然後直接上代碼,這裏只貼有關steamdb的代碼,涉及到網站的部分就不貼了
5.1 本代碼負責手動更新cookie. 也用於selenium測試。
# -*- coding: utf-8 -*-
# @Author : LG
from selenium import webdriver
import argparse
from selenium.webdriver.support.ui import WebDriverWait
def update_cookie(url, file_name, delayed):
# 無頭模式,不打開瀏覽器窗口
option = webdriver.FirefoxOptions()
option.add_argument('--headless')
driver = webdriver.Firefox(options=option)
driver.get(url)
# 這裏加了延時,直到頁面找到特定元素 或超時。
WebDriverWait(driver, delayed).until(lambda aaa: driver.find_element_by_id('live-promotions'))
# 將cookie保存在特定文件中,平時更新時由其他程序讀取
with open(file_name, 'w')as f:
for cookie in driver.get_cookies():
print(cookie)
f.write(cookie['name']+','+cookie['value']+'\n')
driver.close()
return True
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='update cookie')
parser.add_argument('-u', '--url', type=str, default='https://steamdb.info/upcoming/free/')
parser.add_argument('-f', '--filename', type=str, default='cookie.txt')
parser.add_argument('-d', '--delayed', type=int, default=15)
args = parser.parse_args()
update_cookie(url=args.url, file_name=args.filename, delayed=args.delayed)
5.2 自動更新以及自動發送郵件等功能的實現。
# 爬取的時間 轉UTC 轉本地
def time_analysis(time_str):
months = {'January': '01', 'February': '02', 'March': '03', 'April': '04', 'May': '05', 'June': '06',
'July': '07', 'August': '08', 'September': '09', 'October': '10', 'November': '11', 'December': '12'}
time = time_str.split('')
time = '{}-{}-{} {}'.format(time[2], time[1], time[0], time[4])
for month_en, month_num in months.items():
time = time.replace(month_en, month_num)
utc_dt = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
local_tz = pytz.timezone('Asia/Chongqing')
local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz)
local_dt = str(local_dt).split('+')[0]
return local_dt
# 更新cookie
def update_cookie():
url = 'https://steamdb.info/upcoming/free/'
option = webdriver.FirefoxOptions()
option.add_argument('--headless')
driver = webdriver.Firefox(options=option)
driver.get(url)
WebDriverWait(driver, delayed).until(lambda aaa: driver.find_element_by_id('live-promotions'))
# 這只是簡單調試時的日誌,不要在意
if len(driver.get_cookies()) ==1:
with open('XXX/steamfree_log.txt', 'a') as f:
f.write("{} {}\n".format(datetime.now(), 'cookie更新失敗'))
return False
with open('cookie.txt', 'w')as f:
for cookie in driver.get_cookies():
f.write(cookie['name']+','+cookie['value']+'\n')
with open('XXX/steamfree_log.txt', 'a') as f:
f.write("{} {}\n".format(datetime.now(), 'cookie更新失敗'))
return True
# 讀取cookie,並訪問網站
def get_html(url='https://steamdb.info/upcoming/free/'):
session = requests.session()
headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:70.0) Gecko/20100101 Firefox/70.0"}
cookie_str = ''
try:
f = open('cookie.txt','r')
lines = f.readlines()
for line in lines:
name, value = line.rstrip('\n').split(',')
cookie_str+=name+"="+value+";"
except:
pass
headers["Cookie"]=cookie_str
html = session.get(url, headers=headers)
return html
# 從網站頁面抽取需要爬取的數據,這裏倆個表格格式略有不同,後面會分開處理
def get_tables():
html = get_html()
if html.status_code != 200:
with open('XXX/steamfree_log.txt', 'a') as f:
f.write("{} {} {}\n".format(datetime.now(), '請求失敗刷新cookie', html.status_code))
# 請求失敗,更新cookies
if update_cookie():
# 重新讀取網頁
html = get_html()
else:
# 更新失敗
return None
if html.status_code != 200:
return None
# 請求成功
soup = BeautifulSoup(html.text, 'lxml')
tables = soup.find_all('table')
return tables
def currently(currently_table):
# 第一個表格
trs = currently_table.find('tbody').find_all('tr')
for tr in trs:
tds = tr.find_all('td')
link = tds[0].a['href']
pic = tds[0].img['src']
name = tds[1].a.b.string
type = tds[3].string or tds[3].b.string
starts = time_analysis(tds[4].string)
ends = time_analysis(tds[5].string)
# 這裏是網站專門爲這個功能添加一個數據庫
steamfree = Steamfree(link=link, pic=pic, name=name, type=type, start=starts, end=ends)
db.session.add(steamfree)
return True
def upcoming(upcoming_table):
# 第二個表格
trs = upcoming_table.find('tbody').find_all('tr')
for tr in trs:
tds = tr.find_all('td')
link = tds[0].a['href']
pic = tds[0].img['src']
name = tds[1].a.b.string
type = tds[2].string or tds[2].b.string
starts = time_analysis(tds[3].string)
ends = time_analysis(tds[4].string)
steamfree = Steamfree(link=link, pic=pic,name=name,type=type,start=starts,end=ends)
db.session.add(steamfree)
return True
def steamfree_delete_all():
# 更新數據庫前,需要全部刪除
steamfrees = Steamfree.query.all()
for steamfree in steamfrees:
db.session.delete(steamfree)
db.session.commit()
def steamdb_free_update():
# 這裏添加了一個手動刷新數據的函數,在前端給了接口調用,不用在意
tables = get_tables()
if not isinstance(tables, int):
steamfree_delete_all()
currently(tables[0])
upcoming(tables[1])
with open('XXX/steamfree_log.txt', 'a') as f:
f.write("{} {}\n".format(datetime.now(), 'steamfree手動更新成功'))
else:
with open('XXX/steamfree_log.txt', 'a') as f:
f.write("{} {}\n".format(datetime.now(), 'steamfree手動更新失敗'))
# 自動發送郵件
def send_steamfree_auto():
# 這裏定時,用到了 flask_apscheduler,在操作數據庫時,需要一個app
with scheduler.app.app_context():
steamfrees = Steamfree.query.all()
# 訂閱用戶
subers = Steamfree_sub.query.all()
for sub in subers:
send_email(to=sub.email,
subject='steam免費信息',
template='laboratory/email/steam_free_send',
steamfrees=steamfrees,
link='')
with open('XXX/steamfree_log.txt', 'a') as f:
f.write("{} {} {}\n".format(datetime.now(), 'steamfree自動發送郵件成功', sub.email))
# 自動更新
def steamdb_free_update_auto():
with scheduler.app.app_context():
tables = get_tables()
if not isinstance(tables, int):
steamfree_delete_all()
currently(tables[0])
upcoming(tables[1])
with open('XXX/steamfree_log.txt', 'a') as f:
f.write("{} {}\n".format(datetime.now(), 'steamfree自動更新成功'))
else:
with open('XXX/steamfree_log.txt', 'a') as f:
f.write("{} {}\n".format(datetime.now(), 'steamfree自動更新失敗'))
# 每6小時自動更新一次數據庫
scheduler.add_job(func=steamdb_free_update_auto,
id='1',
trigger='interval', # trigger='interval' 表示是一個循環任務,每隔多久執行一次
hours=6)
# 每天早上10點自動發送郵件
scheduler.add_job(func=send_steamfree_auto,
id='2',
trigger='cron', # 定時任務
day_of_week='0-6',
hour=10,
minute=0,
second=0)
5.3 數據庫模型也貼一下
# steam 免費遊戲數據庫
class Steamfree(db.Model):
__bind_key__ = 'laboratory' # 這裏由於網站有多個數據庫,所以需要指定bind
__name__ = 'steamfrees'
id = db.Column(db.Integer, primary_key=True)
link = db.Column(db.Text) # steam遊戲頁面鏈接
pic = db.Column(db.Text) # 縮略圖
name = db.Column(db.String) # 遊戲名
type = db.Column(db.String) # 免費類型
start = db.Column(db.String) # 開始時間
end = db.Column(db.String) # 結束時間
# steam免費遊戲服務訂閱數據庫
class Steamfree_sub(db.Model):
__bind_key__ = 'laboratory'
__name__ = 'steamfree_subs'
id = db.Column(db.Integer, primary_key=True)
email = db.Column(db.Text) # 這裏只保存訂閱者的郵箱
5.4 網站前端也發出來吧(但是基礎頁面不發)
{% extends "base.html" %}
{% block contents %}
<div class="container">
<div style="margin-bottom: 20px">
<h2>Steam免費遊戲資訊</h2>
<div class="pull-right">
<a class="btn btn-success" href="{{ url_for('laboratory.subscribe') }}">訂閱</a>
<a class="btn btn-danger" href="{{ url_for('laboratory.unsubscribe') }}">退訂</a>
</div>
<p>數據來源於
<a href="https://steamdb.info/upcoming/free/" class="text-dark">steamdb</a>,
本站每天6:00刷新一次數據,並10:00發送一封郵件到訂閱的用戶郵箱中。
您可以免費訂閱,並可隨時退訂。
</p>
</div>
<div style="background-color: white">
<table class="table table-striped table-hover">
<thead>
<tr>
<th></th>
<th>遊戲名</th>
<th>免費類型</th>
<th>開始時間</th>
<th>結束時間</th>
</tr>
</thead>
<tbody>
{% for steamfree in steamfrees %}
<tr>
<td><img src="{{ steamfree.pic }}"></td>
<td><a href="{{ steamfree.link }}" target="_blank">{{ steamfree.name }}</a></td>
<td>
{% if steamfree.type == 'Weekend' %}
<span class="text-dark">限時遊玩</span>
{% elif steamfree.type == 'Keep' %}
<span class="text-success">限時領取</span>
{% endif %}
</td>
<td>{{ steamfree.start }}</td>
<td>{{ steamfree.end }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
{% endblock %}
5.5 視圖,這裏只貼部分代碼
# 免費資訊主頁面
@laboratory_blueprint.route('/laboratory/steamfree')
def steamfree():
steamfrees = Steamfree.query.all()
return render_template('laboratory/steamfree.html', steamfrees=steamfrees, current_user=current_user)
# 手動更新
@laboratory_blueprint.route('/laboratory/steamfree/update')
@login_required
def steamfree_update():
steamdb_free_update()
return redirect(url_for('laboratory.steamfree'))
# 訂閱
@laboratory_blueprint.route('/laboratory/steamfree/subscribe')
@login_required
def subscribe():
sub = Steamfree_sub.query.filter(Steamfree_sub.email == current_user.email).first()
if sub is None:
sub = Steamfree_sub(email=current_user.email)
flash('您已訂閱steam免費遊戲資訊。')
db.session.add(sub)
else:
flash('您已訂閱steam免費遊戲資訊,無需重複訂閱。')
return redirect(url_for('laboratory.steamfree'))
# 退訂
@laboratory_blueprint.route('/laboratory/steamfree/unsubscribe')
@login_required
def unsubscribe():
sub = Steamfree_sub.query.filter(Steamfree_sub.email==current_user.email).first()
if sub is None:
flash('您還未訂閱steam免費遊戲資訊,無需退訂。')
else:
db.session.delete(sub)
db.session.commit()
flash('您已退訂steam免費遊戲資訊。')
return redirect(url_for('laboratory.steamfree'))