主要思路:
1、先爬取數據到一個本地文件中
2、創建一個ORM類,並利用sqlalchemy自動建表
3、讀取本地文件中的內容,遍歷裏面的數據創建ORM對象
4、利用sqlalchemy自動進行數據庫的insert操作
import requests
import time
import json
import datetime
from sqlalchemy import create_engine, Column, String, Integer, Float, Date
from sqlalchemy.ext.declarative import declarative_base
# 用於創建一個ORM類
from sqlalchemy.orm import sessionmaker
# mysql會話工廠
engine = create_engine('mysql://username:password@ipAdress/datebaseName')
# 連接數據庫
Base = declarative_base()
# 獲得一個base對象,ORM類將繼承這個對象
def strtodate(s: str, mode='%Y/%m/%d') -> datetime:
# 將str轉爲datetime,這裏要轉的是2020/1/2這種
date = datetime.datetime.strptime(s, mode)
return date
class PMarticle(Base):
__tablename__ = 'article'
# 表名
id = Column(Integer, primary_key=True, autoincrement=True)
# 設爲自增量、主鍵
title = Column(String(255))
url = Column(String(255))
date = Column(Date)
Base.metadata.create_all(engine)
# 若不存在對應的表則自動創建
def main():
for i in range(1, 3): # 4194
# 以下是一個簡單的爬蟲,獲取結果爲json,然後追加到代碼文件夾下的j.txt文件中
# time.sleep(1)
url = "http://www.woshipm.com/__api/v1/stream-list/page/" + str(i)
r = requests.get(url).text
j = json.loads(r)
with open('j.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(j) + '\n')
f.close()
print('已完成第' + str(i) + '頁')
DBsession = sessionmaker(bind=engine)
# 創建一個會話工廠
with open('j.txt', 'r', encoding='utf-8') as f1:
filecontent = f1.readlines()
for i in filecontent:
print(i)
j = json.loads(i)
payload = j['payload']
for j in payload:
singleArticle = j
title = singleArticle['title']
permalink = singleArticle['permalink']
date = strtodate(singleArticle['date'])
sesison = DBsession()
# 創建一個會話
p = PMarticle(title=title, url=permalink, date=date)
# 創建一個ORM對象
sesison.add(p)
# 增加事務
sesison.commit()
# 提交事務
sesison.close()
# 關閉會話
if __name__ == "__main__":
main()