需求
批量上載本地 excel 中存儲數據至數據庫。
本地excel中數據爲二維結構,需先轉換爲一維表。
環境、工具、技術路線
Win7
Python 3.6
Spyder
流程圖
代碼
# _*_ conding: utf-8 _*_
'''
getConfig.py
從配置文件中讀取數據
'''
from configparser import ConfigParser
class Conf():
def __init__(self, path):
self._path = path
def getInfo(self, sec, U, P, ip, port, db):
fil = ConfigParser()
fil.read(self._path)
return (fil.get(sec, U),
fil.get(sec, P),
fil.get(sec, ip),
fil.get(sec, port),
fil.get(sec, db))
# _*_ coding: utf-8 _*_
'''
getDoc.py
1.獲取路徑下的文件
2.獲取指定【關鍵詞】篩選的文件
'''
import os
class Doc():
def __init__(self, path):
self._path = path
def getAll(self):
return os.listdir(self._path)
def getSome(self, key):
return [f for f in os.listdir(self._path) if key in f]
# _*_ coding: utf-8 _*_
'''
up_sp_cash.py
- 上載消費和現金
'''
import os
import time
import pandas as pd
from getPath import Doc
from getConfig import Conf
from sqlalchemy import create_engine
PATH = r'H:\SZ_數據\Download'
CAT = ('總點擊', '搜索點擊', '無線搜索點擊', '自主投放', '新產品')
HEADER = ('日期', '用戶名', '類別', '金額')
now = lambda: time.perf_counter()
def sp(key):
# 獲取文件
doc = Doc(PATH)
os.chdir(PATH)
for f in doc.getSome(key):
# 讀取
df = pd.read_csv(f, engine='python', encoding='GBK')
df.rename(columns={'賬戶名稱': '用戶名'}, inplace=True)
df.set_index('用戶名', inplace=True, drop=True)
# 獲取時段
for dat in getDat(f):
# 構造字段
lis = getHeader(key, dat)
# 篩選,去0
df1 = df.loc[df[lis[0]] > 0, lis[:-1]]
# 計算新產品
df1[lis[-1]] = df1[lis[0]] - df1[lis[1]] - df1[lis[3]]
# 轉換爲一維
newDf = pd.DataFrame(columns=HEADER)
newDf = toDimension(df1, newDf, dat, lis)
# 上載
print(dat, newDf.head(2))
with create_engine(getUrl()).begin() as conn:
newDf.to_sql(key, con=conn, if_exists='append', index=False)
def getDat(f):
# 由時期直接獲取時段 如 '15Q1' --> '20150101' -- '20150331'
# 日期時段
#
def QtoDateRange(Q):
if 'Q1' in Q:
return ['20' + Q[:2] + i for i in ('0101', '0331')]
elif 'Q2' in Q:
return ['20' + Q[:2] + i for i in ('0401', '0630')]
elif 'Q3' in Q:
return ['20' + Q[:2] + i for i in ('0701', '0930')]
elif 'Q4' in Q:
return ['20' + Q[:2] + i for i in ('1001', '1231')]
st, ed = QtoDateRange(f.split('年')[0] + f.split('年')[1].split('消費')[0])
return map(lambda x: x.strftime('%Y%m%d'), pd.date_range(st, ed))
def getHeader(key, dat):
return [cat + key + dat for cat in CAT]
def toDimension(df, newDf, dat, lis):
for acc in df.index:
for n, cat in enumerate(CAT):
df_1 = pd.DataFrame([[dat, acc, cat, df.loc[acc, lis[n]]]]
, columns=HEADER)
newDf = newDf.append(df_1, sort=False)
return newDf
def getUrl():
# 連接數據庫
conf = Conf(r'H:\SZ_數據\Python\c.s.conf')
url = ('mssql+pymssql://%s:%s@%s:%s/%s' %
conf.getInfo('SQL Server', 'accountname', 'password'
, 'ip', 'port', 'dbname'))
return url
if __name__ == '__main__':
st = now()
sp('消費')
sp('現金')
print('Runtime: {:.3f} min'.format(((now() - st)/60)))
Optimize
- 多線程
- 算法?
- 其它?
Q/A
- SyntaxError(低級錯誤)
pd.DataFrame([0, 1], columns=['a', 'b']
->pd.DataFrame([[0, 1]], columns=['a', 'b'])
- pd.ExcelWriter(更好的使用習慣)
# df.to_excel()
with pd.ExcelWriter(os.path.join(PATH, 't' + dat + '.xlsx')) as writer:
df_2.to_excel(writer, sheet_name='sht1')
- 使用下面語句時
1.直到 newDf 上載完才commit
;
2.這個過程中會鎖住相關的表,拒絕所有操作(包括查詢),其他訪問只能排隊等待,上載前再連接數據庫,上載完成後即邊開;
with create_engine(getUrl()).begin() as conn:
newDf.to_sql(key, con=conn, if_exists='append', index=False)