目錄下有如圖60個txt文件,每個txt文件裏的數據大概有7000萬行
目的:把每個txt文件裏的數據去重後合併60個文件爲一個總文件,然後把總文件裏的數據按第一列、第二列分組
第三列求去重後出現的次數
每個文件的內容如下:
代碼如下:
# -*- coding:utf-8 -*-
from datetime import datetime
import pandas as pd
import os
def Main():
sourcr_dir = '/ford_tongji/uaad/'
target_dir = '/ford/ford_tongji/uaad/distinct/'
target_txt = '/ford/ford_tongji/uaad/distinct/merge_result.txt'
print("開始。。。。。")
print("加載規則數據")
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
#遍歷文件並去重
for file in os.listdir(sourcr_dir):
if file.find('201708') != -1:
data=pd.read_csv(sourcr_dir+file,sep="\t",header=None,names=['ad','number','name']).drop_duplicates()[['number','name','ad']]
data.to_csv(target_dir+'distinct_'+file,sep="\t",header=None,index=False)
if file.find('201709') != -1:
data=pd.read_csv(sourcr_dir+file,sep="\t",header=None,names=['number','name','ad']).drop_duplicates()
data.to_csv(target_dir+'distinct_'+file,sep="\t",header=None,index=False)
#合併文件
with open(target_txt, "w+") as ff_write:
for file in os.listdir(target_dir):
if file.find('distinct_2017') != -1:
with open(file, 'r') as f_read:
for line in f_read:
line = line.strip().split("\t")
ff_write.write("\t".join(line) + "\n")
#算UV
data2=data=pd.read_csv(target_txt,sep="\t",header=None,names=['number','name','ad']).groupby(['number','name']).agg({'ad':pd.Series.nunique})
data2.to_csv('uv_result.txt', sep="\t", header=None, index=True)
print("處理完成。。。。。")
print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
if __name__ == "__main__":
Main()
版權聲明:本文爲博主原創文章,未經博主允許不得轉載。