目錄
factfile_path 爲json文件
casefile_path 爲txt文件
去除重複行(法一:利用內置set)
先了解Python splitlines()用法
str.splitlines([keepends])
按照行('\r', '\r\n', \n')分隔,返回一個包含各行作爲元素的列表,參數 keepends默認爲 False,不保留換行符,如果爲 True,則保留換行符。在處理中,建議默認不保留,在寫入的時候在定義換行符。
json涉及字符串與字典的轉換,需一行一行寫入,txt可利用file.writelines(list)進行整體寫入
import json
# f11(json文件),f22(txt文件)爲原文件,f.f2爲新寫入文件
with open(casefile_path2, 'a+', newline='', encoding='utf-8') as f, \
open(casefile_path, 'r', encoding='utf-8') as f11:
# casefile爲json文本,去重寫入
casetmp = f11.read().splitlines()
print("casetmp", type(casetmp))# <class 'list'>
casetmp1 = list(set(casetmp)) # 利用內置集合去重
casetmp1.sort(key=casetmp.index) #排序
for each in casetmp1:
case = json.loads(each)
json.dump(case, f, sort_keys=True, ensure_ascii=False)
f.write("\n")
with open(factfile_path2, 'a+', newline='', encoding='utf-8') as f2, \
open(factfile_path, 'r', encoding='utf-8') as f22:
# factfile爲txt ,去重寫入
facttmp = f22.read().splitlines()
print("facttmp", type(facttmp)) # <class 'list'>
facttmp1 = list(set(facttmp)) # 利用內置set去重
facttmp1.sort(key=facttmp.index)
fact_article = [tmp + "\n" for tmp in facttmp1]
# 給每一行的結尾加一個換行符
f2.writelines(fact_article)
簡化txt代碼
with open('a.txt') as file1:
tmp_list = file1.read().splitlines()
tmp_new = set(tmp)#利用內置的集合去重
tmp_only = [tmp + "\n" for tmp in tmpnew]#給每一行結尾加換行符
with open('b.txt', 'w') as file2:
file2.writelines(tmp_only)
(這種.splitlines()方法需要注意,txt文本如果不規範,每一行以“\n”爲換行符,但是一行內間含有“\n”特殊字符,這種文本就不適用,使用的結果是文本去重後,行數也許不減反降,這是因爲它把文件中本是一行的數據拆分了,用這種方法的首先考慮自己的文件格式,其次可以用下面的程序“查看文件行數長度”進行檢驗)
去除重複行(法二:建造list或set檢查是否已存在)
如果考慮到文本的長度是否會超出顯示,可以截取單行前多少字符,進行比較,比如我的數據有唯一的標識符id,我就對唯一的進行比較,而不是整行進行比較,如果不考慮文本長度問題就可以直接添加進list或set判斷。
import json
with open(casefile_path2, 'a+', newline='',encoding='utf-8') as f,\
open(factfile_path2, 'a+', newline='', encoding='utf-8') as f2, \
open(casefile_path, 'r', encoding='utf-8') as f11,\
open(factfile_path, 'r', encoding='utf-8') as f22,
lines_seen = set()
for line in f11:
case = json.loads(line)
id = case["ID"]
if id not in lines_seen:
lines_seen.add(id)
case = json.loads(line)
json.dump(case, f, sort_keys=True, ensure_ascii=False)
f.write('\n')
fact_article = f22.readline()
f2.write(fact_article)
簡化代碼:
read_Path='a.txt'
write_Path='b.txt'
lines_seen=set()
outfiile=open(write_Path,'a+',encoding='utf-8')
f=open(read_Path,'r',encoding='utf-8')
for line in f:
if line not in lines_seen:
outfiile.write(line)
lines_seen.add(line)
按行寫入json/txt文件
def save_data(case,fact_article):
"""
數據存儲邏輯
"""
global factfile_path
global casefile_path
with open(casefile_path, 'a+', encoding='utf-8') as f:
#case爲一個字典
json.dump(case, f, sort_keys=True, ensure_ascii=False)
f.write('\n')
with open(factfile_path, 'a+', newline='', encoding='utf-8') as f2:
#fact_article爲一個字符串
f2.write(fact_article)
f2.write('\n')
同時寫入文件,將f11文件追加寫入f。兩個文件同步寫入。json文件比較特殊需要一行一行進行。
with open(casefile_path, 'a+', newline='',encoding='utf-8') as f,\
open(factfile_path, 'a+', newline='', encoding='utf-8') as f2, \
open(casefile_path2, 'r', encoding='utf-8') as f11,\
open(factfile_path2, 'r', encoding='utf-8') as f22:
# print(type(f11))#<class '_io.TextIOWrapper'>
for line in f11:
case = json.loads(line) #line是str,case是字典
json.dump(case, f, sort_keys=True, ensure_ascii=False)
f.write('\n') #行末需要寫入"\n"
fact_article = f22.readline()
f2.write(fact_article)
# f2.write('\n')#因爲這一行文本,行末本就有"\n",所以不需要加換行
查看文件行數長度
def zhuijia(factfile_path):
# filename = "somefile.txt"
factfile = open(factfile_path, encoding='utf-8')
factfile_lines = len(factfile.readlines())
print( factfile_lines )
factfile.close()