文件編碼檢測chardet及亂碼處理

def save_data(line):
    with open("new微博評論.csv","a+",newline="",encoding="utf-8") as f:
        f.write(line)
f = open("微博評論.csv","rb")#二進制格式讀文件
i = 0
while True:
    i += 1
    # print(i)
    line = f.readline()
    if not line:
        break
    else:
        try:
            n_line = line.decode('utf8')
            save_data(n_line)
        except Exception as e:
            print(type(e),e)
            print("=========================")
            print(i,line)

編碼檢查chardet 

import chardet
 
def judge(data):
    return chardet.detect(data)["encoding"]
 
def error(e,q=1):
    input(e)
    if q:
        exit(0)
 
def trans(path):
    data = open(path, "rb").read()
    coding = judge(data)
    if coding == "GB2312":
        coding = "GBK"
    try:
        arr = [i.rstrip() for i in data.decode(coding).split("\n")]
        if len(arr) == 1:
            return [i for i in arr[0].split("\r")]
        return arr
    except Exception as e:
        print(e)
        error("[!] 無法使用此文本,請使用utf8編碼的文本")
 
print(trans("123.txt"))

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章