import chardet
import os
filename = 'H:\\python\\source' #文件夾路徑
Aim_Format = 'utf-8' #目標編碼格式
code_ifo = 'xxx'
#保存文件
def write_file(content, file):
with open(file, 'wb') as f:
f.write(content)
#遍歷文件夾下所有文件
def get_filelist(dir, file):
newDir = dir
if os.path.isfile(dir):
file.append(dir)
Encoding_Format_Trans(dir, Aim_Format)#修改編碼格式
elif os.path.isdir(dir):
for s in os.listdir(dir):
newDir = os.path.join(dir, s)
get_filelist(newDir, file)
return file
#獲取單個文件的編碼信息
def get_file_info(file):
f = open(file, 'rb')
data = f.read()
return chardet.detect(data)['encoding'].strip() #空文本會報錯
#編碼格式轉換
def Encoding_Format_Trans (path_name_, _Aim_Format):
code_ifo = get_file_info(path_name_)
print('before ', code_ifo)
if code_ifo != _Aim_Format:
if code_ifo == 'GB2312': #gbk的字符集更全,能解一些2312爲亂碼的文字
code_ifo = 'gbk'
f = open(path_name_, 'rb')
file_decode = f.read().decode(code_ifo, 'ignore') #編碼的字符---> unicode
file_encode = file_decode.encode(_Aim_Format) #unicode-->目標編碼格式
write_file(file_encode, path_name_)
code_test = get_file_info(path_name_)
print('after ', code_test)
if __name__ == "__main__":
list = get_filelist(filename, [])
print('over')