文件存儲格式轉換(ASCII&UTF-8)
在用 Source Insight[version 3.50.0080] 看用在 Linux 上的代碼時發現對中文註釋的支持很不友好,看到網上又說要改註釋字體爲“新宋體”(/“宋體”)的,但我沒弄成。就想着直接把編碼爲 UTF-8 的文件存爲 ASCII,首先想到的是“記事本”中的“另存爲”,但當文件太多時顯然不行。
搜了好多,發現一個寫的還不錯的sourceinsight中文顯示亂碼問題徹底解決辦法,
簡單明瞭,不過好似有點問題–會把原本是 ASCII 的文件給弄壞了,將改進了一點(在命令行輸入目標文件夾,並不能修復關於 ASCII 的問題 -.-。另外,記事本存爲 UTF-8 時其實是 “UTF-8 with BOM”,這也帶來了不少問題)的貼在下邊:
@echo off
set DIR=%1%
if "%DIR%"=="" (
echo "Should input the dictionary name") else (
for /R %DIR% %%i in (*.h *.c *.cpp *.cs *.mak *.java) do (
echo %%i
native2ascii -encoding UTF-8 %%i %DIR%\temp
native2ascii -reverse %DIR%\temp %%i
)
echo ALL DONE
pause
)
關於 native2ascii 的一些參考資料:
所以,就自己寫了個 python 程序來實現所需功能:ASCII 與 UTF-8 互相轉換:
注:需要自行安裝 chardet 模塊,且我的 python 環境是 2.7
使用方式: python transformFormat.py fileOrDirName toUTF_8(True/False) fileExtensions(c,cpp,h,cs,mak)[optional]
比如:python transformFormat.py H:\test True c cpp h
就可以將 H:\test 文件夾下的所有後綴爲 .c/.cpp/.h 的文件轉爲 UTF-8 模式(原來的格式並不牽扯)
"""
transFormat.py, aim to transform the codec of the file,especially between the ASCII and
UTF-8.
"""
class Transform(object):
def listFiles(self, root=''):
allFiles = []
import os
#s = os.sep
#root = "d:" + s + "ll" + s
if os.path.isfile(root): #root is just a file
allFiles.append(root)
return allFiles
for i in os.listdir(root): #root is a dictionary
f = os.path.join(root,i)
if os.path.isdir(f):
allFiles += self.listFiles(root= f)
elif os.path.isfile(f):
allFiles.append(f)
return allFiles
def transform(self, fileName, toUTF_8):
import chardet
import codecs
with open(fileName, 'r') as f:
data = f.read()
if data[:3] == codecs.BOM_UTF8: # In case of UTF-8 with BOM
data = data[3:]
try:
print('Transform begin, file: ' + root + ';toUTF_8: ' + str(toUTF_8))
encodeType = chardet.detect(data)['encoding'].upper()
print(fileName, encodeType)
alreadyUTF_8 = (encodeType.find('UTF') != -1) #already utf-8
if (toUTF_8 and alreadyUTF_8) or (not toUTF_8 and not alreadyUTF_8): #Do not need to transform,already OK
print (fileName + ' Already')
return
if toUTF_8: #meet the require to change to utf-8
data = data.decode('gbk','ignore').encode('utf-8')
else:
data = data.decode('utf-8', 'ignore').encode('gbk')
#write back the content
with open(fileName, 'w') as f:
f.write(data)
print(fileName + ' OK')
except Exception as e:
print('WRONG with ' + fileName)
print(e)
def main(self, root='', toUTF_8=True, fileExtensions=''):
#print('Transform begin, root: ' + root + ';toUTF_8: ' + str(toUTF_8))
allFiles = self.listFiles(root=root)
allFiles2 = []
for f in allFiles:
fends = f.split('.')[-1]
if fends in fileExtensions:
allFiles2.append(f)
if len(allFiles2) == 0:
print('No file to transform')
return
for f in allFiles2:
self.transform(f, toUTF_8)
#t = Transform()
#root = 'H:\leetcode\wingide\he'
#fE = ['c','cpp','h','cs','mak','txt']
#t.main(root=root,toUTF_8=False, fileExtensions = fE)
#exit()
if __name__ == '__main__':
print('Usage: python transformFormat.py fileOrDirName toUTF_8(True/False) fileExtensions(c,cpp,h,cs,mak)[optional]')
import sys
#print(sys.argv)
if len(sys.argv) < 2:
print("No file name!")
exit()
if len(sys.argv) == 2:
print('Should give toUTF_8')
exit()
root = sys.argv[1]
if len(sys.argv) >= 3:
if sys.argv[2] == 'True':
toUTF_8 = True
elif sys.argv[2] == 'False':
toUTF_8 = False
else:
print('toUTF should be True or False')
fileExtensions = ['c','cpp','h','cs','mak']
if len(sys.argv) > 3:
fileExtensions = sys.argv[3:]
print('Transform begin, root: ' + root + ';toUTF_8: ' + str(toUTF_8) + ';fileExtensions:' + str(fileExtensions))
t = Transform()
t.main(root=root, toUTF_8=toUTF_8,fileExtensions=fileExtensions)
print('Transform Over')
參考資料: