關於UCS-2編碼就不多說了,Google一下就大把大把的了
直接上代碼
;>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
.386
.model
flat
, stdcall
option
casemap
:none
include
windows.inc
include
kernel32.inc
include
user32.inc
includelib
kernel32.lib
includelib
msvcrt.lib
includelib
user32.lib
; 用到的C庫函數
; 當然,也可以用API
fopen proto
C filename:dword
, mode:dword
fread proto
C buffer:dword
, nSize:dword
, count:dword
, fp:dword
fclose proto
C fp:dword
feof proto
C fp:dword
fseek proto
C fp:dword
, lOffset:dword
, origin:dword
ftell proto
C fp:dword
calloc proto
C num:dword
, len:dword
free proto
C memblock:dword
.data
szTestFile db
'test.txt', 0
szMode db
'r', 0
szOpenErr db
'Failed to open file', 0
szNotUCS2 db
'File not
encoding by UCS-2', 0
.code
; 對已打開的fp進行讀取並轉換成UTF-8,存放到buf
;len爲buf的長度,應預留一位作0填充
;fIsBe標記編碼是Big Endian 還是Little Endian
_ReadUCS2AsUTF8 proc
fp:dword
, buf:dword
, len:dword
, fIsBe:byte
local
@word
:word
.if
!buf
ret
.endif
;先將buf填0,這裏沒判斷len....
invoke
RtlZeroMemory, buf, len
.if
!fp
ret
.endif
mov
edi
, buf
.while
1
;按字讀取
invoke
fread, addr
@word
, sizeof word
, 1, fp
mov
ax
, @word
.if
fIsBe == 1 ; big endian, 需交換低高字節
xchg
al
, ah
.endif
.if
ax
< 080h ; ASCII
stosb
dec
len
.if
len == 1
.break
.endif
.elseif
ax
< 0800h
mov
bx
, ax
shr
ax
, 6
or
ax
, 0C0h
stosb
dec
len
.if
len == 1
.break
.endif
mov
ax
, bx
and
ax
, 03Fh
or
ax
, 080h
stosb
dec
len
.if
len == 1
.break
.endif
.elseif
ax
< 0FFFFh
mov
bx
, ax
shr
ax
, 12
or
ax
, 0E0h
stosb
.if
len == 1
.break
.endif
mov
ax
, bx
shr
ax
, 6
and
ax
, 03Fh
or
ax
, 080h
stosb
dec
len
.if
len == 1
.break
.endif
mov
ax
, bx
and
ax
, 03Fh
or
ax
, 080h
stosb
dec
len
.if
len == 1
.break
.endif
.endif
invoke
feof, fp
.if
eax
.break
.endif
.endw
ret
_ReadUCS2AsUTF8 endp
_Test proc
local
@buffer
local
@fp
local
@len
local
@mark[2]:byte
local
@unicode
invoke
fopen, addr
szTestFile, addr
szMode
.if
!eax
invoke
MessageBox, 0, addr
szOpenErr, 0, 0
ret
.endif
mov
@fp, eax
; 取文件長度
invoke
fseek, @fp, 0, SEEK_END
invoke
ftell, @fp
; 這裏不作文件長度判斷
mov
@len, eax
;
invoke
fseek, @fp, 0, SEEK_SET
invoke
calloc, @len, 1
mov
@buffer, eax
; 讀取文件標誌
; 對UCS-2來說,是用兩字節標記的,其它UCS-4之類的類似
; 嚴格來說,這裏的判斷還不算嚴謹。。。
invoke
fread, addr
@mark, 2, 1, @fp
lea
esi
, @mark
cld
lodsb
.if
al
== 0FEh
lodsb
.if
al
== 0FFh ;Big Endian
invoke
_ReadUCS2AsUTF8, @fp, @buffer, @len, 1
.endif
.elseif
al
== 0FFh
lodsb
.if
al
== 0FEh ;Little Endian
invoke
_ReadUCS2AsUTF8, @fp, @buffer, @len, 0
.endif
.else
; 其它格式
invoke
MessageBox, 0, addr
szNotUCS2, 0, 0
jmp
_exit
.endif
;轉換成ANSI
invoke
MultiByteToWideChar, CP_UTF8, 0, @buffer, -1, 0, 0
mov
@len, eax
invoke
calloc, @len, 2
mov
@unicode, eax
invoke
MultiByteToWideChar, CP_UTF8, 0, @buffer, -1, @unicode, @len
invoke
free, @buffer
invoke
WideCharToMultiByte, CP_ACP, 0, @unicode, -1, 0, 0, 0, 0
mov
@len, eax
invoke
calloc, @len, 1
mov
@buffer, eax
invoke
WideCharToMultiByte, CP_ACP, 0, @unicode, -1, @buffer, @len, 0, 0
invoke
MessageBox, 0, @buffer, 0, 0
_exit:
invoke
fclose, @fp
invoke
free, @buffer
ret
_Test endp
start:
call
_Test
invoke
ExitProcess, 0
end
start
;>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
挺簡單的一個測試程序,新建一個test.txt放到程序根目錄下即可
可用Notepad++轉換成其它編碼多測試幾下