用MASM实现读UCS-2文件

关于UCS-2编码就不多说了,Google一下就大把大把的了

直接上代码

;>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
.386
.model  flatstdcall
option  casemap :none

include  windows.inc
include  kernel32.inc
include  user32.inc

includelib  kernel32.lib
includelib  msvcrt.lib
includelib  user32.lib

; 用到的C库函数

; 当然,也可以用API

fopen                proto  C filename:dword , mode:dword
fread                proto  C buffer:dword , nSize:dword , count:dword , fp:dword
fclose               proto  C fp:dword
feof                 proto  C fp:dword
fseek                proto  C fp:dword , lOffset:dword , origin:dword
ftell                proto  C fp:dword
calloc                 proto  C num:dword , len:dword
free                 proto  C memblock:dword

.data

szTestFile        db  'test.txt', 0
szMode            db  'r', 0
szOpenErr         db  'Failed to open file', 0
szNotUCS2         db  'File not  encoding by UCS-2', 0

.code

; 对已打开的fp进行读取并转换成UTF-8,存放到buf

;len为buf的长度,应预留一位作0填充

;fIsBe标记编码是Big Endian 还是Little Endian

_ReadUCS2AsUTF8    proc     fp:dword , buf:dword , len:dword , fIsBe:byte
                   local    @word :word

        .if         !buf
                    ret
        .endif

        ;先将buf填0,这里没判断len....
        invoke     RtlZeroMemory, buf, len

        .if         !fp
                    ret
        .endif
        mov         edi , buf
        
        .while     1

                    ;按字读取
                    invoke     fread, addr  @word , sizeof word , 1, fp
                    mov         ax , @word
                    
                    .if         fIsBe == 1        ; big endian, 需交换低高字节
                                xchg     alah
                    .endif
                    
                    .if         ax  < 080h         ; ASCII
                                stosb
                                dec     len
                                
                                .if     len == 1
                                        .break
                                .endif
                                
                    .elseif     ax  < 0800h      
                                mov     bxax     
                                shr     ax , 6
                                or         ax , 0C0h
                                stosb
                                dec     len
                                
                                .if     len == 1
                                        .break
                                .endif

                                mov     axbx
                                and     ax , 03Fh
                                or         ax , 080h
                                stosb
                                
                                dec     len
                                .if     len == 1
                                        .break
                                .endif
                                
                    .elseif     ax  < 0FFFFh
                                mov     bxax
                                shr     ax , 12
                                or         ax , 0E0h
                                stosb
                                
                                .if     len == 1
                                        .break
                                .endif
                                
                                mov     axbx
                                shr     ax , 6
                                and     ax , 03Fh
                                or         ax , 080h
                                stosb
                                dec     len
                                .if     len == 1
                                        .break
                                .endif
                                
                                mov     axbx
                                and     ax , 03Fh
                                or         ax , 080h
                                stosb
                                dec     len
                                .if     len == 1
                                        .break
                                .endif
                    .endif
                    
                    invoke     feof, fp
                    .if         eax
                            .break
                    .endif
        .endw

        ret
_ReadUCS2AsUTF8    endp

_Test        proc
            local  @buffer
            local  @fp
            local  @len
            local  @mark[2]:byte
            local  @unicode

        invoke     fopen, addr  szTestFile, addr  szMode
        .if         !eax
                    invoke     MessageBox, 0, addr  szOpenErr, 0, 0
                    ret
        .endif
        
        mov         @fp, eax

        ; 取文件长度
        invoke     fseek, @fp, 0, SEEK_END
        invoke     ftell, @fp
        ; 这里不作文件长度判断
        mov         @len, eax
        ;
        invoke     fseek, @fp, 0, SEEK_SET
        invoke     calloc, @len, 1
        mov         @buffer, eax
        ; 读取文件标志

        ; 对UCS-2来说,是用两字节标记的,其它UCS-4之类的类似

        ; 严格来说,这里的判断还不算严谨。。。

        invoke     fread, addr  @mark, 2, 1, @fp
        
        lea         esi , @mark
        cld
        lodsb
        .if         al  == 0FEh
                    lodsb
                    .if     al  == 0FFh   ;Big Endian
                            invoke     _ReadUCS2AsUTF8, @fp, @buffer, @len, 1
                    .endif
                    
        .elseif     al  == 0FFh
                    lodsb
                    .if     al  == 0FEh  ;Little Endian
                            invoke     _ReadUCS2AsUTF8, @fp, @buffer, @len, 0
                    .endif
        .else      ; 其它格式
                    invoke     MessageBox, 0, addr  szNotUCS2, 0, 0
                    jmp     _exit
        .endif                 
        ;转换成ANSI
        invoke     MultiByteToWideChar, CP_UTF8, 0, @buffer, -1, 0, 0
        mov         @len, eax
        invoke     calloc, @len, 2
        mov         @unicode, eax
        invoke     MultiByteToWideChar, CP_UTF8, 0, @buffer, -1, @unicode, @len
        invoke         free, @buffer
        
        invoke     WideCharToMultiByte, CP_ACP, 0, @unicode, -1, 0, 0, 0, 0
        mov         @len, eax
        invoke     calloc, @len, 1
        mov         @buffer, eax
        invoke     WideCharToMultiByte, CP_ACP, 0, @unicode, -1, @buffer, @len, 0, 0
        invoke     MessageBox, 0, @buffer, 0, 0
_exit:        
        invoke     fclose, @fp
        invoke     free, @buffer
        
        ret
_Test        endp

start:
        call         _Test
        invoke     ExitProcess, 0
        
end     start

;>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

 

挺简单的一个测试程序,新建一个test.txt放到程序根目录下即可

可用Notepad++转换成其它编码多测试几下

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章