gbk轉utf8 亂碼問題

原創

2020-06-02 05:25

c++中gbk轉utf8時，如果漢字數量爲奇數時，最後一個漢字會出現亂碼的情況。

原因：

char *strOut = new char[nLength+1]; 申請的長度是不夠的，如上面“天安門”是3個漢字，nLength爲3。但UTF-8格式一個漢字是佔三個字符，至少申請10位(3*3+1)。2、奇數個漢字轉碼後，再由UTF-8轉成GBK時，最後一個字符一直顯示爲“？”。因爲一個漢字轉成UTF-8是需要3個字節，3個漢字就成了9個字節，而它會2個字節2個字節地轉換成字符，當字節是奇數時最後1個字節轉字符就會計算錯誤，然後直接賦予最後這個字符爲“?”，這樣改變了數據，影響後面的解碼。

解決方案與示例：

#include "stdafx.h"
#include <stdio.h>
#include <windows.h>
#include <sstream>
#include <fstream>
#include <stdio.h>
#include <iostream>
//GBK編碼轉換到UTF8編碼
using namespace std;
int GBKToUTF8(unsigned char * lpGBKStr, unsigned char * lpUTF8Str, int nUTF8StrLen)
{
    wchar_t * lpUnicodeStr = NULL;
    int nRetLen = 0;

    if (!lpGBKStr)  //如果GBK字符串爲NULL則出錯退出
        return 0;

    nRetLen = ::MultiByteToWideChar(CP_ACP, 0, (char *)lpGBKStr, -1, NULL, NULL);  //獲取轉換到Unicode編碼後所需要的字符空間長度
    lpUnicodeStr = new WCHAR[nRetLen + 1];  //爲Unicode字符串空間
    nRetLen = ::MultiByteToWideChar(CP_ACP, 0, (char *)lpGBKStr, -1, lpUnicodeStr, nRetLen);  //轉換到Unicode編碼
    if (!nRetLen)  //轉換失敗則出錯退出
        return 0;

    nRetLen = ::WideCharToMultiByte(CP_UTF8, 0, lpUnicodeStr, -1, NULL, 0, NULL, NULL);  //獲取轉換到UTF8編碼後所需要的字符空間長度

    if (!lpUTF8Str)  //輸出緩衝區爲空則返回轉換後需要的空間大小
    {
        if (lpUnicodeStr)
            delete[]lpUnicodeStr;
        return nRetLen;
    }

    if (nUTF8StrLen < nRetLen)  //如果輸出緩衝區長度不夠則退出
    {
        if (lpUnicodeStr)
            delete[]lpUnicodeStr;
        return 0;
    }

    nRetLen = ::WideCharToMultiByte(CP_UTF8, 0, lpUnicodeStr, -1, (char *)lpUTF8Str, nUTF8StrLen, NULL, NULL);  //轉換到UTF8編碼

    if (lpUnicodeStr)
        delete[]lpUnicodeStr;

    return nRetLen;
}

// UTF8編碼轉換到GBK編碼
int UTF8ToGBK(unsigned char * lpUTF8Str, unsigned char * lpGBKStr, int nGBKStrLen)
{
    wchar_t * lpUnicodeStr = NULL;
    int nRetLen = 0;

    if (!lpUTF8Str)  //如果UTF8字符串爲NULL則出錯退出
        return 0;

    nRetLen = ::MultiByteToWideChar(CP_UTF8, 0, (char *)lpUTF8Str, -1, NULL, NULL);  //獲取轉換到Unicode編碼後所需要的字符空間長度
    lpUnicodeStr = new WCHAR[nRetLen + 1];  //爲Unicode字符串空間
    nRetLen = ::MultiByteToWideChar(CP_UTF8, 0, (char *)lpUTF8Str, -1, lpUnicodeStr, nRetLen);  //轉換到Unicode編碼
    if (!nRetLen)  //轉換失敗則出錯退出
        return 0;

    nRetLen = ::WideCharToMultiByte(CP_ACP, 0, lpUnicodeStr, -1, NULL, NULL, NULL, NULL);  //獲取轉換到GBK編碼後所需要的字符空間長度

    if (!lpGBKStr)  //輸出緩衝區爲空則返回轉換後需要的空間大小
    {
        if (lpUnicodeStr)
            delete[]lpUnicodeStr;
        return nRetLen;
    }

    if (nGBKStrLen < nRetLen)  //如果輸出緩衝區長度不夠則退出
    {
        if (lpUnicodeStr)
            delete[]lpUnicodeStr;
        return 0;
    }

    nRetLen = ::WideCharToMultiByte(CP_ACP, 0, lpUnicodeStr, -1, (char *)lpGBKStr, nRetLen, NULL, NULL);  //轉換到GBK編碼

    if (lpUnicodeStr)
        delete[]lpUnicodeStr;

    return nRetLen;
}

//使用這兩個函數的例子
int main()
{
    char cGBKStr[] = "奇數個漢字";
    char * lpGBKStr = NULL;
    char * lpUTF8Str = NULL;
    FILE * fp = NULL;
    int nRetLen = 0;

    nRetLen = GBKToUTF8((unsigned char *)cGBKStr, NULL, NULL);
    printf("轉換後的字符串需要的空間長度爲：%d ", nRetLen);
    lpUTF8Str = new char[nRetLen + 1];
    nRetLen = GBKToUTF8((unsigned char *)cGBKStr, (unsigned char *)lpUTF8Str, nRetLen);
    if (nRetLen)
    {
        printf("GBKToUTF8轉換成功！");
    }
    else
    {
        printf("GBKToUTF8轉換失敗！");
        if (lpGBKStr)
            delete[]lpGBKStr;

        if (lpUTF8Str)
            delete[]lpUTF8Str;
        return 0;
    }

    ofstream out("result.txt");
    if (out.is_open())
    {
        out << lpUTF8Str;
        out.close();
    }

    nRetLen = UTF8ToGBK((unsigned char *)lpUTF8Str, NULL, NULL);  //再轉回來
    printf("轉換後的字符串需要的空間長度爲：%d ", nRetLen);
    lpGBKStr = new char[nRetLen + 1];
    nRetLen = UTF8ToGBK((unsigned char *)lpUTF8Str, (unsigned char *)lpGBKStr, nRetLen);
    if (nRetLen)
    {
        printf("UTF8ToGBK轉換成功！ ");
    }
    else
    {
        printf("UTF8ToGBK轉換失敗！ ");
        if (lpGBKStr)
            delete[]lpGBKStr;

        if (lpUTF8Str)
            delete[]lpUTF8Str;
        return 0;
    }
    ofstream out1("result.txt");
    if (out1.is_open())
    {
        out1 << lpGBKStr;
        out1.close();
    }
    if (lpGBKStr)
            delete[]lpGBKStr;

    if (lpUTF8Str)
            delete[]lpUTF8Str;
    return 0;
}

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

gbk轉utf8 亂碼問題

原因：

c++ excel庫 xlnt使用心得

linux下gTest筆記

gbk轉utf8 亂碼問題

c++ mupdf編譯

caffe 修改opencv 路徑

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結