c++中gbk轉utf8時,如果漢字數量爲奇數時,最後一個漢字會出現亂碼的情況。
原因:
char *strOut = new char[nLength+1]; 申請的長度是不夠的,如上面“天安門”是3個漢字,nLength爲3。但UTF-8格式一個漢字是佔三個字符,至少申請10位(3*3+1)。2、奇數個漢字轉碼後,再由UTF-8轉成GBK時,最後一個字符一直顯示爲“?”。因爲一個漢字轉成UTF-8是需要3個字節,3個漢字就成了9個字節,而它會2個字節2個字節地轉換成字符,當字節是奇數時最後1個字節轉字符就會計算錯誤,然後直接賦予最後這個字符爲“?”,這樣改變了數據,影響後面的解碼。
解決方案與示例:
#include "stdafx.h"
#include <stdio.h>
#include <windows.h>
#include <sstream>
#include <fstream>
#include <stdio.h>
#include <iostream>
//GBK編碼轉換到UTF8編碼
using namespace std;
int GBKToUTF8(unsigned char * lpGBKStr, unsigned char * lpUTF8Str, int nUTF8StrLen)
{
wchar_t * lpUnicodeStr = NULL;
int nRetLen = 0;
if (!lpGBKStr) //如果GBK字符串爲NULL則出錯退出
return 0;
nRetLen = ::MultiByteToWideChar(CP_ACP, 0, (char *)lpGBKStr, -1, NULL, NULL); //獲取轉換到Unicode編碼後所需要的字符空間長度
lpUnicodeStr = new WCHAR[nRetLen + 1]; //爲Unicode字符串空間
nRetLen = ::MultiByteToWideChar(CP_ACP, 0, (char *)lpGBKStr, -1, lpUnicodeStr, nRetLen); //轉換到Unicode編碼
if (!nRetLen) //轉換失敗則出錯退出
return 0;
nRetLen = ::WideCharToMultiByte(CP_UTF8, 0, lpUnicodeStr, -1, NULL, 0, NULL, NULL); //獲取轉換到UTF8編碼後所需要的字符空間長度
if (!lpUTF8Str) //輸出緩衝區爲空則返回轉換後需要的空間大小
{
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return nRetLen;
}
if (nUTF8StrLen < nRetLen) //如果輸出緩衝區長度不夠則退出
{
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return 0;
}
nRetLen = ::WideCharToMultiByte(CP_UTF8, 0, lpUnicodeStr, -1, (char *)lpUTF8Str, nUTF8StrLen, NULL, NULL); //轉換到UTF8編碼
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return nRetLen;
}
// UTF8編碼轉換到GBK編碼
int UTF8ToGBK(unsigned char * lpUTF8Str, unsigned char * lpGBKStr, int nGBKStrLen)
{
wchar_t * lpUnicodeStr = NULL;
int nRetLen = 0;
if (!lpUTF8Str) //如果UTF8字符串爲NULL則出錯退出
return 0;
nRetLen = ::MultiByteToWideChar(CP_UTF8, 0, (char *)lpUTF8Str, -1, NULL, NULL); //獲取轉換到Unicode編碼後所需要的字符空間長度
lpUnicodeStr = new WCHAR[nRetLen + 1]; //爲Unicode字符串空間
nRetLen = ::MultiByteToWideChar(CP_UTF8, 0, (char *)lpUTF8Str, -1, lpUnicodeStr, nRetLen); //轉換到Unicode編碼
if (!nRetLen) //轉換失敗則出錯退出
return 0;
nRetLen = ::WideCharToMultiByte(CP_ACP, 0, lpUnicodeStr, -1, NULL, NULL, NULL, NULL); //獲取轉換到GBK編碼後所需要的字符空間長度
if (!lpGBKStr) //輸出緩衝區爲空則返回轉換後需要的空間大小
{
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return nRetLen;
}
if (nGBKStrLen < nRetLen) //如果輸出緩衝區長度不夠則退出
{
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return 0;
}
nRetLen = ::WideCharToMultiByte(CP_ACP, 0, lpUnicodeStr, -1, (char *)lpGBKStr, nRetLen, NULL, NULL); //轉換到GBK編碼
if (lpUnicodeStr)
delete[]lpUnicodeStr;
return nRetLen;
}
//使用這兩個函數的例子
int main()
{
char cGBKStr[] = "奇數個漢字";
char * lpGBKStr = NULL;
char * lpUTF8Str = NULL;
FILE * fp = NULL;
int nRetLen = 0;
nRetLen = GBKToUTF8((unsigned char *)cGBKStr, NULL, NULL);
printf("轉換後的字符串需要的空間長度爲:%d ", nRetLen);
lpUTF8Str = new char[nRetLen + 1];
nRetLen = GBKToUTF8((unsigned char *)cGBKStr, (unsigned char *)lpUTF8Str, nRetLen);
if (nRetLen)
{
printf("GBKToUTF8轉換成功!");
}
else
{
printf("GBKToUTF8轉換失敗!");
if (lpGBKStr)
delete[]lpGBKStr;
if (lpUTF8Str)
delete[]lpUTF8Str;
return 0;
}
ofstream out("result.txt");
if (out.is_open())
{
out << lpUTF8Str;
out.close();
}
nRetLen = UTF8ToGBK((unsigned char *)lpUTF8Str, NULL, NULL); //再轉回來
printf("轉換後的字符串需要的空間長度爲:%d ", nRetLen);
lpGBKStr = new char[nRetLen + 1];
nRetLen = UTF8ToGBK((unsigned char *)lpUTF8Str, (unsigned char *)lpGBKStr, nRetLen);
if (nRetLen)
{
printf("UTF8ToGBK轉換成功! ");
}
else
{
printf("UTF8ToGBK轉換失敗! ");
if (lpGBKStr)
delete[]lpGBKStr;
if (lpUTF8Str)
delete[]lpUTF8Str;
return 0;
}
ofstream out1("result.txt");
if (out1.is_open())
{
out1 << lpGBKStr;
out1.close();
}
if (lpGBKStr)
delete[]lpGBKStr;
if (lpUTF8Str)
delete[]lpUTF8Str;
return 0;
}