charset.h
#pragma once
#include <iostream>
#include <string>
std::string UnicodeToAnsi(const std::wstring& unicode);
std::wstring AnsiToUnicode(const std::string& ansi);
std::string AnsiToUtf8(const std::string& strSrc);
std::string Utf8ToAnsi(const std::string& strSrc);
std::string UnicodeToUtf8(const std::wstring& wstrSrc);
std::wstring Utf8ToUnicode(const std::string& strSrc);
std::string GBKToUtf8(const std::string& gbk);
std::string Utf8ToGBK(const std::string& utf8);
std::wstring GB2312ToUnicode(const std::string& gb2312);
std::string UnicodeToGB2312(const std::wstring& unicode);
std::wstring BIG5ToUnicode(const std::string& big5);
std::string UnicodeToBIG5(const std::wstring& unicode);
std::string FBIG5ToGB2312(const std::string& big5);
std::string GB2312ToFBIG5(const std::string gb2312);
bool IsUTF8(const void* pBuffer, long size);
charset.cpp
#inchude "charset.h"
#include <Windows.h>
std::string UnicodeToAnsi(const std::wstring& unicode)
{
LPCWCH ptr = unicode.c_str();
/** 分配目標空間, 一個16位Unicode字符最多可以轉爲4個字節int size = static_cast<int>( wstrSrc.size() * 4 + 10 );*/
int size = WideCharToMultiByte(CP_THREAD_ACP, 0, ptr, -1, NULL, 0, NULL, NULL);
std::string strRet(size, 0);
int len = WideCharToMultiByte(CP_THREAD_ACP, 0, ptr, -1, (LPSTR)strRet.c_str(), size, NULL, NULL);
return strRet;
}
std::wstring AnsiToUnicode(const std::string& ansi)
{
LPCCH ptr = ansi.c_str();
int size = MultiByteToWideChar(CP_ACP, 0, ptr, -1, NULL, NULL);
std::wstring wstrRet(size, 0);
int len = MultiByteToWideChar(CP_ACP, 0, ptr, -1, (LPWSTR)wstrRet.c_str(), size);
return wstrRet;
}
std::string AnsiToUtf8(const std::string& ansi)
{
LPCCH ptr = ansi.c_str();
/* 分配目標空間, 長度爲 Ansi 編碼的兩倍 */
int size = MultiByteToWideChar(CP_ACP, 0, ptr, -1, NULL, NULL);
std::wstring wstrTemp(size, 0);
int len = MultiByteToWideChar(CP_ACP, 0, ptr, -1, (LPWSTR)wstrTemp.c_str(), size);
return UnicodeToUtf8(wstrTemp);
}
std::string Utf8ToAnsi(const std::string& utf8)
{
std::wstring wstrTemp = Utf8ToUnicode(utf8);
LPCWCH ptr = wstrTemp.c_str();
int size = WideCharToMultiByte(CP_ACP, 0, ptr, -1, NULL, 0, NULL, NULL);
std::string strRet(size, 0);
int len = WideCharToMultiByte(CP_ACP, 0, ptr, -1, (LPSTR)strRet.c_str(), size, NULL, NULL);
return strRet;
}
std::string UnicodeToUtf8(const std::wstring& unicode)
{
/* 分配目標空間, 一個16位Unicode字符最多可以轉爲4個字節 */
LPCWCH ptr = unicode.c_str();
int size = WideCharToMultiByte(CP_UTF8, 0, ptr, -1, NULL, 0, NULL, NULL);
std::string strRet(size, 0);
int len = WideCharToMultiByte(CP_UTF8, 0, ptr, -1, (char*)strRet.c_str(), size, NULL, NULL);
return strRet;
}
std::wstring Utf8ToUnicode(const std::string& utf8)
{
LPCCH ptr = utf8.c_str();
int size = MultiByteToWideChar(CP_UTF8, 0, ptr, -1, NULL, NULL);
std::wstring wstrRet(size, 0);
int len = MultiByteToWideChar(CP_UTF8, 0, ptr, -1, (LPWSTR)wstrRet.c_str(), size);
return wstrRet;
}
std::string GBKToUtf8(const std::string& gbk)
{
return AnsiToUtf8(gbk);
}
std::string Utf8ToGBK(const std::string& utf8)
{
return Utf8ToAnsi(utf8);
}
bool IsUTF8(const void* pBuffer, long size)
{
bool isUTF8 = true;
unsigned char* start = (unsigned char*)pBuffer;
unsigned char* end = (unsigned char*)pBuffer + size;
while (start < end)
{
if (*start < 0x80) { /*(10000000): 值小於0x80的爲ASCII字符*/
start++;
}
else if (*start < (0xC0)) { /*(11000000): 值介於0x80與0xC0之間的爲無效UTF-8字符*/
isUTF8 = false;
break;
}
else if (*start < (0xE0)) { /*(11100000): 此範圍內爲2字節UTF-8字符 */
if (start >= end - 1) {
break;
}
if ((start[1] & (0xC0)) != 0x80) {
isUTF8 = false;
break;
}
start += 2;
}
else if (*start < (0xF0)) { /**(11110000): 此範圍內爲3字節UTF-8字符*/
if (start >= end - 2) {
break;
}
if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80) {
isUTF8 = false;
break;
}
start += 3;
}
else {
isUTF8 = false;
break;
}
}
return isUTF8;
}
//GB2312 轉換成 Unicode
std::wstring GB2312ToUnicode(const std::string& gb2312)
{
UINT nCodePage = 936; //GB2312
int size = MultiByteToWideChar(nCodePage, 0, gb2312.c_str(), -1, NULL, 0);
std::wstring wstrRet(size, 0);
MultiByteToWideChar(nCodePage, 0, gb2312.c_str(), -1, (LPWSTR)wstrRet.c_str(), size);
return wstrRet;
}
//BIG5 轉換成 Unicode
std::wstring BIG5ToUnicode(const std::string& big5)
{
UINT nCodePage = 950; //BIG5
int size = MultiByteToWideChar(nCodePage, 0, big5.c_str(), -1, NULL, 0);
std::wstring wstrRet(size, 0);
MultiByteToWideChar(nCodePage, 0, big5.c_str(), -1, (LPWSTR)wstrRet.c_str(), size);
return wstrRet;
}
//Unicode 轉換成 GB2312
std::string UnicodeToGB2312(const std::wstring& unicode)
{
UINT nCodePage = 936; //GB2312
int size = WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, NULL, 0, NULL, NULL);
std::string strRet(size, 0);
WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, (LPSTR)strRet.c_str(), size, NULL, NULL);
return strRet;
}
//Unicode 轉換成 BIG5
std::string UnicodeToBIG5(const std::wstring& unicode)
{
UINT nCodePage = 950; //BIG5
int size = WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, NULL, 0, NULL, NULL);
std::string strRet(size, 0);
WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, (LPSTR)strRet.c_str(), size, NULL, NULL);
return strRet;
}
//繁體中文BIG5 轉換成 簡體中文 GB2312
std::string FBIG5ToGB2312(const std::string& big5)
{
LCID lcid = MAKELCID(MAKELANGID(LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED), SORT_CHINESE_PRC);
std::wstring unicode = BIG5ToUnicode(big5);
std::string gb2312 = UnicodeToGB2312(unicode);
int size = LCMapStringA(lcid, LCMAP_SIMPLIFIED_CHINESE, gb2312.c_str(), -1, NULL, 0);
std::string strRet(size, 0);
LCMapStringA(0x0804, LCMAP_SIMPLIFIED_CHINESE, gb2312.c_str(), -1, (LPSTR)strRet.c_str(), size);
return strRet;
}
//簡體中文 GB2312 轉換成 繁體中文BIG5
std::string GB2312ToFBIG5(const std::string gb2312)
{
LCID lcid = MAKELCID(MAKELANGID(LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED), SORT_CHINESE_PRC);
int size = LCMapStringA(lcid, LCMAP_TRADITIONAL_CHINESE, gb2312.c_str(), -1, NULL, 0);
std::string strRet(size, 0);
LCMapStringA(lcid, LCMAP_TRADITIONAL_CHINESE, gb2312.c_str(), -1, (LPSTR)strRet.c_str(), size);
std::wstring unicode = GB2312ToUnicode(strRet);
std::string big5 = UnicodeToBIG5(unicode);
return big5;
}
main.cpp
#include "charset.h"
void showHex(const char* bytes, int len) {
for (int i = 0; i < len; i++) {
printf("%02x ", (unsigned char)bytes[i]);
}
}
void showHex(std::string charset, std::string str) {
printf("%10s: ", charset.data());
showHex(str.data(), str.size());
printf("\n");
}
void showHex(std::string charset, std::wstring str) {
printf("%10s: ", charset.data());
showHex((char*)str.data(), 2 * str.size());
printf("\n");
}
int main(int argc, char* argv[])
{
std::wstring wstr(L"中abc國");
std::string str("中abc國");
std::string ansi;
std::string utf8;
std::string gbk;
std::wstring unicode;
showHex("unicode", wstr);
showHex("ansi", str);
ansi = UnicodeToAnsi(wstr); showHex("ansi", ansi);
unicode = AnsiToUnicode(ansi); showHex("unicode", unicode);
utf8 = AnsiToUtf8(str); showHex("utf8", utf8);
ansi = Utf8ToAnsi(utf8); showHex("ansi", ansi);
utf8 = UnicodeToUtf8(wstr); showHex("utf8", utf8);
unicode = Utf8ToUnicode(utf8); showHex("unicode", unicode);
gbk = Utf8ToGBK(utf8); showHex("gbk", gbk);
utf8 = GBKToUtf8(gbk); showHex("utf8", utf8);
getchar();
return 0;
}