c++字節轉碼

charset.h

#pragma once

#include <iostream>
#include <string>

std::string  UnicodeToAnsi(const std::wstring& unicode);
std::wstring AnsiToUnicode(const std::string& ansi);

std::string  AnsiToUtf8(const std::string& strSrc);
std::string  Utf8ToAnsi(const std::string& strSrc);

std::string  UnicodeToUtf8(const std::wstring& wstrSrc);
std::wstring Utf8ToUnicode(const std::string& strSrc);

std::string  GBKToUtf8(const std::string& gbk);
std::string  Utf8ToGBK(const std::string& utf8);

std::wstring GB2312ToUnicode(const std::string& gb2312);
std::string  UnicodeToGB2312(const std::wstring& unicode);

std::wstring BIG5ToUnicode(const std::string& big5);
std::string  UnicodeToBIG5(const std::wstring& unicode);

std::string  FBIG5ToGB2312(const std::string& big5);
std::string  GB2312ToFBIG5(const std::string gb2312);

bool IsUTF8(const void* pBuffer, long size);

charset.cpp

#inchude "charset.h"
#include <Windows.h>


std::string UnicodeToAnsi(const std::wstring& unicode)
{
    LPCWCH ptr = unicode.c_str();
    /** 分配目標空間, 一個16位Unicode字符最多可以轉爲4個字節int size = static_cast<int>( wstrSrc.size() * 4 + 10 );*/
    int size = WideCharToMultiByte(CP_THREAD_ACP, 0, ptr, -1, NULL, 0, NULL, NULL);

    std::string strRet(size, 0);
    int len = WideCharToMultiByte(CP_THREAD_ACP, 0, ptr, -1, (LPSTR)strRet.c_str(), size, NULL, NULL);

    return strRet;
}

std::wstring AnsiToUnicode(const std::string& ansi)
{
    LPCCH ptr = ansi.c_str();
    int size = MultiByteToWideChar(CP_ACP, 0, ptr, -1, NULL, NULL);

    std::wstring wstrRet(size, 0);
    int len = MultiByteToWideChar(CP_ACP, 0, ptr, -1, (LPWSTR)wstrRet.c_str(), size);

    return wstrRet;
}

std::string AnsiToUtf8(const std::string& ansi)
{
    LPCCH ptr = ansi.c_str();
    /* 分配目標空間, 長度爲 Ansi 編碼的兩倍 */
    int size = MultiByteToWideChar(CP_ACP, 0, ptr, -1, NULL, NULL);

    std::wstring wstrTemp(size, 0);
    int len = MultiByteToWideChar(CP_ACP, 0, ptr, -1, (LPWSTR)wstrTemp.c_str(), size);

    return UnicodeToUtf8(wstrTemp);
}

std::string Utf8ToAnsi(const std::string& utf8)
{
    std::wstring wstrTemp = Utf8ToUnicode(utf8);

    LPCWCH ptr = wstrTemp.c_str();
    int size = WideCharToMultiByte(CP_ACP, 0, ptr, -1, NULL, 0, NULL, NULL);

    std::string strRet(size, 0);
    int len = WideCharToMultiByte(CP_ACP, 0, ptr, -1, (LPSTR)strRet.c_str(), size, NULL, NULL);

    return strRet;
}

std::string UnicodeToUtf8(const std::wstring& unicode)
{
    /* 分配目標空間, 一個16位Unicode字符最多可以轉爲4個字節 */
    LPCWCH ptr = unicode.c_str();
    int size = WideCharToMultiByte(CP_UTF8, 0, ptr, -1, NULL, 0, NULL, NULL);

    std::string strRet(size, 0);
    int len = WideCharToMultiByte(CP_UTF8, 0, ptr, -1, (char*)strRet.c_str(), size, NULL, NULL);

    return strRet;
}

std::wstring Utf8ToUnicode(const std::string& utf8)
{
    LPCCH ptr = utf8.c_str();
    int size = MultiByteToWideChar(CP_UTF8, 0, ptr, -1, NULL, NULL);

    std::wstring wstrRet(size, 0);
    int len = MultiByteToWideChar(CP_UTF8, 0, ptr, -1, (LPWSTR)wstrRet.c_str(), size);

    return wstrRet;
}


std::string GBKToUtf8(const std::string& gbk)
{
    return AnsiToUtf8(gbk);
}

std::string Utf8ToGBK(const std::string& utf8)
{
    return Utf8ToAnsi(utf8);
}

bool IsUTF8(const void* pBuffer, long size)
{
    bool isUTF8 = true;
    unsigned char* start = (unsigned char*)pBuffer;
    unsigned char* end = (unsigned char*)pBuffer + size;
    while (start < end)
    {
        if (*start < 0x80) { /*(10000000): 值小於0x80的爲ASCII字符*/
            start++;
        }
        else if (*start < (0xC0)) { /*(11000000): 值介於0x80與0xC0之間的爲無效UTF-8字符*/
            isUTF8 = false;
            break;
        }
        else if (*start < (0xE0)) { /*(11100000): 此範圍內爲2字節UTF-8字符  */
            if (start >= end - 1) {
                break;
            }
            if ((start[1] & (0xC0)) != 0x80) {
                isUTF8 = false;
                break;
            }
            start += 2;
        }
        else if (*start < (0xF0)) { /**(11110000): 此範圍內爲3字節UTF-8字符*/
            if (start >= end - 2) {
                break;
            }
            if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80) {
                isUTF8 = false;
                break;
            }
            start += 3;
        }
        else {
            isUTF8 = false;
            break;
        }
    }

    return isUTF8;
}



//GB2312 轉換成 Unicode
std::wstring GB2312ToUnicode(const std::string& gb2312)
{
    UINT nCodePage = 936; //GB2312
    int size = MultiByteToWideChar(nCodePage, 0, gb2312.c_str(), -1, NULL, 0);

    std::wstring wstrRet(size, 0);
    MultiByteToWideChar(nCodePage, 0, gb2312.c_str(), -1, (LPWSTR)wstrRet.c_str(), size);

    return wstrRet;
}

//BIG5 轉換成 Unicode
std::wstring BIG5ToUnicode(const std::string& big5)
{
    UINT nCodePage = 950; //BIG5
    int size = MultiByteToWideChar(nCodePage, 0, big5.c_str(), -1, NULL, 0);

    std::wstring wstrRet(size, 0);
    MultiByteToWideChar(nCodePage, 0, big5.c_str(), -1, (LPWSTR)wstrRet.c_str(), size);

    return wstrRet;
}

//Unicode 轉換成 GB2312
std::string UnicodeToGB2312(const std::wstring& unicode)
{
    UINT nCodePage = 936; //GB2312
    int size = WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, NULL, 0, NULL, NULL);

    std::string strRet(size, 0);
    WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, (LPSTR)strRet.c_str(), size, NULL, NULL);

    return strRet;
}

//Unicode 轉換成 BIG5
std::string UnicodeToBIG5(const std::wstring& unicode)
{
    UINT nCodePage = 950; //BIG5
    int size = WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, NULL, 0, NULL, NULL);

    std::string strRet(size, 0);
    WideCharToMultiByte(nCodePage, 0, unicode.c_str(), -1, (LPSTR)strRet.c_str(), size, NULL, NULL);

    return strRet;
}

//繁體中文BIG5 轉換成 簡體中文 GB2312
std::string FBIG5ToGB2312(const std::string& big5)
{
    LCID lcid = MAKELCID(MAKELANGID(LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED), SORT_CHINESE_PRC);
    std::wstring unicode = BIG5ToUnicode(big5);

    std::string gb2312 = UnicodeToGB2312(unicode);
    int size = LCMapStringA(lcid, LCMAP_SIMPLIFIED_CHINESE, gb2312.c_str(), -1, NULL, 0);

    std::string strRet(size, 0);
    LCMapStringA(0x0804, LCMAP_SIMPLIFIED_CHINESE, gb2312.c_str(), -1, (LPSTR)strRet.c_str(), size);

    return strRet;
}

//簡體中文 GB2312 轉換成 繁體中文BIG5
std::string GB2312ToFBIG5(const std::string gb2312)
{
    LCID lcid = MAKELCID(MAKELANGID(LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED), SORT_CHINESE_PRC);
    int size = LCMapStringA(lcid, LCMAP_TRADITIONAL_CHINESE, gb2312.c_str(), -1, NULL, 0);

    std::string strRet(size, 0);
    LCMapStringA(lcid, LCMAP_TRADITIONAL_CHINESE, gb2312.c_str(), -1, (LPSTR)strRet.c_str(), size);

    std::wstring unicode = GB2312ToUnicode(strRet);
    std::string big5 = UnicodeToBIG5(unicode);

    return big5;
}

main.cpp

#include "charset.h"

void showHex(const char* bytes, int len) {
    for (int i = 0; i < len; i++) {
        printf("%02x ", (unsigned char)bytes[i]);
    }
}

void showHex(std::string charset, std::string str) {
    printf("%10s: ", charset.data());
    showHex(str.data(), str.size());
    printf("\n");
}

void showHex(std::string charset, std::wstring str) {
    printf("%10s: ", charset.data());
    showHex((char*)str.data(), 2 * str.size());
    printf("\n");
}

int main(int argc, char* argv[])
{
    std::wstring wstr(L"中abc國");
    std::string str("中abc國");

    std::string ansi;
    std::string utf8;
    std::string gbk;
    std::wstring unicode;

    showHex("unicode", wstr);
    showHex("ansi", str);

    ansi = UnicodeToAnsi(wstr); showHex("ansi", ansi);

    unicode = AnsiToUnicode(ansi); showHex("unicode", unicode);

    utf8 = AnsiToUtf8(str); showHex("utf8", utf8);
    ansi = Utf8ToAnsi(utf8); showHex("ansi", ansi);

    utf8 = UnicodeToUtf8(wstr); showHex("utf8", utf8);
    unicode = Utf8ToUnicode(utf8); showHex("unicode", unicode);

    gbk = Utf8ToGBK(utf8); showHex("gbk", gbk);
    utf8 = GBKToUtf8(gbk); showHex("utf8", utf8);

    getchar();
    return 0;
}
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章