c++實現UTF-16轉UTF-8

1.源碼實現

#include <iostream>
#include <cstring>

class UTF16 {
public:
    static int toUCS4(const unsigned short *utf16, unsigned short *ucs4);
    static int toUTF8(const unsigned short *utf16, unsigned char *utf8);
    static int toUTF8(const unsigned short *utf16, int n, unsigned char *utf8);
};

using namespace std;

int UTF16::toUCS4(const unsigned short *utf16, unsigned short *ucs4)
{
    if(utf16[0] >= 0xd800 && utf16[0] <= 0xdfff)
    {
        if(utf16[0] < 0xdc00)
        {
            if(utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff)
            {
                ucs4[1] = (utf16[0] & 0x3ff);
                ucs4[0] = (utf16[1] & 0x3ff);
                ucs4[0] = ((ucs4[1] << 10) | ucs4[0]);
                ucs4[1] = ((ucs4[1] >> 6) | 1);

                //printf("%04x\n", ucs4[0]);
                //printf("%04x\n", ucs4[1]);

                return 2;
            }

            return -1;
        }

        return -1;
    }
    else
    {
        ucs4[0] = utf16[0];
        ucs4[1] = 0x00;
    }

    return 1;
}

int UTF16::toUTF8(const unsigned short *utf16, unsigned char *utf8)
{
    unsigned short ucs4[2];
    unsigned int *u = (unsigned int *)ucs4;
    int w;

    if(utf16[0] >= 0xd800 && utf16[0] <= 0xdfff)
    {
        if(utf16[0] < 0xdc00)
        {
            if(utf16[1] >= 0xdc00 && utf16[1] <= 0xdfff)
            {
                ucs4[1] = (utf16[0] & 0x3ff);
                ucs4[0] = (utf16[1] & 0x3ff);
                ucs4[0] = ((ucs4[1] << 10) | ucs4[0]);
                ucs4[1] = ((ucs4[1] >> 6) | 1);
            }
            else
            {
                return -1;
            }
        }
        else
        {
            return -1;
        }
    }
    else
    {
        ucs4[0] = utf16[0];
        ucs4[1] = 0x00;
    }

    w = *u;

    if(w <= 0x0000007f)
    {
        /*U-00000000 - U-0000007F:  0xxxxxxx*/
        utf8[0] = (w & 0x7f);

        return 1;
    }
    else if(w >= 0x00000080 && w <= 0x000007ff)
    {
        /*U-00000080 - U-000007FF:  110xxxxx 10xxxxxx*/
        utf8[1] = (w & 0x3f) | 0x80;
        utf8[0] = ((w >> 6) & 0x1f) | 0xc0;

        return 2;
    }
    else if(w >= 0x00000800 && w <= 0x0000ffff)
    {
        /*U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx*/
        utf8[2] = (w & 0x3f) | 0x80;
        utf8[1] = ((w >> 6) & 0x3f) | 0x80;
        utf8[0] = ((w >> 12) & 0x0f) | 0xe0;

        return 3;
    }
    else if(w >= 0x00010000 && w <= 0x001fffff)
    {
        /*U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx*/
        utf8[3] = (w & 0x3f) | 0x80;
        utf8[2] = ((w >> 6) & 0x3f) | 0x80;
        utf8[1] = ((w >> 12) & 0x3f) | 0x80;
        utf8[0] = ((w >> 18) & 0x07) | 0xf0;

        return 4;
    }
    else if(w >= 0x00200000 && w <= 0x03ffffff)
    {
        /*U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
        utf8[4] = (w & 0x3f) | 0x80;
        utf8[3] = ((w >> 6) & 0x3f) | 0x80;
        utf8[2] = ((w >> 12) & 0x3f) | 0x80;
        utf8[1] = ((w >> 18) & 0x3f) | 0x80;
        utf8[0] = ((w >> 24) & 0x03) | 0xf8;

        return 5;
    }
    else if(w >= 0x04000000 && w <= 0x7fffffff)
    {
        /*U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
        utf8[5] = (w & 0x3f) | 0x80;
        utf8[4] = ((w >> 6) & 0x3f) | 0x80;
        utf8[3] = ((w >> 12) & 0x3f) | 0x80;
        utf8[2] = ((w >> 18) & 0x3f) | 0x80;
        utf8[1] = ((w >> 24) & 0x03) | 0xf8;
        utf8[0] = ((w >> 30) & 0x01) | 0xfc;

        return 6;
    }

    return 0;
}

int UTF16::toUTF8(const unsigned short *utf16, int n, unsigned char *utf8)
{
    unsigned short ucs4[2];
    unsigned int *u = (unsigned int *)ucs4;
    int w;
    int m = 0;
    int e = 0;
    int i = 0;
    int j = 0;

    for(i=0; i<n; i+=m)
    {
        if(utf16[i] >= 0xd800 && utf16[i] <= 0xdfff)
        {
            if(utf16[i] < 0xdc00)
            {
                if(utf16[i+1] >= 0xdc00 && utf16[i+1] <= 0xdfff)
                {
                    ucs4[1] = (utf16[i+0] & 0x3ff);
                    ucs4[0] = (utf16[i+1] & 0x3ff);
                    ucs4[0] = ((ucs4[1] << 10) | ucs4[0]);
                    ucs4[1] = ((ucs4[1] >> 6) | 1);

                    m = 2;
                }
                else
                {
                    m = -1;
                }
            }
            else
            {
                m = -1;
            }
        }
        else
        {
            ucs4[0] = utf16[i];
            ucs4[1] = 0x00;

            m = 1;
        }

        if(m == -1)
        {
            utf8[j] = 0x00;

            return j;
        }

        w = *u;

        e = 0;

        if(w <= 0x0000007f)
        {
            /*U-00000000 - U-0000007F:  0xxxxxxx*/
            utf8[j+0] = (w & 0x7f);

            e = 1;
        }
        else if(w >= 0x00000080 && w <= 0x000007ff)
        {
            /*U-00000080 - U-000007FF:  110xxxxx 10xxxxxx*/
            utf8[j+1] = (w & 0x3f) | 0x80;
            utf8[j+0] = ((w >> 6) & 0x1f) | 0xc0;

            e = 2;
        }
        else if(w >= 0x00000800 && w <= 0x0000ffff)
        {
            /*U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx*/
            utf8[j+2] = (w & 0x3f) | 0x80;
            utf8[j+1] = ((w >> 6) & 0x3f) | 0x80;
            utf8[j+0] = ((w >> 12) & 0x0f) | 0xe0;

            e = 3;
        }
        else if(w >= 0x00010000 && w <= 0x001fffff)
        {
            /*U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx*/
            utf8[j+3] = (w & 0x3f) | 0x80;
            utf8[j+2] = ((w >> 6) & 0x3f) | 0x80;
            utf8[j+1] = ((w >> 12) & 0x3f) | 0x80;
            utf8[j+0] = ((w >> 18) & 0x07) | 0xf0;

            e = 4;
        }
        else if(w >= 0x00200000 && w <= 0x03ffffff)
        {
            /*U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
            utf8[j+4] = (w & 0x3f) | 0x80;
            utf8[j+3] = ((w >> 6) & 0x3f) | 0x80;
            utf8[j+2] = ((w >> 12) & 0x3f) | 0x80;
            utf8[j+1] = ((w >> 18) & 0x3f) | 0x80;
            utf8[j+0] = ((w >> 24) & 0x03) | 0xf8;

            e = 5;
        }
        else if(w >= 0x04000000 && w <= 0x7fffffff)
        {
            /*U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx*/
            utf8[j+5] = (w & 0x3f) | 0x80;
            utf8[j+4] = ((w >> 6) & 0x3f) | 0x80;
            utf8[j+3] = ((w >> 12) & 0x3f) | 0x80;
            utf8[j+2] = ((w >> 18) & 0x3f) | 0x80;
            utf8[j+1] = ((w >> 24) & 0x03) | 0xf8;
            utf8[j+0] = ((w >> 30) & 0x01) | 0xfc;

            e = 6;
        }

        j += e;
    }

    utf8[j] = 0x00;

    return j;
}

int main()
{
    const unsigned short utf16[4] = {0x4F60, 0x597D, 0x00, 0x00};
    unsigned char utf8[128];

    UTF16::toUTF8(utf16, 2, utf8);

    printf("%s\n", utf8);

    return 0;
}

2.編譯源碼

$ g++ -o test test.cpp -std=c++11

3.運行及其結果

$ ./test
你好
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章