C使用语言环境和流等效于mbsrtowcs和wcsrtombs

是否有使用std :: locale和C流功能的C等效mbsrtowcs和wcsrtombs类型函数?

我试图找出使用标准库在std :: string和std :: wstring之间来回转换的最佳方法.似乎std :: locale几乎可以做到这一点,但我对某些细节或其可能存在的局限性有点不确定.

一些细节:我在Linux上,它使用utf-8作为本机编码.我想从utf-8 std :: string转到std :: wstring并返回而不会丢失信息.

我认为Windows上的语言环境可能存在一些限制,但我并不特别关注它们.只要答案适用于Linux并且没有libstdc以外的依赖关系,即没有提升依赖性,我很高兴.

赞赏背景信息的链接.

注意:似乎有些混乱.多个char可以表示UTF-8中的单个字符,因此在从wchar_t转换为char时不考虑此问题的函数将不起作用.

解决方法:

locale对于此任务来说是过度的 – UTF-8和UTF-16可以通过简单的二进制转换来回转换.这是基于我的answer to an earlier question的一些代码.

 

std::string UTF16to8(const wchar_t * in)
{
    std::string out;
    if (in == NULL)
        return out;

    unsigned int codepoint = 0;
    for (in;  *in != 0;  ++in)
    {
        if (*in >= 0xd800 && *in <= 0xdbff)
            codepoint = ((*in - 0xd800) << 10) + 0x10000;
        else
        {
            if (*in >= 0xdc00 && *in <= 0xdfff)
                codepoint |= *in - 0xdc00;
            else
                codepoint = *in;

            if (codepoint <= 0x7f)
                out.append(1, static_cast<char>(codepoint));
            else if (codepoint <= 0x7ff)
            {
                out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
            }
            else if (codepoint <= 0xffff)
            {
                out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
                out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
            }
            else
            {
                out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
                out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
                out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
            }
            codepoint = 0;
        }
    }
    return out;
}

std::wstring UTF8to16(const char * in)
{
    std::wstring out;
    if (in == NULL)
        return out;

    unsigned int codepoint = 0;
    int following = 0;
    for (in;  *in != 0;  ++in)
    {
        unsigned char ch = *in;
        if (ch <= 0x7f)
        {
            codepoint = ch;
            following = 0;
        }
        else if (ch <= 0xbf)
        {
            if (following > 0)
            {
                codepoint = (codepoint << 6) | (ch & 0x3f);
                --following;
            }
        }
        else if (ch <= 0xdf)
        {
            codepoint = ch & 0x1f;
            following = 1;
        }
        else if (ch <= 0xef)
        {
            codepoint = ch & 0x0f;
            following = 2;
        }
        else
        {
            codepoint = ch & 0x07;
            following = 3;
        }
        if (following == 0)
        {
            if (codepoint > 0xffff)
            {
                out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10)));
                out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff)));
            }
            else
                out.append(1, static_cast<wchar_t>(codepoint));
            codepoint = 0;
        }
    }
    return out;
}

如果你的wchar_t是32位而不是16位,那么这是一个使用的版本(未经测试).

 

std::string UTF32to8(const wchar_t * in)
{
    assert(sizeof(wchar_t) >= 4);
    std::string out;
    if (in == NULL)
        return out;

    for (in;  *in != 0;  ++in)
    {
        unsigned int codepoint = *in;

        if (codepoint <= 0x7f)
            out.append(1, static_cast<char>(codepoint));
        else if (codepoint <= 0x7ff)
        {
            out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
            out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
        }
        else if (codepoint <= 0xffff)
        {
            out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
            out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
            out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
        }
        else
        {
            out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
            out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
            out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
            out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
        }
    }
    return out;
}

std::wstring UTF8to32(const char * in)
{
    assert(sizeof(wchar_t) >= 4);
    std::wstring out;
    if (in == NULL)
        return out;

    wchar_t codepoint = 0;
    int following = 0;
    for (in;  *in != 0;  ++in)
    {
        unsigned char ch = *in;
        if (ch <= 0x7f)
        {
            codepoint = ch;
            following = 0;
        }
        else if (ch <= 0xbf)
        {
            if (following > 0)
            {
                codepoint = (codepoint << 6) | (ch & 0x3f);
                --following;
            }
        }
        else if (ch <= 0xdf)
        {
            codepoint = ch & 0x1f;
            following = 1;
        }
        else if (ch <= 0xef)
        {
            codepoint = ch & 0x0f;
            following = 2;
        }
        else
        {
            codepoint = ch & 0x07;
            following = 3;
        }
        if (following == 0)
        {
            out.append(1, codepoint);
            codepoint = 0;
        }
    }
    return out;
}

来源: https://codeday.me/bug/20190626/1298707.html

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章