Linux string conversion from UTF-8 to UNICODE, UCS-4LE, UCS-4LE


Linux  string conversion from UTF-8 to UNICODE, UCS-4LE, UCS-4LE.

It is astonishing for windows developers that Linux has two distinct difference to Windows character set.

1. standard char * is default in UTF-8 coding.  strlen may greater than 2*(Chinese Word) + English.

2.wchar_t UNICODE string is  4 bytes long. 


See Ubuntu 16.04


#include <iostream>
using namespace std;


#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <iconv.h>

#include <wchar.h>  //wprintf is here!!!!!!!!


void PrintHexData( unsigned char *str, int iLen )
{
	unsigned char * pHex = (unsigned char *)str;
	for ( int i = 0;  i < iLen ;  i ++ )
	{
		printf( "%02x ", pHex[i] );
	}
	printf( "\n" );
}

int main()
{
	char     szName[128] =  "Linux-7字符串轉換測試";

	wchar_t  wzName[128] =  L"Linux-7字符串轉換測試";

	wchar_t  szTransName[128] ;

	const char * strCharSet[] = { "UCS-2LE",  "UCS-4LE",  "UNICODE" };


	memset( szTransName, 0, sizeof(szTransName) );

	cout << "char size     : " << sizeof(char)    <<  "  Bytes" <<  endl;
	cout<<  "wchar_t size  : " << sizeof(wchar_t) <<  "  Bytes"  <<  endl;
	cout << "RAND_MAX      : " << RAND_MAX        <<  endl;

	//Linux  utf-8 length is greater than UNICODE*2 for non-English word.
	size_t iNameByteLen =  strlen( szName );
	cout << "String:  " << szName <<  endl;
	cout << "          length of strlen: " <<  iNameByteLen << endl;
	int wzlen ;
	wzlen = wcslen( wzName );
	printf( "          length  wcslen of wchar_t %d \n",  wzlen  );



	//Linux  UNICODE-4LE for wchar_t.
	cout << "string: " << szName  << " from utf-8 conversion result:"  << endl;

	for ( int iset = 0 ;   iset < (int)(sizeof( strCharSet )/ sizeof(const char *));  iset ++ )
	{
		cout << "dst char set: " << strCharSet[iset] << endl;
		iconv_t  ct = iconv_open( strCharSet[iset],"utf-8" );
		if ( ct != (iconv_t)-1 )
		{
			char * s_in ;
			char * s_out ;
			size_t  iInLen,  iInLen1;
			size_t  iOutLen, iOutLen1;
			int  iconved = 0;

			try{
				s_in = (char *)szName;
				s_out = (char *)szTransName;
				iInLen1  = iInLen =  strlen( szName );
				iOutLen1 = iOutLen = sizeof( szTransName );

				memset( szTransName, 0xff, iOutLen );

				iconved = iconv( ct, (char **)&s_in, &iInLen,  (char **)&s_out, &iOutLen );

				iconv_close( ct );


				int iConvChars = iOutLen1 - iOutLen;
				printf( "ICONV in len:  %d=>%d  out len %d=>%d (%d)  conv ret: %d\n",
						(unsigned int)iInLen1, (unsigned int)iInLen, (unsigned int)iOutLen1, (unsigned int)iOutLen
						,iConvChars, iconved );

				unsigned char * pHex = (unsigned char *)szTransName;
				PrintHexData ( pHex, iConvChars  );
			}
			catch( ... )
			{
				cout << "erro iconv" << endl;
			}
		}
	}


	printf( "expected Unicode linux string : \n" );
	PrintHexData ( (unsigned char *)wzName, wcslen(wzName)*sizeof(wchar_t) );

	getchar();
	return 0;
}


running result:



char size     : 1  Bytes
wchar_t size  : 4  Bytes
RAND_MAX      : 2147483647
String:  Linux-7字符串轉換測試
          length of strlen: 28
          length  wcslen of wchar_t 14 
string: Linux-7字符串轉換測試 from utf-8 conversion result:
dst char set: UCS-2LE
ICONV in len:  28=>0  out len 512=>484 (28)  conv ret: 0
4c 00 69 00 6e 00 75 00 78 00 2d 00 37 00 57 5b 26 7b 32 4e 6c 8f 62 63 4b 6d d5 8b 
dst char set: UCS-4LE
ICONV in len:  28=>0  out len 512=>456 (56)  conv ret: 0
4c 00 00 00 69 00 00 00 6e 00 00 00 75 00 00 00 78 00 00 00 2d 00 00 00 37 00 00 00 57 5b 00 00 26 7b 00 00 32 4e 00 00 6c 8f 00 00 62 63 00 00 4b 6d 00 00 d5 8b 00 00 
dst char set: UNICODE
ICONV in len:  28=>0  out len 512=>482 (30)  conv ret: 0
ff fe 4c 00 69 00 6e 00 75 00 78 00 2d 00 37 00 57 5b 26 7b 32 4e 6c 8f 62 63 4b 6d d5 8b 
expected Unicode linux string : 
4c 00 00 00 69 00 00 00 6e 00 00 00 75 00 00 00 78 00 00 00 2d 00 00 00 37 00 00 00 57 5b 00 00 26 7b 00 00 32 4e 00 00 6c 8f 00 00 62 63 00 00 4b 6d 00 00 d5 8b 00 00 


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章