前提:
- 案例爲C++語言
- 適用於將形如:&#、&#x等開頭的字符串,轉換爲中文顯示
- 如有問題歡迎評論溝通。
說明
- 經過查閱資料,發現以上所說字符是 HTML、XML 等 SGML 類語言的轉義序列(escape sequence)。它們不是編碼。&#跟的是十進制,&#x跟的是十六進制。他們有一個專業名詞爲:NCR(numeric character reference)。
- 轉碼思路說明:十六進制—>十進制—>UTF-8—>中文
代碼
#include <sstream>
#include <iostream>
#include <string>
#include <iconv.h>
using namespace std;
strTrans(string strSource)
{
string result = "";
cout<<"轉化前的字符串爲 : %s"<<strSource.c_str()<<endl;
//將字符串截取出來,存入容器
vector<string> utf16Vec =getTokenList(strSource,";");
ostringstream oss;
for(vector<string>::iterator it = utf16Vec.begin();it!= utf16Vec.end();it++)
{
string strUTFNCR = *it;
string::size_type pos = strUTFNCR.find("&#x");
if (pos!=string::npos)
{
string uft16Before = strUTFNCR.substr(0,pos);
string uft16 = strUTFNCR.substr(pos+3,4);
string uft16After = strUTFNCR.substr(pos+7);
char dest1[5];
memset(dest1, 0, 5);
string utfFirstByte = uft16.substr(0,2);
string utfLastByte = uft16.substr(2);
dest1[0] = htoi(utfFirstByte.c_str());
dest1[1] = htoi(utfLastByte.c_str());
string strGBKWord;
CSConvert("UTF-16",dest1,sizeof(dest1),"GB18030",strGBKWord);
strUTFNCR = uft16Before + strGBKWord +uft16After;
*it = strUTFNCR;
}
oss << *it;
}
result = oss.str();
cout<<"轉化後的字符串爲 : %s"<<result.c_str()<<endl;
return result;
}
/*result爲轉換後中文格式的字符串*/
//進制轉換
htoi(const char s[])
{
int i;
int n = 0;
if (s[0] == '0' && (s[1]=='x' || s[1]=='X')) //判斷是否有前導0x或者0X
{
i = 2;
}
else
{
i = 0;
}
for (; (s[i] >= '0' && s[i] <= '9') || (s[i] >= 'a' && s[i] <= 'z') || (s[i] >='A' && s[i] <= 'Z');++i)
{
if (tolower(s[i]) > '9')
{
n = 16 * n + (10 + tolower(s[i]) - 'a');
}
else
{
n = 16 * n + (tolower(s[i]) - '0');
}
}
return n;
}
//截取字符串的函數
vector<string> getTokenList(const string& val, const string& token)
{
vector<string> slist;
string tmp = val;
int pos = 0;
while(pos != string::npos)
{
pos = tmp.find(token);
if (pos != string::npos)
{
if (pos!=0)
{
slist.push_back(tmp.substr(0,pos));
}
tmp = tmp.substr(pos + token.length(), tmp.length()-pos-token.length());
}
}
if (tmp.length()>0)
{
slist.push_back(tmp);//push the last one
}
return slist;
}
//編碼轉換方法
void CSConvert(string strSourceCS /*"UTF-8"*/,const char* pSourceBuffer,int iSourceLen,string strTargetCS/*"GB2312"*/,string& strTarget)
{
UErrorCode status = U_ZERO_ERROR;
UChar target[iSourceLen*2];
UConverter *conv;
int32_t len;
//1 convert strSourceCS string to Unicode
// set up the converter
conv = ucnv_open(strSourceCS.c_str(), &status);
assert(U_SUCCESS(status));
// convert to Unicode
len = ucnv_toUChars(conv, target, iSourceLen*2, pSourceBuffer, iSourceLen, &status);
assert(U_SUCCESS(status));
// close the converter
ucnv_close(conv);
//2 convert Unicode string to strTargetCS
// set up the converter
conv = ucnv_open(strTargetCS.c_str(), &status);
assert(U_SUCCESS(status));
// convert to strTargetCS
char gbTarget[iSourceLen*2];
len = ucnv_fromUChars(conv, gbTarget, iSourceLen*2, target, -1, &status);
assert(U_SUCCESS(status));
// close the converter
ucnv_close(conv);
strTarget = gbTarget;
return ;
}
這樣就可以調用了,如下:
int main()
{
string strSource = "用户";
string strTar = strTrans(strSource);
cout<<"轉換後的字符串爲:"<<strTar<<endl;
return 0;
}
代碼就是這樣了,供學習交流