在開發的過程中碰到了在utf-8的字符串裏頭有非法字符的問題,搜了下,有不少人遇到了相同的問題。
有種方法是使用iconv,iconv.open("UTF-8", UTF-8//IGNORE"),由於要處理的字符長度變化幅度大,每次轉換需要分配的內存比較大,同時還需要清零,比較費時。
另外觀察感覺iconv本身比較耗時,所以尋求其他簡單的方法。
綜合stackoverflow的帖子,方法如下。
Table 3-7. Well-Formed UTF-8 Byte Sequences
Code Points First Byte Second Byte Third Byte Fourth Byte
U+0000..U+007F 00..7F
U+0080..U+07FF C2..DF 80..BF
U+0800..U+0FFF E0 A0..BF 80..BF
U+1000..U+CFFF E1..EC 80..BF 80..BF
U+D000..U+D7FF ED 80..9F 80..BF
U+E000..U+FFFF EE..EF 80..BF 80..BF
U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
int correct_non_utf_8(string *str){
int i,f_size=str->size();
unsigned char c,c2,c3,c4;
string to;
to.reserve(f_size);
//~ size_t pos = from->find("'advsearch': ' Avansert s");
for(i=0 ; i<f_size ; i++){
c=(unsigned char)(*str)[i];
if(c<32){//control char
if(c==9 || c==10 || c==13){//allow only \t \n \r
to.append(1,c);
}
}else if(c<127){//normal ASCII
to.append(1,c);
}else if(c<160){//control char
if(c2==128){//fix microsoft mess, add euro
to.append(1,226);
to.append(1,130);
to.append(1,172);
}
if(c2==133){//fix IBM mess, add NEL = \n\r
to.append(1,10);
to.append(1,13);
}
}else if(c<192){//invalid for UTF8, converting ASCII
to.append(1,(unsigned char)194);
to.append(1,c);
}else if(c<194){//invalid for UTF8, converting ASCII
to.append(1,(unsigned char)195);
to.append(1,c-64);
}else if(c<224){//possibly 2byte UTF8
c2=(unsigned char)(*str)[i+1];
if(c2>127 && c2<192){//valid 2byte UTF8
if(c==194 && c2<160){//control char, skipping
;
}else{
to.append(1,c);
to.append(1,c2);
}
i++;
}else{//invalid UTF8, converting ASCII
to.append(1,(unsigned char)195);
to.append(1,c-64);
}
}else if(c<240){//possibly 3byte UTF8
c2=(unsigned char)(*str)[i+1];
c3=(unsigned char)(*str)[i+2];
if(c2>127 && c2<192 && c3>127 && c3<192){//valid 3byte UTF8
to.append(1,c);
to.append(1,c2);
to.append(1,c3);
i+=2;
}else{//invalid UTF8, converting ASCII
to.append(1,(unsigned char)195);
to.append(1,c-64);
}
}else if(c<245){//possibly 4byte UTF8
c2=(unsigned char)(*str)[i+1];
c3=(unsigned char)(*str)[i+2];
c4=(unsigned char)(*str)[i+3];
if(c2>127 && c2<192 && c3>127 && c3<192 && c4>127 && c4<192){//valid 4byte UTF8
to.append(1,c);
to.append(1,c2);
to.append(1,c3);
to.append(1,c4);
i+=3;
}else{//invalid UTF8, converting ASCII
to.append(1,(unsigned char)195);
to.append(1,c-64);
}
}else if(c<256){//invalid UTF8, converting ASCII
to.append(1,(unsigned char)195);
to.append(1,c-64);
}else{
cout<<"WTF? more than 256 values per Byte ? ("<<(unsigned int)c<<")"<<endl;
}
}
*str=to;
return 1;
}