關於字符集的一些總結:
1. std::wstring s(L"abc"); //L標識使用wchar_t,一個字符佔用兩個字節。
"A" = 41
"ABC" = 41 42 43
L"A" = 00 41
L"ABC" = 00 41 00 42 00 43
string, wstring相互之間的轉換:
#include <locale>
#include <codecvt>
#include <string>
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
std::string narrow = converter.to_bytes(wide_utf16_source_string);
std::wstring wide = converter.from_bytes(narrow_utf8_source_string);
#include <string>
#include <codecvt>
#include <locale>
std::string input_str = "this is a -string-, which is a sequence based on the -char- type.";
std::wstring input_wstr = L"this is a -wide- string, which is based on the -wchar_t- type.";
// conversion
std::wstring str_turned_to_wstr = std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input_str);
std::string wstr_turned_to_str = std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(input_wstr);
這兩段代碼寫得非常清晰,均來源於stackoverflow。
去除中文標點符號和空格:
//定義轉換對象
wstring_convert<codecvt_utf8<wchar_t>> conv;
//按行讀取文件
while (!infile.eof()){
string s;
getline(infile, s);
//轉換成寬字節類型
wstring ws = conv.from_bytes(s);
wstring nws;
//過濾每一行中的標點和空格
for (wchar_t ch : ws){
//檢查是否是標點和空格
if (!iswpunct(ch) && !iswspace(ch)){
nws.push_back(ch);
}
}
//將過濾後的文本重新轉換成UTF-8編碼的多字節類型
string ns = conv.to_bytes(nws);
//重新寫回文件
outfile << ns;
}