C++混合中文英文字符串匹配查找改進

原創

2020-06-09 17:41

問題起源於一個同學 daidaimadaima 的私信，他的問題是

他寫的版本是

#include<iostream>
using namespace std; 
string dic[] = { "自然語言處理","準時","課程","作業","有","老番茄","意思","上課","計算語言學","開心" };

// 是否爲詞表中的詞或者是詞表中詞的前綴
bool inDict(string str)
{
    bool re = false;
    int i;
    int len = 10;//原來是GET_ARRAY_LEN(dic, len);但是他沒發給我，所以這裏直接賦值算了
   
    for (i = 0; i < len; i++)
    {
        // 是否和詞表詞相等或者是詞表詞前綴
        if (str == dic[i].substr(0, str.length()))
        {
            re = true;
        }
    }
    return re;
}


int main()
{
    string sentence = "自然語言處理課程有意思";
    string word = "一";
    int wordlen = word.length();

	int i;
    string s1 = "";
    cout << "詞典：" << dic << endl;

	cout << "句子：" << sentence << endl;
    for (i = 0; i < sentence.length(); i = i + wordlen)
    {
        string tmp = s1 + sentence.substr(i, wordlen);

        if (inDict(tmp))
        {
            s1 = s1 + sentence.substr(i, wordlen);
        }
        else
        {
            cout << "/" << s1;
            s1 = sentence.substr(i, wordlen);
        }
    }
    cout << "/" << s1;
}

但是，我用運行之後，並沒有發現亂碼，可能他指的是輸出dic是數字而不是內容，也有可能他指的是字符串中有中、英文混雜時錯誤

#include<iostream>
#include<vector>
#include<string.h>
using namespace std;
vector<string>dic = { "自然語言處理","準時","課程","作業","有","老番茄","意思","上課","計算語言學","開心" };

int GET_ARRAY_LEN(vector<string>dic) {
    return  dic.size();
}

// 是否爲詞表中的詞或者是詞表中詞的前綴
bool inDict(string str)
{
    bool re = false;
    int i;
    int len =  GET_ARRAY_LEN(dic);
    for (i = 0; i < len; i++)
    {
        // 是否和詞表詞相等或者是詞表詞前綴
        //可能錯誤：if (str == dic[i].substr(0, str.length()))
        /*
        ==是比較a b的值
        compare是比較 a b指向的內容。建議用內置的比較:strcmp和compare等
        */
        if( str.compare(dic[i].substr(0,str.size()))==0 )        
        {
            re = true;
        }
    }
    return re;
}


int main()
{
    string sentence = "自然語言處理d課程有意思嗎有意思";  //自然語言處理d課程有意思嗎意思
    string cword = "一";
    int cwordlen = cword.length();
    /*其實這一步和上一步可以直接換成int cwordlen=2;因爲中文字符在這裏爲2個字節，英文字符也就是我們平時的cout是論ascill碼的--爲一個字節，所以下面加一個判斷中英文，再改變i，而且我發現上面同學的代碼中有一個bug就是無論最後一個對與錯，都會輸出，所以需要再加一個判斷，我是把中文字符和英文字符都看成一個一個地讀和累加判斷
*/
    int i,j;
    string s1 = "";
    string tmpStr = "";
    string tmp = "";
    cout << "句子：" << sentence << endl;

    for (i = 0; i < sentence.length();)
    {
        j = i;
        if (sentence[i] < 0) {
            tmpStr = s1 + sentence.substr(i, cwordlen);
            tmp = sentence.substr(i, cwordlen);
            j += cwordlen;
        }
        else {
            tmpStr = s1 + sentence.substr(i, 1);
            tmp = sentence.substr(i, 1);
            j += 1;
        }

        if (inDict(tmpStr))
        {
            s1 = tmpStr;  
            cout << tmp;
        }
        else
        {
            cout <<"/";
            s1 = tmp;         
            if (inDict(s1)) {
                cout << s1;
            }
        }
        i = j;
    }  

}

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

C++混合中文英文字符串匹配查找改進

Android啓動過程-萬字長文(Android14)

【SQL進階】CASE語句的使用

optional install error: Error: Unsupported URL Type: npm:vue-loader@^16.1.0

這種嵌套字典類型的數據，我想把它讀取到df裏，如何操作？

微調真的能讓LLM學到新東西嗎:引入新知識可能讓模型產生更多的幻覺

iNeuOS工業互聯網操作系統，增加電力IEC104協議

微服務實踐k8s&dapr開發部署實驗（3）訂閱發佈

chromedriver版本

kbgressdb之數據結構V0.2

java與數據庫oracle連接學習之jdbc（6）改進讀取信息的方法

java與數據庫oracle連接學習之jdbc（7）通過映射實現共享

java與數據庫oracle連接學習之jdbc（5）代碼包裝成一個公共類

java與數據庫oracle連接學習之jdbc（4)做一個工具類，減少代碼複雜度

java與數據庫oracle連接學習之jdbc（3)防止插入注入的入侵

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結