mmseg4j支持單個字母、數字及組合搜索

原文地址:http://blog.csdn.net/july_2/article/details/24481935         
如題,看到這個題目也許覺得功能有些多餘,字母、數字連在一塊的話,是不會單獨分出來的,分詞時候是連在一塊的,也算正常搜素需求。如輸入 :

        String txt = "IBM12二次修改123";        分詞效果:

        i bm |123 | 二 | 次 | 修 | 改

        現在,有一個需求:需要對字母、數字都分詞,分詞效果要達到:

        i | b | m |  1 | 2 | 3 | 二 | 次 | 修 | 改

        類似在數據庫中使用like加百分號雙向查詢效果,使用最初版本的mmseg4j無法滿足需求,經過閱讀mmseg4j部分源代碼,稍微修改了一點點,即可滿足需求(暫不考慮效率)。

  •  未修改前通過單詞,可以查詢,通過字母查詢不到結果如下圖:

         單詞完全匹配搜素:

           字母模糊搜索:

  • 修改mmseg4j源代碼MMSeg.java中的next部分代碼,其實就是屏蔽了部分代碼,很簡單:

        

[plain] view plaincopy
  1. public Word next() throws IOException {  
  2.         //先從緩存中取  
  3.         Word word = bufWord.poll();  
  4.         if(word == null) {  
  5.             bufSentence.setLength(0);  
  6.   
  7.             int data = -1;  
  8.             boolean read = true;  
  9. //          while(read && (data=readNext()) != -1) {  
  10.             while((data=readNext()) != -1) {  
  11.                 read = false;   //默認一次可以讀出同一類字符,就可以分詞內容  
  12.                 int type = Character.getType(data);  
  13.                 String wordType = Word.TYPE_WORD;  
  14.                 switch(type) {  
  15.                 case Character.UPPERCASE_LETTER:  
  16.                 case Character.LOWERCASE_LETTER:  
  17.                 case Character.TITLECASE_LETTER:  
  18.                 case Character.MODIFIER_LETTER:  
  19.                     /*  
  20.                      * 1. 0x410-0x44f -> А-я //俄文  
  21.                      * 2. 0x391-0x3a9 -> Α-Ω //希臘大寫  
  22.                      * 3. 0x3b1-0x3c9 -> α-ω //希臘小寫  
  23.                      */  
  24.                     data = toAscii(data);  
  25.                     NationLetter nl = getNation(data);  
  26.                     if(nl == NationLetter.UNKNOW) {  
  27.                         read = true;  
  28.                         break;  
  29.                     }  
  30.                     wordType = Word.TYPE_LETTER;  
  31.                     bufSentence.appendCodePoint(data);  
  32.                     switch(nl) {  
  33.                     case EN:  
  34.                         //字母后面的數字,如: VH049PA  
  35. //                      ReadCharByAsciiOrDigit rcad = new ReadCharByAsciiOrDigit();  
  36. //                      readChars(bufSentence, rcad);  
  37. //                      if(rcad.hasDigit()) {  
  38. //                          wordType = Word.TYPE_LETTER_OR_DIGIT;  
  39. //                      }  
  40.                         //only english  
  41.                         //readChars(bufSentence, new ReadCharByAscii());  
  42.                         break;  
  43.                     case RA:  
  44.                         readChars(bufSentence, new ReadCharByRussia());  
  45.                         break;  
  46.                     case GE:  
  47.                         readChars(bufSentence, new ReadCharByGreece());  
  48.                         break;  
  49.                     }  
  50.                     bufWord.add(createWord(bufSentence, wordType));  
  51.   
  52.                     bufSentence.setLength(0);  
  53.   
  54.                     break;  
  55.                 case Character.OTHER_LETTER:  
  56.                     /*  
  57.                      * 1. 0x3041-0x30f6 -> ぁ-ヶ   //日文(平|片)假名  
  58.                      * 2. 0x3105-0x3129 -> ㄅ-ㄩ   //注意符號  
  59.                      */  
  60.                     bufSentence.appendCodePoint(data);  
  61.                     readChars(bufSentence, new ReadCharByType(Character.OTHER_LETTER));  
  62.   
  63.                     currentSentence = createSentence(bufSentence);  
  64.   
  65.                     bufSentence.setLength(0);  
  66.   
  67.                     break;  
  68.                 case Character.DECIMAL_DIGIT_NUMBER:  
  69.                     bufSentence.appendCodePoint(toAscii(data));  
  70. //                  readChars(bufSentence, new ReadCharDigit());    //讀後面的數字, AsciiLetterOr  
  71.                     wordType = Word.TYPE_DIGIT;  
  72.                     int d = readNext();  
  73.                     if(d > -1) {  
  74.                         if(seg.isUnit(d)) { //單位,如時間  
  75.                             bufWord.add(createWord(bufSentence, startIdx(bufSentence)-1, Word.TYPE_DIGIT)); //先把數字添加(獨立)  
  76.   
  77.                             bufSentence.setLength(0);  
  78.   
  79.                             bufSentence.appendCodePoint(d);  
  80.                             wordType = Word.TYPE_WORD;  //單位是 word  
  81.                         } else {    //後面可能是字母和數字  
  82.                             pushBack(d);  
  83. //                          if(readChars(bufSentence, new ReadCharByAsciiOrDigit()) > 0) {   //如果有字母或數字都會連在一起.  
  84. //                              wordType = Word.TYPE_DIGIT_OR_LETTER;  
  85. //                          }  
  86.                         }  
  87.                     }  
  88.   
  89.                     bufWord.add(createWord(bufSentence, wordType));  
  90.   
  91.   
  92.                     bufSentence.setLength(0);   //緩存的字符清除  
  93.   
  94.                     break;  
  95.                 case Character.LETTER_NUMBER:  
  96.                     // ⅠⅡⅢ 單分  
  97.                     bufSentence.appendCodePoint(data);  
  98.                     readChars(bufSentence, new ReadCharByType(Character.LETTER_NUMBER));  
  99.   
  100.                     int startIdx = startIdx(bufSentence);  
  101.                     for(int i=0; i<bufSentence.length(); i++) {  
  102.                         bufWord.add(new Word(new char[] {bufSentence.charAt(i)}, startIdx++, Word.TYPE_LETTER_NUMBER));  
  103.                     }  
  104.   
  105.                     bufSentence.setLength(0);   //緩存的字符清除  
  106.   
  107.                     break;  
  108.                 case Character.OTHER_NUMBER:  
  109.                     //①⑩㈠㈩⒈⒑⒒⒛⑴⑽⑾⒇ 連着用  
  110.                     bufSentence.appendCodePoint(data);  
  111.                     readChars(bufSentence, new ReadCharByType(Character.OTHER_NUMBER));  
  112.   
  113.                     bufWord.add(createWord(bufSentence, Word.TYPE_OTHER_NUMBER));  
  114.                     bufSentence.setLength(0);  
  115.                     break;  
  116.                 default :  
  117.                     //其它認爲無效字符  
  118.                     read = true;  
  119.                 }//switch  
  120.             }  
  121.                   
  122.             // 中文分詞  
  123.             if(currentSentence != null) {  
  124.                 do {  
  125.                     Chunk chunk = seg.seg(currentSentence);  
  126.                     for(int i=0; i<chunk.getCount(); i++) {  
  127.                         bufWord.add(chunk.getWords()[i]);  
  128.                     }  
  129.                 } while (!currentSentence.isFinish());  
  130.                   
  131.                 currentSentence = null;  
  132.             }  
  133.               
  134.             word = bufWord.poll();  
  135.         }  
  136.           
  137.         return word;  
  138.     }  
         主要是註釋了一些代碼,對字母、數字不要連續處理。

  • 再次搜索字母查詢,效果如下:


         綜上,這樣就簡單完成了數據庫中類似like和百分號雙向匹配需求。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章