隨便搞搞搜索引擎技術,剛開始分詞而已,綜合考察了幾個比較有名的分詞方法,決定還是用中科院的分詞程序。其中C#有個開源的,且網上已經有用SharpICTCLAS爲lucene.net寫的分詞接口了,不過想試試用好一點的分詞程序的效果,所以選了2009共享版的。本人編程技術還是非常菜的,如有不對請大家指出。
分詞接口的代碼我是綜合這兩篇博客:http://ythzjk.javaeye.com/blog/334194,http://www.cnblogs.com/birdshover/archive/2009/04/03/1122305.html#1494633。
代碼都不長也不難,仔細閱讀後很容易根據把自己的分詞程序放進lucene。據我觀察似乎只要修改Tokenizer接口中next()方法就夠了。不過他們的代碼我覺得有個問題,每次初始化分詞程序都是在Tokenizer的構造方法中,這就影響了分詞效率,每分一句話(標題、正文)就要加載一次字典。於是我把加載詞典的部分寫成靜態的。
調用ICTCLAS30.dll的方法,其實只用到了幾個方法而已
using System;using System.Collections.Generic;
using System.Runtime.InteropServices;
namespace TestLucene
{
[StructLayout(LayoutKind.Explicit)]
public struct result_t
{
[FieldOffset(0)]
public int start;
[FieldOffset(4)]
public int length;
[FieldOffset(8)]
public int sPos;
[FieldOffset(12)]
public int sPosLow;
[FieldOffset(16)]
public int POS_id;
[FieldOffset(20)]
public int word_ID;
[FieldOffset(24)]
public int word_type;
[FieldOffset(28)]
public int weight; }
/// <summary>
/// Class1 的摘要說明。
/// </summary>
public class ICTCLAS
{
const string path = @"ICTCLAS30.dll"; [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_Init")]
public static extern bool ICTCLAS_Init(String sInitDirPath); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ParagraphProcess")]
public static extern String ICTCLAS_ParagraphProcess(String sParagraph, int bPOStagged); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_Exit")]
public static extern bool ICTCLAS_Exit(); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ImportUserDict")]
public static extern int ICTCLAS_ImportUserDict(String sFilename); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_FileProcess")]
public static extern bool ICTCLAS_FileProcess(String sSrcFilename, String sDestFilename, int bPOStagged); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_FileProcessEx")]
public static extern bool ICTCLAS_FileProcessEx(String sSrcFilename, String sDestFilename); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_GetParagraphProcessAWordCount")]
public static extern int ICTCLAS_GetParagraphProcessAWordCount(String sParagraph);
//ICTCLAS_GetParagraphProcessAWordCount
[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_ParagraphProcessAW")]
public static extern void ICTCLAS_ParagraphProcessAW(int nCount, [Out, MarshalAs(UnmanagedType.LPArray)] result_t[] result); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_AddUserWord")]
public static extern int ICTCLAS_AddUserWord(String sWord); [DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_SaveTheUsrDic")]
public static extern int ICTCLAS_SaveTheUsrDic();
[DllImport(path, CharSet = CharSet.Ansi, EntryPoint = "ICTCLAS_DelUsrWord")]
public static extern int ICTCLAS_DelUsrWord(String sWord);
}
}
ICTCLASAnalyzer:
using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using System.Collections;
namespace TestLucene
{
public class ICTCLASAnalyzer : Analyzer
{
//定義要過濾的詞
public string[] CHINESE_ENGLISH_STOP_WORDS;
public string StopPath = @"E:/MyCsProj/TestLucene/Stopwords.txt";
public ICTCLASAnalyzer()
{
ArrayList StopWords= new ArrayList();
StreamReader reader = new StreamReader(StopPath, System.Text.Encoding.UTF8);
string noise = reader.ReadLine();
int i = 0;
while (!string.IsNullOrEmpty(noise))
{
StopWords.Add(noise);
noise = reader.ReadLine();
i++;
}
CHINESE_ENGLISH_STOP_WORDS = new String[i];
while (i>0)
{
i--;
CHINESE_ENGLISH_STOP_WORDS[i] = (string)StopWords[i];
}
StopWords.Clear();
}
/**//// Constructs a
{@link StandardTokenizer} filtered by a {@link
/// StandardFilter}, a {@link LowerCaseFilter} and a
{@link StopFilter}.
///
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
TokenStream result = new ICTCLASTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, CHINESE_ENGLISH_STOP_WORDS);
return result;
}
}
Tokenizer:
using System;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using Lucene.Net.Analysis;
using System.IO;
namespace TestLucene
{
class ICTCLASTokenizer : Tokenizer
{
//result_t[] result; //本來我想直接使用這個結構的結果 但發現中英文識別字符長度不一樣,分詞之後原句子對應分詞的位置不到了
int startIndex = 0;
int endIndex = 0;
int i = 0;
string[] pResult;
/**/
///
/// 待分詞的句子
///
private string sentence;
/**/
/// Constructs a tokenizer for this Reader.
///
public static bool flag = ICTCLAS.ICTCLAS_Init(@"E:/MyCsProj/ttt/bin/Debug");
public static int userdic = ICTCLAS.ICTCLAS_ImportUserDict("userdict.txt");
public ICTCLASTokenizer(System.IO.TextReader reader)
{
this.input = reader;
sentence = input.ReadToEnd();
if(!flag)
{
System.Console.WriteLine("Init ICTCLAS failed!");
return;
}
string sResult= ICTCLAS.ICTCLAS_ParagraphProcess(sentence,1);
pResult = Regex.Split(sResult,@"(?<=/w) ");//直接獲取分詞結果字符串,在結果上分出詞。
//Console.WriteLine(sResult);
}
/**/
/// 進行切詞,返回數據流中下一個token或者數據流爲空時返回null
///
public override Token Next()
{
Token token = null;
while (i < pResult.Length-1)
{
string word = pResult[i].Split('/')[0];
MatchCollection rw = Regex.Matches(word, @"/s");
int space = rw.Count;
startIndex=sentence.IndexOf(word,endIndex);
endIndex =startIndex+word.Length;
token = new Token(sentence.Substring(startIndex+space, word.Length-space),startIndex+space,endIndex);
i++;
// Console.WriteLine("word: {0},({1},{2})", sentence.Substring(startIndex + space, word.Length - space), startIndex + space, endIndex);
return token;
}
return null;
}
}
}