Lucene2.0中使用基於詞典的中文分詞器建立索引

 

前一陣的分詞器寫好了,想用它建立索引,下面是具體代碼

package org.iceshirley.index;
import java.sql.*;
import java.io.*;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.document.*;
import org.iceshirley.MMChineseAnalyzer.*;
public class Index {

    
protected LinkToDb ltb=null;
    
private MMChineseAnalyzer analyzer=null;
    
public Index(){
        analyzer
=new MMChineseAnalyzer();
    }

    
public void createConnection() throws SQLException{
        String conurl
="jdbc:mysql://localhost:3306/searchdb?user=root&password=821210&useUnicode=true&characterEncoding=GBK";
        ltb
=new LinkToDb("com.mysql.jdbc.Driver",conurl);
        System.out.println(
"connection");
    }

    
public int getTableNum(){
        
int count=ltb.GetTableNum();
        
return count;
    }

    
public void close() throws SQLException{
        ltb.close();
    }

    
public void creatindex(String index,int count) throws IOException{
        
//int count=0;
        Directory dir=FSDirectory.getDirectory(index,true);
        
//new org.apache.lucene.analysis.standard.StandardAnalyzer()
        IndexWriter writer=new IndexWriter(dir,analyzer,true);
        writer.setMergeFactor(
100);
        writer.setUseCompoundFile(
true);
        ResultSet rs
=ltb.GetResult();    
        
try{
            
while(rs.next()){
                Document doc
=new Document();
                String url
=rs.getString("url");
                String title
=rs.getString("title");
                String text
=rs.getString("text");
                String date
=rs.getString("date");
                String encode
=rs.getString("encode");
                doc.add(
new Field("url",url,Field.Store.YES,Field.Index.NO,Field.TermVector.NO));
                doc.add(
new Field("title",title,Field.Store.YES,Field.Index.TOKENIZED,Field.TermVector.NO));
                doc.add(
new Field("text",text,Field.Store.COMPRESS,Field.Index.TOKENIZED,Field.TermVector.WITH_POSITIONS_OFFSETS));
                doc.add(
new Field("date",date,Field.Store.YES,Field.Index.NO,Field.TermVector.NO));
                doc.add(
new Field("encode",encode,Field.Store.YES,Field.Index.NO,Field.TermVector.NO));
                
//----------------test--------------//
                /*
                java.util.Enumeration fields=doc.fields();
                while(fields.hasMoreElements()){
                    Field field=(Field)fields.nextElement();
                    Reader reader=null;
                    if (field.readerValue() != null){
                        reader = field.readerValue();
                        System.out.println("1");
                     }else if (field.stringValue() != null){
                        reader = new StringReader(field.stringValue());
                        System.out.println("12");
                     }
                    org.apache.lucene.analysis.TokenStream ts=analyzer.tokenStream("test",reader);
                    org.apache.lucene.analysis.Token token=null;
                    while((token=ts.next())!=null){
                       System.out.println(token.termText());
                    }
                }
                System.out.println("----------------------------------------");
*/

                
//--------------------------------//
                try{
                    writer.addDocument(doc);
                    
//System.out.println(doc.getField("url")+" has beed saved to index");
                }
catch(Exception e){
                    e.printStackTrace();
                    
//System.out.println(e.getMessage());
                    
//System.out.println("can not add doc");
                }

            
        
            }
 
            writer.optimize();
            writer.close();
            System.out.println(
"completed");
        }

        
catch(SQLException e){
            System.out.println(
"error "+e.getMessage());
        }

    }

    
    
public  static void main(String[] args)throws IOException,SQLException{
        String indexdir
="c:/indexdir";
        
int count=0;
        Index ci
=new Index();
        
try{
            ci.createConnection();
            count
=ci.getTableNum();
            
if(count<1){
                System.out.println(
"no record in the database");
            }

            
else{
                ci.creatindex(indexdir,count);
            }

        }

        
catch(SQLException e){
            System.out.println(e.getMessage());
        }

        ci.close();
    }


}

//連接數據庫
class LinkToDb {
    
protected Connection con;
    
protected PreparedStatement preCount;
    
protected PreparedStatement preSelect;
    LinkToDb(String driver,String sqlurl)
{
        
try{
        Class.forName(driver);
        con
=DriverManager.getConnection(sqlurl);
        preCount
=con.prepareStatement("SELECT count(*) as qty FROM complete_queue;");
        preSelect
=con.prepareStatement("SELECT * FROM complete_queue;");
        }

        
catch(Exception e){
            System.out.println(e.getMessage());
        }

            
    }

    
public int GetTableNum(){
        
int count=0;
        
try{
        ResultSet rs
=preCount.executeQuery();
        rs.next();
        count
=rs.getInt("qty");
        }

        
catch(Exception e){
            System.out.println(e.getMessage());
        }

        
return count;
    }

    
public ResultSet GetResult(){
        ResultSet rs
=null;
        
try{
            
//preSelect.setInt(1,i);
            rs=preSelect.executeQuery();
            
//rs.next();
        }

        
catch(Exception e){
            System.out.println(e.getMessage());
        }

        
return rs;
    }

    
public void close() throws SQLException{
        con.close();
    }

}

 lucene2.0中,Field字段有很大的改變,原先的Field.text等方法不再存在,取而代之的是直接使用Field的構造函數,共有5種構造函數

Field(String name, byte[] value, Field.Store store)
Field(String name, Reader reader)
Field(String name, Reader reader, Field.TermVector termVector)
Field (String name, String value, Field.Store store, Field.Index index)
Field (String name, String value, Field.Store store, Field.Index index, Field.TermVector termVector)

在Field中有三個內部類:Field.Index,Field.Store,Field.termVector,而構造函數也用到了它們。termVector是Lucene 1.4新增的,它提供一種向量機制來進行模糊查詢,這個不常用。它們的不同的組合,在全文檢索
中有着不同的作用。

Field.Index

 

 

Field.Store

 

 

說明

 

 

TOKENIZED(分詞)

 

 

YES

 

 

被分詞索引且存儲

 

 

TOKENIZED

 

 

NO

 

 

被分詞索引但不存儲

 

 

NO

 

 

YES

 

 

這是不能被搜索的,它只是被搜索內容的附屬物。如URL等

 

 

UN_TOKENIZED

 

 

YES/NO

 

 

不被分詞,它作爲一個整體被搜索,搜一部分是搜不出來的

 

 

NO

 

 

NO

 

 

沒有這種用法

可以根據自己的需要,來決定你使用哪個方法構造Field
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章