說白了就是兩個函數一個建立索引(寫),另一個來查找(讀),所以涉及到java IO的一些知識。
import java.io.*;
import java.nio.file.Paths;
import java.util.Date;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/**
* This class demonstrate the process of creating index with Lucene
* for text files
*/
public class TxtFileIndexer {
public static void main(String[] args) throws Exception{
//indexDir is the directory that hosts Lucene's index files
Directory indexDir = FSDirectory.open(Paths.get("G:\\luceneout"));
//dataDir is the directory that hosts the text files that to be indexed
File dataDir = new File("G:\\downloads\\LJParser_release\\LJParser_Packet\\訓練分類用文本\\交通");
Analyzer luceneAnalyzer = new StandardAnalyzer(); //新建一個分詞器實例
IndexWriterConfig config = new IndexWriterConfig(luceneAnalyzer);
File[] dataFiles = dataDir.listFiles(); //所有訓練樣本文件
IndexWriter indexWriter = new IndexWriter(indexDir,config);//構造一個索引寫入器
long startTime = new Date().getTime();
for(int i = 0; i < dataFiles.length; i++){
if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){
System.out.println("Indexing file " + dataFiles[i].getCanonicalPath()); //返回絕對路徑
Document document = new Document();//每一個文件都變成一個document對象
Reader txtReader = new FileReader(dataFiles[i]);
Field field1 = new StringField("path",dataFiles[i].getPath(),Store.YES);
Field field2 = new TextField("content",txtReader);
Field field3 = new LongField("fileSize", dataFiles[i].length(), Store.YES);
Field field4 = new TextField("filename",dataFiles[i].getName(),Store.YES);
document.add(field1);
document.add(field2);
document.add(field3);
document.add(field4);
indexWriter.addDocument(document); //寫進一個索引
}
}
//indexWriter.optimize();
indexWriter.close();
long endTime = new Date().getTime();
System.out.println("It takes " + (endTime - startTime)
+ " milliseconds to create index for the files in directory "
+ dataDir.getPath());
}
}
讀取索引並查找
import java.io.File;
import java.nio.file.Paths;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;
/**
* This class is used to demonstrate the
* process of searching on an existing
* Lucene index
*
*/
public class TxtFileSearcher {
public static void main(String[] args) throws Exception{
//存儲了索引文件
Directory indexDir = FSDirectory.open(Paths.get("G:\\luceneout"));
//讀取器讀取索引文件
DirectoryReader ireader = DirectoryReader.open(indexDir);
//查找
IndexSearcher searcher = new IndexSearcher(ireader);
//目的查找字符串
String queryStr = "大數據挖掘";
//構造一個詞法分析器,並將查詢結果返回到一個隊列
QueryParser parser = new QueryParser("content",new StandardAnalyzer());
Query query = parser.parse(queryStr);
TopDocs docs = searcher.search(query, 100);
System.out.print("一共搜索到結果:"+docs.totalHits+"條");
//輸出查詢結果信息
for(ScoreDoc scoreDoc:docs.scoreDocs){
System.out.print("序號爲:"+scoreDoc.doc);
System.out.print("評分爲:"+scoreDoc.score);
Document document = searcher.doc(scoreDoc.doc);
System.out.print("路徑爲:"+document.get("path"));
System.out.print("內容爲"+document.get("content"));
System.out.print("文件大小爲"+document.get("fileSize"));
System.out.print("文件名爲"+document.get("filename"));
System.out.println();
}
}
}
運行結果
下面是文件目錄
兩個函數都需要用到分詞器,前者是爲了配置寫入,後者則是爲了配置詞法分析器來查找