Lucene的練習

今天看了看Lucene，其實好早以前就通過買的參考書學習過了。現在又重溫一下。

關於Lucene2014年底最新的版本是4.10的。但是4.8+的版本都只支持JDK1.7，4.0的版本是支持JDK1.6的。只是4.0版本所用的中文分詞都不太好，所以這裏還是介紹一下常用的3.5版本吧。

3.5版本採用的分詞器是IKAnalyzer。這個在4.0已經不支持了。

還是先說JAR文件吧，工程中使用的JAR文件：lucene-core-3.5.0.jar、lucene-analyzers-3.5.0.jar、lucene-highlighter-3.5.0.jar、IKAnalyzer3.2.5Stable.jar其中有核心包、分詞包和高亮包。

工程中寫了兩個類，一個是創建索引的類，一個是搜索的類：

創建索引：

package com.zyujie.lucene;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;

public class CreateIndexFile {

	/*
	 * 創建索引
	 */
	public void createIndex() throws Exception {
		String filePath = "D:\\lucene\\cityDemo\\cityIndex";	//索引文件夾目錄
		//創建索引之前，先將舊的索引文件清空
		this.deleteIndexFile(filePath);
		File indexFile = new File(filePath);	//索引文件存放的目錄
		Analyzer analyzer = new IKAnalyzer();	//IKAnalyzer支持中文分詞器
		IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, analyzer);	//配置項
		iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);	//模式
		IndexWriter indexWriter = null;
		try {
			indexWriter = new IndexWriter(FSDirectory.open(indexFile), iwc);	//打開索引文件夾
			List cityList = this.getCityDatas();
			for (int i = 0; i < cityList.size(); i++) {
				Document doc = (Document) cityList.get(i);
				indexWriter.addDocument(doc);	//將doc生成的數據添加到indexWriter，寫入索引文件
			}
			System.out.println("索引已經創建！");
		} catch (Exception e) {
			e.printStackTrace();
		}
		finally {
			indexWriter.close(); // 需關閉
		}
	}
	
	/*
	 * 刪除索引文件
	 */
	public void deleteIndexFile(String indexFilePath){
		File file = new File(indexFilePath);
		String[] filelist = file.list();	//獲取下面的所有索引文件
	    for (int i = 0; i < filelist.length; i++) {
	    	File delfile = new File(indexFilePath + "\\" + filelist[i]);
    		delfile.delete();	//刪除所有索引文件
	    }
	    System.out.println("已清空索引文件");  
	}
	
	/*
	 * 造數據
	 */
	public List getCityDatas(){
		List cityList = new ArrayList();
		//創建Document對象
		Document bjdoc = new Document();
		bjdoc.add(new Field("id", "010", Store.YES, Index.NOT_ANALYZED));	//Store.YES存入索引文件，Index.NOT_ANALYZED不進行分詞
		bjdoc.add(new Field("name", "北京", Store.YES, Index.ANALYZED));
		bjdoc.add(new Field("area", "華北地區", Store.YES, Index.ANALYZED));
		bjdoc.add(new Field("population", "2114.8萬", Store.YES, Index.NOT_ANALYZED));
		bjdoc.add(new Field("description", "中華人民共和國的首都、直轄市和國家中心城市，也是中國的政治、文化、科教和國際交往中心", Store.YES, Index.ANALYZED));
		
		Document shdoc = new Document();
		shdoc.add(new Field("id", "021", Store.YES, Index.NOT_ANALYZED));
		shdoc.add(new Field("name", "上海", Store.YES, Index.ANALYZED));
		shdoc.add(new Field("area", "華東地區", Store.YES, Index.ANALYZED));
		shdoc.add(new Field("population", "2500萬", Store.YES, Index.NOT_ANALYZED));
		shdoc.add(new Field("description", "上海（Shanghai），簡稱“滬”，有“東方巴黎”的美稱，中華人民共和國直轄市之一", Store.YES, Index.ANALYZED));
		
		Document gzdoc = new Document();
		gzdoc.add(new Field("id", "020", Store.YES, Index.NOT_ANALYZED));
		gzdoc.add(new Field("name", "廣州", Store.YES, Index.ANALYZED));
		gzdoc.add(new Field("area", "華南地區", Store.YES, Index.ANALYZED));
		gzdoc.add(new Field("population", "1292.68萬", Store.YES, Index.NOT_ANALYZED));
		gzdoc.add(new Field("description", "廣州，簡稱穗，別稱羊城、花城，廣東省省會，位於廣東省中南部", Store.YES, Index.ANALYZED));
		
		Document tjdoc = new Document();
		tjdoc.add(new Field("id", "022", Store.YES, Index.NOT_ANALYZED));
		tjdoc.add(new Field("name", "天津", Store.YES, Index.ANALYZED));
		tjdoc.add(new Field("area", "華北地區", Store.YES, Index.ANALYZED));
		tjdoc.add(new Field("population", "11946.88萬", Store.YES, Index.NOT_ANALYZED));
		tjdoc.add(new Field("description", "天津，簡稱津，中華人民共和國直轄市、中國國家中心城市、中國北方經濟中心", Store.YES, Index.ANALYZED));
		
		Document cddoc = new Document();
		cddoc.add(new Field("id", "028", Store.YES, Index.NOT_ANALYZED));
		cddoc.add(new Field("name", "成都", Store.YES, Index.ANALYZED));
		cddoc.add(new Field("area", "西南地區", Store.YES, Index.ANALYZED));
		cddoc.add(new Field("population", "1417萬", Store.YES, Index.NOT_ANALYZED));
		cddoc.add(new Field("description", "成都，四川省省會[1] ，成都位於中國華西地區東部，成都平原腹地，古爲蜀國地", Store.YES, Index.ANALYZED));
		
		Document cqdoc = new Document();
		cqdoc.add(new Field("id", "023", Store.YES, Index.NOT_ANALYZED));
		cqdoc.add(new Field("name", "重慶", Store.YES, Index.ANALYZED));
		cqdoc.add(new Field("area", "西南地區", Store.YES, Index.ANALYZED));
		cqdoc.add(new Field("population", "2970萬", Store.YES, Index.NOT_ANALYZED));
		cqdoc.add(new Field("description", "重慶，簡稱巴和渝，別稱山城、渝都、橋都，中華人民共和國直轄市", Store.YES, Index.ANALYZED));
		
		cityList.add(bjdoc);
		cityList.add(shdoc);
		cityList.add(gzdoc);
		cityList.add(tjdoc);
		cityList.add(cddoc);
		cityList.add(cqdoc);
		return cityList;
	} 
}

搜索類：

package com.zyujie.lucene;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
import org.wltea.analyzer.lucene.IKSimilarity;

public class SearchCity {

	public void searchCityInfo(String queryStr){
		String filePath = "D:\\lucene\\cityDemo\\cityIndex";	//索引文件夾目錄
		String[] fields = { "id", "name", "area", "description" };	//允許搜索的字段
		
		File indexFile = new File(filePath);	//索引文件存放的目錄
		Analyzer analyzer = new IKAnalyzer();	//IKAnalyzer支持中文分詞器
		
		IndexReader indexReader = null;
		IndexSearcher indexSearcher = null;
		try {
			QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer);	//構造查詢器用來搜索多個字段
                        //PrefixQuery prefixQuery = new PrefixQuery(new Term("name","成")); //與查詢前綴匹配的查詢，suggest功能
                        Query query = queryParser.parse(queryStr);
			indexReader = IndexReader.open(FSDirectory.open(indexFile));	//indexReader讀取索引目錄
			indexSearcher = new IndexSearcher(indexReader);	//實例化搜索對象
			indexSearcher.setSimilarity(new IKSimilarity());	//設置匹配實現

			TopDocs docs = indexSearcher.search(query, 10);	//搜索返回的條目數，關鍵詞包括最多的前10條
			int totalHits = docs.totalHits;	//匹配的結果總數
			System.out.println("total : " + totalHits);

			//爲允許搜索的字段，設置高亮，html標籤
			Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
			Scorer scorer = new QueryScorer(query);
			Highlighter highlighter = new Highlighter(formatter, scorer);
			
			//複雜的很長的文本字段，需要截取部分來顯示，設置片斷邊界大小，可以保證分詞的完整性
			Fragmenter fragmenter = new SimpleFragmenter(50);
			highlighter.setTextFragmenter(fragmenter);
			
			for (ScoreDoc doc : docs.scoreDocs) {
				Document document = indexSearcher.doc(doc.doc);	//從符合結果的集合中取出每個文檔對象
				
				System.out.println("城市區號：-->" + document.get("id"));	//通過key得到值
				//返回高亮後的結果，如果當前屬性值沒有出現關鍵字則出現null
				String hc_name = highlighter.getBestFragment(analyzer, "name", document.get("name"));	//需要高亮顯示的字段
				if (hc_name == null) {
					String description = document.get("name");
					int endIndex = Math.min(50, description.length());	//返回較小的值
					hc_name = description.substring(0, endIndex);// 最多前50個字符
				}
				System.out.println("城市名稱：-->" + hc_name);
				System.out.println("所在區域：-->" + document.get("area"));
				System.out.println("人口：-->" + document.get("population"));
//				System.out.println("簡介：-->" + document.get("description"));
				
				//返回高亮後的結果，如果當前屬性值沒有出現關鍵字則出現null
				String hc_description = highlighter.getBestFragment(analyzer, "description", document.get("description"));
				if (hc_description == null) {
					String description = document.get("description");
					int endIndex = Math.min(50, description.length());
					hc_description = description.substring(0, endIndex);
				}
				System.out.println("簡介：-->" + hc_description);
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			// 使用完畢需要關閉！
			try {
				indexReader.close();
				indexSearcher.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}

寫了一個main方法去調用：

public static void main(String[] args) throws Exception {
//		CreateIndexFile cif = new CreateIndexFile();
//		cif.createIndex();
		
		SearchCity sc = new SearchCity();
		sc.searchCityInfo("上海");
	}

運行結果：

total : 1
城市區號：-->021
城市名稱：--><font color='red'>上海</font>
所在區域：-->華東地區
人口：-->2500萬
簡介：--><font color='red'>上海</font>（Shanghai），簡稱“滬”，有“東方巴黎”的美稱，中華人民共和國直轄市之一

以上只是Lucene3.5簡單的應用，還有一些更多功能，可參見demo和api。

Velocity的練習

tomcat配置https訪問系統

Java性能優化的一些小技巧

Java web工程判斷用戶是否重複登錄

Quartz整合Spring的練習

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結