Apache Lucene 實戰

博主在工作中項目需要lucene做全文搜索功能,所以博主看了一遍lucene in action(分詞器實現部分沒有看,因爲大多數情況下不需要自己實現分詞器,而且有很多開源分詞器)運用到了lucene,而且已上線,在這裏將使用心得分享給大家,供有需要的小夥伴參考~

首先 簡單描述一下lucene是什麼。

Lucene是apache軟件基金會4 jakarta項目組的一個子項目,是一個開放源代碼的全文檢索引擎工具包,但它不是一個完整的全文檢索引擎,而是一個全文檢索引擎的架構,提供了完整的查詢引擎和索引引擎,部分文本分析引擎。Lucene的目的是爲軟件開發人員提供一個簡單易用的工具包,以方便的在目標系統中實現全文檢索的功能,或者是以此爲基礎建立起完整的全文檢索引擎。(這裏博主只是在系統中實現全文檢索的功能)

Lucene是一套用於全文檢索和搜尋的開源程式庫,由Apache軟件基金會支持和提供。Lucene提供了一個簡單卻強大的應用程式接口,能夠做全文索引和搜尋。在Java開發環境裏Lucene是一個成熟的免費開源工具。就其本身而言,Lucene是當前以及最近幾年最受歡迎的免費Java信息檢索程序庫。人們經常提到信息檢索程序庫,雖然與搜索引擎有關,但不應該將信息檢索程序庫與搜索引擎相混淆。

好了~嘮叨了這麼多,我們進入正題~

 

(參考圖)

首先需要導入lucene相關包 maven如下(博主用的是當初最新版本,因爲新版本對索引庫進行了優化,可以相對來講減少磁盤的開銷。)

<!-- Lucene -->
<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-core</artifactId>
	<version>4.9.0</version>
	</dependency>
<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-queryparser</artifactId>
    <version>4.9.0</version>
</dependency>
<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-highlighter</artifactId>
    <version>4.9.0</version>
</dependency>
<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-analyzers-common</artifactId>
    <version>4.9.0</version>
</dependency>
<dependency>
    <groupId>org.apache.lucene</groupId>
    <artifactId>lucene-queryparser</artifactId>
    <version>4.9.0</version>
</dependency>
<!-- Lucene -->

 

首先需要指定lucene的索引庫路徑(推薦將Lucene索引庫路徑配置在配置文件中)

# lucene index path
batch.lucene.index.path=/home1/irteam/owfs/lucene_index

創建索引時候需要將數據庫中取出的javabean對象轉換爲lucene的document對象,因此我做了工具類將基本的lucene相關方法封裝好服用。
在這裏就把關鍵代碼貼出來提供參考

@Service
public class IndexUtil {

	@Value("${batch.lucene.index.path}")
	private String indexPath;
    //將java對象轉換爲lucene元素
	public Document toDocument(IndexMstr index, Document oldDoc) {
		Document doc = new Document();
		if (StringUtils.isNotBlank(index.getTaskId())) {//對應字符串類型(常用)
			doc.add(new StringField("taskId", index.getTaskId(), Store.YES));
		} else if (StringUtils.isNotBlank(oldDoc.get("taskId"))) {
			doc.add(new StringField("taskId", oldDoc.get("taskId"), Store.YES));
		}
		if (StringUtils.isNotBlank(index.getSummary())) {//對應text類型
			doc.add(new TextField("summary", index.getSummary(), Store.YES));
		} else if (StringUtils.isNotBlank(oldDoc.get("summary"))) {
			doc.add(new TextField("summary", oldDoc.get("summary"), Store.YES));
		}
		if (index.getRegistDate() != null) {//對應date類型,由於lucene不支持date所以我保存了時間戳,顯示時候需要轉換
			doc.add(new LongField("registDate", index.getRegistDate().getTime(), Store.YES));
		} else if (StringUtils.isNotBlank(oldDoc.get("registDate"))) {
			doc.add(new LongField("registDate", new Long(oldDoc.get("registDate")), Store.YES));
		}
		if (StringUtils.isNotBlank(index.getSyncIndexType())) {//由於lucene不支持一對多等關係,但是項目需要多表中對應字段作爲檢索條件,所以我將對應多數據用空格分隔保存(注:由於該字段只需要作爲檢索條件所以爲了省事這樣處理,如需要檢索可以將LIST轉爲json存儲)
			if (StringUtils.isNotBlank(index.getAnalysisNmArr())) {
				doc.add(new StringField("analysisNmArr", index.getAnalysisNmArr(), Store.YES));
			}
			if (StringUtils.isNotBlank(index.getAnalysisConArr())) {
				doc.add(new StringField("analysisConArr", index.getAnalysisConArr(), Store.YES));
			}
		} else {
			List<AnalysisIndexMstr> analysisIndexList = index.getAnalysisIndexList();
			if (CollectionUtils.isNotEmpty(analysisIndexList)) {//심화분석 analysisNmArr,analysisConArr
				StringBuffer analysisNmSB = new StringBuffer();
				StringBuffer analysisConSB = new StringBuffer();
				for (AnalysisIndexMstr analysisIndex : analysisIndexList) {
					if (analysisIndex != null) {
						if (StringUtils.isNotBlank(analysisIndex.getAnalysisNm())) {
							analysisNmSB.append(SOCUtil.html2Text(analysisIndex.getAnalysisNm().trim()) + " ");
						}
						if (StringUtils.isNotBlank(analysisIndex.getAnalysisCon())) {
							analysisConSB.append(SOCUtil.html2Text(analysisIndex.getAnalysisCon()) + " ");
						}
					}
				}
				if (StringUtils.isNotBlank(analysisNmSB.toString())) {
					doc.add(new StringField("analysisNmArr", SOCUtil.html2Text(analysisNmSB.toString().trim()), Store.YES));
				}
				if (StringUtils.isNotBlank(analysisConSB.toString())) {
					doc.add(new TextField("analysisConArr", SOCUtil.html2Text(analysisConSB.toString().trim()), Store.YES));
				}
			}
		}
		return doc;
	}
    //批量將java對象轉換爲lucene元素
	public List<Document> toDocuments(List<IndexMstr> indexMstrList) {
		List<Document> docList = new ArrayList<Document>();
		for (IndexMstr index : indexMstrList) {
			docList.add(toDocument(index, new Document()));
		}
		return docList;
	}
	//lucene分頁查詢,並且批量將lucene元素轉換爲java對象
	public List<IndexMstr> toObjectByPage(IndexSearcher indexSearcher, TopFieldDocs topDocs, PagerInfo pagerInfo, IndexParam indexParam) throws IOException {
		if (StringUtils.isEmpty(indexParam.getPage())) {
			indexParam.setPage(SocConstant.PAGE_ONE);
		}
		if (StringUtils.isEmpty(indexParam.getPageSize())) {
			indexParam.setPageSize(String.valueOf(SocConstant.PAGE_SIZE));
		}
		SimpleDateFormat sdf = new SimpleDateFormat(DateUtil.DATE_FULL_PATTERN_DB_KOREAN);
		List<IndexMstr> indexMstrList = new ArrayList<IndexMstr>();
		ScoreDoc[] scoreDocs = topDocs.scoreDocs;
		pagerInfo.setPage(Integer.parseInt(indexParam.getPage()));
		pagerInfo.init(SocConstant.PAGE_AJAX_DEFAULT_TYPE, scoreDocs.length,
			Integer.parseInt(indexParam.getPageSize()),
			SocConstant.INDEX_SIZE);
		indexParam.setStart(pagerInfo.getStartRownum() - 1);
		indexParam.setEnd(pagerInfo.getEndRownum());
		for(int i = indexParam.getStart(); i < indexParam.getEnd(); i++) {
			int docId = scoreDocs[i].doc;
			Document doc = indexSearcher.doc(docId);
			IndexMstr indexMstr = new IndexMstr();
			indexMstr.setTaskId(doc.get("taskId"));
			indexMstr.setTicketId(doc.get("ticketId"));
			//use highlighter for ticketNm
			String ticketNm = StringUtils.isBlank(indexParam.getSearch()) ? doc.get("ticketNm") : toHighlighter(indexParam.getSearch(), doc.get("ticketNm"));
			indexMstr.setTicketNm(ticketNm);
			//use highlighter for taskNm
			String taskNm = StringUtils.isBlank(indexParam.getSearch()) ? doc.get("taskNm") : toHighlighter(indexParam.getSearch(), doc.get("taskNm"));
			indexMstr.setTaskNm(taskNm);
			indexMstr.setEmpNm(doc.get("empNm"));
			indexMstr.setCorpNm(doc.get("corpNm"));
			indexMstr.setTaskIndexType(TaskIndexType.instanceOf(doc.get("taskTypeId")));
			if (StringUtils.isNotBlank(doc.get("registDate"))) {
				indexMstr.setRegistDateStr(sdf.format(new Date(Long.parseLong(doc.get("registDate")))));
			}
			indexMstrList.add(indexMstr);
		}
		return indexMstrList;
	}
	//實現高亮(由於lucene高亮是作用於分詞後的結果,項目中需要將整體匹配高亮所以自己實現了,如果有興趣看源碼你會發現,源碼也是類似這樣實現的)
	private String toHighlighter(String search, String fieldValue) {
		if (StringUtils.isBlank(fieldValue)) {
			return fieldValue;
		}
		return fieldValue.replaceAll("(?i)" + search, "<font color='#FF0000'>" + search + "</font>");
	}

}

 

 參考圖中init sync batch實現如下:
將積存的數據批量創建索引,由於是積存的數據,所以爲了防止查詢DB超時這裏分批次創建索引。
(如果創建多個indexWriter實例同時對索引庫操作會報錯,最好將IndexWriter交於spring管理)

@Override
	public void execute(ExtendedMap jobDataMap) {
		log.info("[IndexRsyncJob] create lucene index job start........");
		File indexFile = new File(indexPath);
		initIndexFile(indexFile);
		CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_4_9);
		IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
		try {
			FSDirectory directory = FSDirectory.open(indexFile);
			IndexWriter indexWriter = new IndexWriter(directory, conf);
			System.err.println("create lucene index start...................." + DateUtil.getFullDate());
			taskIndexBO.createTaskIndex(TaskIndexType.TASK_INTRUSION.getCode(), 200, indexFile, indexWriter);//create intrusion task lucene index
			taskIndexBO.createTaskIndex(TaskIndexType.TASK_INCIDENT.getCode(), 200, indexFile, indexWriter);//create incident task lucene index
			taskIndexBO.createTaskIndex(TaskIndexType.TASK_REQUEST.getCode(), 200, indexFile, indexWriter);//create request task lucene index
			System.err.println("create lucene index end...................." + DateUtil.getFullDate());
		} catch (IOException e) {
			log.error("[IndexRsyncJob] create lucene index error:" + e.getMessage());
		} finally {
			try {
				indexWriter.close();
			} catch (IOException e) {
				log.error("[IndexRsyncJob] close lucene IndexWriter error:" + e.getMessage());
			}
		}
	}
	public void createTaskIndex(String taskTypeId, int synCnt, File indexFile, IndexWriter indexWriter) {
		int num = 0;
		try{
			IndexParam indexParam = new IndexParam();
			indexParam.setTaskTypeId(taskTypeId);
			int taskTotal = taskIndexDAO.selectTaskCnt(indexParam);
			for (int i = num; i < taskTotal; i += synCnt) {
				num = i;
				indexParam.setStart(i);
				indexParam.setEnd(synCnt);
				List<IndexMstr> taskIndexList = taskIndexDAO.selectTaskList(indexParam);
				List<Document> docList = new IndexUtil().toDocuments(taskIndexList);
				indexWriter.addDocuments(docList);
				indexWriter.commit();
				log.info("[task(total:" + taskTotal + ")] create end by count >>>>>>>>> " + i);
			}
		} catch (Exception e) {
			log.error("[TaskIndexBO] create lucene index error:" + e.getMessage());
			createTaskIndex(taskTypeId, synCnt, indexFile, indexWriter);
		}
	}

參考圖中incremental sync batch實現如下:
索引表中的數據是系統每次對關聯表操作時候將變動的數據存儲臨時索引表中,再通過10分鐘一次的batch實現增量同步的。

public void execute(ExtendedMap jobDataMap) {
		log.info("[IndexRsyncDBJob] create lucene index job start........");
		File indexFile = new File(indexPath);
		indexBO.createIndex(indexFile);
		log.info("[IndexRsyncDBJob] create lucene index job end........");
	}
	public void createIndex(File indexFile) {
		List<Integer> indexIdList = new ArrayList<Integer>();
		List<IndexMstr> indexList = indexDAO.selectIndexList();
		CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_4_9);
		IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
		try {
			FSDirectory directory = FSDirectory.open(indexFile);
			indexWriter = new IndexWriter(directory, conf);
			for (IndexMstr indexMstr : indexList) {
				try {
					indexReader = DirectoryReader.open(directory);
					IndexSearcher indexSearcher = new IndexSearcher(indexReader);
					if ("A".equalsIgnoreCase(indexMstr.getSyncIndexType())) {
						TopDocs topDocs = indexSearcher.search(new TermQuery(new Term("taskId", indexMstr.getTaskId())), Integer.MAX_VALUE);
						if (topDocs.totalHits == 0) {
							Document doc = indexUtil.toDocument(indexMstr, new Document());
							indexWriter.addDocument(doc);
						}
					} else if ("D".equalsIgnoreCase(indexMstr.getSyncIndexType())) {
						indexWriter.deleteDocuments(new Term("taskId", indexMstr.getTaskId()));
					} else if ("U".equalsIgnoreCase(indexMstr.getSyncIndexType())) {
						TopDocs topDocs = indexSearcher.search(new TermQuery(new Term("taskId", indexMstr.getTaskId())), Integer.MAX_VALUE);
						if (topDocs.totalHits > 0) {
							Document oldDoc = indexSearcher.doc(topDocs.scoreDocs[0].doc);
							Document doc = indexUtil.toDocument(indexMstr, oldDoc);
							indexWriter.updateDocument(new Term("taskId", indexMstr.getTaskId()), doc);
						}
					}
					indexWriter.commit();
					indexIdList.add(indexMstr.getIndexId());
				} catch (IOException e) {
					log.error("[IndexBO] before create lucene index for db error:" + e.getMessage());
				} finally {
					try {
						indexReader.close();
					} catch (IOException e) {
						log.error("[IndexBO] close IndexReader error : " + e.getMessage());
					}
				}
			}
		} catch (Exception e) {
			log.error("[IndexBO] after create lucene index for db error:" + e.getMessage());
		} finally {
			try {
				indexWriter.close();
			} catch (IOException e) {
				log.error("[IndexBO] close IndexWriter error : " + e.getMessage());
			}
		}
		if (CollectionUtils.isNotEmpty(indexIdList)) {
			indexDAO.deleteIndexs(indexIdList);
		}
	}

到這裏就把最麻煩的創建索引和數據庫和索引庫同步部分做完了。下面說一下查詢部分。查詢時可以定義多個query對象用Occur邏輯實現複雜查詢。而且lucene也支持使用通配符做查詢條件。Occur.SHOULD,或的意思Occur.MUST_NOT,非的意思Occur.MUST, 與的意思

private void doSearch(Model model, PagerInfo pagerInfo,IndexParam indexParam, boolean isInit) {
		File indexFile = new File(indexPath);   
		try {
			dir = FSDirectory.open(indexFile);
			reader= DirectoryReader.open(dir);
			BooleanQuery query = new BooleanQuery();
			BooleanQuery searchQuery = new BooleanQuery();
			searchQuery.add(new WildcardQuery(new Term("ticketNm", "*" + indexParam.getSearch() + "*")), Occur.SHOULD);
			searchQuery.add(new WildcardQuery(new Term("taskNm", "*" + indexParam.getSearch() + "*")), Occur.SHOULD);
			searchQuery.add(new WildcardQuery(new Term("summary", "*" + indexParam.getSearch() + "*")), Occur.SHOULD);
			searchQuery.add(new WildcardQuery(new Term("detectionLog", "*" + indexParam.getSearch() + "*")), Occur.SHOULD);
			searchQuery.add(new WildcardQuery(new Term("analysisNmArr", "*" + indexParam.getSearch() + "*")), Occur.SHOULD);
			searchQuery.add(new WildcardQuery(new Term("analysisConArr", "*" + indexParam.getSearch() + "*")), Occur.SHOULD);
			query.add(searchQuery, Occur.MUST);
			if(StringUtils.isNotEmpty(indexParam.getTaskTypeIds())) {
				BooleanQuery taskTypeQuery = new BooleanQuery();
				String[] taskTypeIdArr = indexParam.getTaskTypeIds().split(",");
				for (int i = 0; i < taskTypeIdArr.length; i++) {
					taskTypeQuery.add(new TermQuery(new Term("taskTypeId", taskTypeIdArr[i])), Occur.SHOULD);
				}
				query.add(taskTypeQuery, Occur.MUST);
			}
			if(StringUtils.isNotEmpty(indexParam.getCorpId())) {
				query.add(new WildcardQuery(new Term("corpId", indexParam.getCorpId())), Occur.MUST);
			}
			if(StringUtils.isNotEmpty(indexParam.getEmpNm())) {
				query.add(new WildcardQuery(new Term("empNm", "*" + indexParam.getEmpNm() + "*")), Occur.MUST);
			}
			if(StringUtils.isNotEmpty(indexParam.getStartDt()) || StringUtils.isNotEmpty(indexParam.getEndDt())) {
				Long startDtLong = StringUtils.isNotEmpty(indexParam.getStartDt()) ? DateUtil.stringTODate(indexParam.getStartDt(), DateUtil.DATE_FULL_PATTERN_DB_KOREAN).getTime() : null;
				Long endDtLong = StringUtils.isNotEmpty(indexParam.getEndDt()) ? DateUtil.stringTODate(indexParam.getEndDt(), DateUtil.DATE_FULL_PATTERN_DB_KOREAN).getTime() : null;
				
				query.add(NumericRangeQuery.newLongRange("registDate", startDtLong, endDtLong, true, true) , Occur.MUST);
			}
	        IndexSearcher indexSearcher = new IndexSearcher(reader);
	        TopFieldDocs topDocs = indexSearcher.search(query, 100, new Sort(new SortField("registDate", SortField.Type.LONG, true)));
	        List<IndexMstr> indexList = new IndexUtil().toObjectByPage(indexSearcher, topDocs, pagerInfo, indexParam);
	        model.addAttribute("indexList", indexList);
	        model.addAttribute("indexParam", indexParam);
	        if(isInit) {
	        	model.addAttribute("page", pagerInfo.getPage());
	        	initTaskCount(indexSearcher, model);
	        }
		} catch (IOException e) {
			log.error("[PublicSearchController] doSearch error : " + e.getMessage());
		} finally {
			try {
				if(reader != null) {
					reader.close();
				}
			} catch (IOException e) {
				log.error("[PublicSearchController] close IndexReader for doSearch error : " + e.getMessage());
			}
		}
	}

	private void initTaskCount(IndexSearcher indexSearcher, Model model) throws IOException {
		TermQuery query_I = new TermQuery(new Term("taskTypeId", TaskIndexType.TASK_INTRUSION.getCode()));
		TermQuery query_D = new TermQuery(new Term("taskTypeId", TaskIndexType.TASK_DDos.getCode()));
		TermQuery query_A = new TermQuery(new Term("taskTypeId", TaskIndexType.TASK_INCIDENT.getCode()));
		TermQuery query_T = new TermQuery(new Term("taskTypeId", TaskIndexType.TASK_REQUEST.getCode()));
		TermQuery query_P = new TermQuery(new Term("taskTypeId", TaskIndexType.POLICY_DEVELOP.getCode()));
		TermQuery query_R = new TermQuery(new Term("taskTypeId", TaskIndexType.POLICY_VENDOR.getCode()));
		TermQuery query_M = new TermQuery(new Term("taskTypeId", TaskIndexType.POLICY_ESM.getCode()));
		TermQuery query_N = new TermQuery(new Term("taskTypeId", TaskIndexType.POLICY_EXCEPTION.getCode()));
		int total_I = indexSearcher.search(query_I, Integer.MAX_VALUE).totalHits;
		model.addAttribute("total_I", total_I);
		int total_A = indexSearcher.search(query_A, Integer.MAX_VALUE).totalHits;
		model.addAttribute("total_A", total_A);
		int total_T = indexSearcher.search(query_T, Integer.MAX_VALUE).totalHits;
		model.addAttribute("total_T", total_T);
		int total_D = indexSearcher.search(query_D, Integer.MAX_VALUE).totalHits;
		model.addAttribute("total_D", total_D);
		int total_P = indexSearcher.search(query_P, Integer.MAX_VALUE).totalHits;
		model.addAttribute("total_P", total_P);
		int total_R = indexSearcher.search(query_R, Integer.MAX_VALUE).totalHits;
		model.addAttribute("total_R", total_R);
		int total_M = indexSearcher.search(query_M, Integer.MAX_VALUE).totalHits;
		model.addAttribute("total_M", total_M);
		int total_N = indexSearcher.search(query_N, Integer.MAX_VALUE).totalHits;
		model.addAttribute("total_N", total_N);
	}

 由於博主應用系統中數據量不是很大所以直接對磁盤進行檢索。如果數據量大的話可以通過RAMDirectory緩存來優化創建索引,和查詢功能。博主經測試幾百萬級以下的數據量可以直接對磁盤進行檢索,相反如果數據量不大時候用緩存反而起反效果。
如有不瞭解地方,或不對的地方,歡迎提出,互相探討~

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章