使用Solrj管理Solr索引

Solrj是Solr搜索服務器的一個比較基礎的客戶端工具,可以非常方便地與Solr搜索服務器進行交互,最基本的功能就是管理Solr索引,包括添加、更新、刪除和查詢等。對於一些比較基礎的應用,用Solj基本夠用,而且你可以非常容易地通過使用Solrj的API實現與Solr搜索服務器進行交互,實現對Solr的基本管理功能。如果你的應用比較複雜,可以擴展Solrj來滿足需要。

下面是一個使用Solrj的API實現與Solr服務器交互的工具類SolrPostServer,能夠實現索引的添加、更新、刪除和查詢功能。SolrPostServer類中兩個內部類是與訪問MongoDB的配置和工具。

在實際應用中,對於是否進行commit,可以有兩種方式:

  • 一種是直接在客戶端進行計算,亦即,進行索引時計算添加的文檔數,滿足設置的值則進行手動commit,這種方式比較靈活,你可以根據搜索服務器的運行狀況選擇合理的commit文檔數量;
  • 另一種是,直接在Solr搜索服務器上進行配置,一般來說,對索引進行大批量更新,一般不會選擇在搜索服務器業務繁忙的時候進行,所以能夠自動進行commit也便利了對索引的管理,更新文檔可以完全可以實現自動化處理。
在Solr服務器端進行配置有關commit的功能,可以在requestHandler中進行配置,示例如下:
	<requestHandler name="/update" class="solr.XmlUpdateRequestHandler">
		<maxPendingDeletes>10000</maxPendingDeletes>
		<autoCommit>
			<maxDocs>20</maxDocs>
			<maxTime>86000</maxTime>
		</autoCommit>
	</requestHandler>
上面autoCommit中的maxDocs指定的pending多少個文檔後執行一次commit,而maxTime指定了多長時間間隔進行一次commit,一般這兩個選項只需要配置一個即可滿足需要。另外,每次commit會將最近的更新生效,但是如果一次commit操作尚未完成,又達到了下一次commit的時刻,這樣做會嚴重影響索引的吞吐量。
在Solr 4.0將會實現一種基於“軟自動提交”(soft auto commit)的功能,它會根據當前的系統上下文決定是否提交(簡單的情況就是,確保每次commit完成,也就是最近的索引數據更新已經更新同步到磁盤上之後再自動執行下一次commit)。

實現代碼如下所示:

package org.shirdrn.solr.solrj;

import java.io.IOException;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.commons.httpclient.HttpClient;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.apache.solr.client.solrj.ResponseParser;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.impl.XMLResponseParser;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.SolrParams;

import com.mongodb.BasicDBObject;
import com.mongodb.DBCollection;
import com.mongodb.DBCursor;
import com.mongodb.DBObject;
import com.mongodb.Mongo;
import com.mongodb.MongoException;

/**
 * Solr server for indexes operations.
 * 
 * @author shirdrn
 * @date   2011-12-20
 */
public class SolrPostServer {

	private static final Logger LOG = Logger.getLogger(SolrPostServer.class);
	private CommonsHttpSolrServer server; 
	private ResponseParser responseParser;
	
	private MongoConfig mongoConfig;
	private String[] collectionNames;
	private  int maxCommitCount = 100;
	private boolean manualOptimize = true;

	private boolean manualCommit = false;
	private Collection<SolrInputDocument> docContainer = new ArrayList<SolrInputDocument>();
	private static int totalCount = 0;
	
	public SolrPostServer(String url, HttpClient httpClient, MongoConfig mongoConfig) {
		try {
			if(httpClient==null) {
				server = new CommonsHttpSolrServer(url);
				server.setSoTimeout(500000);  // socket read timeout
				server.setConnectionTimeout(5000);  
				server.setDefaultMaxConnectionsPerHost(10);  
				server.setMaxTotalConnections(100);
				server.setAllowCompression(true);  
				server.setMaxRetries(1); // defaults to 0.  > 1 not recommended. 
			} else {
				server = new CommonsHttpSolrServer(url, httpClient);
			}
		} catch (MalformedURLException e) {
			e.printStackTrace();
		}
		this.mongoConfig = mongoConfig;
		initialize();
	}

	/**
	 * Initialize the {@link CommonsHttpSolrServer}'s basic parameters.
	 */
	private void initialize() {
		if(responseParser!=null) {
			server.setParser(responseParser);
		} else {
			server.setParser(new XMLResponseParser());
		}
	}
	
	@SuppressWarnings("unchecked")
	public void postUpdate() {
		DBCursor cursor = null;
		try {
			for (String c : collectionNames) {
				LOG.info("MongoDB collection name: " + c);
				DBCollection collection = MongoHelper.newHelper(mongoConfig).getCollection(c);
				DBObject q = new BasicDBObject();
				cursor = collection.find(q);
				while(cursor.hasNext()) {
					try {
						Map<Object, Object> m = cursor.next().toMap();
						if(manualCommit) {
							add(m, true);
						} else {
							add(m, false);
						}
						++totalCount;
						LOG.info("Add fragment: _id = " + m.get("_id").toString());
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
				cursor.close();
			}
			LOG.info("Add totalCount: " + totalCount);
			finallyCommit();
			optimize(manualOptimize);
		} catch (MongoException e) {
			e.printStackTrace();
		} catch (SolrServerException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * Detele lucene {@link Document} by IDs.
	 * @param strings
	 */
	public void deleteById(List<String> strings) {
		try {
			server.deleteById(strings);
		} catch (SolrServerException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * Detele lucene {@link Document} by query.
	 * @param query
	 */
	public void deleteByQuery(String query) {
		try {
			server.deleteByQuery(query);
		} catch (SolrServerException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * Query.
	 * @param params
	 * @param fields
	 * @return
	 */
	public List<Map<String, Object>> query(SolrParams params, String[] fields) {
		List<Map<String, Object>> results = new ArrayList<Map<String, Object>>();
		try {
			SolrDocumentList documents = server.query(params).getResults();
			Iterator<SolrDocument> iter = documents.iterator();
			while(iter.hasNext()) {
				SolrDocument doc = iter.next();
				Map<String, Object> map = new HashMap<String, Object>();
				for(String field : fields) {
					map.put(field, doc.getFieldValue(field));
				}
				results.add(map);
			}
		} catch (SolrServerException e) {
			e.printStackTrace();
		}
		return results;
	}
	
	/**
	 * When controlling the committing action at client side, finally execute committing.
	 * @throws SolrServerException
	 * @throws IOException
	 */
	private void finallyCommit() throws SolrServerException, IOException {
		if(!docContainer.isEmpty()) {
			server.add(docContainer);
			commit(false, false);
		}
	}
	
	/**
	 * Commit.
	 * @param waitFlush
	 * @param waitSearcher
	 * @throws SolrServerException
	 * @throws IOException
	 */
	public void commit(boolean waitFlush, boolean waitSearcher) {
		try {
			server.commit(waitFlush, waitSearcher);
		} catch (SolrServerException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
	/**
	 * When controlling the optimizing action at client side, finally execute optimizing.
	 * @param waitFlush
	 * @param waitSearcher
	 * @throws SolrServerException
	 * @throws IOException
	 */
	public void optimize(boolean waitFlush, boolean waitSearcher) {
		try {
			server.optimize(waitFlush, waitSearcher);
			commit(waitFlush, waitSearcher);
		} catch (Exception e) {
			LOG.error("Encounter error when optimizing.",  e);
			try {
				server.rollback();
			} catch (SolrServerException e1) {
				e1.printStackTrace();
			} catch (IOException e1) {
				e1.printStackTrace();
			}
		}
	}
	
	/**
	 * Optimize.
	 * @param optimize
	 * @throws SolrServerException
	 * @throws IOException
	 */
	private void optimize(boolean optimize) {
		if(optimize) {
			optimize(true, true);
		}
	}

	/**
	 * Add a {@link SolrInputDocument} or collect object and add to the a collection for batch updating
	 * from a mongodb's recored, a Map object.
	 * @param m
	 * @param oneByOne
	 * @throws SolrServerException
	 * @throws IOException
	 */
	private void add(Map<Object, Object> m, boolean oneByOne) throws SolrServerException, IOException {
		SolrInputDocument doc = createDocument(m);
		if(oneByOne) {
			server.add(doc);
		} else {
			docContainer.add(doc);
			if(docContainer.size()>maxCommitCount) {
				server.add(docContainer);
				server.commit(false, false);
				docContainer = new ArrayList<SolrInputDocument>();
			}
		}
	}
	
	/**
	 * Create a {@link SolrInputDocument} object.
	 * @param record
	 * @return
	 */
	private SolrInputDocument createDocument(Map<Object, Object> record) {
		String id = record.get("_id").toString();
		String articleId = (String) record.get("articleId");
		String title = (String) record.get("title");
		String url = (String) record.get("url");
		String spiderName = (String) record.get("spiderName");
		String fragment = makeFragment((BasicDBObject) record.get("fragment"));
		String word = (String) record.get("word");
		int pictureCount = (Integer) record.get("pictureCount");
		int selectedCount = (Integer) record.get("selectedCount");
		int fragmentSize = (Integer) record.get("fragmentSize");
		
		SolrInputDocument doc = new SolrInputDocument();
		doc.addField("_id", id, 1.0f);
		doc.addField("articleId", articleId, 1.0f);
		doc.addField("title", title, 1.0f);
		doc.addField("url", url, 1.0f);
		doc.addField("spiderName", spiderName, 1.0f);
		doc.addField("fragment", fragment, 1.0f);
		doc.addField("word", word, 1.0f);
		// Additional processing for lucene payload metadata.
		doc.addField("pictureCount", word + "|" + pictureCount);
		doc.addField("coverage", word + "|" + (float)selectedCount/fragmentSize);
		return doc;
	}
	
	@SuppressWarnings("unchecked")
	private String makeFragment(BasicDBObject fragment) {
		StringBuilder builder = new StringBuilder();
		Iterator<Map.Entry<Integer, String>> iter = fragment.toMap().entrySet().iterator();
		while(iter.hasNext()) {
			Map.Entry<Integer, String> entry = iter.next();
			builder.append(entry.getValue().trim()).append("<br>");
		}
		return builder.toString();
	}
	
	/**
	 * Set {@link ResponseParser}, default value is {@link XMLResponseParser}.
	 * @param responseParser
	 */
	public void setResponseParser(ResponseParser responseParser) {
		this.responseParser = responseParser;
	}

	/**
	 * Pulling document resource from multiple collections of MongoDB.
	 * @param collectionNames
	 */
	public void setCollectionNames(String[] collectionNames) {
		this.collectionNames = collectionNames;
	}
	
	public void setMaxCommitCount(int maxCommitCount) {
		this.maxCommitCount = maxCommitCount;
	}

	public void setManualCommit(boolean manualCommit) {
		this.manualCommit = manualCommit;
	}

	public void setManualOptimize(boolean manualOptimize) {
		this.manualOptimize = manualOptimize;
	}

	/**
	 * Mongo database configuration.
	 * 
	 * @author shirdrn
	 * @date   2011-12-20
	 */
	public static class MongoConfig implements Serializable {
		private static final long serialVersionUID = -3028092758346115702L;
		private String host;
		private int port;
		private String dbname;
		private String collectionName;
		public MongoConfig(String host, int port, String dbname, String collectionName) {
			super();
			this.host = host;
			this.port = port;
			this.dbname = dbname;
			this.collectionName = collectionName;
		}
		@Override
		public boolean equals(Object obj) {
			MongoConfig other = (MongoConfig) obj;
			return host.equals(other.host) && port==other.port
				&& dbname.equals(other.dbname) && collectionName.equals(other.collectionName);
		}
	}
	
	/**
	 * Mongo database utility.
	 * 
	 * @author shirdrn
	 * @date   2011-12-20
	 */
	static class MongoHelper {
		private static Mongo mongo;
		private static MongoHelper helper;
		private MongoConfig mongoConfig;
		private MongoHelper(MongoConfig mongoConfig) {
			super();
			this.mongoConfig = mongoConfig;
		}
		public synchronized static MongoHelper newHelper(MongoConfig mongoConfig) {
			try {
				if(helper==null) {
					helper = new MongoHelper(mongoConfig);
					mongo = new Mongo(mongoConfig.host, mongoConfig.port);
					Runtime.getRuntime().addShutdownHook(new Thread() {
						@Override
						public void run() {
							if(mongo!=null) {
								mongo.close();
							}
						}
					});
				}
			} catch (Exception e) {
				e.printStackTrace();
			}
			return helper;
		}			
		public DBCollection getCollection(String collectionName) {
			DBCollection c = null;
			try {
				c = mongo.getDB(mongoConfig.dbname).getCollection(collectionName);
			} catch (Exception e) {
				e.printStackTrace();
			}
			return c;
		}	
	}
}

下面,我們可以通過寫一個測試用例測試一下。

首先,我的Solr搜索服務器已經部署好並啓動成功,對應的url爲http://192.168.0.197:8080/server/fragment/。測試用例如下所示:

package org.shirdrn.solr.solrj;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import junit.framework.TestCase;

import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.request.MapSolrParams;
import org.shirdrn.solr.solrj.SolrPostServer.MongoConfig;

@SuppressWarnings("deprecation")
public class TestSolrPostServer extends TestCase {
	
	SolrPostServer myServer;
	MongoConfig config;
	String url;
	String[] collectionNames;
	
	@Override
	protected void setUp() throws Exception {
		url = "http://192.168.0.197:8080/server/fragment/";
		config = new MongoConfig("192.168.0.184", 27017, "fragment", "");
		myServer = new SolrPostServer(url, null, config);
		myServer.setMaxCommitCount(100);
	}
	
	@Override
	protected void tearDown() throws Exception {
		super.tearDown();
	}
	
	public void testPostUpdate() {
		collectionNames = new String[] {
				"sina",
				"lvping",
				"daodao",
				"go2eu",
				"mafengwo",
				"lotour",
				"17u",
				"sohu",
				"baseSe",
				"bytravel"
		};
		myServer.setCollectionNames(collectionNames);
		myServer.setManualCommit(true);
		myServer.setManualOptimize(false);
		myServer.postUpdate();
	}
	
	public void testPostDelete() {
		List<String> strings = new ArrayList<String>();
		strings.add("4ef051342c4117a38f63ee97");
		strings.add("4ef051322c4117a38f63ee36");
		strings.add("4ef051a42c4117a38f63fb51");
		strings.add("4ef050d92c4117a38f63dda4");
		strings.add("4ef051fe2c4117a38f640bc8");
		strings.add("4ef048ef2c4117a38f6207ce");
		strings.add("4ef049062c4117a38f620e13");
		strings.add("4ef046f12c4117a38f6185c0");
		myServer.deleteById(strings);
		myServer.commit(false, false);
		myServer.optimize(true, false);
	}
	
	@SuppressWarnings({ "rawtypes", "unchecked" })
	public void testQuery() {
		Map map = new HashMap();
		map.put(CommonParams.Q, "法國");
		map.put(CommonParams.START, "0");
		map.put(CommonParams.ROWS, "10");
		map.put(CommonParams.FQ, "word:盧浮宮");
		SolrParams params = new MapSolrParams(map);
		List<Map<String, Object>> results = myServer.query(params, new String[] {"_id", "title", "url"});
		assertEquals(10, results.size());
	}
}

在實際開發的過程中,使用Solrj客戶端可以非常容易爲測試做一些基本操作,如創建索引,測試Solr基本參數及其開發定製Solr相關接口(Retrieval、Highlighting、Faceted Search、Clustering等等)。


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章