轉自 http://blog.csdn.net/caoxu1987728/archive/2008/07/18/2673492.aspx
由文章標題可知 我們要建立數據庫和索引。
一,定義Product類
此類相當於MVC中的容器裝載了數據庫和索引所需要的對象,例如:category、name、type、content、summary、imageURI、originalRrl、updatedtime。順序沒關係,代碼如下:
package com.luceneheritrixbook.core;
public class Product {
private String category=null;
private String name=null;
private String type=null;
private String content=null;
private String summary=null;
private String imageURI=null;
private String updatedtime=null;
private String originalUrl=null;
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getImageURI() {
return imageURI;
}
public void setImageURI(String imageURI) {
this.imageURI = imageURI;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getOriginalUrl() {
return originalUrl;
}
public void setOriginalUrl(String originalUrl) {
this.originalUrl = originalUrl;
}
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getUpdatetime() {
return updatedtime;
}
public void setUpdatetime(String updatetime) {
this.updatedtime = updatetime;
}
}二:定義Lucene的Document格式(即用於搜索的field域)
package com.luceneheritrixbook.index;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import com.luceneheritrixbook.core.Product;
public class ProductDocument {
private static final String PRODUCT_ID="productid";
private static final String INDEX_TIME="indextime";
private static final String PRODUCT_URL="productrul";
private static final String CATEGORY="category";
private static final String PRODUCT_NAME="name";
private static final String PRODUCT_TYPE="type";
public static Document buildProductDocument(Product product,int id)
{
Document doc=new Document();
Field identifier=new Field(PRODUCT_ID,id+"",Field.Store.YES,
Field.Index.UN_TOKENIZED);
long mills=System.currentTimeMillis();
Field indextime=new Field(INDEX_TIME,mills+"",Field.Store.YES,
Field.Index.UN_TOKENIZED);
Field producturl=new Field(PRODUCT_URL,product.getOriginalUrl(),Field.Store.YES,
Field.Index.UN_TOKENIZED);
Field category=new Field(CATEGORY,product.getCategory(),Field.Store.YES,
Field.Index.TOKENIZED);
Field name=new Field(PRODUCT_NAME,product.getName(),Field.Store.YES,
Field.Index.TOKENIZED);
Field type=new Field(PRODUCT_TYPE,product.getType(),Field.Store.YES,
Field.Index.TOKENIZED);
String text=product.getCategory();
text+=" "+product.getName();
text+=" "+product.getType();
Field all=new Field(PRODUCT_ID,text,Field.Store.YES,
Field.Index.TOKENIZED);
doc.add(identifier);
doc.add(indextime);
doc.add(producturl);
doc.add(category);
doc.add(name);
doc.add(type);
doc.add(all);
return doc;
}
}
三、對數據庫進行操作(即向數據庫中插入獲得的product對象)
package com.luceneheritrixbook.database;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;
import com.luceneheritrixbook.core.Product;
public class ProductJDBC {
private Connection con = null;
private Statement stmt = null;
private ResultSet rs = null;
private PreparedStatement pstmt = null;
private boolean autoCommit = true;
public ProductJDBC(String url, String usr, String pwd) throws Exception
{
Class.forName("com.mysql.jdbc.Driver").newInstance();
con = DriverManager.getConnection(url, usr, pwd);
con.setAutoCommit(autoCommit);
}
public int addProduct(Product p) throws Exception
{
int nextid = getNextId();
if (nextid < 0) {
throw new Exception("Can't get next id.");
}
String content=p.getContent();
String summary=p.getSummary();
String imageURI=p.getImageURI();
String originalUrl=p.getOriginalUrl();
String category=p.getCategory();
String name=p.getName();
String type=p.getType();
String updatetime=p.getUpdatetime();
String expr="insert into product(content,abstractcontent,url," +
"imageurl,category,name,type,updatedtime)values(?,?,?,?,?,?,?,?)";
pstmt=con.prepareStatement(expr);
pstmt.setString(1, content);
pstmt.setString(2, summary);
pstmt.setString(3, originalUrl);
pstmt.setString(4, imageURI);
pstmt.setString(5, category);
pstmt.setString(6, name);
pstmt.setString(7, type);
pstmt.setString(8, updatetime);
pstmt.execute();
return nextid;
}
private int getNextId() throws Exception {
int result = -1;
String sql = "select max(id)+1 from product";
stmt = con.createStatement();
rs = stmt.executeQuery(sql);
while (rs.next()) {
result = rs.getInt(1);
}
return result;
}
public void close()
{
if(con!=null)
{
try
{
con.close();
}
catch(Exception e)
{
e.printStackTrace();
}
finally
{
con=null;
}
}
}
}
/*
* 在這裏我發現了一個不好的地方,那就是完全相同的兩個產品信息可以同時存入數據庫
* */四、對索引進行操作(其實就是把前面所構建的詞庫加入JE分詞,然後連同Document一起加入索引器)
package com.luceneheritrixbook.index;
import java.io.FileReader;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import com.luceneheritrixbook.core.Product;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;
public class ProductIndexer
{
private String indexPath="";
private IndexWriter writer=null;
private Analyzer analyzer=null;
private String dictionary_file=PropertyConfiguration.getWordDictionary();
public ProductIndexer(String indexPath)throws Exception
{
this.indexPath=indexPath;
initialize();
}
private void initialize() throws Exception
{
analyzer=new MMAnalyzer();
FileReader reader=new FileReader(dictionary_file);
((MMAnalyzer)analyzer).addDictionary(reader);
writer=new IndexWriter(indexPath,analyzer,true);
}
public void close()
{
try
{
writer.close();
}
catch(Exception e)
{
e.printStackTrace();
writer=null;
}
}
public void addProduct(Product product,int id)throws Exception
{
writer.addDocument(ProductDocument.buildProductDocument(product,id));
}
//優化索引
public void optimizeIndex()throws Exception
{
writer.optimize();
}
}
五、調用數據庫處理類和索引處理類(這是建立數據庫和索引最主要的類,主要過程是這樣的:首先初始化數據庫和索引的實例,然後是從heritix中讀取的鏡像網頁,通過File的循環遍歷從中讀取每一個product的詳細信息,然後生成一個Product對象,這樣通過參數product就可以把數據存入數據庫和索引了)代碼如下:
package com.luceneheritrixbook.core;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import com.luceneheritrixbook.database.ProductJDBC;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.index.ProductIndexer;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;
public class ProductTextFileProcessor
{
/**
* @param args
*/
private String[] directionaries;
private static final String dbUrl=PropertyConfiguration.getDBUrl();
private static final String dbUsr=PropertyConfiguration.getDBUsr();
private static final String dbPwd=PropertyConfiguration.getDBPwd();
private static final String indexPath=PropertyConfiguration.getIndexStorePath();
private ProductJDBC productJDBC=null;
private ProductIndexer indexer=null;
public final static int SUMMARY_LENGTH=80;//內容簡介的最大數量
public ProductTextFileProcessor()
{
initialize();
}
public void initialize()
{
try
{
productJDBC=new ProductJDBC(dbUrl,dbUsr,dbPwd);
indexer=new ProductIndexer(indexPath);
}
catch(Exception e)
{
e.printStackTrace();
}
}
public void setDirectionaries(String[] directionaries)
{
this.directionaries=directionaries;
}
protected void process()throws Exception
{
if(productJDBC==null)
{
throw new Exception("Database connection failed,pls retry");
}
if(indexer==null)
{
throw new Exception("Lucene index failed,pls retry");
}
if(directionaries==null||directionaries.length==0)
{
System.out.print("失敗了");
return;
}
try
{
for(int i=0;iSUMMARY_LENGTH)
{
p.setSummary(contentstr.substring(0,SUMMARY_LENGTH-1));
}
else
p.setSummary(contentstr);
p.setUpdatetime(updatetime);
//以上一個product對象已存在
//先存入數據庫,然後h獲得返回的id值;
int nextid=insert2DB(p);//這裏出現了錯誤,其實還是ProductJDBC.java裏面出現了錯誤
//用剛返回的id值,向索引中加入Product對象
buildIndex(p,nextid);
}
//索引優化
optimizeindex();
/*這只是一個函數,不能直接用來優化索引,不知道爲什麼
* 不直接用indexer.optimizeIndex();
*/
}
protected int insert2DB(Product p)throws Exception
{
return productJDBC.addProduct(p);
}
protected void buildIndex(Product p,int nextid)throws Exception
{
indexer.addProduct(p,nextid);
}
//優化所以你
private void optimizeindex()throws Exception
{
indexer.optimizeIndex();
}
private void closeIndex()throws Exception
{
indexer.close();
}
private void closeDB()
{
productJDBC.close();
}
/* public String getDbPwd()
{
return dbPwd;
}
public String getDbUrl()
{
return dbUrl;
}
public String getDbUsr()
{
return dbUsr;
}
public String getIndexPath()
{
return indexPath;
}*/
//上述方法書上有,但我看來看去,發現它好像也沒什麼用,就暫時給凍結了,好像也沒報錯。
public static void main(String[] args) throws Exception
{
// TODO Auto-generated method stub
ProductTextFileProcessor pro=new ProductTextFileProcessor();
pro.initialize();//前面已經有了,不知道是不是多此一舉。
String path1="c://product//mobile//";
pro.setDirectionaries(new String[]{path1});//這句到底是什麼意思
pro.process();
}
}數據庫還好說,以後肯定要用到,可是這個索引到底有什麼用啊,好像後面沒用到,不過我猜肯定是我弄</STRONG>錯了,怎麼可能會用不到,開玩笑嘛,等着看吧 ……
注:先第五發現有錯誤,修改如下:
package com.luceneheritrixbook.core;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import com.luceneheritrixbook.database.ProductJDBC;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.index.ProductIndexer;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;
public class ProductTextFileProcessor
{
/**
* @param args
*/
private String[] directionaries;
private static final String dbUrl=PropertyConfiguration.getDBUrl();
private static final String dbUsr=PropertyConfiguration.getDBUsr();
private static final String dbPwd=PropertyConfiguration.getDBPwd();
private static final String indexPath=PropertyConfiguration.getIndexStorePath();
private ProductJDBC productJDBC=null;
private ProductIndexer indexer=null;
public final static int SUMMARY_LENGTH=80;//到底有什麼用呢
public ProductTextFileProcessor()
{
initialize();
}
public void initialize()
{
try
{
productJDBC=new ProductJDBC(dbUrl,dbUsr,dbPwd);
indexer=new ProductIndexer(indexPath);
}
catch(Exception e)
{
e.printStackTrace();
}
}
public void setDirectionaries(String[] directionaries)
{
this.directionaries=directionaries;
}
protected void process()throws Exception
{
if(productJDBC==null)
{
throw new Exception("Database connection failed,pls retry");
}
if(indexer==null)
{
throw new Exception("Lucene index failed,pls retry");
}
if(directionaries==null||directionaries.length==0)
{
System.out.print("失敗了");
return;
}
try
{
for(int i=0;i<directionaries.length;i++)
{
File f=new File(directionaries[i]);
traverse(f);
}
//處理完成後關閉數據庫
closeDB();
//處理完成後關閉索引器
closeIndex();
}
catch(Exception e)
{
e.printStackTrace();
}
}
protected void traverse(File file)throws Exception
{
String[] files=file.list();
for(int i=0;i<files.length;i++)
{
File productfile=new File(file,files[i]);
String fname=productfile.getName();
System.out.println(fname);
BufferedReader reader=new BufferedReader(new FileReader(productfile));
String url=reader.readLine();
String name=reader.readLine();
String type=reader.readLine();
String imageURI="";
String updatetime=fname.substring(fname.lastIndexOf("-")+1,fname.lastIndexOf("."));
StringBuffer content=new StringBuffer();
String line=reader.readLine();
while(line!=null&&!line.equals(Extractor.SEPARATOR))//&&!line.equals(Extractor.SEPARATOR) 難道調用了前面的東西
{
content.append(line).append("/r/n");
line=reader.readLine();
}
imageURI=reader.readLine();
//生成並設置"一個"product對象
Product p=new Product();
p.setCategory("手機");
p.setName(name);
p.setType(type);
p.setImageURI(imageURI);
//p.setContent(content);//爲什麼會出錯呢?
p.setOriginalUrl(url);
String contentstr=content.toString();
p.setContent(contentstr);
if(contentstr.length()>SUMMARY_LENGTH)
{
p.setSummary(contentstr.substring(0,SUMMARY_LENGTH-1));
}
else
p.setSummary(contentstr);
p.setUpdatetime(updatetime);
//以上一個product對象已存在
//先存入數據庫,然後h獲得返回的id值;
int nextid=insert2DB(p);//這裏出現了錯誤,其實還是ProductJDBC.java裏面出現了錯誤
//用剛返回的id值,向索引中加入Product對象
buildIndex(p,nextid);
}
//索引優化
optimizeindex();
/*這只是一個函數,不能直接用來優化索引,不知道爲什麼
* 不直接用indexer.optimizeIndex();
*/
}
protected int insert2DB(Product p)throws Exception
{
return productJDBC.addProduct(p);
}
protected void buildIndex(Product p,int nextid)throws Exception
{
indexer.addProduct(p,nextid);
}
//優化所以你
private void optimizeindex()throws Exception
{
indexer.optimizeIndex();
}
private void closeIndex()throws Exception
{
indexer.close();
}
private void closeDB()
{
productJDBC.close();
}
/* public String getDbPwd()
{
return dbPwd;
}
public String getDbUrl()
{
return dbUrl;
}
public String getDbUsr()
{
return dbUsr;
}
public String getIndexPath()
{
return indexPath;
}*/
//上述方法書上有,但我看來看去,發現它好像也沒什麼用,就暫時給凍結了,好像也沒報錯。
public static void main(String[] args) throws Exception
{
// TODO Auto-generated method stub
ProductTextFileProcessor pro=new ProductTextFileProcessor();
pro.initialize();//前面已經有了,不知道是不是多此一舉。
String path1="c://product//mobile//";
pro.setDirectionaries(new String[]{path1});//這句到底是什麼意思
pro.process();
}
}
本文來自CSDN博客,轉載請標明出處:http://blog.csdn.net/caoxu1987728/archive/2008/07/18/2673492.aspx