數據庫與索引結構

轉自 http://blog.csdn.net/caoxu1987728/archive/2008/07/18/2673492.aspx

 

由文章標題可知  我們要建立數據庫和索引。

一,定義Product類    
此類相當於MVC中的容器裝載了數據庫和索引所需要的對象,例如:category、name、type、content、summary、imageURI、originalRrl、updatedtime。順序沒關係,代碼如下:

package com.luceneheritrixbook.core;

public class Product {
 private String category=null;
 private String name=null;
 private String type=null;
 private String content=null;
 private String summary=null;
 private String imageURI=null;
 private String updatedtime=null;
 private String originalUrl=null;
 public String getCategory() {
  return category;
 }
 public void setCategory(String category) {
  this.category = category;
 }
 public String getContent() {
  return content;
 }
 public void setContent(String content) {
  this.content = content;
 }
 public String getImageURI() {
  return imageURI;
 }
 public void setImageURI(String imageURI) {
  this.imageURI = imageURI;
 }
 public String getName() {
  return name;
 }
 public void setName(String name) {
  this.name = name;
 }
 public String getOriginalUrl() {
  return originalUrl;
 }
 public void setOriginalUrl(String originalUrl) {
  this.originalUrl = originalUrl;
 }
 public String getSummary() {
  return summary;
 }
 public void setSummary(String summary) {
  this.summary = summary;
 }
 public String getType() {
  return type;
 }
 public void setType(String type) {
  this.type = type;
 }
 public String getUpdatetime() {
  return updatedtime;
 }
 public void setUpdatetime(String updatetime) {
  this.updatedtime = updatetime;
 }

}二:定義Lucene的Document格式(即用於搜索的field域)

package com.luceneheritrixbook.index;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import com.luceneheritrixbook.core.Product;

public class ProductDocument {
 private static final String PRODUCT_ID="productid";
 private static final String INDEX_TIME="indextime";
 private static final String PRODUCT_URL="productrul";
 private static final String CATEGORY="category";
 private static final String PRODUCT_NAME="name";
 private static final String PRODUCT_TYPE="type";
 
 public static Document buildProductDocument(Product product,int id)
 {
  Document doc=new Document();
  
  Field identifier=new Field(PRODUCT_ID,id+"",Field.Store.YES,
    Field.Index.UN_TOKENIZED);
  
  long mills=System.currentTimeMillis();
  Field indextime=new Field(INDEX_TIME,mills+"",Field.Store.YES,
    Field.Index.UN_TOKENIZED);
  
  Field producturl=new Field(PRODUCT_URL,product.getOriginalUrl(),Field.Store.YES,
    Field.Index.UN_TOKENIZED);
  
  Field category=new Field(CATEGORY,product.getCategory(),Field.Store.YES,
    Field.Index.TOKENIZED);
  
  Field name=new Field(PRODUCT_NAME,product.getName(),Field.Store.YES,
    Field.Index.TOKENIZED);
  
  Field type=new Field(PRODUCT_TYPE,product.getType(),Field.Store.YES,
    Field.Index.TOKENIZED);
  
  String text=product.getCategory();
  text+=" "+product.getName();
  text+=" "+product.getType();
  Field all=new Field(PRODUCT_ID,text,Field.Store.YES,
    Field.Index.TOKENIZED);
  
  doc.add(identifier);
  doc.add(indextime);
  doc.add(producturl);
  doc.add(category);
  doc.add(name);
  doc.add(type);
  doc.add(all);
  
  return doc;
 }

}

三、對數據庫進行操作(即向數據庫中插入獲得的product對象)

package com.luceneheritrixbook.database;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.Statement;

import com.luceneheritrixbook.core.Product;

public class ProductJDBC {
 private Connection con = null;

 private Statement stmt = null;

 private ResultSet rs = null;

 private PreparedStatement pstmt = null;

 private boolean autoCommit = true;

 public ProductJDBC(String url, String usr, String pwd) throws Exception
 {
  Class.forName("com.mysql.jdbc.Driver").newInstance();
  con = DriverManager.getConnection(url, usr, pwd);
  
  con.setAutoCommit(autoCommit);
 }

 public int addProduct(Product p) throws Exception
 {

  int nextid = getNextId();

  if (nextid < 0) {
   throw new Exception("Can&apos;t get next id.");
  }
  
  String content=p.getContent();
  String summary=p.getSummary();
  String imageURI=p.getImageURI();
  String originalUrl=p.getOriginalUrl();
  String category=p.getCategory();
  String name=p.getName();
  String type=p.getType();
  String updatetime=p.getUpdatetime();
  
  String expr="insert into product(content,abstractcontent,url," +
    "imageurl,category,name,type,updatedtime)values(?,?,?,?,?,?,?,?)";
  
  pstmt=con.prepareStatement(expr);
  
  pstmt.setString(1, content);
  pstmt.setString(2, summary);
  pstmt.setString(3, originalUrl);
  pstmt.setString(4, imageURI);
  pstmt.setString(5, category);
  pstmt.setString(6, name);
  pstmt.setString(7, type);
  pstmt.setString(8, updatetime);
  
  pstmt.execute();
  
  return nextid;
 }
 
 private int getNextId() throws Exception {

  int result = -1;

  String sql = "select max(id)+1 from product";

  stmt = con.createStatement();
  rs = stmt.executeQuery(sql);

  while (rs.next()) {
   result = rs.getInt(1);
  }

  return result;
 }
 
 public void close()
 {
  if(con!=null)
  {
   try
   {
    con.close();
   }
   catch(Exception e)
   {
    e.printStackTrace();
   }
   finally
   {
    con=null;
   }
  }
 }
}
/*
 * 在這裏我發現了一個不好的地方,那就是完全相同的兩個產品信息可以同時存入數據庫
 * */四、對索引進行操作(其實就是把前面所構建的詞庫加入JE分詞,然後連同Document一起加入索引器)

package com.luceneheritrixbook.index;

import java.io.FileReader;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;

import com.luceneheritrixbook.core.Product;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;

public class ProductIndexer
{
     private String indexPath="";
     private IndexWriter writer=null;
     private Analyzer analyzer=null;
     private String dictionary_file=PropertyConfiguration.getWordDictionary();
    
     public ProductIndexer(String indexPath)throws Exception
     {
      this.indexPath=indexPath;
      initialize();
     }
    
     private void initialize() throws Exception
     {
      analyzer=new MMAnalyzer();
      FileReader reader=new FileReader(dictionary_file);
      ((MMAnalyzer)analyzer).addDictionary(reader);
      writer=new IndexWriter(indexPath,analyzer,true);
     }
    
     public void close()
     {
      try
      {
       writer.close();
      }
      catch(Exception e)
      {
       e.printStackTrace();
       writer=null;
      }
     }
    
     public void addProduct(Product product,int id)throws Exception
     {
      writer.addDocument(ProductDocument.buildProductDocument(product,id));
     }
    
     //優化索引
     public void optimizeIndex()throws Exception
     {
      writer.optimize();
     }
}
五、調用數據庫處理類和索引處理類(這是建立數據庫和索引最主要的類,主要過程是這樣的:首先初始化數據庫和索引的實例,然後是從heritix中讀取的鏡像網頁,通過File的循環遍歷從中讀取每一個product的詳細信息,然後生成一個Product對象,這樣通過參數product就可以把數據存入數據庫和索引了)代碼如下:

package com.luceneheritrixbook.core;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;

import com.luceneheritrixbook.database.ProductJDBC;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.index.ProductIndexer;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;

public class ProductTextFileProcessor
{
 /**
  * @param args
  */
 private String[] directionaries;
 
 private static final String dbUrl=PropertyConfiguration.getDBUrl();
 private static final String dbUsr=PropertyConfiguration.getDBUsr();
 private static final String dbPwd=PropertyConfiguration.getDBPwd();
 private static final String indexPath=PropertyConfiguration.getIndexStorePath();
 
 private ProductJDBC productJDBC=null;
 private ProductIndexer indexer=null;
 
 public final static int SUMMARY_LENGTH=80;//內容簡介的最大數量
 
 public ProductTextFileProcessor()
 {
  initialize();
 }
 
 public void initialize()
 {
  try
  {
   productJDBC=new ProductJDBC(dbUrl,dbUsr,dbPwd);
   indexer=new ProductIndexer(indexPath);
  }
  catch(Exception e)
  {
   e.printStackTrace();
  } 
 }
 
 public void setDirectionaries(String[] directionaries)
 {
  this.directionaries=directionaries;
 }
 
 protected void process()throws Exception
 {
  if(productJDBC==null)
  {
   throw new Exception("Database connection failed,pls retry");
  }
  
  if(indexer==null)
  {
   throw new Exception("Lucene index failed,pls retry");
  }
  
  if(directionaries==null||directionaries.length==0)
  {
   System.out.print("失敗了");
   return;
  }
  
  try
  {
   for(int i=0;iSUMMARY_LENGTH)
   {
    p.setSummary(contentstr.substring(0,SUMMARY_LENGTH-1));
   }
   else
    p.setSummary(contentstr);

   p.setUpdatetime(updatetime);
   //以上一個product對象已存在
   //先存入數據庫,然後h獲得返回的id值;
   int nextid=insert2DB(p);//這裏出現了錯誤,其實還是ProductJDBC.java裏面出現了錯誤
   
   //用剛返回的id值,向索引中加入Product對象
   buildIndex(p,nextid); 
  }
  //索引優化
   optimizeindex();
   /*這只是一個函數,不能直接用來優化索引,不知道爲什麼
    * 不直接用indexer.optimizeIndex();
    */
 }
 
 protected int insert2DB(Product p)throws Exception
 {
  return productJDBC.addProduct(p);
 }
 
 protected void buildIndex(Product p,int nextid)throws Exception
 {
  indexer.addProduct(p,nextid);
 }
 //優化所以你
 private void optimizeindex()throws Exception
 {
  indexer.optimizeIndex();
 }
 
 private void closeIndex()throws Exception
 {
  indexer.close();
 }
 
 private void closeDB()
 {
  productJDBC.close();
 }
 
 /* public String getDbPwd()
 {
  return dbPwd;
 }
 
 public String getDbUrl()
 {
  return dbUrl;
 }
 
 public String getDbUsr()
 {
  return dbUsr;
 }
 
 public String getIndexPath()
 {
  return indexPath;
 }*/
 //上述方法書上有,但我看來看去,發現它好像也沒什麼用,就暫時給凍結了,好像也沒報錯。
 
 public static void main(String[] args) throws Exception
 {
  // TODO Auto-generated method stub
  ProductTextFileProcessor pro=new ProductTextFileProcessor();
  pro.initialize();//前面已經有了,不知道是不是多此一舉。
  
  String path1="c://product//mobile//";
  pro.setDirectionaries(new String[]{path1});//這句到底是什麼意思
       
  pro.process();
 }
}數據庫還好說,以後肯定要用到,可是這個索引到底有什麼用啊,好像後面沒用到,不過我猜肯定是我弄</STRONG>錯了,怎麼可能會用不到,開玩笑嘛,等着看吧  ……

注:先第五發現有錯誤,修改如下:

package com.luceneheritrixbook.core;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;

import com.luceneheritrixbook.database.ProductJDBC;
import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.index.ProductIndexer;
import com.luceneheritrixbook.searchengine.config.PropertyConfiguration;

public class ProductTextFileProcessor
{
    /**
     * @param args
     */
    private String[] directionaries;
    
    private static final String dbUrl=PropertyConfiguration.getDBUrl();
    private static final String dbUsr=PropertyConfiguration.getDBUsr();
    private static final String dbPwd=PropertyConfiguration.getDBPwd();
    private static final String indexPath=PropertyConfiguration.getIndexStorePath();
    
    private ProductJDBC productJDBC=null;
    private ProductIndexer indexer=null;
    
    public final static int SUMMARY_LENGTH=80;//到底有什麼用呢
    
    public ProductTextFileProcessor()
    {
        initialize();
    }
    
    public void initialize()
    {
        try
        {
            productJDBC=new ProductJDBC(dbUrl,dbUsr,dbPwd);
            indexer=new ProductIndexer(indexPath);
        }
        catch(Exception e)
        {
            e.printStackTrace();
        }   
    }
    
    public void setDirectionaries(String[] directionaries)
    {
        this.directionaries=directionaries;
    }
    
    protected void process()throws Exception
    {
        if(productJDBC==null)
        {
            throw new Exception("Database connection failed,pls retry");
        }
        
        if(indexer==null)
        {
            throw new Exception("Lucene index failed,pls retry");
        }
        
        if(directionaries==null||directionaries.length==0)
        {
            System.out.print("失敗了");
            return;
        }
        
        try
        {
            for(int i=0;i<directionaries.length;i++)
            {
                File f=new File(directionaries[i]);
                traverse(f);
            }
            //處理完成後關閉數據庫
            closeDB();
            
            //處理完成後關閉索引器
            closeIndex();
        }
        catch(Exception e)
        {
            e.printStackTrace();
        }
    }
    
    protected void traverse(File file)throws Exception
    {
        String[] files=file.list();
        for(int i=0;i<files.length;i++)
        {
            File productfile=new File(file,files[i]);
            
            String fname=productfile.getName();
            System.out.println(fname);
            
            BufferedReader reader=new BufferedReader(new FileReader(productfile));
            
            String url=reader.readLine();
            String name=reader.readLine();
            String type=reader.readLine();
            String imageURI="";
            String updatetime=fname.substring(fname.lastIndexOf("-")+1,fname.lastIndexOf("."));
            
            StringBuffer content=new StringBuffer();
            String line=reader.readLine();
            while(line!=null&&!line.equals(Extractor.SEPARATOR))//&&!line.equals(Extractor.SEPARATOR) 難道調用了前面的東西
            {
                content.append(line).append("/r/n");
                line=reader.readLine();
            }
            
            imageURI=reader.readLine();
            
            //生成並設置"一個"product對象
            Product p=new Product();
            p.setCategory("手機");
            p.setName(name);
            p.setType(type);
            p.setImageURI(imageURI);
            //p.setContent(content);//爲什麼會出錯呢?
            p.setOriginalUrl(url);
            
            String contentstr=content.toString();
            p.setContent(contentstr);
            
            if(contentstr.length()>SUMMARY_LENGTH)
            {
                p.setSummary(contentstr.substring(0,SUMMARY_LENGTH-1));
            }
            else
                p.setSummary(contentstr);

            p.setUpdatetime(updatetime);
            //以上一個product對象已存在
            //先存入數據庫,然後h獲得返回的id值;
            int nextid=insert2DB(p);//這裏出現了錯誤,其實還是ProductJDBC.java裏面出現了錯誤
            
            //用剛返回的id值,向索引中加入Product對象
            buildIndex(p,nextid);   
        }
        //索引優化
         optimizeindex();
         /*這只是一個函數,不能直接用來優化索引,不知道爲什麼
          * 不直接用indexer.optimizeIndex();
          */
    }
    
    protected int insert2DB(Product p)throws Exception
    {
        return productJDBC.addProduct(p);
    }
    
    protected void buildIndex(Product p,int nextid)throws Exception
    {
        indexer.addProduct(p,nextid);
    }
    //優化所以你
    private void optimizeindex()throws Exception
    {
        indexer.optimizeIndex();
    }
    
    private void closeIndex()throws Exception
    {
        indexer.close();
    }
    
    private void closeDB()
    {
        productJDBC.close();
    }
    
 /* public String getDbPwd()
    {
        return dbPwd;
    }
    
    public String getDbUrl()
    {
        return dbUrl;
    }
    
    public String getDbUsr()
    {
        return dbUsr;
    }
    
    public String getIndexPath()
    {
        return indexPath;
    }*/
    //上述方法書上有,但我看來看去,發現它好像也沒什麼用,就暫時給凍結了,好像也沒報錯。
    
    public static void main(String[] args) throws Exception
    {
        // TODO Auto-generated method stub
        ProductTextFileProcessor pro=new ProductTextFileProcessor();
        pro.initialize();//前面已經有了,不知道是不是多此一舉。
        
        String path1="c://product//mobile//";
        pro.setDirectionaries(new String[]{path1});//這句到底是什麼意思
        
        pro.process();
    }
}


本文來自CSDN博客,轉載請標明出處:http://blog.csdn.net/caoxu1987728/archive/2008/07/18/2673492.aspx

發佈了20 篇原創文章 · 獲贊 2 · 訪問量 3萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章