Lucene.Net 全文索引筆記

Lucene.Net用了又忘...由於現在信息量爆炸，用過的東西用完就忘，只好自己寫個筆記來記錄一下了...

1：需要DLL

Lucene.Net.dll

PanGu.dll

PanGu.HighLight.dll

PanGu.Lucene.Analyzer.dll

沒有的話，可以去我的資源包裏面下，地址如下： http://download.csdn.net/download/kimizhou_blog/10016313

2；生成索引

string indexPath = Context.Server.MapPath("~/App_Data/IndexData");//索引文檔保存位置
string commonProductIndexPath = string.Format("{0}/{1}", indexPath, "commonProduct");    //積分商城產品
 //開始處理 積分商城產品索引
CreateCommonProductIndex(commonProductIndexPath);

然後看看CreateCommonProductIndex方法

      /// <summary>
        /// 創建積分商城產品索引
        /// </summary>
        /// <param name="indexPath"></param>
        private void CreateCommonProductIndex(string indexPath)
        {
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());//綁定索引目錄
            bool isExist = IndexReader.IndexExists(directory);
            if (isExist)
            {
                if (IndexWriter.IsLocked(directory))
                {
                    IndexWriter.Unlock(directory);
                }
            }
            IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isExist, IndexWriter.MaxFieldLength.UNLIMITED);
            writer.DeleteAll();//先刪之前的索引
            IList<ProductInfoByIndex> list = Product.GetProductListByIndex();
            foreach (var item in list)
            {
                Document document = new Document(); 
                document.Add(new Field("id", item.ProductID.ToString(), Field.Store.YES, Field.Index.ANALYZED));//--所有字段的值都將以字符串類型保存 因爲索引庫只存儲字符串類型數據
                string Content = string.Format("{0}", item.ProductName);
                document.Add(new Field("Content", Content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
                writer.AddDocument(document); //文檔寫入索引庫
            }
            writer.Close();//會自動解鎖
            directory.Close(); //不要忘了Close，否則索引結果搜不到
        }

其中
IList<ProductInfoByIndex> list = Product.GetProductListByIndex();方式是去數據中讀取這個list對象，這裏代碼就不貼出來了。到這裏你的索引已經創建出來的，那麼接下來需要查詢和顯示

查詢是最困難的，各種匹配

3：查詢索引並且顯示出來

GetProductIndex方法就是獲取索引代碼如下：

    /// <summary>
        /// 獲取積分商品索引
        /// </summary>
        private void GetProductIndex()
        {
            string indexPath = Context.Server.MapPath("~/App_Data/IndexData");//索引文檔保存位置
            string commonProductIndexPath = string.Format("{0}/{1}", indexPath, "commonProduct");    //積分商城產品
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(commonProductIndexPath), new NoLockFactory());
            IndexReader reader = IndexReader.Open(directory, true);
            IndexSearcher searcher = new IndexSearcher(reader);

            BooleanQuery bQuery = new BooleanQuery();
            foreach (string word in SplitContent.SplitWords(Request["SearchKey"]))
            {
                Query queryUseringNatrue = new WildcardQuery(new Term("Content", "*" + word + "*"));
                bQuery.Add(queryUseringNatrue, BooleanClause.Occur.MUST);// MUST 必須
            }

            Sort sort = new Sort(new SortField("id", SortField.FLOAT, true)); //true爲降序排序 
            TopDocs docs = searcher.Search(bQuery, (Filter)null, 9999999, sort);
            List<ProductInfoByIndex> proList = new List<ProductInfoByIndex>();
            for (int i = 0; i < docs.totalHits; i++)
            {
                Document doc = searcher.Doc(docs.scoreDocs[i].doc);
                ProductInfoByIndex product = new ProductInfoByIndex();
                product.ProductID = System.Convert.ToInt32(doc.Get("id"));
                product.ProductName = doc.Get("Content");
                //product.ProductName = SplitContent.HightLight(Request["SearchKey"], doc.Get("Content"));
                proList.Add(product);
            }
            productResultList = proList;
            this.Message += string.Format("|{0}條積分商城產品", docs.totalHits);


            //PhraseQuery query = new PhraseQuery();



            //foreach (string word in SplitContent.SplitWords(Request["SearchKey"]))
            //{
            //    query.Add(new Term("Content", word));
            //}
            //query.SetSlop(100);
            //TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);
            //searcher.Search(query, null, collector);
            //ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;

            //List<ProductInfoByIndex> proList = new List<ProductInfoByIndex>();
            //for (int i = 0; i < docs.Length; i++)
            //{
            //    int docId = docs[i].doc;//得到查詢結果文檔的id（Lucene內部分配的id）
            //    Document doc = searcher.Doc(docId);//根據文檔id來獲得文檔對象Document
            //    ProductInfoByIndex product = new ProductInfoByIndex();
            //    product.ProductID = System.Convert.ToInt32(doc.Get("id"));
            //    //book.ContentDescription = doc.Get("content");//未使用高亮
            //    //搜索關鍵字高亮顯示 使用盤古提供高亮插件
            //    product.ProductName = SplitContent.HightLight(Request["SearchKey"], doc.Get("Content"));
            //    proList.Add(product);
            //}
            //    productResultList = proList;
            //this.Message += string.Format("|{0}條積分商城產品", docs.Length);
        }

其中我註釋掉的，是另外一種方法，這裏我用的效率比較慢的模糊查詢
Query queryUseringNatrue = new WildcardQuery(new Term("Content", "*" + word + "*"));
這個類似數據庫的like '%關鍵字%'
到這裏就已經獲取到了所有的索引資料了，是不是很簡單，你get到了嗎？最後我再給大家介紹索引的幾種查詢方式：
第1種：

  //string keyWordUseringNatrue = "營運";
            //if (!string.IsNullOrWhiteSpace(keyWordUseringNatrue))
            //{
            //    QueryParser parseUseringNatrue = new QueryParser("UseringNatrue", new PanGuAnalyzer());
            //    Query query = parseUseringNatrue.Parse(keyWordUseringNatrue);
            //    parseUseringNatrue.SetDefaultOperator(QueryParser.Operator.AND);
            //    bQuery.Add(query, BooleanClause.Occur.MUST);
            //}
            //營運
            //Query queryUseringNatrue = new WildcardQuery(new Term("UseringNatrue", "營運")); 
            //bQuery.Add(queryUseringNatrue, BooleanClause.Occur.MUST);// MUST 必須

這個查詢是什麼呢？是一般的查詢，會查詢出運營相關的，但是他和like不一樣，他跟分詞有關，比如說，“愛” 就查詢不出 “可愛” ，pangu有自己的分詞，但是這個比較常用，下面彙總一下其它的查詢：
其它查詢彙總：

//介紹各種Query
            //TermQuery： 首先介紹最基本的查詢，如果你想執行一個這樣的查詢：在content字段中查詢包含‘劉備的document”，那麼你可以用TermQuery：
            // Term t = new Term("content", "劉備");
            // Query query = new TermQuery(t);

            //BooleanQuery ：如果你想這麼查詢：在content字段中包含”劉備“並且在title字段包含”三國“的document”，那麼你可以建立兩個TermQuery並把它們用BooleanQuery連接起來：
            //1             TermQuery termQuery1 = new TermQuery(new Term("content", "劉備"));
            //2             TermQuery termQuery2 = new TermQuery(new Term("title", "三國"));
            //3             BooleanQuery booleanQuery = new BooleanQuery();
            //4             booleanQuery.Add(termQuery1, BooleanClause.Occur.SHOULD);
            //5             booleanQuery.Add(termQuery2, BooleanClause.Occur.SHOULD);

            //WildcardQuery ：如果你想對某單詞進行通配符查詢，你可以用WildcardQuery，通配符包括’?’匹配一個任意字符和’*’匹配零個或多個任意字符，例如你搜索’三國*’，你可能找到’三國演義’或者’三國志’：
            //1             Query query = new WildcardQuery(new Term("content", "三國*"));

            //PhraseQuery ：你可能對中日關係比較感興趣，想查找‘中’和‘日’捱得比較近（5個字的距離內）的文章，超過這個距離的不予考慮，你可以
            //1             PhraseQuery query = new PhraseQuery();
            //2             query.SetSlop(5);
            //3             query.Add(new Term("content ", "中"));
            //4             query.Add(new Term("content", "日"));

            //那麼它可能搜到“中日合作……”、“中方和日方……”，但是搜不到“中國某高層領導說日本欠扁”
            //PrefixQuery ：如果你想搜以‘中’開頭的詞語，你可以用PrefixQuery：
            //1             PrefixQuery query = new PrefixQuery(new Term("content ", "中"));

            //FuzzyQuery ：FuzzyQuery用來搜索相似的term，使用Levenshtein算法。假設你想搜索跟‘wuzza’相似的詞語，你可以：
            //1             Query query = new FuzzyQuery(new Term("content", "wuzza"));
            //你可能得到‘fuzzy’和‘wuzzy’。

            //RangeQuery： 另一個常用的Query是RangeQuery，你也許想搜索時間域從20060101到20060130之間的document，你可以用RangeQuery：
            //1             RangeQuery query = new RangeQuery(new Term("time","20060101"), new Term("time","20060130"), true);
            //最後的true表示用閉合區間。

因爲各個版本，他們使用的都不太一樣，下面介紹一種常用的讀取以後顯示的方式，其中Sort就是排序

Stopwatch stopwath = new Stopwatch();//秒錶 
            Sort sort = new Sort(new SortField("CarPrice", SortField.FLOAT,true)); //true爲降序排序 CarPrice爲價格 SortField.DOC是？
            TopDocs docs = searcher.Search(bQuery, (Filter)null, 9999999, sort);
            stopwath.Stop();//秒錶停止
            long lSearchTime = stopwath.ElapsedMilliseconds;//耗時

            List<CarSourceInfoByIndex> carSourceResult = new List<CarSourceInfoByIndex>();
            for (int i=0;i<docs.totalHits; i++)
            {
                Document doc = searcher.Doc(docs.scoreDocs[i].doc);
                CarSourceInfoByIndex carSource = new CarSourceInfoByIndex()
                {
                    Id = int.Parse(doc.Get("Id")),
                    CarPrice = System.Convert.ToDouble(doc.Get("CarPrice")),
                    Recommended = SplitContent.HightLight(Request["SearchKey"], doc.Get("Content"))
                 };
                carSourceResult.Add(carSource);
            }
            carSourceResultList2 = carSourceResult;
            this.Message += string.Format("{0}條測試", docs.totalHits);

就到這裏了，不懂的可以加我QQ 10200454諮詢

Lucene.Net 全文索引筆記

sql to linq 之存儲過程偏

C# wcf 添加引用的時候 "無從http://XXX/XXX.svc?wsdl獲取元數據”錯誤的解決方法

C# Windows服務的開發和部署,調試（運用Timer）

正則在C#中的使用

power導出HTML模板

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結