python一步一步實現簡單的搜索引擎


待測試文檔:

1.txt

I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character. I have a dream today.

2.txt

This is our hope. . . With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day. . . .

3.txt

And when this happens, and when we allow freedom ring, when we let it ring from every village and every hamlet, from every state and every city, we will be able to speed up that day when all of God’s children, black men and white men, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the old Negro spiritual: “Free at last! Free at last! Thank God Almighty, we are free at last!”

1. 簡單的全文匹配

將待查找詞語直接在所有文本中進行全文搜索

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 文本對比模型
class TextSearchEngine(SearchEngineBase):
    def __init__(self):
        super(TextSearchEngine,self).__init__()
        self.__idToTexts = {}
        
    def processCorpus(self,id,text):
        self.__idToTexts[id] = text;
        
    def search(self,query):
        results = []
        for id, text in self.__idToTexts.items():
            if query in text:
                results.append(id)
        return results
    
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = TextSearchEngine()
main(engine)

在這裏插入圖片描述

2. 詞袋模型

將文本中的詞語進行統計放入詞袋,使用待查找詞語爲所有詞袋進行查找,這是由於一個文本中用到大量重複的單詞,浪費空間

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 詞袋模型
class BowEngine(SearchEngineBase):
    def __init__(self):
        super(BowEngine,self).__init__()
        self.__idToWords = {}
        
    def processCorpus(self,id,text):
        self.__idToWords[id] = self.textToWords(text)
        
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
    def search(self,querys):
        queryWords = self.textToWords(querys)
        results = []
        for id,text in self.__idToWords.items():
            foundFlag = True
            for query in queryWords:
                if query not in text:
                    foundFlag = False
                    break
            if foundFlag:
                results.append(id)
        return results   
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BowEngine()
main(engine)

在這裏插入圖片描述

3. 詞袋倒排索引模型

上面是通過對每個文件進行詞語檢測,當前將詞語索引到所有文件,然後對所有詞語求交集

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 詞袋反向索引模型
class BOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(BOWInvertedIndexEngine,self).__init__()
        self.invertedIndex = {}
        
    def processCorpus(self,id,text):
        words = self.textToWords(text)
        for word in words:
            if word not in self.invertedIndex:
                self.invertedIndex[word] = []
            self.invertedIndex[word].append(id)#創建反向索引
            
    def search(self,query):#求交集
        queryWords = list(self.textToWords(query))
        queryList = list()
        for query in queryWords:
            queryList.append(0)
            
        for query in queryWords:# 有一個詞沒有,則整個短語一定不會存在
            if query not in self.invertedIndex:
                return []
            
        result = []
        while True:
            currentIds = []
            for idx, query in enumerate(queryWords):
                currentIdx = queryList[idx]
                currentList = self.invertedIndex[query]
                
                if currentIdx >= len(currentList):
                    return result                
                currentIds.append(currentList[currentIdx])
                
            if all(x == currentIds[0] for x in currentIds):
                result.append(currentIds[0])
                queryList = [x+1 for x in queryList]
                continue
                
            minVal = min(currentIds)
            minPos = currentIds.index(minVal)
            queryList[minPos] += 1
                
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BOWInvertedIndexEngine()
main(engine)

在這裏插入圖片描述

4. 增加緩存

可以考慮使用LRU算法,此處只是示意了一個緩存。

import re
import pylru

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 詞袋反向索引模型
class BOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(BOWInvertedIndexEngine,self).__init__()
        self.invertedIndex = {}
        
    def processCorpus(self,id,text):
        words = self.textToWords(text)
        for word in words:
            if word not in self.invertedIndex:
                self.invertedIndex[word] = []
            self.invertedIndex[word].append(id)#創建反向索引
            
    def search(self,query):#求交集
        queryWords = list(self.textToWords(query))
        queryList = list()
        for query in queryWords:
            queryList.append(0)
            
        for query in queryWords:# 有一個詞沒有,則整個短語一定不會存在
            if query not in self.invertedIndex:
                return []
            
        result = []
        while True:
            currentIds = []
            for idx, query in enumerate(queryWords):
                currentIdx = queryList[idx]
                currentList = self.invertedIndex[query]
                
                if currentIdx >= len(currentList):
                    return result                
                currentIds.append(currentList[currentIdx])
                
            if all(x == currentIds[0] for x in currentIds):
                result.append(currentIds[0])
                queryList = [x+1 for x in queryList]
                continue
                
            minVal = min(currentIds)
            minPos = currentIds.index(minVal)
            queryList[minPos] += 1
                
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
class LRUCache(object):
    def __init__(self,size=32):
        self.cache = pylru.lrucache(size)
        
    def has(self,key):
        return key in self.cache
    
    def get(self,key):
        return self.cache[key]
    
    def set(self,key,value):
        self.cache[key] = value        
        
class BOWInvertedIndexEngineWithCache(BOWInvertedIndexEngine,LRUCache):
    def __init__(self):
        super(BOWInvertedIndexEngineWithCache,self).__init__()
        LRUCache.__init__(self)
        
    def search(self,query):
        if self.has(query):
            return self.get(query)
        result = super(BOWInvertedIndexEngineWithCache,self).search(query)
        self.set(query,result)
        return result
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BOWInvertedIndexEngineWithCache()
main(engine)

在這裏插入圖片描述

轉自:https://www.zhenxiangsimple.com/2020/03/09/tech/python-spider/

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章