python一步一步實現簡單的搜索引擎

文章目錄

待測試文檔：

1.txt

I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character. I have a dream today.

2.txt

This is our hope. . . With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day. . . .

3.txt

And when this happens, and when we allow freedom ring, when we let it ring from every village and every hamlet, from every state and every city, we will be able to speed up that day when all of God’s children, black men and white men, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the old Negro spiritual: “Free at last! Free at last! Thank God Almighty, we are free at last!”

1. 簡單的全文匹配

將待查找詞語直接在所有文本中進行全文搜索

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 文本對比模型
class TextSearchEngine(SearchEngineBase):
    def __init__(self):
        super(TextSearchEngine,self).__init__()
        self.__idToTexts = {}
        
    def processCorpus(self,id,text):
        self.__idToTexts[id] = text;
        
    def search(self,query):
        results = []
        for id, text in self.__idToTexts.items():
            if query in text:
                results.append(id)
        return results
    
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = TextSearchEngine()
main(engine)

2. 詞袋模型

將文本中的詞語進行統計放入詞袋，使用待查找詞語爲所有詞袋進行查找，這是由於一個文本中用到大量重複的單詞，浪費空間

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 詞袋模型
class BowEngine(SearchEngineBase):
    def __init__(self):
        super(BowEngine,self).__init__()
        self.__idToWords = {}
        
    def processCorpus(self,id,text):
        self.__idToWords[id] = self.textToWords(text)
        
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
    def search(self,querys):
        queryWords = self.textToWords(querys)
        results = []
        for id,text in self.__idToWords.items():
            foundFlag = True
            for query in queryWords:
                if query not in text:
                    foundFlag = False
                    break
            if foundFlag:
                results.append(id)
        return results   
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BowEngine()
main(engine)

3. 詞袋倒排索引模型

上面是通過對每個文件進行詞語檢測，當前將詞語索引到所有文件，然後對所有詞語求交集

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 詞袋反向索引模型
class BOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(BOWInvertedIndexEngine,self).__init__()
        self.invertedIndex = {}
        
    def processCorpus(self,id,text):
        words = self.textToWords(text)
        for word in words:
            if word not in self.invertedIndex:
                self.invertedIndex[word] = []
            self.invertedIndex[word].append(id)#創建反向索引
            
    def search(self,query):#求交集
        queryWords = list(self.textToWords(query))
        queryList = list()
        for query in queryWords:
            queryList.append(0)
            
        for query in queryWords:# 有一個詞沒有，則整個短語一定不會存在
            if query not in self.invertedIndex:
                return []
            
        result = []
        while True:
            currentIds = []
            for idx, query in enumerate(queryWords):
                currentIdx = queryList[idx]
                currentList = self.invertedIndex[query]
                
                if currentIdx >= len(currentList):
                    return result                
                currentIds.append(currentList[currentIdx])
                
            if all(x == currentIds[0] for x in currentIds):
                result.append(currentIds[0])
                queryList = [x+1 for x in queryList]
                continue
                
            minVal = min(currentIds)
            minPos = currentIds.index(minVal)
            queryList[minPos] += 1
                
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BOWInvertedIndexEngine()
main(engine)

4. 增加緩存

可以考慮使用LRU算法，此處只是示意了一個緩存。

import re
import pylru

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 詞袋反向索引模型
class BOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(BOWInvertedIndexEngine,self).__init__()
        self.invertedIndex = {}
        
    def processCorpus(self,id,text):
        words = self.textToWords(text)
        for word in words:
            if word not in self.invertedIndex:
                self.invertedIndex[word] = []
            self.invertedIndex[word].append(id)#創建反向索引
            
    def search(self,query):#求交集
        queryWords = list(self.textToWords(query))
        queryList = list()
        for query in queryWords:
            queryList.append(0)
            
        for query in queryWords:# 有一個詞沒有，則整個短語一定不會存在
            if query not in self.invertedIndex:
                return []
            
        result = []
        while True:
            currentIds = []
            for idx, query in enumerate(queryWords):
                currentIdx = queryList[idx]
                currentList = self.invertedIndex[query]
                
                if currentIdx >= len(currentList):
                    return result                
                currentIds.append(currentList[currentIdx])
                
            if all(x == currentIds[0] for x in currentIds):
                result.append(currentIds[0])
                queryList = [x+1 for x in queryList]
                continue
                
            minVal = min(currentIds)
            minPos = currentIds.index(minVal)
            queryList[minPos] += 1
                
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
class LRUCache(object):
    def __init__(self,size=32):
        self.cache = pylru.lrucache(size)
        
    def has(self,key):
        return key in self.cache
    
    def get(self,key):
        return self.cache[key]
    
    def set(self,key,value):
        self.cache[key] = value        
        
class BOWInvertedIndexEngineWithCache(BOWInvertedIndexEngine,LRUCache):
    def __init__(self):
        super(BOWInvertedIndexEngineWithCache,self).__init__()
        LRUCache.__init__(self)
        
    def search(self,query):
        if self.has(query):
            return self.get(query)
        result = super(BOWInvertedIndexEngineWithCache,self).search(query)
        self.set(query,result)
        return result
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BOWInvertedIndexEngineWithCache()
main(engine)

轉自：https://www.zhenxiangsimple.com/2020/03/09/tech/python-spider/

python一步一步實現簡單的搜索引擎

文章目錄

待測試文檔：

1. 簡單的全文匹配

2. 詞袋模型

3. 詞袋倒排索引模型

4. 增加緩存

容器中nginx無法使用同一個網絡下的容器域名

Python: SunMoonTimeCalculator

「Pygors跨平臺GUI」1：Pygors跨平臺GUI應用研究

NETCore中實現一個輕量無負擔的極簡任務調度ScheduleTask

docker使用特定的網絡

使用c#強大的表達式樹實現對象的深克隆之解決循環引用的問題

「Pygors跨平臺GUI」2：安裝MinGW-w64、MSYS2還是WSL2

nodejs學習07——API

避免DbContext同時在多個線程調用

GPT-4o 引領人機交互新風向，向量數據庫賽道沸騰了

項目中常用的審計類型概述

4種解法 - 最小的k個數

python一步一步實現簡單的搜索引擎

項目過程發生變更的處理流程

兩種解法 - 判斷字符串的子串

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結