python一步一步实现简单的搜索引擎

文章目录

待测试文档：

1.txt

I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character. I have a dream today.

2.txt

This is our hope. . . With this faith we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day. . . .

3.txt

And when this happens, and when we allow freedom ring, when we let it ring from every village and every hamlet, from every state and every city, we will be able to speed up that day when all of God’s children, black men and white men, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the old Negro spiritual: “Free at last! Free at last! Thank God Almighty, we are free at last!”

1. 简单的全文匹配

将待查找词语直接在所有文本中进行全文搜索

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 文本对比模型
class TextSearchEngine(SearchEngineBase):
    def __init__(self):
        super(TextSearchEngine,self).__init__()
        self.__idToTexts = {}
        
    def processCorpus(self,id,text):
        self.__idToTexts[id] = text;
        
    def search(self,query):
        results = []
        for id, text in self.__idToTexts.items():
            if query in text:
                results.append(id)
        return results
    
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = TextSearchEngine()
main(engine)

2. 词袋模型

将文本中的词语进行统计放入词袋，使用待查找词语为所有词袋进行查找，这是由于一个文本中用到大量重复的单词，浪费空间

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 词袋模型
class BowEngine(SearchEngineBase):
    def __init__(self):
        super(BowEngine,self).__init__()
        self.__idToWords = {}
        
    def processCorpus(self,id,text):
        self.__idToWords[id] = self.textToWords(text)
        
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
    def search(self,querys):
        queryWords = self.textToWords(querys)
        results = []
        for id,text in self.__idToWords.items():
            foundFlag = True
            for query in queryWords:
                if query not in text:
                    foundFlag = False
                    break
            if foundFlag:
                results.append(id)
        return results   
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BowEngine()
main(engine)

3. 词袋倒排索引模型

上面是通过对每个文件进行词语检测，当前将词语索引到所有文件，然后对所有词语求交集

import re

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 词袋反向索引模型
class BOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(BOWInvertedIndexEngine,self).__init__()
        self.invertedIndex = {}
        
    def processCorpus(self,id,text):
        words = self.textToWords(text)
        for word in words:
            if word not in self.invertedIndex:
                self.invertedIndex[word] = []
            self.invertedIndex[word].append(id)#创建反向索引
            
    def search(self,query):#求交集
        queryWords = list(self.textToWords(query))
        queryList = list()
        for query in queryWords:
            queryList.append(0)
            
        for query in queryWords:# 有一个词没有，则整个短语一定不会存在
            if query not in self.invertedIndex:
                return []
            
        result = []
        while True:
            currentIds = []
            for idx, query in enumerate(queryWords):
                currentIdx = queryList[idx]
                currentList = self.invertedIndex[query]
                
                if currentIdx >= len(currentList):
                    return result                
                currentIds.append(currentList[currentIdx])
                
            if all(x == currentIds[0] for x in currentIds):
                result.append(currentIds[0])
                queryList = [x+1 for x in queryList]
                continue
                
            minVal = min(currentIds)
            minPos = currentIds.index(minVal)
            queryList[minPos] += 1
                
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BOWInvertedIndexEngine()
main(engine)

4. 增加缓存

可以考虑使用LRU算法，此处只是示意了一个缓存。

import re
import pylru

class SearchEngineBase(object):
    def __init__(self):
        pass
    
    def addCorpus(self,filePath):
        with open(filePath,'r') as fin:
            text = fin.read()
        self.processCorpus(filePath,text)
        
    def processCorpus(self,id,text):
        raise Exception('processCorpus not implemented')
        
    def search(self,query):
        raise Exception('search not implemented')
        
# 词袋反向索引模型
class BOWInvertedIndexEngine(SearchEngineBase):
    def __init__(self):
        super(BOWInvertedIndexEngine,self).__init__()
        self.invertedIndex = {}
        
    def processCorpus(self,id,text):
        words = self.textToWords(text)
        for word in words:
            if word not in self.invertedIndex:
                self.invertedIndex[word] = []
            self.invertedIndex[word].append(id)#创建反向索引
            
    def search(self,query):#求交集
        queryWords = list(self.textToWords(query))
        queryList = list()
        for query in queryWords:
            queryList.append(0)
            
        for query in queryWords:# 有一个词没有，则整个短语一定不会存在
            if query not in self.invertedIndex:
                return []
            
        result = []
        while True:
            currentIds = []
            for idx, query in enumerate(queryWords):
                currentIdx = queryList[idx]
                currentList = self.invertedIndex[query]
                
                if currentIdx >= len(currentList):
                    return result                
                currentIds.append(currentList[currentIdx])
                
            if all(x == currentIds[0] for x in currentIds):
                result.append(currentIds[0])
                queryList = [x+1 for x in queryList]
                continue
                
            minVal = min(currentIds)
            minPos = currentIds.index(minVal)
            queryList[minPos] += 1
                
    @staticmethod
    def textToWords(text):
        text = re.sub(r'[^\w]',' ', text)
        wordList = text.split(' ')
        wordList = filter(None, wordList)
        return set(wordList)
    
class LRUCache(object):
    def __init__(self,size=32):
        self.cache = pylru.lrucache(size)
        
    def has(self,key):
        return key in self.cache
    
    def get(self,key):
        return self.cache[key]
    
    def set(self,key,value):
        self.cache[key] = value        
        
class BOWInvertedIndexEngineWithCache(BOWInvertedIndexEngine,LRUCache):
    def __init__(self):
        super(BOWInvertedIndexEngineWithCache,self).__init__()
        LRUCache.__init__(self)
        
    def search(self,query):
        if self.has(query):
            return self.get(query)
        result = super(BOWInvertedIndexEngineWithCache,self).search(query)
        self.set(query,result)
        return result
    
def main(engine):
    for filePath in ['1.txt','2.txt','3.txt']:
        engine.addCorpus(filePath)
        
    while True:
        query = input('searching the words: ')
        results = engine.search(query)
        print('found {} results:'.format(len(results)))
        for item in results:
            print(item)
    
engine = BOWInvertedIndexEngineWithCache()
main(engine)

转自：https://www.zhenxiangsimple.com/2020/03/09/tech/python-spider/

python一步一步实现简单的搜索引擎

文章目录

待测试文档：

1. 简单的全文匹配

2. 词袋模型

3. 词袋倒排索引模型

4. 增加缓存

項目中常用的審計類型概述

4種解法 - 最小的k個數

python一步一步實現簡單的搜索引擎

項目過程發生變更的處理流程

兩種解法 - 判斷字符串的子串

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結