做一個搜索,以es爲基礎,數據存在mongodb
1:Elasticsearch
下載:
elasticsearch下載地址https://www.elastic.co/downloads/elasticsearch
安裝:
修改elasticsearch-5.5.1/config/elasticsearch.yml
# 集羣名稱
cluster.name: myElasticsearch
# 節點名稱
node.name: node001
# 0.0.0.0是爲了讓別的機器訪問
network.host: 0.0.0.0
# 端口
http.port: 9200
命令:elasticsearch-5.5.1/bin/elasticsearch
瀏覽器:127.0.0.1:9200
2:Elasticsearch-head
修改elasticsearch-5.5.1/config/elasticsearch.yml
# 增加新的參數,這樣head插件可以訪問es
http.cors.enabled: true
http.cors.allow-origin: "*"
git clone git://github.com/mobz/elasticsearch-head.git
安裝grunt(需要node和npm):
npm install -g grunt-cli
npm install -g grunt
修改head源碼
elasticsearch-head/Gruntfile.js
connect: {
server: {
options: {
port: 9100,
hostname: '*',
base: '.',
keepalive: true
}
}
}
});
添加hostname: '*',
elasticsearch-head/_site/app.js
# 修改head的連接地址:
this.base_uri = this.config.base_uri || this.prefs.get("app-base_uri") || "http://localhost:9200";
# 把localhost修改成你es的服務器地址,如:
this.base_uri = this.config.base_uri || this.prefs.get("app-base_uri") || "http://x.x.x.x:9200";
安裝elasticsearch-head(需要node和npm)
cd elasticsearch-head/
npminstall
啓動:
grunt server
瀏覽器:127.0.0.1:9100
3:mongo-connector
mongo-connector需要開啓MongoDB複製集
新建三個data文件夾
# replSet後面是複製集名稱,port是端口,dbpath是data目錄
# 第一個節點
sudo mongod --dbpath=/Users/zjl/mongodbdata/data1 --port 27018 --replSet rs0
# 第二個節點
sudo mongod --dbpath=/Users/zjl/mongodbdata/data2 --port 27019 --replSet rs0
# 第三個節點
sudo mongod --dbpath=/Users/zjl/mongodbdata/data3 --port 27020 --replSet rs0
mongo 127.0.0.1:27018
config = {
"_id": "rs0",
members: [
{ "_id": 0,
"host": "127.0.0.1:27018"
},
{ "_id": 1,
"host": "127.0.0.1:27019"
},
{ "_id": 2,
"host": "127.0.0.1:27020",
arbiterOnly:true
}
]
}
# arbiterOnly:true是這個節點是仲裁節點,據說這個很雞肋,只需節點個數是單數就不需要了,仲裁節點不存數據
# 進行初始化:
rs.initiate(config);
# 查看配置信息
rs.conf();
# 查看狀態
rs.status();
https://github.com/mongodb-labs/mongo-connector
pip install 'mongo-connector[elastic5]'
同步命令:mongo-connector -m 127.0.0.1:27018 -t 127.0.0.1:9200 -d elastic2_doc_manager
出現Logging to /xx/xx/mongo-connector.log.說明正常
現在我們在mongo主節點新建一個庫,會馬上同步到其它子節點,並且同步到elasticsearch
rs後面是端口號,27018是主節點,我在主節點新建一個庫,馬上就同步到子節點
也同步到了elasticsearch上,mongo的庫是elasticsearch的索引,表是elasticsearch的type
4:python操作elasticsearch
https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/index.html
pip install elasticsearch
百度,谷歌之類的搜索會對用戶的輸入進行關鍵詞提取,去重,英文單詞糾錯
先往mongo添加一些數據
elasticsearch據說內部默認字符串相似度算法是TF-IDF,但是沒有分詞,不過elasticsearch有ik分詞器的插件可以不需要自己手動實現,不過這裏我用jieba分詞,因爲es搜索時內部的算法已經封裝好了,而且字符串算法都那些套路,所以我對搜索結果再進行算法加工也沒有太大的意義,所以我能做的就是在關鍵字進入es之前做一些處理,比如關鍵詞提取,單詞糾錯,感覺能做很有限,畢竟搜索引擎到底怎麼樣的我不清楚
estest
|----enchant_py.py(單詞糾錯,網上找的)
|----EsQuery.py(elasticsearch操作)
|----flaskrun.py(flask服務)
|----dict.txt(jieba詞庫,發現分詞結果不理想往裏面添加詞語,設置詞頻)
|----stop_words.txt(jieba的停用詞,用於關鍵詞提取)
|----big.txt(單詞糾錯用到的詞庫,這個太長了,找一個英英詞典或者英文小說,如果效果不好,往裏面添加你想要的詞)
enchant_py.py
# -*- coding: utf-8 -*-
#__author__="ZJL"
import re, collections
def words(text): return re.findall('[a-z]+', text.lower())
def train(features):
model = collections.defaultdict(lambda: 1)
for f in features:
model[f] += 1
return model
NWORDS = train(words(open('big.txt').read()))
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def edits1(word):
n = len(word)
return set([word[0:i] + word[i + 1:] for i in range(n)] + # deletion
[word[0:i] + word[i + 1] + word[i] + word[i + 2:] for i in range(n - 1)] + # transposition
[word[0:i] + c + word[i + 1:] for i in range(n) for c in alphabet] + # alteration
[word[0:i] + c + word[i:] for i in range(n + 1) for c in alphabet]) # insertion
def known_edits2(word):
return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)
def known(words): return set(w for w in words if w in NWORDS)
def correct(word):
candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
return max(candidates, key=lambda w: NWORDS[w])
# print('thew => ' + correct('thew'))
# print('spak => ' + correct('spak'))
# print('goof => ' + correct('goof'))
# print('babyu => ' + correct('babyu'))
# print('spalling => ' + correct('spalling'))
# print("Hello =>"+ correct('Hello'))
EsQuery.py
# -*- coding: utf-8 -*-
#__author__="ZJL"
from elasticsearch import Elasticsearch
import jieba
import jieba.analyse
import re
import enchant_py
import json
class ESQuery(object):
def __init__(self):
self.es = Elasticsearch("127.0.0.1:9200")
def ES_Query(self,es_index,es_doc_type,query_key_list,strs,num,size_num):
from_num = (num - 1) * size_num
size_num = num * size_num
esstrs = " ".join(strs.get("key_list", ""))
str_key = strs.get("key_str", "")
re_nums = re.findall(r'[0-9]+', esstrs)
re_nums_list = []
if re_nums:
for re_num in re_nums:
re_nums_list.append({"match": {"age": re_num}})
for query_key in query_key_list:
re_nums_list.append({"match": {query_key: esstrs}})
print(re_nums_list)
body = {
"query":
{"bool":
{
"must": [],
"must_not": [],
"should": re_nums_list
}
},
"from": from_num,
"size": size_num,
"sort": [],
"aggs": {},
# 關鍵字高亮
"highlight": {
"fields": {
"school": {},
"name":{}
}
}
}
a = self.es.search(index=es_index, doc_type=es_doc_type,body=body)
aa = a["hits"]
aa["key_str"] = str_key
data_json = json.dumps(aa)
print(data_json)
return (data_json)
def Check_Keyword(self,key_str):
# 詞庫
file_name = "dict.txt"
# 停用詞 stop_words.txt
stop_file_name = "stop_words.txt"
# 加載詞庫
jieba.load_userdict(file_name)
# 加載停用詞
jieba.analyse.set_stop_words(stop_file_name)
key_str_copy = key_str
# 正則找出所有英文單詞
result_list = re.findall(r'[a-zA-Z]+', key_str_copy)
# key_str_list = list(jieba.cut(key_str.strip()))
# print(key_str_list)
# 單詞量小於3(百度超過兩個也不糾錯),將單詞糾錯,將原詞與糾錯後的詞添加到字典
corr_dict = {}
if len(result_list)<3 and len(result_list)>0:
for restr in result_list:
strd = enchant_py.correct(restr)
if restr!=strd:
corr_dict[restr] = strd
# 將糾錯後的詞替換原來的單詞
for corr in corr_dict:
key_str_copy = key_str_copy.replace(corr,corr_dict.get(corr,""))
# jieba的tf-idf算法,提取關鍵詞
tagstr = jieba.analyse.extract_tags(key_str_copy, topK=20, withWeight=False, allowPOS=())
# 考慮到英文短句超不多在這個範圍,且不太會有停用詞,這樣中英文結合後也能去掉中文的停用詞
elif len(result_list)<3 and len(result_list)>5:
tagstr = jieba.analyse.extract_tags(key_str_copy, topK=20, withWeight=False, allowPOS=())
# 英文單詞過多就直接原樣輸出
else:
# 分詞
key_str_list = list(jieba.cut(key_str_copy))
# 如果全英文中出現特殊符號就去掉
stop_key = [" ","(",")",".",",","\'","\"","*","+",
"-","\\","/","`","~","@","#","$","%","^",
"&",'[',']',"{","}",";","?","!","\t","\n",":"]
for key in stop_key:
if key in key_str_list:
key_str_list.remove(key)
tagstr = key_str_list
# 如果單詞沒有糾錯就不顯示
if corr_dict:
data_str = key_str
else:
data_str = ""
data = {
"key_list":tagstr,
"key_str":data_str
}
print(data)
return data
flaskrun.py
# -*-coding:utf-8 -*-
__author__ = "ZJL"
from flask import Flask
from flask import request
from EsQuery import ESQuery
from werkzeug.contrib.fixers import ProxyFix
app = Flask(__name__)
"""
@api {get} / 首頁
@apiName index
@apiGroup indexx
"""
@app.route("/")
def index():
return "hello world"
"""
@api {get} /query 查詢
@apiName 查詢
@apiGroup 查詢xx
@apiParam {string} strs 關鍵字
@apiParam {string} num 頁碼.
@apiParam {string} size_num 每頁數量
"""
@app.route('/query', methods=['GET'])
def es_query():
if request.method == 'GET' and request.args['strs'] and request.args['num'] and request.args['size_num']:
num = int(request.args['num'])
size_num = int(request.args['size_num'])
strs = request.args['strs']
eq = ESQuery()
key_str_dict = eq.Check_Keyword(strs)
es_index = ["test99911"]
es_type = []
es_query_list = ["title","body"]
data_json = eq.ES_Query(es_index, es_type, es_query_list,key_str_dict, num, size_num)
return data_json
else:
return "no"
app.wsgi_app = ProxyFix(app.wsgi_app)
if __name__ == "__main__":
app.run(host="0.0.0.0",port=5123) # ,debug=True,threaded=True
# 分別通過3中方式獲取參數:request.form, request.args,request.values
# postForm= request.form
# getArgs= request.args
# postValues= request.values
dict.txt
相似度 5
stop_words.txt
的
了
和
是
就
都
而
及
與
著
或
一個
沒有
我們
你們
妳們
他們
她們
是否
與
着
一個
沒有
我們
你們
他們
她們
它們
big.txt太長了不貼了