最近遇到個需求,索引遷移,本來應該用es的reindex,不過由於業務限制等原因,沒法使用,使用了 scroll 和 bulk 完成了遷移,記錄代碼
scroll 拉取到csv文件
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import time
import json
# 188 -> 線上
class Get_es:
def __init__(self):
self.es188 = Elasticsearch(hosts="xxx", port=9200, http_auth=('xxx', 'xxx'),timeout=100000)
def GetValue(self):
query = {
"query": {"match_all": {}}
}
scanResp = helpers.scan(client=self.es188, query=query, size=5000, scroll="3m", index='xxx',
doc_type='xx',
timeout="3m")
for k in scanResp:
yield k
def write_file(self, k):
with open('new_es_all.csv', 'ab') as f:
k = dict(k)
f.write(k['_id'].encode('utf-8'))
f.write(b',')
f.write(json.dumps(k['_source']).encode('utf-8'))
f.write(b'\n')
def run(self):
list1 = self.GetValue()
for index, k in enumerate(list1, 1):
self.write_file(k)
if __name__ == "__main__":
S = Get_es()
S.run()
bulk 從 csv 上傳到es
import json
from json import JSONDecodeError
import csv
from elasticsearch import helpers
from elasticsearch import Elasticsearch
es = Elasticsearch(host="xxx",
port=9200,
http_auth=('xx', 'xxx!'),
use_ssl=True,
verify_certs=False,
ssl_assert_hostname=False,
timeout=10000
)
with open('new_es_all.csv', 'r') as f:
reader = csv.reader(f)
actions = []
count = 0
for row in reader:
count += 1
id = (row[0])
source = {}
try:
source = json.loads(row[1] + "," + row[2])
except JSONDecodeError as e:
print(e)
source["id"] = id
action = {
"_index": "xxx",
"_type": "_doc",
"_source": source
}
actions.append(action)
if len(actions) >= 5000:
# 調用bulk ,清空 actions
helpers.bulk(es, actions)
actions = []
helpers.bulk(es, actions)
效率還是可以接受的,平均一天7200w數據,對於當前的需求夠用了