以下是個導出es數據到csv文件的簡單腳本,腳本簡單易懂,主要解決了兩個問題:
- Elasticsearch數據含中文寫入csv文件會亂碼
- unicode編碼的中文數據寫入csv文件會亂碼
from elasticsearch import Elasticsearch
import csv
import sys
import json
import codecs
reload(sys)
sys.setdefaultencoding('utf-8')
# es地址:本機 localhost:9200, 遠程地址 https://your_remote_es_url/
es = Elasticsearch(["localhost:9200"], timeout=9999)
# 索引名稱和文檔類型
es_index = "server-log"
es_type = "doc"
# 指定要導出的字段,如果不清楚有哪些字段,但是要導出全部字段,則去掉"_source"部分
csv_header = ["time", "api", "parameterMap", "response", "sessionId", "page"]
res = es.search(index=es_index, doc_type=es_type, body={
"query": {
"match_all": {}
},
"_source": {
"includes": csv_header,
"excludes": []
},
}, size=1000)
def export(file_name):
"""
export es documents to csv file
:param file_name: 導出數據的目標文件
:return: None
"""
mapping = es.indices.get_mapping(index=es_index, doc_type=es_type)
# export all fields if csv_header is not set
fields = []
for field in mapping[es_index]['mappings'][es_type]['properties']:
fields.append(field)
if len(csv_header):
fields = csv_header
with open(file_name, 'w') as f:
f.write(codecs.BOM_UTF8) # 防止整體中文亂碼
header_present = False
for doc in res['hits']['hits']:
my_dict = doc['_source']
if not len(my_dict):
continue
if not header_present:
w = csv.DictWriter(f, fields)
w.writeheader()
header_present = True
deal_chinese_words(my_dict)
w.writerow(my_dict)
# 對於字典類型數據做特殊處理,轉json並把unicode做decode防止亂碼。如果文檔沒有中文那直接註釋掉好了
def deal_chinese_words(my_dict):
if my_dict.get('parameterMap'):
# in case chinese character garbled by utf8 encoding
my_dict['parameterMap'] = json.dumps(my_dict['parameterMap']).decode('unicode_escape')
if my_dict.get('response'):
my_dict['response'] = json.dumps(my_dict['response']).decode('unicode_escape')
# 執行入口
export("/Users/zhuhuiyuan/Downloads/data.csv")