遇見這個一個需求:幾百萬的數據,需要替換掉不同種類的敏感語句,需要在測試庫中進行替換,成功後再替換生產庫。
難點1:考慮即適用開發環境,又適用生產環境,考慮使用python(簡潔,方便)
難點2:數據量太大;解決辦法:查詢時使用elasticsearch
難點3:替換種類多;解決辦法:進行一定的封裝
寫的不算細緻,可以替換部分規則的敏感語句。代碼顯示沒有格式,需要自行 Reformat Code
具體代碼如下:
# -*- coding: UTF-8 -*-
from elasticsearch import Elasticsearch
import pymysql
import re
import time
# Elasticsearch查找query匹配的結果
def getElasticsearchList(query_string=''):
es = Elasticsearch(
[
'http://192.168.1.166:9200',
]
)
res = es.search(index="test-questions", body={"query": {
"bool": {"must": [{"query_string": {"default_field": "test-question.content", "query": query_string}}],
"must_not": [],
"should": []}}, "from": 0, "size": 2})
print("Got %d Hits:" % res['hits']['total'])
# 查找匹配的字符 ,並以字典形式{id:content}返回
dict = {}
for hit in res['hits']['hits']:
dict[hit["_source"]['id']] = hit["_source"]['content']
return dict
#替換符合規則的字符
def deleteString(dict):
result = {}
for key in dict:
reg = r'alt=".*!"' #替換'alt'標籤
strre = re.compile(reg)
str = re.sub(strre, ' ', dict[key])
result[key] = str
return result
#更新操作
conn = pymysql.connect(host='192.168.1.225', user='admin', passwd='neptune@admin', db='teachresource', port=3306,
charset='utf8')
cur = conn.cursor()
resultList = deleteString(dict)
for key in resultList:
cmd = "update sh_testquestion_bak set updateTime=%s,content =%s where id=%s"
cur.execute(
cmd, (int(time.time()) * 1000, resultList[key], key))
conn.commit()
conn.close()