最近在做中文的情感分析,國內沒有比較好的數據集(我指的不是評論數據集,而是類似推文的數據集)。然後在kaggle競賽平臺看到了一個推特推文的數據集。然後就想把它翻譯爲中文,然後再來進行訓練。
直接貼上代碼分析
from importlib import reload
import http.client
import hashlib
import urllib
import urllib.parse
import time
import random
import json
import sys
reload(sys)
#
def transf(q):
appid = '20190930000338551' # 百度翻譯的appid和sk
secretKey = 'Xf5oc6ENptZOCEeK6goa'
httpClient = None
myurl = '/api/trans/vip/translate'
fromLang = 'en' # 要翻譯的語言
toLang = 'zh' # 要得到的語言
str1 = q.split("\t")
salt = random.randint(32768, 65536) # 鹽值
sign = appid + str1[1] + str(salt) + secretKey # 簽證
m1 = hashlib.md5() # 使用hashlib的md5來加密(單獨的md5模塊在py3中已被廢棄)
m1.update(sign.encode('utf-8'))
sign = m1.hexdigest()
#拼接請求鏈接
myurl = myurl + '?appid=' + appid + '&q=' + urllib.parse.quote(
str1[1]) + '&from=' + fromLang + '&to=' + toLang + '&salt=' + str(salt) + '&sign=' + sign
try:
httpClient = http.client.HTTPConnection('api.fanyi.baidu.com')
httpClient.request('GET', myurl)
# response是HTTPResponse對象
response = httpClient.getresponse()
resultJason = response.read().decode('utf-8')
js = None
try:
js = json.loads(resultJason) # 將json格式的結果轉換成Python的字典結構
except Exception as e:
print
'loads Json error.'
print
e
key = u"trans_result"
if key in js:
dst = js["trans_result"][0]["dst"] # 取得翻譯後的文本結果
outStr = str1[0] + "\t" + dst
else:
outStr = q
# 追加寫入文件中
file_object = open('training.txt', 'a+')
file_object.write(outStr + '\n')
except Exception as e:
print
e
finally:
if httpClient:
httpClient.close()
# 打開要翻譯的文件
f = open('E:\\deepLearning\\si650winter11\\training.txt','r', encoding='UTF-8')
# 逐行讀取整個文件
lines = f.readlines()
for line in lines:
transf(line)
#這裏設置了每秒調用一下,太頻繁,,,api會報錯。
time.sleep(1)
print
f.tell()
# print f.readline()
f.close()