本文參考鏈接:
- https://blog.csdn.net/wangliuqi123/article/details/80527417
- https://blog.csdn.net/wangliuqi123/article/details/80537635
之前分別用了百度和科大訊飛的接口來實現音頻文件的轉寫。 百度的幾乎是免費的,但是效果實在有點差強人意,科大訊飛的效果倒是很不錯,但是有一點,“貴”。於是乎,想着有沒有一個折中的方案:識別效果不能太差,但是價錢也最好不要太貴。轉來轉去,於是轉到騰訊這兒來了。
騰訊的語音識別也是有 Python SDK的,鏈接在此:https://cloud.tencent.com/document/product/1093/35731,Demo的下載地址在這個頁面就有,不過暫時只有Python2的,雖說它自己也說後續會支持Python3,但是如果能早點用Python3的話,又何樂而不爲?只不過Demo改改確實需要踩很多坑。
它的Demo裏面主要由三個Python文件,Config.py, OfflineCLient.py, offlineSdk.py。針對在Python3環境下需要做的修改,我進行部分說明:
Config.py 其實沒什麼好改的,主要是 SECRET_KEY, SECRETID, APPID 用自己申請的填好就是
# -*- coding:utf-8 -*-
'''
Created on 2019-4-28
@author: iantang
'''
class Config:
'全局變量配置信息,請按需求改成自己的配置'
# ------------- Required,必須填寫 ---------------
# AppId, secretId, secretKey獲取方法可參考截圖:
# https://cloud.tencent.com/document/product/441/6203
# 具體路徑:點控制檯右上角您的賬號-->選:訪問管理-->點左邊菜單的:訪問祕鑰-->API祕鑰管理
SECRET_KEY = '***'
SECRETID = '*****'
APPID = '***'
# 我們會將識別結果通過Post方式發送至這個URL。用戶需要先搭建好自己的用於接收post數據的服務。
CALLBACK_URL = "http://xxx.xxxxx.xxx" # 這個回調url是必填,至於怎麼填,我後續再說
# ------------- optional,根據自身需求配置值 ---------------
# 識別引擎 8k_0 or 16k_0 or 8k_6, 8k_6支持角色分離,所以我這兒用的這個識別引擎
ENGINE_MODEL_TYPE = '8k_6'
# 1 or 2. 語音聲道數。在電話 8k通用模型下支持 1和 2,其他模型僅支持 1聲道
CHANNEL_NUM = 1
# 識別結果文本編碼方式 0:UTF-8, 1:GB2312, 2:GBK, 3:BIG5
RES_TEXT_FORMAT = 0
# 語音數據來源。0:語音 URL;1:語音數據(post body)
SOURCE_TYPE = 0
# ------------- optional,採用默認值即可 ---------------
# 騰訊雲項目 ID, 填0。也可改成用戶的:控制檯-賬號中心-項目管理中的配置。
PROJECT_ID = 0
# 子服務類型。0:離線語音識別。
SUB_SERVICE_TYPE = 0
# 結果返回方式。0:同步返回;1:異步返回。目前只支持異步返回
RES_TYPE = 1
# 騰訊服務器的URL,通常無需修改。
REQUEST_URL = "https://aai.qcloud.com/asr/v1/"
# 註冊簽名時用的URL,通常無需修改。
SIGN_URL = "aai.qcloud.com/asr/v1/"
# ------------- 下面是初始化和驗證方法,可跳過 ---------------
def __init__(self):
print ("")
def verifyProperties(self):
if len(str(self.SECRET_KEY)) == 0:
print('SECRET_KEY can not empty')
return
if len(str(self.SECRETID)) == 0:
print('SECRETID can not empty')
return
if len(str(self.APPID)) == 0:
print('APPID can not empty')
return
if len(str(self.CALLBACK_URL)) == 0:
print('CALLBACK_URL can not empty')
return
if len(str(self.ENGINE_MODEL_TYPE)) == 0 or (
str(self.ENGINE_MODEL_TYPE) != '8k_0' and str(self.ENGINE_MODEL_TYPE) != '16k_0' and str(self.ENGINE_MODEL_TYPE) != '8k_6'):
print('ENGINE_MODEL_TYPE is not right')
return
if len(str(self.CHANNEL_NUM)) == 0 or (str(self.CHANNEL_NUM) != '0' and str(self.CHANNEL_NUM) != '1'):
print('CHANNEL_NUM is not right')
return
if len(str(self.RES_TEXT_FORMAT)) == 0 or (str(self.RES_TEXT_FORMAT) != '0' and str(self.RES_TEXT_FORMAT) != '1' and str(
self.RES_TEXT_FORMAT) != '2' and str(self.RES_TEXT_FORMAT) != '3'):
print('RES_TEXT_FORMAT is not right')
return
if len(str(self.SOURCE_TYPE)) == 0 or (str(self.SOURCE_TYPE) != '0' and str(self.SOURCE_TYPE) != '1'):
print('SOURCE_TYPE is not right')
return
if len(str(self.PROJECT_ID)) == 0:
print('self.PROJECT_ID can not empty')
return
if len(str(self.SUB_SERVICE_TYPE)) == 0 or (str(self.SUB_SERVICE_TYPE) != '0' and str(self.SUB_SERVICE_TYPE) != '1'):
print('SUB_SERVICE_TYPE is not right')
return
if len(str(self.RES_TYPE)) == 0 or (str(self.RES_TYPE) != '0' and str(self.RES_TYPE) != '1'):
print('RES_TYPE is not right')
return
config = Config()
config.verifyProperties()
offlineSdk.py 中修改的地方有一部分,我已經在源碼中進行了註釋
# -*- coding:utf-8 -*-
import requests
import hashlib
import time
import hmac
import base64
import urllib
import urllib.parse # 新增
import Config
def task_process(audio_url):
request_data = dict()
request_data['channel_num'] = Config.config.CHANNEL_NUM
request_data['secretid'] = Config.config.SECRETID
request_data['engine_model_type'] = Config.config.ENGINE_MODEL_TYPE
request_data['timestamp'] = int(time.time())
request_data['expired'] = int(time.time()) + 3600
request_data['nonce'] = 6666
request_data['projectid'] = Config.config.PROJECT_ID
request_data['callback_url'] = Config.config.CALLBACK_URL
request_data['res_text_format'] = Config.config.RES_TEXT_FORMAT
request_data['res_type'] = Config.config.RES_TYPE
request_data['source_type'] = Config.config.SOURCE_TYPE
request_data['sub_service_type'] = Config.config.SUB_SERVICE_TYPE
# request_data['url'] = urllib.quote(audio_url) 修改爲下面的格式
request_data['url'] = audio_url
authorization = generate_sign(request_data)
task_req_url = generate_request(request_data)
header = {
"Content-Type": "application/json",
# "Authorization": str(authorization) Python3中不用進行str轉換,否則會認證失敗
"Authorization": authorization
}
r = requests.post(task_req_url, headers=header, data=request_data)
return r.text
def generate_sign(request_data):
sign_str = "POST" + Config.config.SIGN_URL + str(Config.config.APPID) + "?"
sort_dict = sorted(request_data.keys())
for key in sort_dict:
# sign_str = sign_str + key + "=" + urllib.unquote(str(request_data[key])) + '&' urllib改爲urllib.parse
sign_str = sign_str + key + "=" + urllib.parse.unquote(str(request_data[key])) + '&'
sign_str = sign_str[:-1]
# authorization = base64.b64encode(hmac.new(Config.config.SECRET_KEY, sign_str, hashlib.sha1).digest()) 修改爲下面這種
authorization = base64.b64encode(hmac.new(bytes(Config.config.SECRET_KEY, 'utf-8'), bytes(sign_str, 'utf-8'), hashlib.sha1).digest())
return authorization
def generate_request(request_data):
result_url = Config.config.REQUEST_URL + str(Config.config.APPID) + "?"
for key in request_data:
result_url = result_url + key + "=" + str(request_data[key]) + '&'
result_url = result_url[:-1]
return result_url
if __name__ == '__main__':
# 語音 URL,公網可下載。當 source_type值爲 0時須填寫該字段,爲 1時不填;長度大於 0,小於 2048
audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"
task_process(audio_url)
OfflineCLient.py也沒什麼修改的, audio_url得用公網可訪問的url地址,然後運行他就好了。
# -*- coding:utf-8 -*-
import offlineSdk
import Config
# 說明:請先將Config.py中的配置項按需改成自己的值,然後再開始使用。
# 音頻文件路徑。每調用一次task_process方法,可發出一份請求。
# 語音 URL,公網可下載。當 source_type值爲 0時須填寫該字段,爲 1時不填;長度大於 0,小於 2048
audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"
# 調用語音識別函數獲得識別結果
result = offlineSdk.task_process(audio_url)
print (result)
# ------------------------------------------------------------------------------------
# 若需中途調整參數值,可直接修改,然後繼續發請求即可。比如:
# Config.config.CALLBACK_URL = ""
# Config.config.ENGINE_MODEL_TYPE = "16k_0"
# ......
# audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"
# result = offlineSdk.task_process(audio_url)
# print (result)
然後你會發現運行結果是這樣的,看起來有個success,好像是成功了:
這個時候你肯定會問:我錄音的識別結果在哪兒呢?怎麼沒返回回來?
你肯定還記得 Config.py 的配置裏面有一個 CALLBACK_URL = "http://xxx.xxxxx.xxx" 參數,而且根據騰訊的語音識別API文檔,這個還是必填的參數,那怎麼辦? 騰訊是以回調的方式返回識別結果的,而不是像訊飛那樣以輪詢的方式查看是否有識別結果返回,所以得自己搭建服務處理騰訊回調的POST請求。當然,解決這個問題最簡單的方法就是,你可以用 flask 寫一個處理回調的路由函數,我把我自己做的數據解析放這兒了,如果覺得有用請自取:
import json
from flask import Flask, request
app = Flask(__name__)
@app.route('/data', methods=['POST'])
def testpost():
if request.method == 'POST':
# print('騰訊響應了')
print(request.form)
if request.form['message'] == '成功':
filename = request.form['audioUrl'].split('/')[-1].split('.')[0] # 獲取文件名
txt_file = filename + ".txt" # 轉寫結果保存的TXT文件
doc = open(txt_file, 'w', encoding='utf-8')
recognition_text = request.form['text']
sentence_list = recognition_text.split('\n')[0:-1] # 列表最後一個元素是空字符串
for sentence in sentence_list:
content = sentence.split(' ')[1] # 獲取單句通話內容
begin_time = sentence.split(' ')[0].split(',')[0][1:] # 獲取每句話的開始時間
begin_time = str(int(begin_time.split(":")[0]) * 60000 + int(begin_time.split(":")[1].replace(".", "")))
end_time = sentence.split(' ')[0].split(',')[1] # 獲取每句話的結束時間
end_time = str(int(end_time.split(":")[0]) * 60000 + int(end_time.split(":")[1].replace(".", "")))
speaker = sentence.split(' ')[0].split(',')[-1][:-1] # 獲取說話人
print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time)
print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time, file=doc)
doc.close()
dict = {
"code": 0,
"message": "成功"
}
else:
dict = {
"code": 1,
"message": "失敗"
}
print(dict)
return json.dumps(dict)
if __name__ == '__main__':
app.run(host="0.0.0.0", port=9979, threaded=True)
當然,如果你有公網可訪問的 IP 地址或者域名的話,在Linux上執行上面這段程序:
然後修改CALLBACK_URL 的值爲CALLBACK_URL = 'http://你的公網IP:9979/data' ,然後用 Python3 運行 OfflineCLient.py ,過一會兒你就可以在 Linux 這邊收到錄音識別的結果了。
如果沒有公網可訪問的 IP 或者域名,請參考:https://blog.csdn.net/wangliuqi123/article/details/80537635 進行配置。
如果你想上傳本地的語音文件進行識別,請參考下面的demo:
# -*- coding: utf-8 -*-
"""
@author: Looking
@email: [email protected]
"""
import os
import requests
import hashlib
import time
import hmac
import base64
import urllib
import urllib.parse
import json
import base64
import Config
import random
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.asr.v20190614 import asr_client, models
appid = Config.config.APPID
req_url = "https://aai.qcloud.com/asr/v1/"
callback_url = "http://*******:9979/data" # 需要搭建接收post數據的服務
sign_url = "aai.qcloud.com/asr/v1/"
secret_id = Config.config.SECRETID
secret_key = Config.config.SECRET_KEY
def task_process(audio_url):
request_data = dict()
request_data['channel_num'] = 1
request_data['secretid'] = secret_id
request_data['engine_model_type'] = "8k_6"
request_data['timestamp'] = int(time.time())
request_data['expired'] = int(time.time()) + 3600
request_data['nonce'] = 1559
request_data['projectid'] = 0
request_data['callback_url'] = callback_url
request_data['res_text_format'] = 0
request_data['res_type'] = 1
request_data['source_type'] = 1
request_data['sub_service_type'] = 0
with open(audio_url, 'rb') as f:
body_data = f.read()
body_len = str(len(body_data))
authorization = generate_sign(request_data, appid)
task_req_url = generate_request(request_data, appid)
header = {
"Authorization": authorization,
"Content-Length": body_len
}
r = requests.post(task_req_url, headers=header, data=body_data)
# print(task_req_url)
# print(r.text)
return r.text
def generate_sign(request_data, appid):
sign_str = "POST" + sign_url + str(appid) + "?"
sort_dict = sorted(request_data.keys())
for key in sort_dict:
sign_str = sign_str + key + "=" + urllib.parse.unquote(str(request_data[key])) + '&'
sign_str = sign_str[:-1]
authorization = base64.b64encode(
hmac.new(bytes(Config.config.SECRET_KEY, 'utf-8'), bytes(sign_str, 'utf-8'), hashlib.sha1).digest())
# authorization = base64.b64encode(hmac.new(secret_key, sign_str, hashlib.sha1).digest())
return authorization
def generate_request(request_data, appid):
result_url = req_url + str(appid) + "?"
for key in request_data:
result_url = result_url + key + "=" + str(request_data[key]) + '&'
result_url = result_url[:-1]
return result_url
def get_requestId(audio_file_path):
request_result = task_process(audio_file_path)
print(request_result)
requestId = eval(request_result)["requestId"]
return requestId
def get_recognition_result(requestId):
try:
cred = credential.Credential(Config.config.SECRETID, Config.config.SECRET_KEY)
httpProfile = HttpProfile()
httpProfile.endpoint = "asr.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
client = asr_client.AsrClient(cred, "ap-guangzhou", clientProfile)
while True:
req = models.DescribeTaskStatusRequest()
# 537731632
params = '{"TaskId":%s}' % requestId
req.from_json_string(params)
resp = client.DescribeTaskStatus(req)
recognition_text = json.loads(resp.to_json_string())
recognition_status = recognition_text['Data']['StatusStr']
if recognition_status == "success":
print(recognition_text['Data']['TaskId'], "識別成功!")
break
if recognition_status == "failed":
raise TencentCloudSDKException
time.sleep(1)
# print(recognition_text)
recognition_text = recognition_text['Data']['Result']
sentence_list = recognition_text.split('\n')[0:-1] # 列表最後一個元素是空字符串
for sentence in sentence_list:
content = sentence.split(' ')[1] # 獲取單句通話內容
begin_time = sentence.split(' ')[0].split(',')[0][1:] # 獲取每句話的開始時間
begin_time = str(int(begin_time.split(":")[0]) * 60000 + int(begin_time.split(":")[1].replace(".", "")))
end_time = sentence.split(' ')[0].split(',')[1] # 獲取每句話的結束時間
end_time = str(int(end_time.split(":")[0]) * 60000 + int(end_time.split(":")[1].replace(".", "")))
speaker = sentence.split(' ')[0].split(',')[-1][:-1] # 獲取說話人
print(speaker + "\t" + content + '\t' + begin_time + '\t' + end_time)
# print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time, file=doc)
except TencentCloudSDKException as err:
print(err)
if __name__ == '__main__':
audio_file_path = r"D:\MyProject\Python\audio_recognition\audio\o2020031309513910300127.wav"
requestId = get_requestId(audio_file_path)
get_recognition_result(requestId)
通過回調返回的結果除了沒有 audioUrl 參數之外,其他部分與直接使用錄音的 url 返回的數據是一樣的。