Python3使用騰訊雲API接口實現音頻文件轉寫(支持本地音頻或者url音頻)

本文參考鏈接:

  1. https://blog.csdn.net/wangliuqi123/article/details/80527417
  2. https://blog.csdn.net/wangliuqi123/article/details/80537635

之前分別用了百度和科大訊飛的接口來實現音頻文件的轉寫。 百度的幾乎是免費的,但是效果實在有點差強人意,科大訊飛的效果倒是很不錯,但是有一點,“貴”。於是乎,想着有沒有一個折中的方案:識別效果不能太差,但是價錢也最好不要太貴。轉來轉去,於是轉到騰訊這兒來了。

騰訊的語音識別也是有 Python SDK的,鏈接在此:https://cloud.tencent.com/document/product/1093/35731,Demo的下載地址在這個頁面就有,不過暫時只有Python2的,雖說它自己也說後續會支持Python3,但是如果能早點用Python3的話,又何樂而不爲?只不過Demo改改確實需要踩很多坑。

它的Demo裏面主要由三個Python文件,Config.py, OfflineCLient.py, offlineSdk.py。針對在Python3環境下需要做的修改,我進行部分說明:

Config.py 其實沒什麼好改的,主要是 SECRET_KEY,  SECRETID, APPID 用自己申請的填好就是

# -*- coding:utf-8 -*-
'''
Created on 2019-4-28
@author: iantang
'''

class Config:
    '全局變量配置信息,請按需求改成自己的配置'
    
    # ------------- Required,必須填寫 ---------------
    # AppId, secretId, secretKey獲取方法可參考截圖: 
    # https://cloud.tencent.com/document/product/441/6203
    # 具體路徑:點控制檯右上角您的賬號-->選:訪問管理-->點左邊菜單的:訪問祕鑰-->API祕鑰管理
    SECRET_KEY = '***'
    SECRETID = '*****'
    APPID = '***'
    # 我們會將識別結果通過Post方式發送至這個URL。用戶需要先搭建好自己的用於接收post數據的服務。
    CALLBACK_URL = "http://xxx.xxxxx.xxx"  # 這個回調url是必填,至於怎麼填,我後續再說
         
    # ------------- optional,根據自身需求配置值 ---------------
    # 識別引擎 8k_0 or 16k_0 or 8k_6, 8k_6支持角色分離,所以我這兒用的這個識別引擎
    ENGINE_MODEL_TYPE = '8k_6'
    # 1 or 2. 語音聲道數。在電話 8k通用模型下支持 1和 2,其他模型僅支持 1聲道
    CHANNEL_NUM = 1
    # 識別結果文本編碼方式 0:UTF-8, 1:GB2312, 2:GBK, 3:BIG5
    RES_TEXT_FORMAT = 0
    # 語音數據來源。0:語音 URL;1:語音數據(post body)
    SOURCE_TYPE = 0
    
    # ------------- optional,採用默認值即可 ---------------
    # 騰訊雲項目 ID, 填0。也可改成用戶的:控制檯-賬號中心-項目管理中的配置。
    PROJECT_ID = 0
    # 子服務類型。0:離線語音識別。
    SUB_SERVICE_TYPE = 0
    # 結果返回方式。0:同步返回;1:異步返回。目前只支持異步返回
    RES_TYPE = 1
    # 騰訊服務器的URL,通常無需修改。
    REQUEST_URL = "https://aai.qcloud.com/asr/v1/"
    # 註冊簽名時用的URL,通常無需修改。
    SIGN_URL = "aai.qcloud.com/asr/v1/"
    
    # ------------- 下面是初始化和驗證方法,可跳過 ---------------
    def __init__(self):
        print ("")

    def verifyProperties(self):
        if len(str(self.SECRET_KEY)) == 0:
            print('SECRET_KEY can not empty')
            return
        if len(str(self.SECRETID)) == 0:
            print('SECRETID can not empty')
            return
        if len(str(self.APPID)) == 0:
            print('APPID can not empty')
            return
        if len(str(self.CALLBACK_URL)) == 0:
            print('CALLBACK_URL can not empty')
            return
        
        if len(str(self.ENGINE_MODEL_TYPE)) == 0 or (
            str(self.ENGINE_MODEL_TYPE) != '8k_0' and str(self.ENGINE_MODEL_TYPE) != '16k_0' and str(self.ENGINE_MODEL_TYPE) != '8k_6'):
            print('ENGINE_MODEL_TYPE is not right')
            return
        if len(str(self.CHANNEL_NUM)) == 0 or (str(self.CHANNEL_NUM) != '0' and str(self.CHANNEL_NUM) != '1'):
            print('CHANNEL_NUM is not right')
            return
        if len(str(self.RES_TEXT_FORMAT)) == 0 or (str(self.RES_TEXT_FORMAT) != '0' and str(self.RES_TEXT_FORMAT) != '1' and str(
                self.RES_TEXT_FORMAT) != '2' and str(self.RES_TEXT_FORMAT) != '3'):
            print('RES_TEXT_FORMAT is not right')
            return
        if len(str(self.SOURCE_TYPE)) == 0 or (str(self.SOURCE_TYPE) != '0' and str(self.SOURCE_TYPE) != '1'):
            print('SOURCE_TYPE is not right')
            return
        
        if len(str(self.PROJECT_ID)) == 0:
            print('self.PROJECT_ID can not empty')
            return
        if len(str(self.SUB_SERVICE_TYPE)) == 0 or (str(self.SUB_SERVICE_TYPE) != '0' and str(self.SUB_SERVICE_TYPE) != '1'):
            print('SUB_SERVICE_TYPE is not right')
            return
        if len(str(self.RES_TYPE)) == 0 or (str(self.RES_TYPE) != '0' and str(self.RES_TYPE) != '1'):
            print('RES_TYPE is not right')
            return
        
config = Config()
config.verifyProperties()

offlineSdk.py 中修改的地方有一部分,我已經在源碼中進行了註釋

# -*- coding:utf-8 -*-

import requests
import hashlib
import time
import hmac
import base64
import urllib
import urllib.parse  # 新增
import Config

def task_process(audio_url):
    request_data = dict()
    request_data['channel_num'] = Config.config.CHANNEL_NUM
    request_data['secretid'] = Config.config.SECRETID
    request_data['engine_model_type'] = Config.config.ENGINE_MODEL_TYPE
    request_data['timestamp'] = int(time.time())
    request_data['expired'] = int(time.time()) + 3600
    request_data['nonce'] = 6666
    request_data['projectid'] = Config.config.PROJECT_ID
    request_data['callback_url'] = Config.config.CALLBACK_URL
    request_data['res_text_format'] = Config.config.RES_TEXT_FORMAT
    request_data['res_type'] = Config.config.RES_TYPE
    request_data['source_type'] = Config.config.SOURCE_TYPE
    request_data['sub_service_type'] = Config.config.SUB_SERVICE_TYPE

    # request_data['url'] = urllib.quote(audio_url) 修改爲下面的格式
    request_data['url'] = audio_url
    authorization = generate_sign(request_data)
    task_req_url = generate_request(request_data)
    header = {
        "Content-Type": "application/json",
        # "Authorization": str(authorization)  Python3中不用進行str轉換,否則會認證失敗
        "Authorization": authorization
    }
    r = requests.post(task_req_url, headers=header, data=request_data)
    return r.text


def generate_sign(request_data):
    sign_str = "POST" + Config.config.SIGN_URL + str(Config.config.APPID) + "?"
    sort_dict = sorted(request_data.keys())
    for key in sort_dict:
        # sign_str = sign_str + key + "=" + urllib.unquote(str(request_data[key])) + '&'  urllib改爲urllib.parse
        sign_str = sign_str + key + "=" + urllib.parse.unquote(str(request_data[key])) + '&'
    sign_str = sign_str[:-1]
    #  authorization = base64.b64encode(hmac.new(Config.config.SECRET_KEY, sign_str, hashlib.sha1).digest())  修改爲下面這種
    authorization = base64.b64encode(hmac.new(bytes(Config.config.SECRET_KEY, 'utf-8'), bytes(sign_str, 'utf-8'), hashlib.sha1).digest())
    return authorization


def generate_request(request_data):
    result_url = Config.config.REQUEST_URL + str(Config.config.APPID) + "?"
    for key in request_data:
        result_url = result_url + key + "=" + str(request_data[key]) + '&'
    result_url = result_url[:-1]
    return result_url

if __name__ == '__main__':
    # 語音 URL,公網可下載。當 source_type值爲 0時須填寫該字段,爲 1時不填;長度大於 0,小於 2048
    audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"
    task_process(audio_url)

OfflineCLient.py也沒什麼修改的, audio_url得用公網可訪問的url地址,然後運行他就好了。

# -*- coding:utf-8 -*-
import offlineSdk
import Config

# 說明:請先將Config.py中的配置項按需改成自己的值,然後再開始使用。

# 音頻文件路徑。每調用一次task_process方法,可發出一份請求。
# 語音 URL,公網可下載。當 source_type值爲 0時須填寫該字段,爲 1時不填;長度大於 0,小於 2048
audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"
# 調用語音識別函數獲得識別結果
result = offlineSdk.task_process(audio_url)
print (result)


# ------------------------------------------------------------------------------------
# 若需中途調整參數值,可直接修改,然後繼續發請求即可。比如:
# Config.config.CALLBACK_URL = ""
# Config.config.ENGINE_MODEL_TYPE = "16k_0"
# ......
# audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"
# result = offlineSdk.task_process(audio_url)
# print (result)

然後你會發現運行結果是這樣的,看起來有個success,好像是成功了:

這個時候你肯定會問:我錄音的識別結果在哪兒呢?怎麼沒返回回來?

你肯定還記得 Config.py 的配置裏面有一個 CALLBACK_URL = "http://xxx.xxxxx.xxx" 參數,而且根據騰訊的語音識別API文檔,這個還是必填的參數,那怎麼辦? 騰訊是以回調的方式返回識別結果的,而不是像訊飛那樣以輪詢的方式查看是否有識別結果返回,所以得自己搭建服務處理騰訊回調的POST請求。當然,解決這個問題最簡單的方法就是,你可以用 flask 寫一個處理回調的路由函數,我把我自己做的數據解析放這兒了,如果覺得有用請自取:

import json

from flask import Flask, request

app = Flask(__name__)


@app.route('/data', methods=['POST'])
def testpost():
    if request.method == 'POST':
        # print('騰訊響應了')
        print(request.form)
        if request.form['message'] == '成功':
            filename = request.form['audioUrl'].split('/')[-1].split('.')[0]  # 獲取文件名
            txt_file = filename + ".txt"  # 轉寫結果保存的TXT文件
            doc = open(txt_file, 'w', encoding='utf-8')
            recognition_text = request.form['text']
            sentence_list = recognition_text.split('\n')[0:-1]  # 列表最後一個元素是空字符串
            for sentence in sentence_list:
                content = sentence.split('  ')[1]  # 獲取單句通話內容
                begin_time = sentence.split('  ')[0].split(',')[0][1:]  # 獲取每句話的開始時間
                begin_time = str(int(begin_time.split(":")[0]) * 60000 + int(begin_time.split(":")[1].replace(".", "")))
                end_time = sentence.split('  ')[0].split(',')[1]  # 獲取每句話的結束時間
                end_time = str(int(end_time.split(":")[0]) * 60000 + int(end_time.split(":")[1].replace(".", "")))
                speaker = sentence.split('  ')[0].split(',')[-1][:-1]  # 獲取說話人
                print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time)
                print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time, file=doc)
            doc.close()
            dict = {
                "code": 0,
                "message": "成功"
            }
        else:
            dict = {
                "code": 1,
                "message": "失敗"
            }
        print(dict)
        return json.dumps(dict)


if __name__ == '__main__':
    app.run(host="0.0.0.0", port=9979, threaded=True)

當然,如果你有公網可訪問的 IP 地址或者域名的話,在Linux上執行上面這段程序:

然後修改CALLBACK_URL 的值爲CALLBACK_URL =  'http://你的公網IP:9979/data' ,然後用 Python3 運行 OfflineCLient.py ,過一會兒你就可以在 Linux 這邊收到錄音識別的結果了。

如果沒有公網可訪問的 IP 或者域名,請參考:https://blog.csdn.net/wangliuqi123/article/details/80537635 進行配置。

如果你想上傳本地的語音文件進行識別,請參考下面的demo:

# -*- coding: utf-8 -*-
"""
@author: Looking
@email: [email protected]
"""
import os
import requests
import hashlib
import time
import hmac
import base64
import urllib
import urllib.parse
import json
import base64
import Config
import random

from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.asr.v20190614 import asr_client, models

appid = Config.config.APPID
req_url = "https://aai.qcloud.com/asr/v1/"
callback_url = "http://*******:9979/data"  # 需要搭建接收post數據的服務
sign_url = "aai.qcloud.com/asr/v1/"
secret_id = Config.config.SECRETID
secret_key = Config.config.SECRET_KEY


def task_process(audio_url):
    request_data = dict()
    request_data['channel_num'] = 1
    request_data['secretid'] = secret_id
    request_data['engine_model_type'] = "8k_6"
    request_data['timestamp'] = int(time.time())
    request_data['expired'] = int(time.time()) + 3600
    request_data['nonce'] = 1559
    request_data['projectid'] = 0
    request_data['callback_url'] = callback_url
    request_data['res_text_format'] = 0
    request_data['res_type'] = 1
    request_data['source_type'] = 1
    request_data['sub_service_type'] = 0
    with open(audio_url, 'rb') as f:
        body_data = f.read()
        body_len = str(len(body_data))
    authorization = generate_sign(request_data, appid)
    task_req_url = generate_request(request_data, appid)
    header = {
        "Authorization": authorization,
        "Content-Length": body_len
    }

    r = requests.post(task_req_url, headers=header, data=body_data)
    # print(task_req_url)
    # print(r.text)
    return r.text


def generate_sign(request_data, appid):
    sign_str = "POST" + sign_url + str(appid) + "?"
    sort_dict = sorted(request_data.keys())
    for key in sort_dict:
        sign_str = sign_str + key + "=" + urllib.parse.unquote(str(request_data[key])) + '&'
    sign_str = sign_str[:-1]
    authorization = base64.b64encode(
        hmac.new(bytes(Config.config.SECRET_KEY, 'utf-8'), bytes(sign_str, 'utf-8'), hashlib.sha1).digest())
    # authorization = base64.b64encode(hmac.new(secret_key, sign_str, hashlib.sha1).digest())
    return authorization


def generate_request(request_data, appid):
    result_url = req_url + str(appid) + "?"
    for key in request_data:
        result_url = result_url + key + "=" + str(request_data[key]) + '&'
    result_url = result_url[:-1]
    return result_url


def get_requestId(audio_file_path):
    request_result = task_process(audio_file_path)
    print(request_result)
    requestId = eval(request_result)["requestId"]
    return requestId


def get_recognition_result(requestId):
    try:
        cred = credential.Credential(Config.config.SECRETID, Config.config.SECRET_KEY)
        httpProfile = HttpProfile()
        httpProfile.endpoint = "asr.tencentcloudapi.com"

        clientProfile = ClientProfile()
        clientProfile.httpProfile = httpProfile
        client = asr_client.AsrClient(cred, "ap-guangzhou", clientProfile)

        while True:
            req = models.DescribeTaskStatusRequest()
            # 537731632
            params = '{"TaskId":%s}' % requestId
            req.from_json_string(params)
            resp = client.DescribeTaskStatus(req)
            recognition_text = json.loads(resp.to_json_string())
            recognition_status = recognition_text['Data']['StatusStr']
            if recognition_status == "success":
                print(recognition_text['Data']['TaskId'], "識別成功!")
                break
            if recognition_status == "failed":
                raise TencentCloudSDKException
            time.sleep(1)
            # print(recognition_text)
        recognition_text = recognition_text['Data']['Result']
        sentence_list = recognition_text.split('\n')[0:-1]  # 列表最後一個元素是空字符串
        for sentence in sentence_list:
            content = sentence.split('  ')[1]  # 獲取單句通話內容
            begin_time = sentence.split('  ')[0].split(',')[0][1:]  # 獲取每句話的開始時間
            begin_time = str(int(begin_time.split(":")[0]) * 60000 + int(begin_time.split(":")[1].replace(".", "")))
            end_time = sentence.split('  ')[0].split(',')[1]  # 獲取每句話的結束時間
            end_time = str(int(end_time.split(":")[0]) * 60000 + int(end_time.split(":")[1].replace(".", "")))
            speaker = sentence.split('  ')[0].split(',')[-1][:-1]  # 獲取說話人
            print(speaker + "\t" + content + '\t' + begin_time + '\t' + end_time)
            # print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time, file=doc)
    except TencentCloudSDKException as err:
        print(err)


if __name__ == '__main__':
    audio_file_path = r"D:\MyProject\Python\audio_recognition\audio\o2020031309513910300127.wav"
    requestId = get_requestId(audio_file_path)
    get_recognition_result(requestId)

通過回調返回的結果除了沒有 audioUrl 參數之外,其他部分與直接使用錄音的 url 返回的數據是一樣的。

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章