PYTHON-唤醒+VOSK+DEEPSEEK+离线合成-实现纯离线大模型问答

首先我们用到的技术有:离线唤醒+离线听写VOSK+离线大模型DEEPSEEK+离线模拟流式合成。

其次我们实现了流畅的人机交互,而且模型可以通过JSON无限制的保存上下文,让模型拥有了永久的历史记忆。

最后欢迎小白微信搜索和关注我的公众号:AI新视野-拾光纪  我会带领大家,从0-1学习JAVA、网站开发、人工智能。

B站视频:PYTHON-唤醒+VOSK+DEEPSEEK+离线合成-实现纯离线大模型问答_哔哩哔哩_bilibili

具体代码如下:

import ctypes
import time
import pyaudio
import wave
import os

import iat
import tts

# 定义常量
APPID = "xxx"
WORK_DIR = "res"
IVW_DLL_PATH = os.path.join(WORK_DIR, "ivw_msc_x64.dll")
IVW_LOGIN_PARAMS = f"appid = {APPID}, work_dir = {WORK_DIR}"
IVW_SSB_PARAMS = "ivw_threshold=0:1450,sst=wakeup,ivw_shot_word=1,ivw_res_path =fo|res/ivw/wakeupresource.jet"
IVW_FRAME_SIZE = 6400
IVW_AUDIO_STATUS = 1

# 加载DLL
ivw_dll = ctypes.WinDLL(IVW_DLL_PATH)

# 定义函数原型
ivw_dll.MSPLogin.argtypes = [ctypes.c_char_p, ctypes.c_char_p, ctypes.c_char_p]
ivw_dll.MSPLogin.restype = ctypes.c_int

ivw_dll.QIVWSessionBegin.argtypes = [ctypes.c_char_p, ctypes.c_char_p, ctypes.POINTER(ctypes.c_int)]
ivw_dll.QIVWSessionBegin.restype = ctypes.c_char_p

ivw_dll.QIVWAudioWrite.argtypes = [ctypes.c_char_p, ctypes.POINTER(ctypes.c_ubyte), ctypes.c_uint, ctypes.c_int]
ivw_dll.QIVWAudioWrite.restype = ctypes.c_int

ivw_dll.QIVWSessionEnd.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
ivw_dll.QIVWSessionEnd.restype = ctypes.c_int

ivw_dll.QIVWRegisterNotify.argtypes = [ctypes.c_char_p,
                                       ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_int,
                                                        ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p),
                                       ctypes.c_void_p]
ivw_dll.QIVWRegisterNotify.restype = ctypes.c_int

ivw_dll.MSPLogout.argtypes = []
ivw_dll.MSPLogout.restype = ctypes.c_int


# 定义回调函数
def ivw_callback(sessionID, msg, param1, param2, info, userData):
    print(f"回调函数返回的唤醒结果...:{info.decode('utf-8')}")
    iat.start_iat()
    # 调用其他函数,例如 startIat()
    return 0


# 将回调函数转换为C函数指针
CALLBACK = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_char_p, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_char_p,
                            ctypes.c_void_p)
callback_func = CALLBACK(ivw_callback)

# 初始化音频输入
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=IVW_FRAME_SIZE)


def start_ivw():
    ret = ivw_dll.MSPLogin(None, None, IVW_LOGIN_PARAMS.encode('utf-8'))
    if ret != 0:
        print(f"唤醒登录失败...:{ret}")
        return

    error_code = ctypes.c_int(-100)
    session_id = ivw_dll.QIVWSessionBegin(None, IVW_SSB_PARAMS.encode('utf-8'), ctypes.byref(error_code))
    if error_code.value != 0:
        print(f"开启唤醒会话失败...:{error_code.value}")
        return

    ret = ivw_dll.QIVWRegisterNotify(session_id, callback_func, None)
    if ret != 0:
        print(f"注册唤醒回调函数失败...:{ret}")
        return

    try:
        while True:
            audio_data = stream.read(IVW_FRAME_SIZE)
            audio_data_array = (ctypes.c_ubyte * len(audio_data)).from_buffer_copy(audio_data)
            ret = ivw_dll.QIVWAudioWrite(session_id, audio_data_array, len(audio_data), IVW_AUDIO_STATUS)
            if ret != 0:
                print(f"唤醒音频写入失败...:{ret}")
            time.sleep(0.2)  # 模拟人说话时间间隙,10帧的音频200ms写入一次
    except Exception as e:
        print(e)
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()


if __name__ == "__main__":
    start_ivw()
import ctypes
import os
import wave
import pyaudio
import json

from pydub import AudioSegment
from pydub.playback import play
from vosk import Model, KaldiRecognizer

import deepseek
import tts

# Constants
APPID = ""
WORK_DIR = "src/main/resources"


class IatConstants:
    # 1. Wake-up related parameters
    IVW_ASR_AUDIO_FORMAT = pyaudio.paInt16
    IVW_DLL_PATH = os.path.join(WORK_DIR, "ivw_msc_x64.dll")
    IVW_LOGIN_PARAMS = f"appid={APPID}, work_dir={WORK_DIR}"
    IVW_SSB_PARAMS = "ivw_threshold=0:1450,sst=wakeup,ivw_shot_word=1,ivw_res_path=fo|res/ivw/wakeupresource.jet"
    IVW_ERROR_CODE = ctypes.c_int(-100)
    IVW_FRAME_SIZE = 6400
    IVW_AUDIO_STATUS = 1

    # 2. Synthesis related parameters
    TTS_DLL_PATH = os.path.join(WORK_DIR, "tts_msc_x64.dll")
    TTS_LOGIN_PARAMS = f"appid={APPID}, work_dir={WORK_DIR}"
    TTS_SESSION_BEGIN_PARAMS = "engine_type=local, voice_name=xiaoyuan, text_encoding=UTF8, tts_res_path=fo|res/tts/xiaoyuan.jet;fo|res/tts/common.jet, sample_rate=16000, speed=50, volume=50, pitch=50, rdn=2"
    TTS_ERROR_CODE = ctypes.c_int(-100)
    TTS_AUDIO_LEN = ctypes.c_int(-100)
    TTS_SYNTH_STATUS = ctypes.c_int(-100)
    TTS_TEXT = ""
    TTS_TOTAL_AUDIO_LENGTH = 0
    TTS_BYTE_ARRAY_OUTPUT_STREAM = bytearray()

    # 3. Offline command word related parameters
    ASR_DLL_PATH = os.path.join(WORK_DIR, "asr_msc_x64.dll")
    ASR_LOGIN_PARAMS = f"appid={APPID}, work_dir={WORK_DIR}"
    ASR_CALL_BNF_PATH = os.path.join(WORK_DIR, "msc/res/asr/call.bnf")
    ASR_BUILD_PARAMS = "engine_type=local,asr_res_path=fo|res/asr/common.jet,sample_rate=16000,grm_build_path=res/asr/GrmBuilld_x64"
    ASR_LEX_PARAMS = "engine_type=local,asr_res_path=fo|res/asr/common.jet,sample_rate=16000,grm_build_path=res/asr/GrmBuilld_x64,grammar_list=call"
    ASR_ERROR_CODE = ctypes.c_int(-100)
    ASR_SESSION_PARAMS = "vad_bos=3000,vad_eos=10000,engine_type=local,asr_res_path=fo|res/asr/common.jet,sample_rate=16000,grm_build_path=res/asr/GrmBuilld_x64,local_grammar=call,result_type=json,result_encoding=UTF8"
    ASR_EP_STATUS = ctypes.c_int(-100)
    ASR_RECOG_STATUS = ctypes.c_int(-100)
    ASR_AUDIO_STATUS = 1
    ASR_FRAME_SIZE = 640
    ASR_GRAMMAR_CONTENT = ""
    ASR_RESULT_STATUS = ctypes.c_int(-100)


# Initialize audio
p1 = pyaudio.PyAudio()
IVW_ASR_TARGET_DATA_LINE = p1.open(format=IatConstants.IVW_ASR_AUDIO_FORMAT,
                                   channels=1,
                                   rate=16000,
                                   input=True,
                                   frames_per_buffer=IatConstants.IVW_FRAME_SIZE)


# 3. Offline speech recognition call
def start_iat():
    # wav_file_path = "res/ivw.wav"
    # # 加载WAV文件
    # audio = AudioSegment.from_wav(wav_file_path)
    # # 播放音频
    # play(audio)
    tts.TtsService.start_tts("主人,您请说[p1000]")
    print("开始调用听写")
    model = Model("res/vosk-model-small-cn-0.22")
    recognizer = KaldiRecognizer(model, 16000)
    try:
        while True:
            buffer = IVW_ASR_TARGET_DATA_LINE.read(IatConstants.IVW_FRAME_SIZE)
            if len(buffer) == 0:
                break

            if recognizer.AcceptWaveform(buffer):
                result = recognizer.Result()
                json_result = json.loads(result)
                print("最终识别结果==》", json_result.get("text"))
                deepseek.start_model(json_result.get("text"))
                # AIMain.startModel(json_result.get("text"))  # Assuming AIMain is another module
                break  # Only recognize once
            else:
                partial_result = recognizer.PartialResult()
                json_partial = json.loads(partial_result)
                print("中间识别结果==》", json_partial.get("partial"))
    except Exception as e:
        print(e)


if __name__ == "__main__":
    start_iat()
import os
import json
import requests
from typing import List, Dict
import re

import tts

BASE_URL = "http://localhost:11434/v1/"
API_KEY = "ollama"  # required but ignored
FILE_PATH = "res/knowledge.txt"


class ModelHistory:
    def __init__(self, role: str, content: str):
        self.role = role
        self.content = content

    def to_dict(self):
        return {"role": self.role, "content": self.content}


def read_file_content(file_path: str) -> str:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except IOError as e:
        print(f"Error reading file: {e}")
        return ""


def write_file_content(file_path: str, content: str):
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(content)
    except IOError as e:
        print(f"Error writing file: {e}")


def start_model(user_question: str):
    content = read_file_content(FILE_PATH)
    model_history_list = json.loads(content) if content else []

    messages = []
    if model_history_list:
        for history in model_history_list:
            messages.append({"role": history["role"], "content": history["content"]})
    else:
        model_history_list = []

    messages.append({"role": "system",
                     "content": "你的角色是:中学生问题回答导师。当用户问你中学问题时,要以专业、敬业、友好的态度问答。当问到你是谁的时候,你应该回答你是一名中学生导师,并给与修饰解释。"})
    messages.append({"role": "user", "content": user_question})
    print(messages)

    model_history_list.append(ModelHistory("user", user_question).to_dict())

    request_body = {
        "messages": messages,
        "model": "deepseek-r1:14b",
        "stream": True,
        "temperature": 0.0
    }

    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }

    response = requests.post(f"{BASE_URL}chat/completions", json=request_body, headers=headers, stream=True)

    if response.status_code != 200:
        raise Exception(f"Unexpected code {response.status_code}")

    temp_res = ""
    out_res = ""
    for line in response.iter_lines():
        if line:
            decoded_line = line.decode('utf-8')
            if decoded_line.startswith("data: ") and "[DONE]" not in decoded_line:
                json_data = json.loads(decoded_line.replace("data: ", ""))
                for choice in json_data["choices"]:
                    content = choice["delta"]["content"]
                    print(content, end="")
                    temp_res += content
                    out_res += content

                mark_1 = temp_res.find("。")
                if mark_1 > 0:
                    tts.TtsService.start_tts(temp_res[:mark_1 + 1])
                    temp_res = temp_res[mark_1 + 1:]

    tts.TtsService.start_tts(temp_res)

    out_res = re.sub(r'<think>[\s\S]*?</think>', '', out_res)
    model_history_list.append(ModelHistory("assistant", out_res).to_dict())

    write_file_content(FILE_PATH, json.dumps(model_history_list, indent=4, ensure_ascii=False))


if __name__ == "__main__":
    start_model("你是谁?")
import ctypes
import os
from ctypes import byref, c_int, c_char_p, c_void_p, POINTER
import wave
import pyaudio


# 定义常量
class Constants:
    APPID = "xxx"
    WORK_DIR = "res"
    TTS_DLL_PATH = os.path.join(WORK_DIR, "tts_msc_x64.dll")
    TTS_LOGIN_PARAMS = f"appid = {APPID}, work_dir = {WORK_DIR}"
    TTS_SESSION_BEGIN_PARAMS = (
        "engine_type = local, voice_name = xiaoyuan, text_encoding = UTF8, "
        "tts_res_path = fo|res/tts/xiaoyuan.jet;fo|res/tts/common.jet, sample_rate = 16000, "
        "speed = 50, volume = 50, pitch = 50, rdn = 2"
    )
    TTS_ERROR_CODE = c_int(-100)
    TTS_AUDIO_LEN = c_int(-100)
    TTS_SYNTH_STATUS = c_int(-100)
    TTS_TEXT = ""
    TTS_TOTAL_AUDIO_LENGTH = 0
    TTS_BYTE_ARRAY_OUTPUT_STREAM = bytearray()
    TTS_AUDIO_FORMAT = pyaudio.paInt16
    TTS_CHANNELS = 1
    TTS_RATE = 16000
    TTS_FRAME_SIZE = 640


# 加载DLL
tts_dll = ctypes.WinDLL(Constants.TTS_DLL_PATH)

# 定义函数原型
tts_dll.MSPLogin.argtypes = [c_char_p, c_char_p, c_char_p]
tts_dll.MSPLogin.restype = c_int

tts_dll.QTTSSessionBegin.argtypes = [c_char_p, POINTER(c_int)]
tts_dll.QTTSSessionBegin.restype = c_char_p

tts_dll.QTTSTextPut.argtypes = [c_char_p, c_char_p, c_int, c_char_p]
tts_dll.QTTSTextPut.restype = c_int

tts_dll.QTTSAudioGet.argtypes = [c_char_p, POINTER(c_int), POINTER(c_int), POINTER(c_int)]
tts_dll.QTTSAudioGet.restype = c_void_p

tts_dll.QTTSSessionEnd.argtypes = [c_char_p, c_char_p]
tts_dll.QTTSSessionEnd.restype = c_int

tts_dll.MSPLogout.argtypes = []
tts_dll.MSPLogout.restype = c_int


# 定义TTS服务类
class TtsService:
    @staticmethod
    def start_tts(tts_text):
        Constants.TTS_TEXT = tts_text
        Constants.TTS_BYTE_ARRAY_OUTPUT_STREAM = bytearray()
        Constants.TTS_TOTAL_AUDIO_LENGTH = 0

        ret = tts_dll.MSPLogin(None, None, Constants.TTS_LOGIN_PARAMS.encode('utf-8'))
        if ret != 0:
            print(f"合成登录失败...:{ret}")
            return

        session_id = tts_dll.QTTSSessionBegin(Constants.TTS_SESSION_BEGIN_PARAMS.encode('utf-8'),
                                              byref(Constants.TTS_ERROR_CODE))
        if Constants.TTS_ERROR_CODE.value != 0:
            print(f"合成开启会话失败...:{Constants.TTS_ERROR_CODE.value}")
            return

        ret = tts_dll.QTTSTextPut(session_id, Constants.TTS_TEXT.encode('utf-8'),
                                  len(Constants.TTS_TEXT.encode('utf-8')), None)
        if ret != 0:
            print(f"合成音频失败...:{ret}")
            return

        # 初始化音频播放
        p = pyaudio.PyAudio()
        stream = p.open(format=Constants.TTS_AUDIO_FORMAT,
                        channels=Constants.TTS_CHANNELS,
                        rate=Constants.TTS_RATE,
                        output=True)

        while True:
            audio_data = tts_dll.QTTSAudioGet(session_id, byref(Constants.TTS_AUDIO_LEN),
                                              byref(Constants.TTS_SYNTH_STATUS), byref(Constants.TTS_ERROR_CODE))
            if Constants.TTS_ERROR_CODE.value != 0:
                print(f"获取音频数据失败...:{Constants.TTS_ERROR_CODE.value}")
                break

            if Constants.TTS_SYNTH_STATUS.value == 2:  # 合成结束
                break

            if Constants.TTS_AUDIO_LEN.value > 0:
                audio_buffer = ctypes.string_at(audio_data, Constants.TTS_AUDIO_LEN.value)
                Constants.TTS_BYTE_ARRAY_OUTPUT_STREAM.extend(audio_buffer)
                stream.write(audio_buffer)

        stream.stop_stream()
        stream.close()
        p.terminate()

        ret = tts_dll.QTTSSessionEnd(session_id, None)
        if ret != 0:
            print(f"结束会话失败...:{ret}")

        ret = tts_dll.MSPLogout()
        if ret != 0:
            print(f"退出失败...:{ret}")


# 调用TTS服务
if __name__ == "__main__":
    TtsService.start_tts("你好,这是一个测试文本。")