Voice Chat AI for Mac
import asyncio
from dotenv import load_dotenv
import shutil
import subprocess
import requests
import time
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
#from langchain_openai import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.chains import LLMChain
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
LiveTranscriptionEvents,
LiveOptions,
Microphone,
)
load_dotenv()
class LanguageModelProcessor:
def __init__(self):
self.llm = ChatGroq(temperature=0, model_name="llama3-70b-8192", groq_api_key=os.getenv("GROQ_API_KEY")) #mixtral-8x7b-32768
# self.llm = ChatOpenAI(temperature=0, model_name="gpt-4-0125-preview", openai_api_key=os.getenv("OPENAI_API_KEY"))
# self.llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo-0125", openai_api_key=os.getenv("OPENAI_API_KEY"))
self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# Load the system prompt from a file
with open('system_prompt.txt', 'r') as file:
system_prompt = file.read().strip()
self.prompt = ChatPromptTemplate.from_messages([
SystemMessagePromptTemplate.from_template(system_prompt),
MessagesPlaceholder(variable_name="chat_history"),
HumanMessagePromptTemplate.from_template("{text}")
])
self.conversation = LLMChain(
llm=self.llm,
prompt=self.prompt,
memory=self.memory
)
def process(self, text):
self.memory.chat_memory.add_user_message(text) # Add user message to memory
start_time = time.time()
# Go get the response from the LLM
response = self.conversation.invoke({"text": text})
end_time = time.time()
self.memory.chat_memory.add_ai_message(response['text']) # Add AI response to memory
elapsed_time = int((end_time - start_time) * 1000)
print(f"LLM ({elapsed_time}ms): {response['text']}")
return response['text']
class TextToSpeech:
# Set your Deepgram API Key and desired voice model
DG_API_KEY = os.getenv("DEEPGRAM_API_KEY")
MODEL_NAME = "aura-helios-en" # Example model name, change as needed
@staticmethod
def is_installed(lib_name: str) -> bool:
lib = shutil.which(lib_name)
return lib is not None
def speak(self, text):
ffplay_path = "/opt/homebrew/bin/ffplay" # ffplayのパスを指定する
if not self.is_installed(ffplay_path):
raise ValueError("ffplay not found at the specified path. Please specify the correct path to ffplay.")
DEEPGRAM_URL = f"https://api.deepgram.com/v1/speak?model={self.MODEL_NAME}&performance=some&encoding=linear16&sample_rate=24000"
headers = {
"Authorization": f"Token {self.DG_API_KEY}",
"Content-Type": "application/json"
}
payload = {
"text": text
}
player_command = [ffplay_path, "-autoexit", "-", "-nodisp"]
player_process = subprocess.Popen(
player_command,
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
start_time = time.time() # Record the time before sending the request
first_byte_time = None # Initialize a variable to store the time when the first byte is received
with requests.post(DEEPGRAM_URL, stream=True, headers=headers, json=payload) as r:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
if first_byte_time is None: # Check if this is the first chunk received
first_byte_time = time.time() # Record the time when the first byte is received
ttfb = int((first_byte_time - start_time)*1000) # Calculate the time to first byte
print(f"TTS Time to First Byte (TTFB): {ttfb}ms\n")
player_process.stdin.write(chunk)
player_process.stdin.flush()
if player_process.stdin:
player_process.stdin.close()
player_process.wait()
class TranscriptCollector:
def __init__(self):
self.reset()
def reset(self):
self.transcript_parts = []
def add_part(self, part):
self.transcript_parts.append(part)
def get_full_transcript(self):
return ' '.join(self.transcript_parts)
transcript_collector = TranscriptCollector()
async def get_transcript(callback):
transcription_complete = asyncio.Event() # Event to signal transcription completion
try:
# example of setting up a client config. logging values: WARNING, VERBOSE, DEBUG, SPAM
config = DeepgramClientOptions(options={"keepalive": "true"})
deepgram: DeepgramClient = DeepgramClient("", config)
dg_connection = deepgram.listen.asynclive.v("1")
print ("Listening...")
async def on_message(self, result, **kwargs):
sentence = result.channel.alternatives[0].transcript
if not result.speech_final:
transcript_collector.add_part(sentence)
else:
# This is the final part of the current sentence
transcript_collector.add_part(sentence)
full_sentence = transcript_collector.get_full_transcript()
# Check if the full_sentence is not empty before printing
if len(full_sentence.strip()) > 0:
full_sentence = full_sentence.strip()
print(f"Human: {full_sentence}")
callback(full_sentence) # Call the callback with the full_sentence
transcript_collector.reset()
transcription_complete.set() # Signal to stop transcription and exit
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
options = LiveOptions(
model="nova-2",
punctuate=True,
language="en-US",
encoding="linear16",
channels=1,
sample_rate=16000,
endpointing=300,
smart_format=True,
)
await dg_connection.start(options)
# Open a microphone stream on the default input device
microphone = Microphone(dg_connection.send)
microphone.start()
await transcription_complete.wait() # Wait for the transcription to complete instead of looping indefinitely
# Wait for the microphone to close
microphone.finish()
# Indicate that we've finished
await dg_connection.finish()
except Exception as e:
print(f"Could not open socket: {e}")
return
class ConversationManager:
def __init__(self):
self.transcription_response = ""
self.llm = LanguageModelProcessor()
async def main(self):
def handle_full_sentence(full_sentence):
self.transcription_response = full_sentence
# Loop indefinitely until "goodbye" is detected
while True:
await get_transcript(handle_full_sentence)
# Check for "goodbye" to exit the loop
if "goodbye" in self.transcription_response.lower():
break
llm_response = self.llm.process(self.transcription_response)
tts = TextToSpeech()
tts.speak(llm_response)
# Reset transcription_response for the next loop iteration
self.transcription_response = ""
if __name__ == "__main__":
manager = ConversationManager()
asyncio.run(manager.main())
Above code is a Python code for voice chat AI for Mac.
1. Setup
Create .env file for API keys.
In advance, create a .env file as below. Please create it in the same directory as the Python file. (You need to get API keys of Deepgram and Groq).
DEEPGRAM_API_KEY=XXXXXXXXXXXXXX
GROQ_API_KEY=XXXXXXXXXXXXXX
Install at terminal
(1) Create a new conda environment.
conda create -n deepgram python=3.11
(2) Activate the conda environment we created.
conda activate deepgram
(3) Install the necessary packages.
pip install deepgram-sdk
pip install python-dotenv
pip install PyAudio
2. Here is a step-by-step explanation of the code: (以下は、Code内容の詳細説明です)
Import Statements
import asyncio
from dotenv import load_dotenv
import shutil
import subprocess
import requests
import time
import os
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
#from langchain_openai import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.prompts import (
ChatPromptTemplate,
MessagesPlaceholder,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
)
from langchain.chains import LLMChain
from deepgram import (
DeepgramClient,
DeepgramClientOptions,
LiveTranscriptionEvents,
LiveOptions,
Microphone,
)
The code starts with importing necessary libraries and modules.
These include standard libraries (`asyncio`, `shutil`, `subprocess`, `requests`, `time`, `os`) and external libraries (`dotenv`, `langchain_core`, `langchain_groq`, `langchain`, `deepgram`).
Load Environment Variables
load_dotenv()
This loads environment variables from a `.env` file.
LanguageModelProcessor Class
class LanguageModelProcessor:
def __init__(self):
self.llm = ChatGroq(temperature=0, model_name="llama3-70b-8192", groq_api_key=os.getenv("GROQ_API_KEY"))
self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
with open('system_prompt.txt', 'r') as file:
system_prompt = file.read().strip()
self.prompt = ChatPromptTemplate.from_messages([
SystemMessagePromptTemplate.from_template(system_prompt),
MessagesPlaceholder(variable_name="chat_history"),
HumanMessagePromptTemplate.from_template("{text}")
])
self.conversation = LLMChain(
llm=self.llm,
prompt=self.prompt,
memory=self.memory
)
def process(self, text):
self.memory.chat_memory.add_user_message(text)
start_time = time.time()
response = self.conversation.invoke({"text": text})
end_time = time.time()
self.memory.chat_memory.add_ai_message(response['text'])
elapsed_time = int((end_time - start_time) * 1000)
print(f"LLM ({elapsed_time}ms): {response['text']}")
return response['text']
This class initializes a language model processor using the `ChatGroq` model.
It sets up a prompt template and a conversation chain.
The `process` method sends user input to the model and prints the response time and the model's response.
TextToSpeech Class
class TextToSpeech:
DG_API_KEY = os.getenv("DEEPGRAM_API_KEY")
MODEL_NAME = "aura-helios-en"
@staticmethod
def is_installed(lib_name: str) -> bool:
lib = shutil.which(lib_name)
return lib is not None
def speak(self, text):
ffplay_path = "/opt/homebrew/bin/ffplay"
if not self.is_installed(ffplay_path):
raise ValueError("ffplay not found at the specified path. Please specify the correct path to ffplay.")
DEEPGRAM_URL = f"https://api.deepgram.com/v1/speak?model={self.MODEL_NAME}&performance=some&encoding=linear16&sample_rate=24000"
headers = {
"Authorization": f"Token {self.DG_API_KEY}",
"Content-Type": "application/json"
}
payload = {"text": text}
player_command = [ffplay_path, "-autoexit", "-", "-nodisp"]
player_process = subprocess.Popen(
player_command,
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
start_time = time.time()
first_byte_time = None
with requests.post(DEEPGRAM_URL, stream=True, headers=headers, json=payload) as r:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
if first_byte_time is None:
first_byte_time = time.time()
ttfb = int((first_byte_time - start_time)*1000)
print(f"TTS Time to First Byte (TTFB): {ttfb}ms\n")
player_process.stdin.write(chunk)
player_process.stdin.flush()
if player_process.stdin:
player_process.stdin.close()
player_process.wait()
This class handles text-to-speech using Deepgram's API.
It checks if `ffplay` is installed and uses it to play the generated speech audio.
TranscriptCollector Class
class TranscriptCollector:
def __init__(self):
self.reset()
def reset(self):
self.transcript_parts = []
def add_part(self, part):
self.transcript_parts.append(part)
def get_full_transcript(self):
return ' '.join(self.transcript_parts)
transcript_collector = TranscriptCollector()
This class collects transcript parts and combines them into a full transcript.
get_transcript Function
async def get_transcript(callback):
transcription_complete = asyncio.Event()
try:
config = DeepgramClientOptions(options={"keepalive": "true"})
deepgram = DeepgramClient("", config)
dg_connection = deepgram.listen.asynclive.v("1")
print ("Listening...")
async def on_message(self, result, **kwargs):
sentence = result.channel.alternatives[0].transcript
if not result.speech_final:
transcript_collector.add_part(sentence)
else:
transcript_collector.add_part(sentence)
full_sentence = transcript_collector.get_full_transcript()
if len(full_sentence.strip()) > 0:
full_sentence = full_sentence.strip()
print(f"Human: {full_sentence}")
callback(full_sentence)
transcript_collector.reset()
transcription_complete.set()
dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
options = LiveOptions(
model="nova-2",
punctuate=True,
language="en-US",
encoding="linear16",
channels=1,
sample_rate=16000,
endpointing=300,
smart_format=True,
)
await dg_connection.start(options)
microphone = Microphone(dg_connection.send)
microphone.start()
await transcription_complete.wait()
microphone.finish()
await dg_connection.finish()
except Exception as e:
print(f"Could not open socket: {e}")
return
This asynchronous function sets up a Deepgram client to get live transcription from a microphone input.
It handles partial and final transcription results and invokes a callback with the full sentence.
ConversationManager Class
class ConversationManager:
def __init__(self):
self.transcription_response = ""
self.llm = LanguageModelProcessor()
async def main(self):
def handle_full_sentence(full_sentence):
self.transcription_response = full_sentence
while True:
await get_transcript(handle_full_sentence)
if "goodbye" in self.transcription_response.lower():
break
llm_response = self.llm.process(self.transcription_response)
tts = TextToSpeech()
tts.speak(llm_response)
self.transcription_response = ""
if __name__ == "__main__":
manager = ConversationManager()
asyncio.run(manager.main())
This class manages the conversation by looping to get transcriptions, processing them with the language model, and then using text-to-speech to speak the responses.
The loop continues until the word "goodbye" is detected in the transcription.
Summary
The code initializes a language model and text-to-speech system, sets up live transcription from a microphone, and manages a conversation by continuously processing user input and generating responses.
ーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーー
Twitterでぜひご意見をお寄せ下さい。フォローよろしくお願いします🙇
旅人Twitter
https://twitter.com/Tomoto1234567