sal-maq's picture
Create app.py
e4c39da verified
raw
history blame
4.24 kB
import os
import tempfile
import numpy as np
import gradio as gr
import whisper
from gtts import gTTS
from groq import Groq
import soundfile as sf
# Set up Groq API key
os.environ['GROQ_API_KEY'] = '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'
groq_client = Groq(api_key=os.environ.get('GROQ_API_KEY'))
# Load Whisper model
whisper_model = whisper.load_model("base")
def process_audio(audio_file_path):
try:
# Ensure audio_file_path is valid
if not audio_file_path:
raise ValueError("No audio file provided")
print(f"Received audio file path: {audio_file_path}")
# Read the audio file from the file path
with open(audio_file_path, 'rb') as f:
audio_data = f.read()
# Save the audio data to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio_file:
temp_audio_path = temp_audio_file.name
temp_audio_file.write(audio_data)
# Ensure the temporary file is properly closed before processing
temp_audio_file.close()
# Transcribe audio using Whisper
result = whisper_model.transcribe(temp_audio_path)
user_text = result['text']
print(f"Transcribed text: {user_text}")
# Generate response using Llama 8b model with Groq API
chat_completion = groq_client.chat.completions.create(
messages=[
{
"role": "user",
"content": user_text,
}
],
model="llama3-8b-8192",
)
response_text = chat_completion.choices[0].message.content
print(f"Response text: {response_text}")
# Convert response text to speech using gTTS
tts = gTTS(text=response_text, lang='en')
with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio_file:
response_audio_path = temp_audio_file.name
tts.save(response_audio_path)
# Ensure the temporary file is properly closed before returning the path
temp_audio_file.close()
return response_text, response_audio_path
except Exception as e:
return f"Error: {str(e)}", None
# Create Gradio interface with updated layout
with gr.Blocks() as demo:
gr.Markdown(
"""
<style>
.gradio-container {
font-family: Arial, sans-serif;
background-color: #e0f7fa; /* Changed background color */
border-radius: 10px;
padding: 20px;
box-shadow: 0 4px 12px rgba(0,0,0,0.2);
}
.gradio-input, .gradio-output {
border-radius: 6px;
border: 1px solid #ddd;
padding: 10px;
}
.gradio-button {
background-color: #28a745;
color: white;
border-radius: 6px;
border: none;
padding: 8px 16px; /* Adjusted padding */
font-size: 16px; /* Adjusted font size */
}
.gradio-button:hover {
background-color: #218838;
}
.gradio-title {
font-size: 24px;
font-weight: bold;
margin-bottom: 20px;
}
.gradio-description {
font-size: 14px;
margin-bottom: 20px;
color: #555;
}
</style>
"""
)
gr.Markdown("# Voice-to-Voice Chatbot\nDeveloped by Salman Maqbool")
gr.Markdown("Upload an audio file to interact with the voice-to-voice chatbot. The chatbot will transcribe the audio, generate a response, and provide a spoken reply.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Upload Audio File")
submit_button = gr.Button("Submit")
with gr.Column():
response_text = gr.Textbox(label="Response Text", placeholder="Generated response will appear here")
response_audio = gr.Audio(label="Response Audio", type="filepath")
submit_button.click(process_audio, inputs=audio_input, outputs=[response_text, response_audio])
# Launch the Gradio app
demo.launch()