File size: 4,242 Bytes
e4c39da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import tempfile
import numpy as np
import gradio as gr
import whisper
from gtts import gTTS
from groq import Groq
import soundfile as sf

# Set up Groq API key
os.environ['GROQ_API_KEY'] = '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'
groq_client = Groq(api_key=os.environ.get('GROQ_API_KEY'))

# Load Whisper model
whisper_model = whisper.load_model("base")

def process_audio(audio_file_path):
    try:
        # Ensure audio_file_path is valid
        if not audio_file_path:
            raise ValueError("No audio file provided")
        
        print(f"Received audio file path: {audio_file_path}")

        # Read the audio file from the file path
        with open(audio_file_path, 'rb') as f:
            audio_data = f.read()

        # Save the audio data to a temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as temp_audio_file:
            temp_audio_path = temp_audio_file.name
            temp_audio_file.write(audio_data)
        
        # Ensure the temporary file is properly closed before processing
        temp_audio_file.close()

        # Transcribe audio using Whisper
        result = whisper_model.transcribe(temp_audio_path)
        user_text = result['text']
        print(f"Transcribed text: {user_text}")
        
        # Generate response using Llama 8b model with Groq API
        chat_completion = groq_client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": user_text,
                }
            ],
            model="llama3-8b-8192",
        )
        response_text = chat_completion.choices[0].message.content
        print(f"Response text: {response_text}")
        
        # Convert response text to speech using gTTS
        tts = gTTS(text=response_text, lang='en')
        with tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') as temp_audio_file:
            response_audio_path = temp_audio_file.name
            tts.save(response_audio_path)

        # Ensure the temporary file is properly closed before returning the path
        temp_audio_file.close()
        
        return response_text, response_audio_path
    except Exception as e:
        return f"Error: {str(e)}", None

# Create Gradio interface with updated layout
with gr.Blocks() as demo:
    gr.Markdown(
        """
        <style>
        .gradio-container {
            font-family: Arial, sans-serif;
            background-color: #e0f7fa;  /* Changed background color */
            border-radius: 10px;
            padding: 20px;
            box-shadow: 0 4px 12px rgba(0,0,0,0.2);
        }
        .gradio-input, .gradio-output {
            border-radius: 6px;
            border: 1px solid #ddd;
            padding: 10px;
        }
        .gradio-button {
            background-color: #28a745;
            color: white;
            border-radius: 6px;
            border: none;
            padding: 8px 16px;  /* Adjusted padding */
            font-size: 16px;  /* Adjusted font size */
        }
        .gradio-button:hover {
            background-color: #218838;
        }
        .gradio-title {
            font-size: 24px;
            font-weight: bold;
            margin-bottom: 20px;
        }
        .gradio-description {
            font-size: 14px;
            margin-bottom: 20px;
            color: #555;
        }
        </style>
        """
    )
    
    gr.Markdown("# Voice-to-Voice Chatbot\nDeveloped by Salman Maqbool")
    gr.Markdown("Upload an audio file to interact with the voice-to-voice chatbot. The chatbot will transcribe the audio, generate a response, and provide a spoken reply.")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Upload Audio File")
            submit_button = gr.Button("Submit")
        
        with gr.Column():
            response_text = gr.Textbox(label="Response Text", placeholder="Generated response will appear here")
            response_audio = gr.Audio(label="Response Audio", type="filepath")

    submit_button.click(process_audio, inputs=audio_input, outputs=[response_text, response_audio])

# Launch the Gradio app
demo.launch()