Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,14 +1,227 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
-
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
return text
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
gr.Interface(
|
11 |
-
fn=
|
12 |
-
inputs=
|
13 |
-
outputs=[
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
import gradio as gr
|
4 |
+
import tempfile
|
5 |
+
from pydub import AudioSegment
|
6 |
+
from pydub.utils import which
|
7 |
+
import edge_tts
|
8 |
+
import asyncio
|
9 |
+
import nest_asyncio
|
10 |
+
nest_asyncio.apply()
|
11 |
+
from openai import OpenAI
|
12 |
|
13 |
+
sync_client = OpenAI(
|
14 |
+
base_url="https://t2t.fanheroapi.com/v1",
|
15 |
+
api_key="tela"
|
16 |
+
)
|
17 |
|
18 |
+
# Ensuring pydub can locate ffmpeg
|
19 |
+
AudioSegment.converter = which("ffmpeg")
|
|
|
20 |
|
21 |
+
# TELA endpoint for text-to-text generation
|
22 |
+
TELA_API_URL = "https://t2t.fanheroapi.com/v1/chat/completions"
|
23 |
+
|
24 |
+
# Headers for API request
|
25 |
+
headers = {
|
26 |
+
"Content-Type": "application/json",
|
27 |
+
"Accept": "application/json"
|
28 |
+
}
|
29 |
+
|
30 |
+
# TELA endpoint for speech-to-text generation
|
31 |
+
TELA_TRANSCRIPT_AUDIO_URL = "http://104.171.203.212:8000/speech-to-text/"
|
32 |
+
|
33 |
+
system_instruction = """
|
34 |
+
Responda e mantenha a conversa de forma amigavel, concisa, clara e aberta.
|
35 |
+
Evite qualquer desnecessaria introducao.
|
36 |
+
Responda em um tom normal, de conversacao e sempre amigavel e suportivo.
|
37 |
+
"""
|
38 |
+
|
39 |
+
# Function to convert audio to mp3 using pydub
|
40 |
+
def convert_to_mp3(audio_file_path):
|
41 |
+
temp_mp3 = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
|
42 |
+
try:
|
43 |
+
audio = AudioSegment.from_file(audio_file_path)
|
44 |
+
audio.export(temp_mp3.name, format="mp3")
|
45 |
+
return temp_mp3.name
|
46 |
+
except Exception as e:
|
47 |
+
print(f"Error converting audio: {e}")
|
48 |
+
return None
|
49 |
+
|
50 |
+
# Function to send audio to the speech-to-text endpoint
|
51 |
+
def transcript(audio_file_path):
|
52 |
+
if audio_file_path is None:
|
53 |
+
return {"data": "failed", "error": "No audio file provided."}
|
54 |
+
|
55 |
+
mp3_file_path = convert_to_mp3(audio_file_path)
|
56 |
+
if not mp3_file_path:
|
57 |
+
return {"data": "failed", "error": "Failed to convert audio to mp3."}
|
58 |
+
|
59 |
+
try:
|
60 |
+
print(f"Transcription API URL: {TELA_TRANSCRIPT_AUDIO_URL}")
|
61 |
+
with open(mp3_file_path, 'rb') as f:
|
62 |
+
files = {'file': f}
|
63 |
+
response = requests.post(TELA_TRANSCRIPT_AUDIO_URL, files=files)
|
64 |
+
|
65 |
+
print(f"Response Status: {response.status_code}")
|
66 |
+
print(f"Response Text: {response.text}")
|
67 |
+
|
68 |
+
if response.status_code == 200:
|
69 |
+
return response.json()
|
70 |
+
else:
|
71 |
+
return {"data": "failed", "error": f"Error {response.status_code}: {response.text}"}
|
72 |
+
|
73 |
+
except Exception as e:
|
74 |
+
return {"data": "failed", "error": str(e)}
|
75 |
+
finally:
|
76 |
+
if mp3_file_path and os.path.exists(mp3_file_path):
|
77 |
+
try:
|
78 |
+
os.remove(mp3_file_path)
|
79 |
+
except OSError as e:
|
80 |
+
print(f"Error deleting temporary file: {e}")
|
81 |
+
|
82 |
+
# Function to extract user input from transcription
|
83 |
+
def extract_user_input(transcription_response):
|
84 |
+
try:
|
85 |
+
transcript_segments = transcription_response.get('result', [])
|
86 |
+
user_input = "".join([segment['text'] for segment in transcript_segments])
|
87 |
+
return user_input.strip()
|
88 |
+
except KeyError:
|
89 |
+
return ""
|
90 |
+
|
91 |
+
|
92 |
+
# Function to format the AI response
|
93 |
+
def format_generated_response(response):
|
94 |
+
if response is None:
|
95 |
+
return "Error: No valid response received."
|
96 |
+
try:
|
97 |
+
# Extract the generated text from the response
|
98 |
+
generated_text = response['choices'][0]['message']['content']
|
99 |
+
partial_text = re.sub(r'<.*?>', '', generated_text)
|
100 |
+
cleaned_text = re.sub(r'#.*?\n', '', partial_text)
|
101 |
+
return cleaned_text.strip()
|
102 |
+
except (KeyError, IndexError) as e:
|
103 |
+
return f"Error: Missing key or index {e} in response."
|
104 |
+
|
105 |
+
# Function to generate speech using edge_tts
|
106 |
+
def generate_speech(text):
|
107 |
+
tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
|
108 |
+
async def generate_tts():
|
109 |
+
tts = edge_tts.Communicate(text, voice="pt-BR-AntonioNeural")
|
110 |
+
await tts.save(tts_file.name)
|
111 |
+
|
112 |
+
try:
|
113 |
+
asyncio.run(generate_tts())
|
114 |
+
print(f"TTS audio saved to: {tts_file.name}")
|
115 |
+
return tts_file.name
|
116 |
+
except Exception as e:
|
117 |
+
print(f"Error generating TTS: {e}")
|
118 |
+
return None
|
119 |
+
|
120 |
+
# Main chatbot conversation function
|
121 |
+
def chatbot_conversation(audio_file_path):
|
122 |
+
try:
|
123 |
+
transcription = transcript(audio_file_path)
|
124 |
+
user_input = extract_user_input(transcription)
|
125 |
+
|
126 |
+
if not user_input:
|
127 |
+
return "I could not generate the text. Please try again.", None
|
128 |
+
|
129 |
+
if history is None:
|
130 |
+
history = []
|
131 |
+
else:
|
132 |
+
for val in history:
|
133 |
+
if val[0]:
|
134 |
+
messages.append({"role": "user", "content": val[0]})
|
135 |
+
if val[1]:
|
136 |
+
messages.append({"role": "assistant", "content": val[1]})
|
137 |
+
|
138 |
+
response = ""
|
139 |
+
|
140 |
+
for message in sync_client.chat.completions.create(
|
141 |
+
model="tela-gpt4o",
|
142 |
+
messages=[
|
143 |
+
{"role": "system", "content": system_message},
|
144 |
+
],
|
145 |
+
stream=True,
|
146 |
+
max_tokens=1024, # Still concise response
|
147 |
+
temperature=0, # Creative output
|
148 |
+
response_format={"type": "text"}
|
149 |
+
):
|
150 |
+
token = message.choices[0].delta.content
|
151 |
+
response += token
|
152 |
+
yield response
|
153 |
+
|
154 |
+
if response:
|
155 |
+
history.append([
|
156 |
+
{"role": "user", "content": user_input},
|
157 |
+
{"role": "assistant", "content": response}
|
158 |
+
])
|
159 |
+
tts_file_name = generate_speech(response)
|
160 |
+
if tts_file_name:
|
161 |
+
return formatted_output, tts_file_name
|
162 |
+
else:
|
163 |
+
return formatted_output, None
|
164 |
+
else:
|
165 |
+
return "I could not synthesize the audio. Please try again.", None
|
166 |
+
|
167 |
+
#def respond(
|
168 |
+
# message,
|
169 |
+
# history: list[tuple[str, str]],
|
170 |
+
# system_message,
|
171 |
+
# max_tokens,
|
172 |
+
# temperature,
|
173 |
+
# top_p,
|
174 |
+
#):
|
175 |
+
# messages = []
|
176 |
+
|
177 |
+
# if history is None:
|
178 |
+
# history = []
|
179 |
+
# else:
|
180 |
+
|
181 |
+
|
182 |
+
# messages.append({"role": "user", "content": message})
|
183 |
+
|
184 |
+
# response = ""
|
185 |
+
|
186 |
+
# for message in client.chat_completion(
|
187 |
+
# messages,
|
188 |
+
# max_tokens=max_tokens,
|
189 |
+
# stream=True,
|
190 |
+
# temperature=temperature,
|
191 |
+
# top_p=top_p,
|
192 |
+
# ):
|
193 |
+
# token = message.choices[0].delta.content
|
194 |
+
|
195 |
+
# response += token
|
196 |
+
# yield response
|
197 |
+
|
198 |
+
#if response:
|
199 |
+
# history.append([
|
200 |
+
# {"role": "user", "content": message},
|
201 |
+
# {"role": "assistant", "content": response}
|
202 |
+
# ])
|
203 |
+
# tts_file_name = generate_speech(response)
|
204 |
+
# if tts_file_name:
|
205 |
+
# return formatted_output, tts_file_name
|
206 |
+
# else:
|
207 |
+
# return formatted_output, None
|
208 |
+
#else:
|
209 |
+
# return "I could not synthesize the audio. Please try again.", None
|
210 |
+
|
211 |
+
except Exception as e:
|
212 |
+
print(f"Error: {e}")
|
213 |
+
return "I could not understand you. Please try again.", None
|
214 |
+
|
215 |
+
# Gradio interface setup
|
216 |
gr.Interface(
|
217 |
+
fn=chatbot_conversation,
|
218 |
+
inputs=gr.Audio(label="User", type="filepath", streaming=True, container=True),
|
219 |
+
outputs=[
|
220 |
+
gr.Textbox(label="Transcription"),
|
221 |
+
gr.Audio(type="filepath", autoplay=True, label="MAGIC Chat")
|
222 |
+
],
|
223 |
+
title="MAGIC VoiceChat",
|
224 |
+
description="A simple example of audio conversational AI",
|
225 |
+
theme="sudeepshouche/minimalist",
|
226 |
+
live=True
|
227 |
+
).launch()
|