rodrigomasini commited on
Commit
167bfa7
·
verified ·
1 Parent(s): f2b8c24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +222 -9
app.py CHANGED
@@ -1,14 +1,227 @@
 
 
1
  import gradio as gr
2
- from transformers import pipeline
 
 
 
 
 
 
 
3
 
4
- modelo = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h-lv60-self")
 
 
 
5
 
6
- def transcribe(audio):
7
- text = modelo(audio)["text"]
8
- return text
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  gr.Interface(
11
- fn=transcribe,
12
- inputs=[gr.Audio(sources="microphone", type="filepath")],
13
- outputs=["textbox"]
14
- ).launch()
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
  import gradio as gr
4
+ import tempfile
5
+ from pydub import AudioSegment
6
+ from pydub.utils import which
7
+ import edge_tts
8
+ import asyncio
9
+ import nest_asyncio
10
+ nest_asyncio.apply()
11
+ from openai import OpenAI
12
 
13
+ sync_client = OpenAI(
14
+ base_url="https://t2t.fanheroapi.com/v1",
15
+ api_key="tela"
16
+ )
17
 
18
+ # Ensuring pydub can locate ffmpeg
19
+ AudioSegment.converter = which("ffmpeg")
 
20
 
21
+ # TELA endpoint for text-to-text generation
22
+ TELA_API_URL = "https://t2t.fanheroapi.com/v1/chat/completions"
23
+
24
+ # Headers for API request
25
+ headers = {
26
+ "Content-Type": "application/json",
27
+ "Accept": "application/json"
28
+ }
29
+
30
+ # TELA endpoint for speech-to-text generation
31
+ TELA_TRANSCRIPT_AUDIO_URL = "http://104.171.203.212:8000/speech-to-text/"
32
+
33
+ system_instruction = """
34
+ Responda e mantenha a conversa de forma amigavel, concisa, clara e aberta.
35
+ Evite qualquer desnecessaria introducao.
36
+ Responda em um tom normal, de conversacao e sempre amigavel e suportivo.
37
+ """
38
+
39
+ # Function to convert audio to mp3 using pydub
40
+ def convert_to_mp3(audio_file_path):
41
+ temp_mp3 = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
42
+ try:
43
+ audio = AudioSegment.from_file(audio_file_path)
44
+ audio.export(temp_mp3.name, format="mp3")
45
+ return temp_mp3.name
46
+ except Exception as e:
47
+ print(f"Error converting audio: {e}")
48
+ return None
49
+
50
+ # Function to send audio to the speech-to-text endpoint
51
+ def transcript(audio_file_path):
52
+ if audio_file_path is None:
53
+ return {"data": "failed", "error": "No audio file provided."}
54
+
55
+ mp3_file_path = convert_to_mp3(audio_file_path)
56
+ if not mp3_file_path:
57
+ return {"data": "failed", "error": "Failed to convert audio to mp3."}
58
+
59
+ try:
60
+ print(f"Transcription API URL: {TELA_TRANSCRIPT_AUDIO_URL}")
61
+ with open(mp3_file_path, 'rb') as f:
62
+ files = {'file': f}
63
+ response = requests.post(TELA_TRANSCRIPT_AUDIO_URL, files=files)
64
+
65
+ print(f"Response Status: {response.status_code}")
66
+ print(f"Response Text: {response.text}")
67
+
68
+ if response.status_code == 200:
69
+ return response.json()
70
+ else:
71
+ return {"data": "failed", "error": f"Error {response.status_code}: {response.text}"}
72
+
73
+ except Exception as e:
74
+ return {"data": "failed", "error": str(e)}
75
+ finally:
76
+ if mp3_file_path and os.path.exists(mp3_file_path):
77
+ try:
78
+ os.remove(mp3_file_path)
79
+ except OSError as e:
80
+ print(f"Error deleting temporary file: {e}")
81
+
82
+ # Function to extract user input from transcription
83
+ def extract_user_input(transcription_response):
84
+ try:
85
+ transcript_segments = transcription_response.get('result', [])
86
+ user_input = "".join([segment['text'] for segment in transcript_segments])
87
+ return user_input.strip()
88
+ except KeyError:
89
+ return ""
90
+
91
+
92
+ # Function to format the AI response
93
+ def format_generated_response(response):
94
+ if response is None:
95
+ return "Error: No valid response received."
96
+ try:
97
+ # Extract the generated text from the response
98
+ generated_text = response['choices'][0]['message']['content']
99
+ partial_text = re.sub(r'<.*?>', '', generated_text)
100
+ cleaned_text = re.sub(r'#.*?\n', '', partial_text)
101
+ return cleaned_text.strip()
102
+ except (KeyError, IndexError) as e:
103
+ return f"Error: Missing key or index {e} in response."
104
+
105
+ # Function to generate speech using edge_tts
106
+ def generate_speech(text):
107
+ tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
108
+ async def generate_tts():
109
+ tts = edge_tts.Communicate(text, voice="pt-BR-AntonioNeural")
110
+ await tts.save(tts_file.name)
111
+
112
+ try:
113
+ asyncio.run(generate_tts())
114
+ print(f"TTS audio saved to: {tts_file.name}")
115
+ return tts_file.name
116
+ except Exception as e:
117
+ print(f"Error generating TTS: {e}")
118
+ return None
119
+
120
+ # Main chatbot conversation function
121
+ def chatbot_conversation(audio_file_path):
122
+ try:
123
+ transcription = transcript(audio_file_path)
124
+ user_input = extract_user_input(transcription)
125
+
126
+ if not user_input:
127
+ return "I could not generate the text. Please try again.", None
128
+
129
+ if history is None:
130
+ history = []
131
+ else:
132
+ for val in history:
133
+ if val[0]:
134
+ messages.append({"role": "user", "content": val[0]})
135
+ if val[1]:
136
+ messages.append({"role": "assistant", "content": val[1]})
137
+
138
+ response = ""
139
+
140
+ for message in sync_client.chat.completions.create(
141
+ model="tela-gpt4o",
142
+ messages=[
143
+ {"role": "system", "content": system_message},
144
+ ],
145
+ stream=True,
146
+ max_tokens=1024, # Still concise response
147
+ temperature=0, # Creative output
148
+ response_format={"type": "text"}
149
+ ):
150
+ token = message.choices[0].delta.content
151
+ response += token
152
+ yield response
153
+
154
+ if response:
155
+ history.append([
156
+ {"role": "user", "content": user_input},
157
+ {"role": "assistant", "content": response}
158
+ ])
159
+ tts_file_name = generate_speech(response)
160
+ if tts_file_name:
161
+ return formatted_output, tts_file_name
162
+ else:
163
+ return formatted_output, None
164
+ else:
165
+ return "I could not synthesize the audio. Please try again.", None
166
+
167
+ #def respond(
168
+ # message,
169
+ # history: list[tuple[str, str]],
170
+ # system_message,
171
+ # max_tokens,
172
+ # temperature,
173
+ # top_p,
174
+ #):
175
+ # messages = []
176
+
177
+ # if history is None:
178
+ # history = []
179
+ # else:
180
+
181
+
182
+ # messages.append({"role": "user", "content": message})
183
+
184
+ # response = ""
185
+
186
+ # for message in client.chat_completion(
187
+ # messages,
188
+ # max_tokens=max_tokens,
189
+ # stream=True,
190
+ # temperature=temperature,
191
+ # top_p=top_p,
192
+ # ):
193
+ # token = message.choices[0].delta.content
194
+
195
+ # response += token
196
+ # yield response
197
+
198
+ #if response:
199
+ # history.append([
200
+ # {"role": "user", "content": message},
201
+ # {"role": "assistant", "content": response}
202
+ # ])
203
+ # tts_file_name = generate_speech(response)
204
+ # if tts_file_name:
205
+ # return formatted_output, tts_file_name
206
+ # else:
207
+ # return formatted_output, None
208
+ #else:
209
+ # return "I could not synthesize the audio. Please try again.", None
210
+
211
+ except Exception as e:
212
+ print(f"Error: {e}")
213
+ return "I could not understand you. Please try again.", None
214
+
215
+ # Gradio interface setup
216
  gr.Interface(
217
+ fn=chatbot_conversation,
218
+ inputs=gr.Audio(label="User", type="filepath", streaming=True, container=True),
219
+ outputs=[
220
+ gr.Textbox(label="Transcription"),
221
+ gr.Audio(type="filepath", autoplay=True, label="MAGIC Chat")
222
+ ],
223
+ title="MAGIC VoiceChat",
224
+ description="A simple example of audio conversational AI",
225
+ theme="sudeepshouche/minimalist",
226
+ live=True
227
+ ).launch()