audio-to-text

Runtime error

App Files Files Community

rodrigomasini commited on Dec 12, 2024

Commit

d3a9b11

verified ·

1 Parent(s): 84837eb

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -55

app.py CHANGED Viewed

@@ -91,21 +91,6 @@ def extract_user_input(transcription_response):
         print(f"[ERROR] KeyError in transcription response: {e}")
         return ""
-#def format_generated_response(response):
-#    print("[DEBUG] Formatting the generated response.")
-#    if response is None:
-#        print("[ERROR] No response to format.")
-#        return "Error: No valid response received."
-#    try:
-#        generated_text = response['choices'][0]['message']['content']
-#        partial_text = re.sub(r'<.*?>', '', generated_text)
-#        cleaned_text = re.sub(r'#.*?\n', '', partial_text)
-#        print(f"[DEBUG] Formatted response: {cleaned_text.strip()}")
-#        return cleaned_text.strip()
-#    except (KeyError, IndexError) as e:
-#        print(f"[ERROR] Error formatting response: {e}")
-#        return f"Error: Missing key or index {e} in response."
 def generate_speech(text):
     print("[DEBUG] Generating speech from text.")
     tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
@@ -121,7 +106,7 @@ def generate_speech(text):
         print(f"[ERROR] Error generating TTS: {e}")
         return None
-def chatbot_conversation(audio_file_path):
     print("[DEBUG] Starting chatbot conversation.")
     try:
         transcription = transcript(audio_file_path)
@@ -129,23 +114,27 @@ def chatbot_conversation(audio_file_path):
         if not user_input:
             print("[ERROR] No user input extracted from transcription.")
-            yield "I could not generate the text. Please try again.", None
             return
         system_message = system_instruction
-        history = []  # If history is meant to persist, consider storing it externally
-        messages = []
-        # Reconstruct history if needed (currently empty)
-        for val in history:
-            if val[0]:
-                messages.append({"role": "user", "content": val[0]})
-            if val[1]:
-                messages.append({"role": "assistant", "content": val[1]})
-        # Current user input
         messages.append({"role": "user", "content": user_input})
-        messages.insert(0, {"role": "system", "content": system_message})
         print("[DEBUG] Sending request to sync_client for chat completion.")
         print(f"[DEBUG] Messages: {messages}")
@@ -163,47 +152,49 @@ def chatbot_conversation(audio_file_path):
             ):
                 token = message.choices[0].delta.content
                 response += token
-                # Yield partial text, no audio yet
-                # The first output is the transcription (assistant message),
-                # second output is audio, which we pass as None for now
-                yield (response, None)
         except Exception as e:
             print(f"[ERROR] Error during streaming response: {e}")
-            yield ("I could not understand you. Please try again.", None)
             return
-        # Now that we have the full response, generate TTS
-        if response:
-            history.append([
-                {"role": "user", "content": user_input},
-                {"role": "assistant", "content": response}
-            ])
-            print("[DEBUG] Generating TTS for full response.")
-            tts_file_name = generate_speech(response)
-            if tts_file_name:
-                print("[DEBUG] Returning final response and TTS file.")
-                # Now yield again with final text and audio
-                yield (response, tts_file_name)
-            else:
-                print("[ERROR] Failed to generate TTS.")
-                yield (response, None)
         else:
-            print("[ERROR] No response generated.")
-            yield ("I could not synthesize the audio. Please try again.", None)
     except Exception as e:
         print(f"[ERROR] Exception in chatbot_conversation: {e}")
-        yield ("I could not understand you. Please try again.", None)
-gr.Interface(
     fn=chatbot_conversation,
-    inputs=gr.Audio(label="User", type="filepath", streaming=False, container=True),
     outputs=[
         gr.Textbox(label="Transcription"),
-        gr.Audio(type="filepath", autoplay=True, label="MAGIC Chat")
     ],
     title="MAGIC VoiceChat",
     description="A simple example of audio conversational AI",
     theme="sudeepshouche/minimalist",
-    live=True
-).launch()

         print(f"[ERROR] KeyError in transcription response: {e}")
         return ""
 def generate_speech(text):
     print("[DEBUG] Generating speech from text.")
     tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
         print(f"[ERROR] Error generating TTS: {e}")
         return None
+def chatbot_conversation(audio_file_path, history):
     print("[DEBUG] Starting chatbot conversation.")
     try:
         transcription = transcript(audio_file_path)
         if not user_input:
             print("[ERROR] No user input extracted from transcription.")
+            yield "I could not generate the text. Please try again.", None, history
             return
+        # Ensure we have a system_message
         system_message = system_instruction
+        if history is None:
+            history = []
+        # Reconstruct messages from history
+        messages = [{"role": "system", "content": system_message}]
+        for turn in history:
+            user_msg = turn[0].get("content") if turn[0] else ""
+            assistant_msg = turn[1].get("content") if turn[1] else ""
+            if user_msg:
+                messages.append({"role": "user", "content": user_msg})
+            if assistant_msg:
+                messages.append({"role": "assistant", "content": assistant_msg})
+        # Add the current user input
         messages.append({"role": "user", "content": user_input})
         print("[DEBUG] Sending request to sync_client for chat completion.")
         print(f"[DEBUG] Messages: {messages}")
             ):
                 token = message.choices[0].delta.content
                 response += token
+                # Yield partial text updates, no audio yet, history unchanged yet
+                yield (response, None, history)
         except Exception as e:
             print(f"[ERROR] Error during streaming response: {e}")
+            yield ("I could not understand you. Please try again.", None, history)
             return
+        # Now that we have the full response, update history
+        history.append([
+            {"role": "user", "content": user_input},
+            {"role": "assistant", "content": response}
+        ])
+        # Generate TTS now
+        print("[DEBUG] Generating TTS for full response.")
+        tts_file_name = generate_speech(response)
+        if tts_file_name:
+            print("[DEBUG] Returning final response and TTS file with updated history.")
+            # Now yield again with final text, audio, and updated history
+            yield (response, tts_file_name, history)
         else:
+            print("[ERROR] Failed to generate TTS.")
+            yield (response, None, history)
     except Exception as e:
         print(f"[ERROR] Exception in chatbot_conversation: {e}")
+        yield ("I could not understand you. Please try again.", None, history)
+# We now have three outputs: transcription text, audio, and the updated history
+interface = gr.Interface(
     fn=chatbot_conversation,
+    inputs=[
+        gr.Audio(label="User", type="filepath", streaming=False, container=True),
+        gr.State([])  # State holds the conversation history
+    ],
     outputs=[
         gr.Textbox(label="Transcription"),
+        gr.Audio(type="filepath", autoplay=True, label="MAGIC Chat"),
+        gr.State([])  # Return updated history
     ],
     title="MAGIC VoiceChat",
     description="A simple example of audio conversational AI",
     theme="sudeepshouche/minimalist",
+)
+interface.queue().launch()