audio-to-text

Runtime error

App Files Files Community

rodrigomasini commited on Dec 12, 2024

Commit

167bfa7

verified ·

1 Parent(s): f2b8c24

Update app.py

Browse files

Files changed (1) hide show

app.py +222 -9

app.py CHANGED Viewed

@@ -1,14 +1,227 @@
 import gradio as gr
-from transformers import pipeline
-modelo = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h-lv60-self")
-def transcribe(audio):
-  text = modelo(audio)["text"]
-  return text
 gr.Interface(
-    fn=transcribe,
-    inputs=[gr.Audio(sources="microphone", type="filepath")],
-    outputs=["textbox"]
-).launch()

+import os
+import re
 import gradio as gr
+import tempfile
+from pydub import AudioSegment
+from pydub.utils import which
+import edge_tts
+import asyncio
+import nest_asyncio
+nest_asyncio.apply()
+from openai import OpenAI
+sync_client = OpenAI(
+    base_url="https://t2t.fanheroapi.com/v1",
+    api_key="tela"
+)
+# Ensuring pydub can locate ffmpeg
+AudioSegment.converter = which("ffmpeg")
+# TELA endpoint for text-to-text generation
+TELA_API_URL = "https://t2t.fanheroapi.com/v1/chat/completions"
+# Headers for API request
+headers = {
+    "Content-Type": "application/json",
+    "Accept": "application/json"
+}
+# TELA endpoint for speech-to-text generation
+TELA_TRANSCRIPT_AUDIO_URL = "http://104.171.203.212:8000/speech-to-text/"
+system_instruction = """
+Responda e mantenha a conversa de forma amigavel, concisa, clara e aberta.
+Evite qualquer desnecessaria introducao.
+Responda em um tom normal, de conversacao e sempre amigavel e suportivo.
+"""
+# Function to convert audio to mp3 using pydub
+def convert_to_mp3(audio_file_path):
+    temp_mp3 = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
+    try:
+        audio = AudioSegment.from_file(audio_file_path)
+        audio.export(temp_mp3.name, format="mp3")
+        return temp_mp3.name
+    except Exception as e:
+        print(f"Error converting audio: {e}")
+        return None
+# Function to send audio to the speech-to-text endpoint
+def transcript(audio_file_path):
+    if audio_file_path is None:
+        return {"data": "failed", "error": "No audio file provided."}
+    mp3_file_path = convert_to_mp3(audio_file_path)
+    if not mp3_file_path:
+        return {"data": "failed", "error": "Failed to convert audio to mp3."}
+    try:
+        print(f"Transcription API URL: {TELA_TRANSCRIPT_AUDIO_URL}")
+        with open(mp3_file_path, 'rb') as f:
+            files = {'file': f}
+            response = requests.post(TELA_TRANSCRIPT_AUDIO_URL, files=files)
+        print(f"Response Status: {response.status_code}")
+        print(f"Response Text: {response.text}")
+        if response.status_code == 200:
+            return response.json()
+        else:
+            return {"data": "failed", "error": f"Error {response.status_code}: {response.text}"}
+    except Exception as e:
+        return {"data": "failed", "error": str(e)}
+    finally:
+        if mp3_file_path and os.path.exists(mp3_file_path):
+            try:
+                os.remove(mp3_file_path)
+            except OSError as e:
+                print(f"Error deleting temporary file: {e}")
+# Function to extract user input from transcription
+def extract_user_input(transcription_response):
+    try:
+        transcript_segments = transcription_response.get('result', [])
+        user_input = "".join([segment['text'] for segment in transcript_segments])
+        return user_input.strip()
+    except KeyError:
+        return ""
+# Function to format the AI response
+def format_generated_response(response):
+    if response is None:
+        return "Error: No valid response received."
+    try:
+        # Extract the generated text from the response
+        generated_text = response['choices'][0]['message']['content']
+        partial_text = re.sub(r'<.*?>', '', generated_text)
+        cleaned_text = re.sub(r'#.*?\n', '', partial_text)
+        return cleaned_text.strip()
+    except (KeyError, IndexError) as e:
+        return f"Error: Missing key or index {e} in response."
+# Function to generate speech using edge_tts
+def generate_speech(text):
+    tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    async def generate_tts():
+        tts = edge_tts.Communicate(text, voice="pt-BR-AntonioNeural")
+        await tts.save(tts_file.name)
+    try:
+        asyncio.run(generate_tts())
+        print(f"TTS audio saved to: {tts_file.name}")
+        return tts_file.name
+    except Exception as e:
+        print(f"Error generating TTS: {e}")
+        return None
+# Main chatbot conversation function
+def chatbot_conversation(audio_file_path):
+    try:
+        transcription = transcript(audio_file_path)
+        user_input = extract_user_input(transcription)
+        if not user_input:
+            return "I could not generate the text. Please try again.", None
+        if history is None:
+            history = []
+        else:
+            for val in history:
+                if val[0]:
+                    messages.append({"role": "user", "content": val[0]})
+                if val[1]:
+                    messages.append({"role": "assistant", "content": val[1]})
+        response = ""
+        for message in sync_client.chat.completions.create(
+        model="tela-gpt4o",
+        messages=[
+            {"role": "system", "content": system_message},
+        ],
+        stream=True,
+        max_tokens=1024,  # Still concise response
+        temperature=0,  # Creative output
+        response_format={"type": "text"}
+        ):
+            token = message.choices[0].delta.content
+            response += token
+            yield response
+        if response:
+            history.append([
+                {"role": "user", "content": user_input},
+                {"role": "assistant", "content": response}
+            ])
+            tts_file_name = generate_speech(response)
+            if tts_file_name:
+                return formatted_output, tts_file_name
+            else:
+                return formatted_output, None
+        else:
+            return "I could not synthesize the audio. Please try again.", None
+        #def respond(
+        #    message,
+        #    history: list[tuple[str, str]],
+        #    system_message,
+        #    max_tokens,
+        #    temperature,
+        #    top_p,
+        #):
+        #    messages = []
+        #    if history is None:
+        #        history = []
+        #    else:
+        #    messages.append({"role": "user", "content": message})
+        #    response = ""
+        #    for message in client.chat_completion(
+        #        messages,
+        #        max_tokens=max_tokens,
+        #        stream=True,
+        #        temperature=temperature,
+        #        top_p=top_p,
+        #    ):
+        #        token = message.choices[0].delta.content
+        #        response += token
+        #        yield response
+        #if response:
+        #    history.append([
+        #        {"role": "user", "content": message},
+        #        {"role": "assistant", "content": response}
+        #    ])
+        #    tts_file_name = generate_speech(response)
+        #    if tts_file_name:
+        #        return formatted_output, tts_file_name
+        #    else:
+        #        return formatted_output, None
+        #else:
+        #    return "I could not synthesize the audio. Please try again.", None
+    except Exception as e:
+        print(f"Error: {e}")
+        return "I could not understand you. Please try again.", None
+# Gradio interface setup
 gr.Interface(
+    fn=chatbot_conversation,
+    inputs=gr.Audio(label="User", type="filepath", streaming=True, container=True),
+    outputs=[
+        gr.Textbox(label="Transcription"),
+        gr.Audio(type="filepath", autoplay=True, label="MAGIC Chat")
+    ],
+    title="MAGIC VoiceChat",
+    description="A simple example of audio conversational AI",
+    theme="sudeepshouche/minimalist",
+    live=True
+).launch()