import os import re import gradio as gr import tempfile from pydub import AudioSegment from pydub.utils import which import edge_tts import asyncio import nest_asyncio import requests nest_asyncio.apply() from openai import OpenAI secret=os.getenv("SECRET") url=os.getenv("SRVC") url_audio=os.getenv("TRANSCRIPTION") key=os.getenv("KEY") description = """
Primeiro assistente de IA de voz do Brasil
""" OPENAI_API_KEY = secret sync_client = OpenAI( base_url=url, api_key=key ) # Ensuring pydub can locate ffmpeg AudioSegment.converter = which("ffmpeg") # TELA endpoint for speech-to-text generation TELA_TRANSCRIPT_AUDIO_URL = url_audio system_instruction = """ A partir de agora, o seu nome é Chagas, um assistente virtual de saúde que fala português. Durante a interação com o usuário, você deve responder e manter a conversa de forma amigável, concisa, clara e aberta. Evite qualquer introdução desnecessária. Responda em um tom amigável de conversação e sempre empático e suportivo. Nunca retorne a sua resposta em formato Markdown. E sempre, sempre retorne na forma de frases, mesmo se a sua resposta for uma lista. Novamente, apenas frases, mesmo que você queira realçar várias etapas como uma lista numerada e colocando markdown com asteriscos, não o faça! Apenas frases. """ def convert_to_mp3(audio_file_path): print("[DEBUG] Starting audio conversion to mp3.") temp_mp3 = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) try: audio = AudioSegment.from_file(audio_file_path) audio.export(temp_mp3.name, format="mp3") print(f"[DEBUG] Successfully converted to mp3: {temp_mp3.name}") return temp_mp3.name except Exception as e: print(f"[ERROR] Error converting audio: {e}") return None def transcript(audio_file_path): print("[DEBUG] Starting transcription process.") if audio_file_path is None: print("[ERROR] No audio file provided.") return {"data": "failed", "error": "No audio file provided."} mp3_file_path = convert_to_mp3(audio_file_path) if not mp3_file_path: print("[ERROR] Failed to convert audio to mp3.") return {"data": "failed", "error": "Failed to convert audio to mp3."} try: print("[DEBUG] Sending mp3 to transcription endpoint.") print(f"[DEBUG] Transcription API URL: {TELA_TRANSCRIPT_AUDIO_URL}") with open(mp3_file_path, 'rb') as f: files = {'file': f} response = requests.post(TELA_TRANSCRIPT_AUDIO_URL, files=files) print(f"[DEBUG] Response Status Code: {response.status_code}") print(f"[DEBUG] Response Text: {response.text}") if response.status_code == 200: print("[DEBUG] Successfully received transcription.") return response.json() else: print(f"[ERROR] Unexpected status code {response.status_code}: {response.text}") return {"data": "failed", "error": f"Error {response.status_code}: {response.text}"} except Exception as e: print(f"[ERROR] Exception during transcription: {e}") return {"data": "failed", "error": str(e)} finally: if mp3_file_path and os.path.exists(mp3_file_path): try: os.remove(mp3_file_path) print("[DEBUG] Temporary mp3 file deleted.") except OSError as e: print(f"[ERROR] Error deleting temporary file: {e}") def extract_user_input(transcription_response): print("[DEBUG] Extracting user input from transcription response.") try: transcript_segments = transcription_response.get('result', []) user_input = "".join([segment['text'] for segment in transcript_segments]) print(f"[DEBUG] Extracted user input: {user_input.strip()}") return user_input.strip() except KeyError as e: print(f"[ERROR] KeyError in transcription response: {e}") return "" def generate_speech(text): print("[DEBUG] Generating speech from text.") tts_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") async def generate_tts(): tts = edge_tts.Communicate(text, voice="pt-BR-AntonioNeural") await tts.save(tts_file.name) try: asyncio.run(generate_tts()) print(f"[DEBUG] TTS audio saved to: {tts_file.name}") return tts_file.name except Exception as e: print(f"[ERROR] Error generating TTS: {e}") return None def chatbot_conversation(audio_file_path, history): print("[DEBUG] Starting chatbot conversation.") try: transcription = transcript(audio_file_path) user_input = extract_user_input(transcription) if not user_input: print("[ERROR] No user input extracted from transcription.") yield "I could not generate the text. Please try again.", None, history return # Ensure we have a system_message system_message = system_instruction if history is None: history = [] # Reconstruct messages from history messages = [{"role": "system", "content": system_message}] for turn in history: user_msg = turn[0].get("content") if turn[0] else "" assistant_msg = turn[1].get("content") if turn[1] else "" if user_msg: messages.append({"role": "user", "content": user_msg}) if assistant_msg: messages.append({"role": "assistant", "content": assistant_msg}) # Add the current user input messages.append({"role": "user", "content": user_input}) print("[DEBUG] Sending request to sync_client for chat completion.") print(f"[DEBUG] Messages: {messages}") response = "" # Stream partial responses try: for message in sync_client.chat.completions.create( model="marco-o1", messages=messages, stream=True, max_tokens=1024, temperature=0, response_format={"type": "text"} ): token = message.choices[0].delta.content if token: token = token.replace("<|im_start|>", "").replace("<|im_end|>", "") print(token, end="") response += token # Yield partial text updates, no audio yet, history unchanged yet yield (response, None, history) except Exception as e: print(f"[ERROR] Error during streaming response: {e}") yield ("I could not understand you. Please try again.", None, history) return # Now that we have the full response, update history history.append([ {"role": "user", "content": user_input}, {"role": "assistant", "content": response} ]) # Generate TTS now print("[DEBUG] Generating TTS for full response.") tts_file_name = generate_speech(response) if tts_file_name: print("[DEBUG] Returning final response and TTS file with updated history.") # Now yield again with final text, audio, and updated history yield (response, tts_file_name, history) else: print("[ERROR] Failed to generate TTS.") yield (response, None, history) except Exception as e: print(f"[ERROR] Exception in chatbot_conversation: {e}") yield ("I could not understand you. Please try again.", None, history) # Three outputs here: transcription text, audio, and the updated history interface = gr.Interface( fn=chatbot_conversation, inputs=[ gr.Audio(label="Usuário", type="filepath", streaming=False, container=True), gr.State([]) # State holds the conversation history ], outputs=[ gr.Textbox(label="Resposta do Chagas"), gr.Audio(type="filepath", autoplay=True, label="Chagas"), gr.State([]) # Return updated history ], title="Chagas - assistente de saúde", description= description, theme="sudeepshouche/minimalist", live=True ) interface.queue().launch()