import os import base64 import uuid import gradio as gr from openai import OpenAI from speechify import Speechify from dotenv import load_dotenv # Detect Hugging Face environment RUNNING_IN_SPACES = os.getenv("SYSTEM") == "spaces" # Load API keys if not RUNNING_IN_SPACES: load_dotenv() openai_api_key = os.getenv("OPENAI_API_KEY") speechify_api_key = os.getenv("SPEECHIFY_API_KEY") # Sanity check (but don't print full keys) print(f"✅ OPENAI_API_KEY loaded: {'✅' if openai_api_key else '❌ MISSING'}") print(f"✅ SPEECHIFY_API_KEY loaded: {'✅' if speechify_api_key else '❌ MISSING'}") # Initialize clients openai_client = OpenAI(api_key=openai_api_key) speechify_client = Speechify(token=speechify_api_key) # Voice config language_config = { "Portuguese": { "voice_id": "joao", "language": "pt-PT", "model": "simba-multilingual", "audio_format": "mp3" }, "French": { "voice_id": "leo", "language": "fr-FR", "model": "simba-multilingual", "audio_format": "mp3" }, "Spanish": { "voice_id": "danna-sofia", "language": "es-MX", "model": "simba-multilingual", "audio_format": "mp3" }, } def chat_and_speak(user_input, language_choice): gpt_response = "" audio_output_path = None try: if not user_input or not user_input.strip(): return None, "Please enter some text to process." print(f"🧠 User input: {user_input}") print(f"🗣️ Language choice: {language_choice}") # Step 1: Get GPT response system_message = f"You are a friendly {language_choice} language tutor. Respond only in {language_choice}." completion = openai_client.chat.completions.create( model="gpt-4", messages=[ {"role": "system", "content": system_message}, {"role": "user", "content": user_input} ] ) gpt_response = completion.choices[0].message.content print(f"💬 GPT response: {gpt_response}") # Step 2: Use Speechify to generate audio config = language_config.get(language_choice) if not config: error_msg = f"⚠️ Language '{language_choice}' not supported." print(error_msg) return None, f"{gpt_response}\n\n{error_msg}" tts_response = speechify_client.tts.audio.speech( input=gpt_response, voice_id=config["voice_id"], model=config["model"], audio_format=config["audio_format"] ) if hasattr(tts_response, "audio_data") and isinstance(tts_response.audio_data, str) and tts_response.audio_data: try: audio_bytes = base64.b64decode(tts_response.audio_data) output_dir = "/tmp" if RUNNING_IN_SPACES else "speech_files" os.makedirs(output_dir, exist_ok=True) audio_output_path = os.path.join(output_dir, f"speech_{uuid.uuid4().hex}.mp3") with open(audio_output_path, "wb") as f: f.write(audio_bytes) except Exception as audio_err: print(f"🔥 Error processing audio data: {audio_err}") return None, f"{gpt_response}\n\n⚠️ Error saving audio: {audio_err}" else: print("⚠️ No audio data received from Speechify or audio_data is not a string.") return None, f"{gpt_response}\n\n⚠️ No audio data received from Speechify." return audio_output_path, gpt_response except Exception as e: print(f"🔥 An unexpected error occurred: {e}") error_message = f"⚠️ An unexpected error occurred: {e}" if gpt_response: return None, f"{gpt_response}\n\n{error_message}" return None, error_message with open("custom.css") as f: custom_css = f.read() with gr.Blocks(css=custom_css) as demo: gr.HTML( '
' ) with gr.Column(elem_classes="main-card"): with gr.Row(): with gr.Column(): user_input = gr.Textbox(label="Type in whatever language you prefer", placeholder="Type here...", lines=4) language_choice = gr.Dropdown( choices=["Portuguese", "French", "Spanish"], value="Portuguese", label="Language" ) submit_btn = gr.Button("Submit") with gr.Column(): audio_output = gr.Audio(label="Audio Playback", type="filepath", autoplay=True) gpt_output = gr.Textbox(label="The Response") submit_btn.click( fn=chat_and_speak, inputs=[user_input, language_choice], outputs=[audio_output, gpt_output] ) if __name__ == "__main__": demo.launch()