import gradio as gr import google.generativeai as genai from gradio_client import Client, handle_file import tempfile import os genai.configure(api_key=os.getenv("GOOGLE_API_KEY")) model = genai.GenerativeModel('gemini-2.0-flash') # Load TTS client - using the correct Space URL tts_client = Client("https://medmac01-darija-arabic-tts.hf.space/") def generate_conversation(subject, speaker1_audio, speaker2_audio): try: prompt = f""" Generate a natural Moroccan Darija conversation in Arabic script only between two people about: "{subject}". Rules: - Use only Arabic script for Darija - Do not include any transliterations or translations - Do not include any Latin characters or parentheses - Use "Speaker 1" and "Speaker 2" as the speaker names Format: Speaker 1: [Arabic Darija text only] Speaker 2: [Arabic Darija text only] Speaker 1: [Arabic Darija text only] Speaker 2: [Arabic Darija text only] Keep it short and casual (4 lines). """ print("Sending prompt to Gemini API...") response = model.generate_content(prompt) print(f"Gemini API Response: {response}") if not response or not response.text: print("No response text received from Gemini API") return ["Error: No response from the model"] + [None] * 4 result = response.text print(f"Generated text: {result}") # Split the text into lines and process each line lines = [] for line in result.split('\n'): line = line.strip() if ':' in line: # Extract the text after the colon text = line.split(':', 1)[1].strip() # Add the appropriate speaker prefix if len(lines) % 2 == 0: lines.append(f"Speaker 1: {text}") else: lines.append(f"Speaker 2: {text}") print(f"Processed lines: {lines}") if not lines: print("No valid lines found in the response") return ["Error: No valid conversation generated"] + [None] * 4 # Generate audio files using TTS audio_paths = [] idx = 0 for line in lines: speaker_audio = speaker1_audio if line.startswith("Speaker 1") else speaker2_audio text = line.split(":", 1)[1].strip() # Create TTS audio using the correct API call result = tts_client.predict( text=text, speaker_audio_path=handle_file(speaker_audio), temperature=0.75, api_name="/infer_EGTTS" ) # Save the result to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: with open(result, "rb") as f: tmp.write(f.read()) tmp.flush() audio_paths.append(tmp.name) idx += 1 # Format the conversation text conversation_text = "\n".join(lines) # Ensure we have exactly 4 audio paths while len(audio_paths) < 4: audio_paths.append(None) # Return all outputs in the correct order return [conversation_text] + audio_paths[:4] except Exception as e: print(f"Error occurred: {str(e)}") return [f"Error: {str(e)}"] + [None] * 4 with gr.Blocks() as demo: gr.Markdown("# 🗣️ Moroccan Darija Conversation Generator") gr.Markdown("Enter a discussion topic and upload 2 speaker voices. We'll generate a Darija conversation!") with gr.Row(): subject = gr.Textbox(label="Subject of the discussion", placeholder="e.g. Going to the souk") with gr.Row(): speaker1 = gr.Audio(label="Speaker 1 Reference (4-5 sec)", type="filepath") speaker2 = gr.Audio(label="Speaker 2 Reference (4-5 sec)", type="filepath") btn = gr.Button("🎤 Generate Conversation") # Add text output for the conversation conversation_output = gr.Textbox(label="Generated Conversation", lines=6) # Audio outputs audio_outputs = [gr.Audio(label=f"Line {i+1}") for i in range(4)] btn.click( generate_conversation, inputs=[subject, speaker1, speaker2], outputs=[conversation_output] + audio_outputs ) demo.launch()