import re import numpy as np from transformers import pipeline import gradio as gr # Available voices and their corresponding models VOICES = { "Amy (Female)": "microsoft/vits-piper-en-us-amy", "Joe (Male)": "microsoft/vits-piper-en-us-joe", "Clara (Female)": "microsoft/vits-piper-en-us-clb", "Ryan (Male)": "microsoft/vits-piper-en-us-jvs" } def parse_segments(text): """Parse input text for speaker segments using regex""" pattern = re.compile(r'$$(?P[^$$]+)$$(?P.*?)$$\/\1$$', re.DOTALL) return [(match.group('speaker'), match.group('text').strip()) for match in pattern.finditer(text)] def generate_podcast(input_text): """Convert text to podcast with multiple voices""" try: segments = parse_segments(input_text) if not segments: return (22050, np.zeros(0)), "No valid speaker segments found" all_audio = [] current_pipe = None current_model = "" for speaker, text in segments: if speaker not in VOICES: return (22050, np.zeros(0)), f"Invalid speaker: {speaker}" model_name = VOICES[speaker] # Load model only when needed if current_model != model_name: if current_pipe: del current_pipe current_pipe = pipeline("text-to-speech", model=model_name) current_model = model_name # Generate audio for this segment output = current_pipe(text) all_audio.append(output["audio"]) # Combine all audio segments with short pauses final_audio = np.concatenate([np.concatenate((audio, np.zeros(5000))) for audio in all_audio]) return (output["sampling_rate"], final_audio), "Podcast generated successfully!" except Exception as e: return (22050, np.zeros(0)), f"Error: {str(e)}" # Create Gradio interface def podcast_interface(text): (sr, audio), status = generate_podcast(text) return (sr, audio) if audio.size > 0 else gr.update(), status demo = gr.Interface( fn=podcast_interface, inputs=gr.Textbox( label="Input Text with Speaker Tags", lines=12, placeholder="""Example format: [Amy (Female)]Hello and welcome to today's episode![/Amy (Female)] [Joe (Male)]Excited to have you here![/Joe (Male)]""" ), outputs=[ gr.Audio(label="Generated Podcast", type="numpy"), gr.Textbox(label="Status", value="Ready") ], examples=[ ["""[Amy (Female)]Welcome to our podcast![/Amy (Female)] [Joe (Male)]Today we're discussing AI innovations.[/Joe (Male)]"""] ], title="🎙️ Multi-Voice Podcast Generator", description="Generate podcasts with multiple free AI voices using Microsoft's Piper TTS models. Use [SpeakerName] tags to assign different voices to different text segments.", theme="soft", allow_flagging="never" ) if __name__ == "__main__": demo.launch()