import re import numpy as np from transformers import pipeline import gradio as gr # Available voices and their corresponding models VOICES = { "Amy (Female)": "microsoft/vits-piper-en-us-amy", "Joe (Male)": "microsoft/vits-piper-en-us-joe", "Clara (Female)": "microsoft/vits-piper-en-us-clb", "Ryan (Male)": "microsoft/vits-piper-en-us-jvs" } def parse_segments(text): """Parse input text for speaker segments with improved validation""" pattern = re.compile(r'$$(?P[^$$]+?)$$(?P.*?)$$/(?P=speaker)$$', re.DOTALL) matches = list(pattern.finditer(text)) # Validate speaker names and collect results valid_segments = [] for match in matches: speaker = match.group('speaker') if speaker in VOICES: valid_segments.append((speaker, match.group('text').strip())) # Find any invalid segments if len(matches) < len(text.strip()): return valid_segments, f"Warning: Found {len(matches)} valid segments, but text contains untagged content or invalid speaker names" return valid_segments, None def generate_podcast(input_text): """Convert text to podcast with multiple voices""" try: segments, warning = parse_segments(input_text) if not segments: return (22050, np.zeros(0)), "No valid speaker segments found. Please use the format: [Speaker Name]text[/Speaker Name]" all_audio = [] current_pipe = None current_model = "" for speaker, text in segments: model_name = VOICES[speaker] # Load model only when needed if current_model != model_name: if current_pipe: del current_pipe current_pipe = pipeline("text-to-speech", model=model_name) current_model = model_name # Generate audio for this segment output = current_pipe(text) all_audio.append(output["audio"]) # Combine all audio segments with short pauses final_audio = np.concatenate([np.concatenate((audio, np.zeros(5000))) for audio in all_audio]) status = "Podcast generated successfully!" if warning: status += " " + warning return (output["sampling_rate"], final_audio), status except Exception as e: return (22050, np.zeros(0)), f"Error: {str(e)}" # Create Gradio interface def podcast_interface(text): (sr, audio), status = generate_podcast(text) return (sr, audio) if audio.size > 0 else gr.update(), status demo = gr.Interface( fn=podcast_interface, inputs=gr.Textbox( label="Input Text with Speaker Tags", lines=12, placeholder="""Example format: [Amy (Female)]Welcome to our podcast![/Amy (Female)] [Joe (Male)]Today we're discussing AI innovations.[/Joe (Male)]""" ), outputs=[ gr.Audio(label="Generated Podcast", type="numpy"), gr.Textbox(label="Status", value="Ready") ], examples=[ ["""[Amy (Female)]Welcome to our podcast![/Amy (Female)] [Joe (Male)]Today we're discussing AI innovations.[/Joe (Male)] [Clara (Female)]This is Clara speaking![/Clara (Female)]"""] ], title="🎙️ Multi-Voice Podcast Generator", description="Generate podcasts with multiple free AI voices using Microsoft's Piper TTS models. Use [SpeakerName] tags to assign different voices to different text segments.", theme="soft", allow_flagging="never" ) if __name__ == "__main__": demo.launch()