# app.py - Main Gradio application import gradio as gr import os import tempfile import shutil from pathlib import Path import asyncio from typing import List, Tuple, Generator import logging from datetime import datetime # Import our custom modules from segmenter import TextSegmenter # --- CHANGE START --- from tts_engine import CPUMultiSpeakerTTS # Updated class name # --- CHANGE END --- from audio_utils import AudioProcessor # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class PodXplainApp: def __init__(self): self.segmenter = TextSegmenter() # --- CHANGE START --- self.tts_engine = CPUMultiSpeakerTTS() # Updated class instantiation # --- CHANGE END --- self.audio_processor = AudioProcessor() self.temp_dir = None def create_temp_directory(self) -> str: """Create a temporary directory for processing.""" if self.temp_dir: shutil.rmtree(self.temp_dir, ignore_errors=True) self.temp_dir = tempfile.mkdtemp(prefix="podxplain_") return self.temp_dir def cleanup_temp_directory(self): """Clean up temporary files.""" if self.temp_dir and os.path.exists(self.temp_dir): shutil.rmtree(self.temp_dir, ignore_errors=True) self.temp_dir = None def generate_podcast( self, text: str, speaker_detection_mode: str = "auto", progress=gr.Progress() ) -> Tuple[str, str]: """ Main function to convert text to podcast audio. Args: text: Input text (up to 50,000 characters) speaker_detection_mode: How to detect speaker changes progress: Gradio progress tracker Returns: Tuple of (audio_path, status_message) """ try: # Validate input if not text or len(text.strip()) == 0: return None, "❌ Please provide some text to convert." if len(text) > 50000: return None, f"❌ Text too long ({len(text)} chars). Maximum is 50,000 characters." # Create temporary directory temp_dir = self.create_temp_directory() progress(0, desc="🚀 Starting podcast generation...") # Step 1: Segment text and assign speakers progress(0.1, desc="📝 Analyzing text and assigning speakers...") segments = self.segmenter.segment_and_assign_speakers( text, mode=speaker_detection_mode ) if not segments: return None, "❌ Could not process the text. Please check the input." logger.info(f"Generated {len(segments)} segments") # Step 2: Generate audio for each segment progress(0.2, desc="🎤 Generating audio segments...") audio_files = [] for i, (speaker, segment_text) in enumerate(segments): progress( 0.2 + (0.7 * i / len(segments)), desc=f"🎵 Processing segment {i+1}/{len(segments)} (Speaker {speaker})" ) # Generate audio for this segment audio_path = self.tts_engine.synthesize_segment( segment_text, speaker, os.path.join(temp_dir, f"segment_{i:03d}.wav") ) if audio_path: audio_files.append(audio_path) else: logger.warning(f"Failed to generate audio for segment {i}") if not audio_files: return None, "❌ Failed to generate any audio segments." # Step 3: Merge audio files and convert to MP3 progress(0.9, desc="🔧 Merging segments and converting to MP3...") final_audio_path = self.audio_processor.merge_and_convert_to_mp3( audio_files, os.path.join(temp_dir, "podcast_output.mp3") ) if not final_audio_path: return None, "❌ Failed to merge audio segments." progress(1.0, desc="✅ Podcast generated successfully!") # Generate summary total_segments = len(segments) speakers_used = len(set(speaker for speaker, _ in segments)) duration_estimate = len(text) / 1000 * 60 # Rough estimate: 1000 chars ≈ 1 minute status_message = f""" ✅ **Podcast Generated Successfully!** 📊 **Statistics:** - Total segments: {total_segments} - Speakers used: {speakers_used} - Estimated duration: {duration_estimate:.1f} minutes - Character count: {len(text):,} 🎧 **Your podcast is ready for download!** """ return final_audio_path, status_message except Exception as e: logger.error(f"Error generating podcast: {str(e)}") return None, f"❌ Error: {str(e)}" finally: # Clean up temporary files (except the final output) # Note: We keep the final MP3 for download pass def create_gradio_interface(): """Create the Gradio interface.""" app = PodXplainApp() # Custom CSS for better styling css = """ .main-container { max-width: 1200px; margin: 0 auto; } .header { text-align: center; padding: 20px; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; border-radius: 10px; margin-bottom: 20px; } .footer { text-align: center; padding: 20px; color: #666; font-size: 0.9em; } """ with gr.Blocks(css=css, title="PodXplainClone - CPU Podcast Generator") as interface: # Updated title # Header gr.HTML("""

🎙️ PodXplainClone

From script to story — voice it like never before, even on CPU.

This space allows you to transform written dialogue into natural-sounding multi-speaker audio, optimized for CPU hardware. It serves as a **CPU-friendly alternative and development sandbox** while the main PodXplain project awaits GPU resources for more advanced models.

""") with gr.Row(): with gr.Column(scale=2): # Input section gr.Markdown("## 📝 Input Your Script") text_input = gr.Textbox( label="Podcast Script", placeholder="Enter your podcast script here (up to 50,000 characters).\n\nTip: Use paragraph breaks to help with speaker detection.", lines=15, max_lines=20, show_label=True ) char_count = gr.HTML("Characters: 0 / 50,000") # Options speaker_mode = gr.Radio( choices=["auto", "paragraph", "dialogue"], value="auto", label="Speaker Detection Mode", info="How to detect when speakers change" ) generate_btn = gr.Button( "🎤 Generate Podcast", variant="primary", size="lg" ) with gr.Column(scale=1): # Output section gr.Markdown("## 🎧 Your Podcast") status_output = gr.Markdown("Ready to generate your podcast!") audio_output = gr.Audio( label="Generated Podcast", show_download_button=True, interactive=False ) # Footer with instructions gr.HTML(""" """) # JavaScript for character counting text_input.change( fn=lambda text: f"Characters: {len(text) if text else 0:,} / 50,000", inputs=[text_input], outputs=[char_count] ) # Main generation function generate_btn.click( fn=app.generate_podcast, inputs=[text_input, speaker_mode], outputs=[audio_output, status_output], show_progress=True ) return interface if __name__ == "__main__": # Create and launch the interface interface = create_gradio_interface() interface.launch( share=True, server_name="0.0.0.0", server_port=7860, show_error=True )