Spaces:

Nick021402
/

PodXplainClone

Running

File size: 10,174 Bytes

# app.py - Main Gradio application
import gradio as gr
import os
import tempfile
import shutil
from pathlib import Path
import asyncio
from typing import List, Tuple, Generator
import logging
from datetime import datetime

# Import our custom modules
from segmenter import TextSegmenter
# --- CHANGE START ---
from tts_engine import CPUMultiSpeakerTTS # Updated class name
# --- CHANGE END ---
from audio_utils import AudioProcessor

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PodXplainApp:
    def __init__(self):
        self.segmenter = TextSegmenter()
        # --- CHANGE START ---
        self.tts_engine = CPUMultiSpeakerTTS() # Updated class instantiation
        # --- CHANGE END ---
        self.audio_processor = AudioProcessor()
        self.temp_dir = None
        
    def create_temp_directory(self) -> str:
        """Create a temporary directory for processing."""
        if self.temp_dir:
            shutil.rmtree(self.temp_dir, ignore_errors=True)
        self.temp_dir = tempfile.mkdtemp(prefix="podxplain_")
        return self.temp_dir
    
    def cleanup_temp_directory(self):
        """Clean up temporary files."""
        if self.temp_dir and os.path.exists(self.temp_dir):
            shutil.rmtree(self.temp_dir, ignore_errors=True)
            self.temp_dir = None
    
    def generate_podcast(
        self, 
        text: str, 
        speaker_detection_mode: str = "auto",
        progress=gr.Progress()
    ) -> Tuple[str, str]:
        """
        Main function to convert text to podcast audio.
        
        Args:
            text: Input text (up to 50,000 characters)
            speaker_detection_mode: How to detect speaker changes
            progress: Gradio progress tracker
            
        Returns:
            Tuple of (audio_path, status_message)
        """
        try:
            # Validate input
            if not text or len(text.strip()) == 0:
                return None, "❌ Please provide some text to convert."
            
            if len(text) > 50000:
                return None, f"❌ Text too long ({len(text)} chars). Maximum is 50,000 characters."
            
            # Create temporary directory
            temp_dir = self.create_temp_directory()
            progress(0, desc="🚀 Starting podcast generation...")
            
            # Step 1: Segment text and assign speakers
            progress(0.1, desc="📝 Analyzing text and assigning speakers...")
            segments = self.segmenter.segment_and_assign_speakers(
                text, mode=speaker_detection_mode
            )
            
            if not segments:
                return None, "❌ Could not process the text. Please check the input."
            
            logger.info(f"Generated {len(segments)} segments")
            
            # Step 2: Generate audio for each segment
            progress(0.2, desc="🎤 Generating audio segments...")
            audio_files = []
            
            for i, (speaker, segment_text) in enumerate(segments):
                progress(
                    0.2 + (0.7 * i / len(segments)), 
                    desc=f"🎵 Processing segment {i+1}/{len(segments)} (Speaker {speaker})"
                )
                
                # Generate audio for this segment
                audio_path = self.tts_engine.synthesize_segment(
                    segment_text, 
                    speaker, 
                    os.path.join(temp_dir, f"segment_{i:03d}.wav")
                )
                
                if audio_path:
                    audio_files.append(audio_path)
                else:
                    logger.warning(f"Failed to generate audio for segment {i}")
            
            if not audio_files:
                return None, "❌ Failed to generate any audio segments."
            
            # Step 3: Merge audio files and convert to MP3
            progress(0.9, desc="🔧 Merging segments and converting to MP3...")
            final_audio_path = self.audio_processor.merge_and_convert_to_mp3(
                audio_files, 
                os.path.join(temp_dir, "podcast_output.mp3")
            )
            
            if not final_audio_path:
                return None, "❌ Failed to merge audio segments."
            
            progress(1.0, desc="✅ Podcast generated successfully!")
            
            # Generate summary
            total_segments = len(segments)
            speakers_used = len(set(speaker for speaker, _ in segments))
            duration_estimate = len(text) / 1000 * 60  # Rough estimate: 1000 chars ≈ 1 minute
            
            status_message = f"""
            ✅ **Podcast Generated Successfully!**
            
            📊 **Statistics:**
            - Total segments: {total_segments}
            - Speakers used: {speakers_used}
            - Estimated duration: {duration_estimate:.1f} minutes
            - Character count: {len(text):,}
            
            🎧 **Your podcast is ready for download!**
            """
            
            return final_audio_path, status_message
            
        except Exception as e:
            logger.error(f"Error generating podcast: {str(e)}")
            return None, f"❌ Error: {str(e)}"
        
        finally:
            # Clean up temporary files (except the final output)
            # Note: We keep the final MP3 for download
            pass

def create_gradio_interface():
    """Create the Gradio interface."""
    app = PodXplainApp()
    
    # Custom CSS for better styling
    css = """
    .main-container {
        max-width: 1200px;
        margin: 0 auto;
    }
    .header {
        text-align: center;
        padding: 20px;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        border-radius: 10px;
        margin-bottom: 20px;
    }
    .footer {
        text-align: center;
        padding: 20px;
        color: #666;
        font-size: 0.9em;
    }
    """
    
    with gr.Blocks(css=css, title="PodXplainClone - CPU Podcast Generator") as interface: # Updated title
        # Header
        gr.HTML("""
        <div class="header">
            <h1>🎙️ PodXplainClone</h1>
            <p><em>From script to story — voice it like never before, even on CPU.</em></p> <p style="font-size: 0.9em; margin-top: 10px;">
                This space allows you to transform written dialogue into natural-sounding multi-speaker audio, optimized for CPU hardware.
                It serves as a **CPU-friendly alternative and development sandbox** while the main PodXplain project awaits GPU resources for more advanced models.
            </p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Input section
                gr.Markdown("## 📝 Input Your Script")
                
                text_input = gr.Textbox(
                    label="Podcast Script",
                    placeholder="Enter your podcast script here (up to 50,000 characters).\n\nTip: Use paragraph breaks to help with speaker detection.",
                    lines=15,
                    max_lines=20,
                    show_label=True
                )
                
                char_count = gr.HTML("Characters: 0 / 50,000")
                
                # Options
                speaker_mode = gr.Radio(
                    choices=["auto", "paragraph", "dialogue"],
                    value="auto",
                    label="Speaker Detection Mode",
                    info="How to detect when speakers change"
                )
                
                generate_btn = gr.Button(
                    "🎤 Generate Podcast", 
                    variant="primary", 
                    size="lg"
                )
            
            with gr.Column(scale=1):
                # Output section
                gr.Markdown("## 🎧 Your Podcast")
                
                status_output = gr.Markdown("Ready to generate your podcast!")
                
                audio_output = gr.Audio(
                    label="Generated Podcast",
                    show_download_button=True,
                    interactive=False
                )
        
        # Footer with instructions
        gr.HTML("""
        <div class="footer">
            <h3>📋 How to Use PodXplainClone</h3>
            <ol>
                <li><strong>Write your script:</strong> Enter up to 50,000 characters of text</li>
                <li><strong>Choose speaker mode:</strong> Auto-detect, paragraph-based, or dialogue-based</li>
                <li><strong>Generate:</strong> Click the button and wait for processing</li>
                <li><strong>Listen & Download:</strong> Your MP3 podcast will be ready!</li>
            </ol>
            <p><strong>💡 Tips:</strong> Use clear paragraph breaks for better speaker detection. 
            Write naturally as if speaking to an audience.</p>
            <p style="font-size: 0.8em; color: #999;">Powered by PodXplainClone &bull; Developed by Nick021402</p>
            <p style="font-size: 0.7em; color: #aaa;">This space runs on CPU hardware for accessibility. For the original project and GPU-powered advanced models, visit the main PodXplain space.</p>
        </div>
        """)
        
        # JavaScript for character counting
        text_input.change(
            fn=lambda text: f"Characters: {len(text) if text else 0:,} / 50,000",
            inputs=[text_input],
            outputs=[char_count]
        )
        
        # Main generation function
        generate_btn.click(
            fn=app.generate_podcast,
            inputs=[text_input, speaker_mode],
            outputs=[audio_output, status_output],
            show_progress=True
        )
    
    return interface

if __name__ == "__main__":
    # Create and launch the interface
    interface = create_gradio_interface()
    interface.launch(
        share=True,
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )