File size: 10,174 Bytes
1f6c376
 
 
 
 
 
 
 
 
 
 
 
 
6ba9626
 
 
1f6c376
 
 
 
 
 
 
 
 
6ba9626
 
 
1f6c376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ba9626
1f6c376
 
 
6ba9626
 
 
 
 
1f6c376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ba9626
1f6c376
 
 
 
 
 
 
 
6ba9626
 
1f6c376
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# app.py - Main Gradio application
import gradio as gr
import os
import tempfile
import shutil
from pathlib import Path
import asyncio
from typing import List, Tuple, Generator
import logging
from datetime import datetime

# Import our custom modules
from segmenter import TextSegmenter
# --- CHANGE START ---
from tts_engine import CPUMultiSpeakerTTS # Updated class name
# --- CHANGE END ---
from audio_utils import AudioProcessor

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PodXplainApp:
    def __init__(self):
        self.segmenter = TextSegmenter()
        # --- CHANGE START ---
        self.tts_engine = CPUMultiSpeakerTTS() # Updated class instantiation
        # --- CHANGE END ---
        self.audio_processor = AudioProcessor()
        self.temp_dir = None
        
    def create_temp_directory(self) -> str:
        """Create a temporary directory for processing."""
        if self.temp_dir:
            shutil.rmtree(self.temp_dir, ignore_errors=True)
        self.temp_dir = tempfile.mkdtemp(prefix="podxplain_")
        return self.temp_dir
    
    def cleanup_temp_directory(self):
        """Clean up temporary files."""
        if self.temp_dir and os.path.exists(self.temp_dir):
            shutil.rmtree(self.temp_dir, ignore_errors=True)
            self.temp_dir = None
    
    def generate_podcast(
        self, 
        text: str, 
        speaker_detection_mode: str = "auto",
        progress=gr.Progress()
    ) -> Tuple[str, str]:
        """
        Main function to convert text to podcast audio.
        
        Args:
            text: Input text (up to 50,000 characters)
            speaker_detection_mode: How to detect speaker changes
            progress: Gradio progress tracker
            
        Returns:
            Tuple of (audio_path, status_message)
        """
        try:
            # Validate input
            if not text or len(text.strip()) == 0:
                return None, "❌ Please provide some text to convert."
            
            if len(text) > 50000:
                return None, f"❌ Text too long ({len(text)} chars). Maximum is 50,000 characters."
            
            # Create temporary directory
            temp_dir = self.create_temp_directory()
            progress(0, desc="πŸš€ Starting podcast generation...")
            
            # Step 1: Segment text and assign speakers
            progress(0.1, desc="πŸ“ Analyzing text and assigning speakers...")
            segments = self.segmenter.segment_and_assign_speakers(
                text, mode=speaker_detection_mode
            )
            
            if not segments:
                return None, "❌ Could not process the text. Please check the input."
            
            logger.info(f"Generated {len(segments)} segments")
            
            # Step 2: Generate audio for each segment
            progress(0.2, desc="🎀 Generating audio segments...")
            audio_files = []
            
            for i, (speaker, segment_text) in enumerate(segments):
                progress(
                    0.2 + (0.7 * i / len(segments)), 
                    desc=f"🎡 Processing segment {i+1}/{len(segments)} (Speaker {speaker})"
                )
                
                # Generate audio for this segment
                audio_path = self.tts_engine.synthesize_segment(
                    segment_text, 
                    speaker, 
                    os.path.join(temp_dir, f"segment_{i:03d}.wav")
                )
                
                if audio_path:
                    audio_files.append(audio_path)
                else:
                    logger.warning(f"Failed to generate audio for segment {i}")
            
            if not audio_files:
                return None, "❌ Failed to generate any audio segments."
            
            # Step 3: Merge audio files and convert to MP3
            progress(0.9, desc="πŸ”§ Merging segments and converting to MP3...")
            final_audio_path = self.audio_processor.merge_and_convert_to_mp3(
                audio_files, 
                os.path.join(temp_dir, "podcast_output.mp3")
            )
            
            if not final_audio_path:
                return None, "❌ Failed to merge audio segments."
            
            progress(1.0, desc="βœ… Podcast generated successfully!")
            
            # Generate summary
            total_segments = len(segments)
            speakers_used = len(set(speaker for speaker, _ in segments))
            duration_estimate = len(text) / 1000 * 60  # Rough estimate: 1000 chars β‰ˆ 1 minute
            
            status_message = f"""
            βœ… **Podcast Generated Successfully!**
            
            πŸ“Š **Statistics:**
            - Total segments: {total_segments}
            - Speakers used: {speakers_used}
            - Estimated duration: {duration_estimate:.1f} minutes
            - Character count: {len(text):,}
            
            🎧 **Your podcast is ready for download!**
            """
            
            return final_audio_path, status_message
            
        except Exception as e:
            logger.error(f"Error generating podcast: {str(e)}")
            return None, f"❌ Error: {str(e)}"
        
        finally:
            # Clean up temporary files (except the final output)
            # Note: We keep the final MP3 for download
            pass

def create_gradio_interface():
    """Create the Gradio interface."""
    app = PodXplainApp()
    
    # Custom CSS for better styling
    css = """
    .main-container {
        max-width: 1200px;
        margin: 0 auto;
    }
    .header {
        text-align: center;
        padding: 20px;
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white;
        border-radius: 10px;
        margin-bottom: 20px;
    }
    .footer {
        text-align: center;
        padding: 20px;
        color: #666;
        font-size: 0.9em;
    }
    """
    
    with gr.Blocks(css=css, title="PodXplainClone - CPU Podcast Generator") as interface: # Updated title
        # Header
        gr.HTML("""
        <div class="header">
            <h1>πŸŽ™οΈ PodXplainClone</h1>
            <p><em>From script to story β€” voice it like never before, even on CPU.</em></p> <p style="font-size: 0.9em; margin-top: 10px;">
                This space allows you to transform written dialogue into natural-sounding multi-speaker audio, optimized for CPU hardware.
                It serves as a **CPU-friendly alternative and development sandbox** while the main PodXplain project awaits GPU resources for more advanced models.
            </p>
        </div>
        """)
        
        with gr.Row():
            with gr.Column(scale=2):
                # Input section
                gr.Markdown("## πŸ“ Input Your Script")
                
                text_input = gr.Textbox(
                    label="Podcast Script",
                    placeholder="Enter your podcast script here (up to 50,000 characters).\n\nTip: Use paragraph breaks to help with speaker detection.",
                    lines=15,
                    max_lines=20,
                    show_label=True
                )
                
                char_count = gr.HTML("Characters: 0 / 50,000")
                
                # Options
                speaker_mode = gr.Radio(
                    choices=["auto", "paragraph", "dialogue"],
                    value="auto",
                    label="Speaker Detection Mode",
                    info="How to detect when speakers change"
                )
                
                generate_btn = gr.Button(
                    "🎀 Generate Podcast", 
                    variant="primary", 
                    size="lg"
                )
            
            with gr.Column(scale=1):
                # Output section
                gr.Markdown("## 🎧 Your Podcast")
                
                status_output = gr.Markdown("Ready to generate your podcast!")
                
                audio_output = gr.Audio(
                    label="Generated Podcast",
                    show_download_button=True,
                    interactive=False
                )
        
        # Footer with instructions
        gr.HTML("""
        <div class="footer">
            <h3>πŸ“‹ How to Use PodXplainClone</h3>
            <ol>
                <li><strong>Write your script:</strong> Enter up to 50,000 characters of text</li>
                <li><strong>Choose speaker mode:</strong> Auto-detect, paragraph-based, or dialogue-based</li>
                <li><strong>Generate:</strong> Click the button and wait for processing</li>
                <li><strong>Listen & Download:</strong> Your MP3 podcast will be ready!</li>
            </ol>
            <p><strong>πŸ’‘ Tips:</strong> Use clear paragraph breaks for better speaker detection. 
            Write naturally as if speaking to an audience.</p>
            <p style="font-size: 0.8em; color: #999;">Powered by PodXplainClone &bull; Developed by Nick021402</p>
            <p style="font-size: 0.7em; color: #aaa;">This space runs on CPU hardware for accessibility. For the original project and GPU-powered advanced models, visit the main PodXplain space.</p>
        </div>
        """)
        
        # JavaScript for character counting
        text_input.change(
            fn=lambda text: f"Characters: {len(text) if text else 0:,} / 50,000",
            inputs=[text_input],
            outputs=[char_count]
        )
        
        # Main generation function
        generate_btn.click(
            fn=app.generate_podcast,
            inputs=[text_input, speaker_mode],
            outputs=[audio_output, status_output],
            show_progress=True
        )
    
    return interface

if __name__ == "__main__":
    # Create and launch the interface
    interface = create_gradio_interface()
    interface.launch(
        share=True,
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True
    )