PodXplainClone / app.py
Nick021402's picture
Update app.py
6ba9626 verified
# app.py - Main Gradio application
import gradio as gr
import os
import tempfile
import shutil
from pathlib import Path
import asyncio
from typing import List, Tuple, Generator
import logging
from datetime import datetime
# Import our custom modules
from segmenter import TextSegmenter
# --- CHANGE START ---
from tts_engine import CPUMultiSpeakerTTS # Updated class name
# --- CHANGE END ---
from audio_utils import AudioProcessor
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class PodXplainApp:
def __init__(self):
self.segmenter = TextSegmenter()
# --- CHANGE START ---
self.tts_engine = CPUMultiSpeakerTTS() # Updated class instantiation
# --- CHANGE END ---
self.audio_processor = AudioProcessor()
self.temp_dir = None
def create_temp_directory(self) -> str:
"""Create a temporary directory for processing."""
if self.temp_dir:
shutil.rmtree(self.temp_dir, ignore_errors=True)
self.temp_dir = tempfile.mkdtemp(prefix="podxplain_")
return self.temp_dir
def cleanup_temp_directory(self):
"""Clean up temporary files."""
if self.temp_dir and os.path.exists(self.temp_dir):
shutil.rmtree(self.temp_dir, ignore_errors=True)
self.temp_dir = None
def generate_podcast(
self,
text: str,
speaker_detection_mode: str = "auto",
progress=gr.Progress()
) -> Tuple[str, str]:
"""
Main function to convert text to podcast audio.
Args:
text: Input text (up to 50,000 characters)
speaker_detection_mode: How to detect speaker changes
progress: Gradio progress tracker
Returns:
Tuple of (audio_path, status_message)
"""
try:
# Validate input
if not text or len(text.strip()) == 0:
return None, "❌ Please provide some text to convert."
if len(text) > 50000:
return None, f"❌ Text too long ({len(text)} chars). Maximum is 50,000 characters."
# Create temporary directory
temp_dir = self.create_temp_directory()
progress(0, desc="πŸš€ Starting podcast generation...")
# Step 1: Segment text and assign speakers
progress(0.1, desc="πŸ“ Analyzing text and assigning speakers...")
segments = self.segmenter.segment_and_assign_speakers(
text, mode=speaker_detection_mode
)
if not segments:
return None, "❌ Could not process the text. Please check the input."
logger.info(f"Generated {len(segments)} segments")
# Step 2: Generate audio for each segment
progress(0.2, desc="🎀 Generating audio segments...")
audio_files = []
for i, (speaker, segment_text) in enumerate(segments):
progress(
0.2 + (0.7 * i / len(segments)),
desc=f"🎡 Processing segment {i+1}/{len(segments)} (Speaker {speaker})"
)
# Generate audio for this segment
audio_path = self.tts_engine.synthesize_segment(
segment_text,
speaker,
os.path.join(temp_dir, f"segment_{i:03d}.wav")
)
if audio_path:
audio_files.append(audio_path)
else:
logger.warning(f"Failed to generate audio for segment {i}")
if not audio_files:
return None, "❌ Failed to generate any audio segments."
# Step 3: Merge audio files and convert to MP3
progress(0.9, desc="πŸ”§ Merging segments and converting to MP3...")
final_audio_path = self.audio_processor.merge_and_convert_to_mp3(
audio_files,
os.path.join(temp_dir, "podcast_output.mp3")
)
if not final_audio_path:
return None, "❌ Failed to merge audio segments."
progress(1.0, desc="βœ… Podcast generated successfully!")
# Generate summary
total_segments = len(segments)
speakers_used = len(set(speaker for speaker, _ in segments))
duration_estimate = len(text) / 1000 * 60 # Rough estimate: 1000 chars β‰ˆ 1 minute
status_message = f"""
βœ… **Podcast Generated Successfully!**
πŸ“Š **Statistics:**
- Total segments: {total_segments}
- Speakers used: {speakers_used}
- Estimated duration: {duration_estimate:.1f} minutes
- Character count: {len(text):,}
🎧 **Your podcast is ready for download!**
"""
return final_audio_path, status_message
except Exception as e:
logger.error(f"Error generating podcast: {str(e)}")
return None, f"❌ Error: {str(e)}"
finally:
# Clean up temporary files (except the final output)
# Note: We keep the final MP3 for download
pass
def create_gradio_interface():
"""Create the Gradio interface."""
app = PodXplainApp()
# Custom CSS for better styling
css = """
.main-container {
max-width: 1200px;
margin: 0 auto;
}
.header {
text-align: center;
padding: 20px;
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
color: white;
border-radius: 10px;
margin-bottom: 20px;
}
.footer {
text-align: center;
padding: 20px;
color: #666;
font-size: 0.9em;
}
"""
with gr.Blocks(css=css, title="PodXplainClone - CPU Podcast Generator") as interface: # Updated title
# Header
gr.HTML("""
<div class="header">
<h1>πŸŽ™οΈ PodXplainClone</h1>
<p><em>From script to story β€” voice it like never before, even on CPU.</em></p> <p style="font-size: 0.9em; margin-top: 10px;">
This space allows you to transform written dialogue into natural-sounding multi-speaker audio, optimized for CPU hardware.
It serves as a **CPU-friendly alternative and development sandbox** while the main PodXplain project awaits GPU resources for more advanced models.
</p>
</div>
""")
with gr.Row():
with gr.Column(scale=2):
# Input section
gr.Markdown("## πŸ“ Input Your Script")
text_input = gr.Textbox(
label="Podcast Script",
placeholder="Enter your podcast script here (up to 50,000 characters).\n\nTip: Use paragraph breaks to help with speaker detection.",
lines=15,
max_lines=20,
show_label=True
)
char_count = gr.HTML("Characters: 0 / 50,000")
# Options
speaker_mode = gr.Radio(
choices=["auto", "paragraph", "dialogue"],
value="auto",
label="Speaker Detection Mode",
info="How to detect when speakers change"
)
generate_btn = gr.Button(
"🎀 Generate Podcast",
variant="primary",
size="lg"
)
with gr.Column(scale=1):
# Output section
gr.Markdown("## 🎧 Your Podcast")
status_output = gr.Markdown("Ready to generate your podcast!")
audio_output = gr.Audio(
label="Generated Podcast",
show_download_button=True,
interactive=False
)
# Footer with instructions
gr.HTML("""
<div class="footer">
<h3>πŸ“‹ How to Use PodXplainClone</h3>
<ol>
<li><strong>Write your script:</strong> Enter up to 50,000 characters of text</li>
<li><strong>Choose speaker mode:</strong> Auto-detect, paragraph-based, or dialogue-based</li>
<li><strong>Generate:</strong> Click the button and wait for processing</li>
<li><strong>Listen & Download:</strong> Your MP3 podcast will be ready!</li>
</ol>
<p><strong>πŸ’‘ Tips:</strong> Use clear paragraph breaks for better speaker detection.
Write naturally as if speaking to an audience.</p>
<p style="font-size: 0.8em; color: #999;">Powered by PodXplainClone &bull; Developed by Nick021402</p>
<p style="font-size: 0.7em; color: #aaa;">This space runs on CPU hardware for accessibility. For the original project and GPU-powered advanced models, visit the main PodXplain space.</p>
</div>
""")
# JavaScript for character counting
text_input.change(
fn=lambda text: f"Characters: {len(text) if text else 0:,} / 50,000",
inputs=[text_input],
outputs=[char_count]
)
# Main generation function
generate_btn.click(
fn=app.generate_podcast,
inputs=[text_input, speaker_mode],
outputs=[audio_output, status_output],
show_progress=True
)
return interface
if __name__ == "__main__":
# Create and launch the interface
interface = create_gradio_interface()
interface.launch(
share=True,
server_name="0.0.0.0",
server_port=7860,
show_error=True
)