Spaces:
Running
Running
# app.py - Main Gradio application | |
import gradio as gr | |
import os | |
import tempfile | |
import shutil | |
from pathlib import Path | |
import asyncio | |
from typing import List, Tuple, Generator | |
import logging | |
from datetime import datetime | |
# Import our custom modules | |
from segmenter import TextSegmenter | |
# --- CHANGE START --- | |
from tts_engine import CPUMultiSpeakerTTS # Updated class name | |
# --- CHANGE END --- | |
from audio_utils import AudioProcessor | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class PodXplainApp: | |
def __init__(self): | |
self.segmenter = TextSegmenter() | |
# --- CHANGE START --- | |
self.tts_engine = CPUMultiSpeakerTTS() # Updated class instantiation | |
# --- CHANGE END --- | |
self.audio_processor = AudioProcessor() | |
self.temp_dir = None | |
def create_temp_directory(self) -> str: | |
"""Create a temporary directory for processing.""" | |
if self.temp_dir: | |
shutil.rmtree(self.temp_dir, ignore_errors=True) | |
self.temp_dir = tempfile.mkdtemp(prefix="podxplain_") | |
return self.temp_dir | |
def cleanup_temp_directory(self): | |
"""Clean up temporary files.""" | |
if self.temp_dir and os.path.exists(self.temp_dir): | |
shutil.rmtree(self.temp_dir, ignore_errors=True) | |
self.temp_dir = None | |
def generate_podcast( | |
self, | |
text: str, | |
speaker_detection_mode: str = "auto", | |
progress=gr.Progress() | |
) -> Tuple[str, str]: | |
""" | |
Main function to convert text to podcast audio. | |
Args: | |
text: Input text (up to 50,000 characters) | |
speaker_detection_mode: How to detect speaker changes | |
progress: Gradio progress tracker | |
Returns: | |
Tuple of (audio_path, status_message) | |
""" | |
try: | |
# Validate input | |
if not text or len(text.strip()) == 0: | |
return None, "β Please provide some text to convert." | |
if len(text) > 50000: | |
return None, f"β Text too long ({len(text)} chars). Maximum is 50,000 characters." | |
# Create temporary directory | |
temp_dir = self.create_temp_directory() | |
progress(0, desc="π Starting podcast generation...") | |
# Step 1: Segment text and assign speakers | |
progress(0.1, desc="π Analyzing text and assigning speakers...") | |
segments = self.segmenter.segment_and_assign_speakers( | |
text, mode=speaker_detection_mode | |
) | |
if not segments: | |
return None, "β Could not process the text. Please check the input." | |
logger.info(f"Generated {len(segments)} segments") | |
# Step 2: Generate audio for each segment | |
progress(0.2, desc="π€ Generating audio segments...") | |
audio_files = [] | |
for i, (speaker, segment_text) in enumerate(segments): | |
progress( | |
0.2 + (0.7 * i / len(segments)), | |
desc=f"π΅ Processing segment {i+1}/{len(segments)} (Speaker {speaker})" | |
) | |
# Generate audio for this segment | |
audio_path = self.tts_engine.synthesize_segment( | |
segment_text, | |
speaker, | |
os.path.join(temp_dir, f"segment_{i:03d}.wav") | |
) | |
if audio_path: | |
audio_files.append(audio_path) | |
else: | |
logger.warning(f"Failed to generate audio for segment {i}") | |
if not audio_files: | |
return None, "β Failed to generate any audio segments." | |
# Step 3: Merge audio files and convert to MP3 | |
progress(0.9, desc="π§ Merging segments and converting to MP3...") | |
final_audio_path = self.audio_processor.merge_and_convert_to_mp3( | |
audio_files, | |
os.path.join(temp_dir, "podcast_output.mp3") | |
) | |
if not final_audio_path: | |
return None, "β Failed to merge audio segments." | |
progress(1.0, desc="β Podcast generated successfully!") | |
# Generate summary | |
total_segments = len(segments) | |
speakers_used = len(set(speaker for speaker, _ in segments)) | |
duration_estimate = len(text) / 1000 * 60 # Rough estimate: 1000 chars β 1 minute | |
status_message = f""" | |
β **Podcast Generated Successfully!** | |
π **Statistics:** | |
- Total segments: {total_segments} | |
- Speakers used: {speakers_used} | |
- Estimated duration: {duration_estimate:.1f} minutes | |
- Character count: {len(text):,} | |
π§ **Your podcast is ready for download!** | |
""" | |
return final_audio_path, status_message | |
except Exception as e: | |
logger.error(f"Error generating podcast: {str(e)}") | |
return None, f"β Error: {str(e)}" | |
finally: | |
# Clean up temporary files (except the final output) | |
# Note: We keep the final MP3 for download | |
pass | |
def create_gradio_interface(): | |
"""Create the Gradio interface.""" | |
app = PodXplainApp() | |
# Custom CSS for better styling | |
css = """ | |
.main-container { | |
max-width: 1200px; | |
margin: 0 auto; | |
} | |
.header { | |
text-align: center; | |
padding: 20px; | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
color: white; | |
border-radius: 10px; | |
margin-bottom: 20px; | |
} | |
.footer { | |
text-align: center; | |
padding: 20px; | |
color: #666; | |
font-size: 0.9em; | |
} | |
""" | |
with gr.Blocks(css=css, title="PodXplainClone - CPU Podcast Generator") as interface: # Updated title | |
# Header | |
gr.HTML(""" | |
<div class="header"> | |
<h1>ποΈ PodXplainClone</h1> | |
<p><em>From script to story β voice it like never before, even on CPU.</em></p> <p style="font-size: 0.9em; margin-top: 10px;"> | |
This space allows you to transform written dialogue into natural-sounding multi-speaker audio, optimized for CPU hardware. | |
It serves as a **CPU-friendly alternative and development sandbox** while the main PodXplain project awaits GPU resources for more advanced models. | |
</p> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
# Input section | |
gr.Markdown("## π Input Your Script") | |
text_input = gr.Textbox( | |
label="Podcast Script", | |
placeholder="Enter your podcast script here (up to 50,000 characters).\n\nTip: Use paragraph breaks to help with speaker detection.", | |
lines=15, | |
max_lines=20, | |
show_label=True | |
) | |
char_count = gr.HTML("Characters: 0 / 50,000") | |
# Options | |
speaker_mode = gr.Radio( | |
choices=["auto", "paragraph", "dialogue"], | |
value="auto", | |
label="Speaker Detection Mode", | |
info="How to detect when speakers change" | |
) | |
generate_btn = gr.Button( | |
"π€ Generate Podcast", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(scale=1): | |
# Output section | |
gr.Markdown("## π§ Your Podcast") | |
status_output = gr.Markdown("Ready to generate your podcast!") | |
audio_output = gr.Audio( | |
label="Generated Podcast", | |
show_download_button=True, | |
interactive=False | |
) | |
# Footer with instructions | |
gr.HTML(""" | |
<div class="footer"> | |
<h3>π How to Use PodXplainClone</h3> | |
<ol> | |
<li><strong>Write your script:</strong> Enter up to 50,000 characters of text</li> | |
<li><strong>Choose speaker mode:</strong> Auto-detect, paragraph-based, or dialogue-based</li> | |
<li><strong>Generate:</strong> Click the button and wait for processing</li> | |
<li><strong>Listen & Download:</strong> Your MP3 podcast will be ready!</li> | |
</ol> | |
<p><strong>π‘ Tips:</strong> Use clear paragraph breaks for better speaker detection. | |
Write naturally as if speaking to an audience.</p> | |
<p style="font-size: 0.8em; color: #999;">Powered by PodXplainClone • Developed by Nick021402</p> | |
<p style="font-size: 0.7em; color: #aaa;">This space runs on CPU hardware for accessibility. For the original project and GPU-powered advanced models, visit the main PodXplain space.</p> | |
</div> | |
""") | |
# JavaScript for character counting | |
text_input.change( | |
fn=lambda text: f"Characters: {len(text) if text else 0:,} / 50,000", | |
inputs=[text_input], | |
outputs=[char_count] | |
) | |
# Main generation function | |
generate_btn.click( | |
fn=app.generate_podcast, | |
inputs=[text_input, speaker_mode], | |
outputs=[audio_output, status_output], | |
show_progress=True | |
) | |
return interface | |
if __name__ == "__main__": | |
# Create and launch the interface | |
interface = create_gradio_interface() | |
interface.launch( | |
share=True, | |
server_name="0.0.0.0", | |
server_port=7860, | |
show_error=True | |
) | |