PierreBrunelle's picture
Update src/interface.py
f0a88ca verified
raw
history blame
8.95 kB
import gradio as gr
from .processor import process_document
SYNTHESIS_MODES = {
"narration": {
"description": "Simple document narration with clear voice and natural pacing",
"styles": ["Technical", "Narrative", "Instructional", "Descriptive"],
"default_temp": 0.7,
"default_chunks": 300,
"system_prompt": """Convert this content into clear narration.
Format:
- Clear sentence structures
- Natural pauses (...)
- Term definitions when needed
- Proper transitions"""
},
"podcast": {
"description": "Conversational style with engaging tone and dynamic pacing",
"styles": ["Casual", "Interview", "Educational", "Commentary"],
"default_temp": 0.8,
"default_chunks": 400,
"system_prompt": """Transform this content into engaging podcast-style speech."""
},
"presentation": {
"description": "Professional presentation style with clear structure",
"styles": ["Business", "Academic", "Sales", "Training"],
"default_temp": 0.6,
"default_chunks": 250,
"system_prompt": """Convert this content into a presentation format."""
},
"storytelling": {
"description": "Narrative style with emotional engagement",
"styles": ["Dynamic", "Dramatic", "Calm", "Energetic"],
"default_temp": 0.9,
"default_chunks": 500,
"system_prompt": """Transform this content into an engaging story."""
}
}
def create_interface():
with gr.Blocks(theme=gr.themes.Base()) as demo:
gr.HTML(
"""
<div style="margin-bottom: 1rem;">
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
alt="Pixeltable" style="max-width: 150px;" />
<h1>📄 Document to Audio Synthesis 🎧</h1>
</div>
"""
)
# Overview Row
with gr.Row():
with gr.Column():
with gr.Accordion("🎯 What does it do?", open=True):
gr.Markdown("""
- 📄 Document processing - 🧠 Content transformation
- 🎧 Audio synthesis - ⚙️ Multiple output styles
""")
with gr.Column():
with gr.Accordion("⚡ How does it work?", open=True):
gr.Markdown("""
1. 📑 **Processing:** Token-based segmentation
2. 🔍 **Analysis:** LLM optimization & scripts
3. 🎵 **Synthesis:** Multiple voice options
""")
synthesis_mode = gr.State(SYNTHESIS_MODES["narration"])
# Main Input Row
with gr.Row():
# Left Column - Core Inputs
with gr.Column(scale=1):
with gr.Row():
api_key = gr.Textbox(
label="🔑 OpenAI API Key",
placeholder="sk-...",
type="password",
scale=2
)
file_input = gr.File(
label="📁 Input PDF",
file_types=[".pdf"],
scale=1
)
# Right Column - Mode Selection
with gr.Column(scale=1):
mode_select = gr.Radio(
choices=list(SYNTHESIS_MODES.keys()),
value="narration",
label="🎭 Output Mode",
info="Select output type"
)
mode_description = gr.Markdown(
SYNTHESIS_MODES["narration"]["description"],
elem_classes=["mode-description"]
)
# Parameters Row
with gr.Row():
# Voice and Style Column
with gr.Column():
with gr.Box():
gr.Markdown("### 🎛️ Voice & Style")
voice_select = gr.Radio(
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
value="onyx",
label="🎙️ Voice",
interactive=True
)
style_select = gr.Radio(
choices=SYNTHESIS_MODES["narration"]["styles"],
value=SYNTHESIS_MODES["narration"]["styles"][0],
label="💫 Style",
interactive=True
)
# Processing Parameters Column
with gr.Column():
with gr.Box():
gr.Markdown("### ⚙️ Processing")
with gr.Row():
chunk_size = gr.Slider(
minimum=100, maximum=1000,
value=SYNTHESIS_MODES["narration"]["default_chunks"],
step=50,
label="📏 Chunk Size"
)
temperature = gr.Slider(
minimum=0, maximum=1,
value=SYNTHESIS_MODES["narration"]["default_temp"],
step=0.1,
label="🌡️ Temperature"
)
max_tokens = gr.Slider(
minimum=100, maximum=1000,
value=300,
step=50,
label="📊 Tokens"
)
# Process Button Row
with gr.Row():
process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
status_output = gr.Textbox(label="📋 Status", scale=1)
# Output Tabs Row
with gr.Tabs():
with gr.Tab("📝 Content"):
output_table = gr.Dataframe(
headers=["🔍 Segment", "📄 Content", "🎭 Script"],
wrap=True
)
with gr.Tab("🎧 Audio"):
with gr.Row():
audio_output = gr.Audio(
label="🔊 Output",
type="filepath",
show_download_button=True
)
with gr.Column():
gr.Markdown("""
### 📚 Quick Tips
- 🎯 Lower temperature = more consistent output
- 📏 Smaller chunks = more precise control
- 🎙️ Try different voices for best results
""")
# Footer
gr.HTML(
"""
<div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
<p style="margin: 0; color: #666; font-size: 0.8em;">
🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
| 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Docs</a>
| 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">HF Space</a>
</p>
</div>
"""
)
def update_mode(mode_name):
mode = SYNTHESIS_MODES[mode_name]
return (
gr.update(choices=mode["styles"], value=mode["styles"][0]),
gr.update(value=mode["default_chunks"]),
gr.update(value=mode["default_temp"]),
mode["description"]
)
mode_select.change(
update_mode,
inputs=[mode_select],
outputs=[style_select, chunk_size, temperature, mode_description]
)
def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens):
mode = SYNTHESIS_MODES[mode_name]
return process_document(
pdf_file=pdf_file,
api_key=api_key,
voice_choice=voice,
style_choice=style,
chunk_size=chunk_size,
temperature=temperature,
max_tokens=max_tokens,
system_prompt=mode["system_prompt"]
)
process_btn.click(
update_interface,
inputs=[
file_input, api_key, mode_select, voice_select, style_select,
chunk_size, temperature, max_tokens
],
outputs=[output_table, audio_output, status_output]
)
return demo