|
import gradio as gr |
|
from .processor import process_document |
|
|
|
SYNTHESIS_MODES = { |
|
"narration": { |
|
"description": "Simple document narration with clear voice and natural pacing", |
|
"styles": ["Technical", "Narrative", "Instructional", "Descriptive"], |
|
"default_temp": 0.7, |
|
"default_chunks": 300, |
|
"system_prompt": """Convert this content into clear narration.""" |
|
}, |
|
"podcast": { |
|
"description": "Conversational style with engaging tone and dynamic pacing", |
|
"styles": ["Casual", "Interview", "Educational", "Commentary"], |
|
"default_temp": 0.8, |
|
"default_chunks": 400, |
|
"system_prompt": """Transform this content into engaging podcast-style speech.""" |
|
}, |
|
"presentation": { |
|
"description": "Professional presentation style with clear structure", |
|
"styles": ["Business", "Academic", "Sales", "Training"], |
|
"default_temp": 0.6, |
|
"default_chunks": 250, |
|
"system_prompt": """Convert this content into a presentation format.""" |
|
}, |
|
"storytelling": { |
|
"description": "Narrative style with emotional engagement", |
|
"styles": ["Dynamic", "Dramatic", "Calm", "Energetic"], |
|
"default_temp": 0.9, |
|
"default_chunks": 500, |
|
"system_prompt": """Transform this content into an engaging story.""" |
|
} |
|
} |
|
|
|
def create_interface(): |
|
with gr.Blocks(theme=gr.themes.Base()) as demo: |
|
gr.HTML( |
|
""" |
|
<div style="margin-bottom: 1rem;"> |
|
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" |
|
alt="Pixeltable" style="max-width: 150px;" /> |
|
<h1>📄 Document to Audio Synthesis 🎧</h1> |
|
</div> |
|
""" |
|
) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Accordion("🎯 What does it do?", open=True): |
|
gr.Markdown(""" |
|
- 📄 Document processing - 🧠 Content transformation |
|
- 🎧 Audio synthesis - ⚙️ Multiple output styles |
|
""") |
|
with gr.Column(): |
|
with gr.Accordion("⚡ How does it work?", open=True): |
|
gr.Markdown(""" |
|
1. 📑 **Processing:** Token-based segmentation |
|
2. 🔍 **Analysis:** LLM optimization & scripts |
|
3. 🎵 **Synthesis:** Multiple voice options |
|
""") |
|
|
|
synthesis_mode = gr.State(SYNTHESIS_MODES["narration"]) |
|
|
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
with gr.Accordion("🔑 Core Settings", open=True): |
|
with gr.Row(): |
|
api_key = gr.Textbox( |
|
label="OpenAI API Key", |
|
placeholder="sk-...", |
|
type="password", |
|
scale=2 |
|
) |
|
file_input = gr.File( |
|
label="PDF Document", |
|
file_types=[".pdf"], |
|
scale=1 |
|
) |
|
|
|
|
|
with gr.Column(): |
|
with gr.Accordion("🎭 Output Mode", open=True): |
|
mode_select = gr.Radio( |
|
choices=list(SYNTHESIS_MODES.keys()), |
|
value="narration", |
|
label="Select Mode", |
|
info="Choose output style" |
|
) |
|
mode_description = gr.Markdown( |
|
SYNTHESIS_MODES["narration"]["description"] |
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
|
with gr.Column(): |
|
with gr.Accordion("🎛️ Voice & Style", open=True): |
|
voice_select = gr.Radio( |
|
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"], |
|
value="onyx", |
|
label="🎙️ Voice", |
|
interactive=True |
|
) |
|
style_select = gr.Radio( |
|
choices=SYNTHESIS_MODES["narration"]["styles"], |
|
value=SYNTHESIS_MODES["narration"]["styles"][0], |
|
label="💫 Style", |
|
interactive=True |
|
) |
|
|
|
|
|
with gr.Column(): |
|
with gr.Accordion("⚙️ Processing Parameters", open=True): |
|
with gr.Row(): |
|
chunk_size = gr.Slider( |
|
minimum=100, maximum=1000, |
|
value=SYNTHESIS_MODES["narration"]["default_chunks"], |
|
step=50, |
|
label="📏 Chunk Size" |
|
) |
|
temperature = gr.Slider( |
|
minimum=0, maximum=1, |
|
value=SYNTHESIS_MODES["narration"]["default_temp"], |
|
step=0.1, |
|
label="🌡️ Temperature" |
|
) |
|
max_tokens = gr.Slider( |
|
minimum=100, maximum=1000, |
|
value=300, |
|
step=50, |
|
label="📊 Tokens" |
|
) |
|
|
|
|
|
with gr.Row(): |
|
process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2) |
|
status_output = gr.Textbox(label="📋 Status", scale=1) |
|
|
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("📝 Content"): |
|
output_table = gr.Dataframe( |
|
headers=["🔍 Segment", "📄 Content", "🎭 Script"], |
|
wrap=True |
|
) |
|
with gr.TabItem("🎧 Audio"): |
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
audio_output = gr.Audio( |
|
label="🔊 Synthesized Audio", |
|
type="filepath", |
|
show_download_button=True |
|
) |
|
with gr.Column(scale=1): |
|
with gr.Accordion("📚 Quick Tips", open=True): |
|
gr.Markdown(""" |
|
- 🎯 Lower temperature = more consistent |
|
- 📏 Smaller chunks = more precise |
|
- 🎙️ Try different voices for best fit |
|
- 💫 Match style to content type |
|
""") |
|
|
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;"> |
|
<p style="margin: 0; color: #666; font-size: 0.8em;"> |
|
🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a> |
|
| 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Docs</a> |
|
| 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">HF Space</a> |
|
</p> |
|
</div> |
|
""" |
|
) |
|
|
|
def update_mode(mode_name): |
|
mode = SYNTHESIS_MODES[mode_name] |
|
return ( |
|
gr.update(choices=mode["styles"], value=mode["styles"][0]), |
|
gr.update(value=mode["default_chunks"]), |
|
gr.update(value=mode["default_temp"]), |
|
mode["description"] |
|
) |
|
|
|
mode_select.change( |
|
update_mode, |
|
inputs=[mode_select], |
|
outputs=[style_select, chunk_size, temperature, mode_description] |
|
) |
|
|
|
def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens): |
|
mode = SYNTHESIS_MODES[mode_name] |
|
return process_document( |
|
pdf_file=pdf_file, |
|
api_key=api_key, |
|
voice_choice=voice, |
|
style_choice=style, |
|
chunk_size=chunk_size, |
|
temperature=temperature, |
|
max_tokens=max_tokens, |
|
system_prompt=mode["system_prompt"] |
|
) |
|
|
|
process_btn.click( |
|
update_interface, |
|
inputs=[ |
|
file_input, api_key, mode_select, voice_select, style_select, |
|
chunk_size, temperature, max_tokens |
|
], |
|
outputs=[output_table, audio_output, status_output] |
|
) |
|
|
|
return demo |