|
import gradio as gr |
|
from .processor import process_document |
|
|
|
def create_interface(): |
|
with gr.Blocks(theme=gr.themes.Base()) as demo: |
|
gr.HTML( |
|
""" |
|
<div style="margin-bottom: 1rem;"> |
|
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" |
|
alt="Pixeltable" style="max-width: 150px;" /> |
|
<h1>📄 Document to Audio Synthesis 🎧</h1> |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Accordion("🎯 What does it do?", open=True): |
|
gr.Markdown(""" |
|
- 📄 PDF document processing and text extraction |
|
- 🧠 Intelligent content transformation and summarization |
|
- 🎧 High-quality audio synthesis with voice selection |
|
- ⚙️ Configurable processing parameters |
|
- ⬇️ Downloadable audio output |
|
""") |
|
with gr.Column(): |
|
with gr.Accordion("⚡ How does it work?", open=True): |
|
gr.Markdown(""" |
|
1. 📑 **Document Processing** |
|
- 📊 Chunks document using token-based segmentation |
|
- 🔄 Maintains document structure and context |
|
|
|
2. 🔍 **Content Processing** |
|
- 🤖 Transforms text using LLM optimization |
|
- 📝 Generates optimized audio scripts |
|
|
|
3. 🎵 **Audio Synthesis** |
|
- 🗣️ Converts scripts to natural speech |
|
- 🎙️ Multiple voice models available |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
api_key = gr.Textbox( |
|
label="🔑 OpenAI API Key", |
|
placeholder="sk-...", |
|
type="password" |
|
) |
|
file_input = gr.File( |
|
label="📁 Input Document (PDF)", |
|
file_types=[".pdf"] |
|
) |
|
|
|
with gr.Accordion("🎛️ Synthesis Parameters", open=True): |
|
voice_select = gr.Radio( |
|
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"], |
|
value="onyx", |
|
label="🎙️ Voice Model", |
|
info="TTS voice model selection" |
|
) |
|
style_select = gr.Radio( |
|
choices=["Technical", "Narrative", "Instructional", "Descriptive"], |
|
value="Technical", |
|
label="💫 Processing Style", |
|
info="Content processing approach" |
|
) |
|
|
|
with gr.Accordion("⚙️ Processing Parameters", open=False): |
|
chunk_size = gr.Slider( |
|
minimum=100, maximum=1000, value=300, step=50, |
|
label="📏 Chunk Size (tokens)", |
|
info="Text segmentation size" |
|
) |
|
temperature = gr.Slider( |
|
minimum=0, maximum=1, value=0.7, step=0.1, |
|
label="🌡️ Temperature", |
|
info="LLM randomness factor" |
|
) |
|
max_tokens = gr.Slider( |
|
minimum=100, maximum=1000, value=300, step=50, |
|
label="📊 Max Tokens", |
|
info="Maximum output token limit" |
|
) |
|
|
|
process_btn = gr.Button("🚀 Process Document", variant="primary") |
|
status_output = gr.Textbox(label="📋 Status") |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("📝 Content Processing"): |
|
output_table = gr.Dataframe( |
|
headers=["🔍 Segment", "📄 Processed Content", "🎭 Audio Script"], |
|
wrap=True |
|
) |
|
with gr.TabItem("🎧 Audio Output"): |
|
audio_output = gr.Audio( |
|
label="🔊 Synthesized Audio", |
|
type="filepath", |
|
show_download_button=True |
|
) |
|
|
|
gr.Markdown(""" |
|
### 📚 Technical Notes |
|
- ⚡ Token limit affects processing speed and memory usage |
|
- 🎯 Temperature values > 0.8 may introduce content variations |
|
- 🔊 Audio synthesis has a 4096 token limit per segment |
|
|
|
### ⚙️ Performance Considerations |
|
- 📊 Chunk size directly impacts processing time |
|
- 🔄 Higher temperatures increase LLM compute time |
|
- ⏱️ Audio synthesis scales with script length |
|
""") |
|
|
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;"> |
|
<p style="margin: 0; color: #666; font-size: 0.8em;"> |
|
🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a> |
|
| 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a> |
|
| 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a> |
|
</p> |
|
</div> |
|
""" |
|
) |
|
|
|
def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens): |
|
return process_document( |
|
pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens |
|
) |
|
|
|
process_btn.click( |
|
update_interface, |
|
inputs=[ |
|
file_input, api_key, voice_select, style_select, |
|
chunk_size, temperature, max_tokens |
|
], |
|
outputs=[output_table, audio_output, status_output] |
|
) |
|
|
|
return demo |