|
import gradio as gr |
|
from .processor import process_document |
|
|
|
def create_interface(): |
|
with gr.Blocks(theme=gr.themes.Base()) as demo: |
|
gr.HTML( |
|
""" |
|
<div style="margin-bottom: 1rem;"> |
|
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" |
|
alt="Pixeltable" style="max-width: 150px;" /> |
|
<h1>Document to Audio Synthesis</h1> |
|
</div> |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
with gr.Accordion("What does it do?", open=True): |
|
gr.Markdown(""" |
|
- PDF document processing and text extraction |
|
- Intelligent content transformation and summarization |
|
- High-quality audio synthesis with voice selection |
|
- Configurable processing parameters |
|
- Downloadable audio output |
|
""") |
|
with gr.Column(): |
|
with gr.Accordion("How does it work?", open=True): |
|
gr.Markdown(""" |
|
1. **Document Processing** |
|
- Chunks document using token-based segmentation |
|
- Maintains document structure and context |
|
|
|
2. **Content Processing** |
|
- Transforms text using LLM optimization |
|
- Generates optimized audio scripts |
|
|
|
3. **Audio Synthesis** |
|
- Converts scripts to natural speech |
|
- Multiple voice models available |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
api_key = gr.Textbox( |
|
label="OpenAI API Key", |
|
placeholder="sk-...", |
|
type="password" |
|
) |
|
file_input = gr.File( |
|
label="Input Document (PDF)", |
|
file_types=[".pdf"] |
|
) |
|
|
|
with gr.Accordion("Synthesis Parameters", open=True): |
|
voice_select = gr.Radio( |
|
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"], |
|
value="onyx", |
|
label="Voice Model", |
|
info="TTS voice model selection" |
|
) |
|
style_select = gr.Radio( |
|
choices=["Technical", "Narrative", "Instructional", "Descriptive"], |
|
value="Technical", |
|
label="Processing Style", |
|
info="Content processing approach" |
|
) |
|
|
|
with gr.Accordion("Processing Parameters", open=False): |
|
chunk_size = gr.Slider( |
|
minimum=100, maximum=1000, value=300, step=50, |
|
label="Chunk Size (tokens)", |
|
info="Text segmentation size" |
|
) |
|
temperature = gr.Slider( |
|
minimum=0, maximum=1, value=0.7, step=0.1, |
|
label="Temperature", |
|
info="LLM randomness factor" |
|
) |
|
max_tokens = gr.Slider( |
|
minimum=100, maximum=1000, value=300, step=50, |
|
label="Max Tokens", |
|
info="Maximum output token limit" |
|
) |
|
|
|
process_btn = gr.Button("Process Document", variant="primary") |
|
status_output = gr.Textbox(label="Status") |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Content Processing"): |
|
output_table = gr.Dataframe( |
|
headers=["Segment", "Processed Content", "Audio Script"], |
|
wrap=True |
|
) |
|
with gr.TabItem("Audio Output"): |
|
audio_output = gr.Audio( |
|
label="Synthesized Audio", |
|
type="filepath", |
|
show_download_button=True |
|
) |
|
|
|
gr.Markdown(""" |
|
### Technical Notes |
|
- Token limit affects processing speed and memory usage |
|
- Temperature values > 0.8 may introduce content variations |
|
- Audio synthesis has a 4096 token limit per segment |
|
|
|
### Performance Considerations |
|
- Chunk size directly impacts processing time |
|
- Higher temperatures increase LLM compute time |
|
- Audio synthesis scales with script length |
|
""") |
|
|
|
gr.HTML( |
|
""" |
|
<div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;"> |
|
<p style="margin: 0; color: #666; font-size: 0.8em;"> |
|
Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a> |
|
| <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a> |
|
| <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a> |
|
</p> |
|
</div> |
|
""" |
|
) |
|
|
|
def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens): |
|
return process_document( |
|
pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens |
|
) |
|
|
|
process_btn.click( |
|
update_interface, |
|
inputs=[ |
|
file_input, api_key, voice_select, style_select, |
|
chunk_size, temperature, max_tokens |
|
], |
|
outputs=[output_table, audio_output, status_output] |
|
) |
|
|
|
return demo |