File size: 6,073 Bytes
45c1057 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
from .processor import process_document
def create_interface():
with gr.Blocks(theme=gr.themes.Base()) as demo:
gr.HTML(
"""
<div style="margin-bottom: 1rem;">
<img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
alt="Pixeltable" style="max-width: 150px;" />
<h1>Document to Audio Synthesis</h1>
</div>
"""
)
with gr.Row():
with gr.Column():
with gr.Accordion("What does it do?", open=True):
gr.Markdown("""
- PDF document processing and text extraction
- Intelligent content transformation and summarization
- High-quality audio synthesis with voice selection
- Configurable processing parameters
- Downloadable audio output
""")
with gr.Column():
with gr.Accordion("How does it work?", open=True):
gr.Markdown("""
1. **Document Processing**
- Chunks document using token-based segmentation
- Maintains document structure and context
2. **Content Processing**
- Transforms text using LLM optimization
- Generates optimized audio scripts
3. **Audio Synthesis**
- Converts scripts to natural speech
- Multiple voice models available
""")
with gr.Row():
with gr.Column():
api_key = gr.Textbox(
label="OpenAI API Key",
placeholder="sk-...",
type="password"
)
file_input = gr.File(
label="Input Document (PDF)",
file_types=[".pdf"]
)
with gr.Accordion("Synthesis Parameters", open=True):
voice_select = gr.Radio(
choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
value="onyx",
label="Voice Model",
info="TTS voice model selection"
)
style_select = gr.Radio(
choices=["Technical", "Narrative", "Instructional", "Descriptive"],
value="Technical",
label="Processing Style",
info="Content processing approach"
)
with gr.Accordion("Processing Parameters", open=False):
chunk_size = gr.Slider(
minimum=100, maximum=1000, value=300, step=50,
label="Chunk Size (tokens)",
info="Text segmentation size"
)
temperature = gr.Slider(
minimum=0, maximum=1, value=0.7, step=0.1,
label="Temperature",
info="LLM randomness factor"
)
max_tokens = gr.Slider(
minimum=100, maximum=1000, value=300, step=50,
label="Max Tokens",
info="Maximum output token limit"
)
process_btn = gr.Button("Process Document", variant="primary")
status_output = gr.Textbox(label="Status")
with gr.Tabs():
with gr.TabItem("Content Processing"):
output_table = gr.Dataframe(
headers=["Segment", "Processed Content", "Audio Script"],
wrap=True
)
with gr.TabItem("Audio Output"):
audio_output = gr.Audio(
label="Synthesized Audio",
type="filepath",
show_download_button=True
)
gr.Markdown("""
### Technical Notes
- Token limit affects processing speed and memory usage
- Temperature values > 0.8 may introduce content variations
- Audio synthesis has a 4096 token limit per segment
### Performance Considerations
- Chunk size directly impacts processing time
- Higher temperatures increase LLM compute time
- Audio synthesis scales with script length
""")
gr.HTML(
"""
<div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
<p style="margin: 0; color: #666; font-size: 0.8em;">
Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
| <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a>
| <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a>
</p>
</div>
"""
)
def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens):
return process_document(
pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens
)
process_btn.click(
update_interface,
inputs=[
file_input, api_key, voice_select, style_select,
chunk_size, temperature, max_tokens
],
outputs=[output_table, audio_output, status_output]
)
return demo |