Spaces:

Pixeltable
/

Document-to-Audio-Synthesis

Sleeping

App Files Files Community

PierreBrunelle commited on Oct 23, 2024

Commit

45c1057

verified ·

1 Parent(s): 7024de8

Create interface.py

Browse files

Files changed (1) hide show

interface.py +139 -0

interface.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import gradio as gr
+from .processor import process_document
+def create_interface():
+    with gr.Blocks(theme=gr.themes.Base()) as demo:
+        gr.HTML(
+            """
+            <div style="margin-bottom: 1rem;">
+                <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
+                     alt="Pixeltable" style="max-width: 150px;" />
+                <h1>Document to Audio Synthesis</h1>
+            </div>
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                with gr.Accordion("What does it do?", open=True):
+                    gr.Markdown("""
+                        - PDF document processing and text extraction
+                        - Intelligent content transformation and summarization
+                        - High-quality audio synthesis with voice selection
+                        - Configurable processing parameters
+                        - Downloadable audio output
+                    """)
+            with gr.Column():
+                with gr.Accordion("How does it work?", open=True):
+                    gr.Markdown("""
+                        1. **Document Processing**
+                           - Chunks document using token-based segmentation
+                           - Maintains document structure and context
+                        2. **Content Processing**
+                           - Transforms text using LLM optimization
+                           - Generates optimized audio scripts
+                        3. **Audio Synthesis**
+                           - Converts scripts to natural speech
+                           - Multiple voice models available
+                    """)
+        with gr.Row():
+            with gr.Column():
+                api_key = gr.Textbox(
+                    label="OpenAI API Key",
+                    placeholder="sk-...",
+                    type="password"
+                )
+                file_input = gr.File(
+                    label="Input Document (PDF)",
+                    file_types=[".pdf"]
+                )
+                with gr.Accordion("Synthesis Parameters", open=True):
+                    voice_select = gr.Radio(
+                        choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
+                        value="onyx",
+                        label="Voice Model",
+                        info="TTS voice model selection"
+                    )
+                    style_select = gr.Radio(
+                        choices=["Technical", "Narrative", "Instructional", "Descriptive"],
+                        value="Technical",
+                        label="Processing Style",
+                        info="Content processing approach"
+                    )
+                with gr.Accordion("Processing Parameters", open=False):
+                    chunk_size = gr.Slider(
+                        minimum=100, maximum=1000, value=300, step=50,
+                        label="Chunk Size (tokens)",
+                        info="Text segmentation size"
+                    )
+                    temperature = gr.Slider(
+                        minimum=0, maximum=1, value=0.7, step=0.1,
+                        label="Temperature",
+                        info="LLM randomness factor"
+                    )
+                    max_tokens = gr.Slider(
+                        minimum=100, maximum=1000, value=300, step=50,
+                        label="Max Tokens",
+                        info="Maximum output token limit"
+                    )
+                process_btn = gr.Button("Process Document", variant="primary")
+                status_output = gr.Textbox(label="Status")
+        with gr.Tabs():
+            with gr.TabItem("Content Processing"):
+                output_table = gr.Dataframe(
+                    headers=["Segment", "Processed Content", "Audio Script"],
+                    wrap=True
+                )
+            with gr.TabItem("Audio Output"):
+                audio_output = gr.Audio(
+                    label="Synthesized Audio",
+                    type="filepath",
+                    show_download_button=True
+                )
+        gr.Markdown("""
+            ### Technical Notes
+            - Token limit affects processing speed and memory usage
+            - Temperature values > 0.8 may introduce content variations
+            - Audio synthesis has a 4096 token limit per segment
+            ### Performance Considerations
+            - Chunk size directly impacts processing time
+            - Higher temperatures increase LLM compute time
+            - Audio synthesis scales with script length
+        """)
+        gr.HTML(
+            """
+            <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
+                <p style="margin: 0; color: #666; font-size: 0.8em;">
+                    Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
+                    | <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a>
+                    | <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a>
+                </p>
+            </div>
+            """
+        )
+        def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens):
+            return process_document(
+                pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens
+            )
+        process_btn.click(
+            update_interface,
+            inputs=[
+                file_input, api_key, voice_select, style_select,
+                chunk_size, temperature, max_tokens
+            ],
+            outputs=[output_table, audio_output, status_output]
+        )
+    return demo