Spaces:

Pixeltable
/

Document-to-Audio-Synthesis

Sleeping

File size: 6,073 Bytes

45c1057

import gradio as gr
from .processor import process_document

def create_interface():
    with gr.Blocks(theme=gr.themes.Base()) as demo:
        gr.HTML(
            """
            <div style="margin-bottom: 1rem;">
                <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" 
                     alt="Pixeltable" style="max-width: 150px;" />
                <h1>Document to Audio Synthesis</h1>
            </div>
            """
        )

        with gr.Row():
            with gr.Column():
                with gr.Accordion("What does it do?", open=True):
                    gr.Markdown("""
                        - PDF document processing and text extraction
                        - Intelligent content transformation and summarization
                        - High-quality audio synthesis with voice selection
                        - Configurable processing parameters
                        - Downloadable audio output
                    """)
            with gr.Column():
                with gr.Accordion("How does it work?", open=True):
                    gr.Markdown("""
                        1. **Document Processing**
                           - Chunks document using token-based segmentation
                           - Maintains document structure and context
                        
                        2. **Content Processing**
                           - Transforms text using LLM optimization
                           - Generates optimized audio scripts
                        
                        3. **Audio Synthesis**
                           - Converts scripts to natural speech
                           - Multiple voice models available
                    """)

        with gr.Row():
            with gr.Column():
                api_key = gr.Textbox(
                    label="OpenAI API Key",
                    placeholder="sk-...",
                    type="password"
                )
                file_input = gr.File(
                    label="Input Document (PDF)",
                    file_types=[".pdf"]
                )
                
                with gr.Accordion("Synthesis Parameters", open=True):
                    voice_select = gr.Radio(
                        choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
                        value="onyx",
                        label="Voice Model",
                        info="TTS voice model selection"
                    )
                    style_select = gr.Radio(
                        choices=["Technical", "Narrative", "Instructional", "Descriptive"],
                        value="Technical",
                        label="Processing Style",
                        info="Content processing approach"
                    )

                with gr.Accordion("Processing Parameters", open=False):
                    chunk_size = gr.Slider(
                        minimum=100, maximum=1000, value=300, step=50,
                        label="Chunk Size (tokens)",
                        info="Text segmentation size"
                    )
                    temperature = gr.Slider(
                        minimum=0, maximum=1, value=0.7, step=0.1,
                        label="Temperature",
                        info="LLM randomness factor"
                    )
                    max_tokens = gr.Slider(
                        minimum=100, maximum=1000, value=300, step=50,
                        label="Max Tokens",
                        info="Maximum output token limit"
                    )

                process_btn = gr.Button("Process Document", variant="primary")
                status_output = gr.Textbox(label="Status")
        
        with gr.Tabs():
            with gr.TabItem("Content Processing"):
                output_table = gr.Dataframe(
                    headers=["Segment", "Processed Content", "Audio Script"],
                    wrap=True
                )
            with gr.TabItem("Audio Output"):
                audio_output = gr.Audio(
                    label="Synthesized Audio", 
                    type="filepath",
                    show_download_button=True
                )
                
        gr.Markdown("""
            ### Technical Notes
            - Token limit affects processing speed and memory usage
            - Temperature values > 0.8 may introduce content variations
            - Audio synthesis has a 4096 token limit per segment
            
            ### Performance Considerations
            - Chunk size directly impacts processing time
            - Higher temperatures increase LLM compute time
            - Audio synthesis scales with script length
        """)

        gr.HTML(
            """
            <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
                <p style="margin: 0; color: #666; font-size: 0.8em;">
                    Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
                    | <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a>
                    | <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a>
                </p>
            </div>
            """
        )
        
        def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens):
            return process_document(
                pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens
            )
        
        process_btn.click(
            update_interface,
            inputs=[
                file_input, api_key, voice_select, style_select,
                chunk_size, temperature, max_tokens
            ],
            outputs=[output_table, audio_output, status_output]
        )
    
    return demo