import gradio as gr from .processor import process_document def create_interface(): with gr.Blocks(theme=gr.themes.Base()) as demo: gr.HTML( """

Document to Audio Synthesis

""" ) with gr.Row(): with gr.Column(): with gr.Accordion("What does it do?", open=True): gr.Markdown(""" - PDF document processing and text extraction - Intelligent content transformation and summarization - High-quality audio synthesis with voice selection - Configurable processing parameters - Downloadable audio output """) with gr.Column(): with gr.Accordion("How does it work?", open=True): gr.Markdown(""" 1. **Document Processing** - Chunks document using token-based segmentation - Maintains document structure and context 2. **Content Processing** - Transforms text using LLM optimization - Generates optimized audio scripts 3. **Audio Synthesis** - Converts scripts to natural speech - Multiple voice models available """) with gr.Row(): with gr.Column(): api_key = gr.Textbox( label="OpenAI API Key", placeholder="sk-...", type="password" ) file_input = gr.File( label="Input Document (PDF)", file_types=[".pdf"] ) with gr.Accordion("Synthesis Parameters", open=True): voice_select = gr.Radio( choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"], value="onyx", label="Voice Model", info="TTS voice model selection" ) style_select = gr.Radio( choices=["Technical", "Narrative", "Instructional", "Descriptive"], value="Technical", label="Processing Style", info="Content processing approach" ) with gr.Accordion("Processing Parameters", open=False): chunk_size = gr.Slider( minimum=100, maximum=1000, value=300, step=50, label="Chunk Size (tokens)", info="Text segmentation size" ) temperature = gr.Slider( minimum=0, maximum=1, value=0.7, step=0.1, label="Temperature", info="LLM randomness factor" ) max_tokens = gr.Slider( minimum=100, maximum=1000, value=300, step=50, label="Max Tokens", info="Maximum output token limit" ) process_btn = gr.Button("Process Document", variant="primary") status_output = gr.Textbox(label="Status") with gr.Tabs(): with gr.TabItem("Content Processing"): output_table = gr.Dataframe( headers=["Segment", "Processed Content", "Audio Script"], wrap=True ) with gr.TabItem("Audio Output"): audio_output = gr.Audio( label="Synthesized Audio", type="filepath", show_download_button=True ) gr.Markdown(""" ### Technical Notes - Token limit affects processing speed and memory usage - Temperature values > 0.8 may introduce content variations - Audio synthesis has a 4096 token limit per segment ### Performance Considerations - Chunk size directly impacts processing time - Higher temperatures increase LLM compute time - Audio synthesis scales with script length """) gr.HTML( """

""" ) def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens): return process_document( pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens ) process_btn.click( update_interface, inputs=[ file_input, api_key, voice_select, style_select, chunk_size, temperature, max_tokens ], outputs=[output_table, audio_output, status_output] ) return demo