File size: 6,073 Bytes
45c1057
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import gradio as gr
from .processor import process_document

def create_interface():
    with gr.Blocks(theme=gr.themes.Base()) as demo:
        gr.HTML(
            """
            <div style="margin-bottom: 1rem;">
                <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" 
                     alt="Pixeltable" style="max-width: 150px;" />
                <h1>Document to Audio Synthesis</h1>
            </div>
            """
        )

        with gr.Row():
            with gr.Column():
                with gr.Accordion("What does it do?", open=True):
                    gr.Markdown("""
                        - PDF document processing and text extraction
                        - Intelligent content transformation and summarization
                        - High-quality audio synthesis with voice selection
                        - Configurable processing parameters
                        - Downloadable audio output
                    """)
            with gr.Column():
                with gr.Accordion("How does it work?", open=True):
                    gr.Markdown("""
                        1. **Document Processing**
                           - Chunks document using token-based segmentation
                           - Maintains document structure and context
                        
                        2. **Content Processing**
                           - Transforms text using LLM optimization
                           - Generates optimized audio scripts
                        
                        3. **Audio Synthesis**
                           - Converts scripts to natural speech
                           - Multiple voice models available
                    """)

        with gr.Row():
            with gr.Column():
                api_key = gr.Textbox(
                    label="OpenAI API Key",
                    placeholder="sk-...",
                    type="password"
                )
                file_input = gr.File(
                    label="Input Document (PDF)",
                    file_types=[".pdf"]
                )
                
                with gr.Accordion("Synthesis Parameters", open=True):
                    voice_select = gr.Radio(
                        choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
                        value="onyx",
                        label="Voice Model",
                        info="TTS voice model selection"
                    )
                    style_select = gr.Radio(
                        choices=["Technical", "Narrative", "Instructional", "Descriptive"],
                        value="Technical",
                        label="Processing Style",
                        info="Content processing approach"
                    )

                with gr.Accordion("Processing Parameters", open=False):
                    chunk_size = gr.Slider(
                        minimum=100, maximum=1000, value=300, step=50,
                        label="Chunk Size (tokens)",
                        info="Text segmentation size"
                    )
                    temperature = gr.Slider(
                        minimum=0, maximum=1, value=0.7, step=0.1,
                        label="Temperature",
                        info="LLM randomness factor"
                    )
                    max_tokens = gr.Slider(
                        minimum=100, maximum=1000, value=300, step=50,
                        label="Max Tokens",
                        info="Maximum output token limit"
                    )

                process_btn = gr.Button("Process Document", variant="primary")
                status_output = gr.Textbox(label="Status")
        
        with gr.Tabs():
            with gr.TabItem("Content Processing"):
                output_table = gr.Dataframe(
                    headers=["Segment", "Processed Content", "Audio Script"],
                    wrap=True
                )
            with gr.TabItem("Audio Output"):
                audio_output = gr.Audio(
                    label="Synthesized Audio", 
                    type="filepath",
                    show_download_button=True
                )
                
        gr.Markdown("""
            ### Technical Notes
            - Token limit affects processing speed and memory usage
            - Temperature values > 0.8 may introduce content variations
            - Audio synthesis has a 4096 token limit per segment
            
            ### Performance Considerations
            - Chunk size directly impacts processing time
            - Higher temperatures increase LLM compute time
            - Audio synthesis scales with script length
        """)

        gr.HTML(
            """
            <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
                <p style="margin: 0; color: #666; font-size: 0.8em;">
                    Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
                    | <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a>
                    | <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a>
                </p>
            </div>
            """
        )
        
        def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens):
            return process_document(
                pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens
            )
        
        process_btn.click(
            update_interface,
            inputs=[
                file_input, api_key, voice_select, style_select,
                chunk_size, temperature, max_tokens
            ],
            outputs=[output_table, audio_output, status_output]
        )
    
    return demo