PierreBrunelle commited on
Commit
45c1057
·
verified ·
1 Parent(s): 7024de8

Create interface.py

Browse files
Files changed (1) hide show
  1. interface.py +139 -0
interface.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from .processor import process_document
3
+
4
+ def create_interface():
5
+ with gr.Blocks(theme=gr.themes.Base()) as demo:
6
+ gr.HTML(
7
+ """
8
+ <div style="margin-bottom: 1rem;">
9
+ <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png"
10
+ alt="Pixeltable" style="max-width: 150px;" />
11
+ <h1>Document to Audio Synthesis</h1>
12
+ </div>
13
+ """
14
+ )
15
+
16
+ with gr.Row():
17
+ with gr.Column():
18
+ with gr.Accordion("What does it do?", open=True):
19
+ gr.Markdown("""
20
+ - PDF document processing and text extraction
21
+ - Intelligent content transformation and summarization
22
+ - High-quality audio synthesis with voice selection
23
+ - Configurable processing parameters
24
+ - Downloadable audio output
25
+ """)
26
+ with gr.Column():
27
+ with gr.Accordion("How does it work?", open=True):
28
+ gr.Markdown("""
29
+ 1. **Document Processing**
30
+ - Chunks document using token-based segmentation
31
+ - Maintains document structure and context
32
+
33
+ 2. **Content Processing**
34
+ - Transforms text using LLM optimization
35
+ - Generates optimized audio scripts
36
+
37
+ 3. **Audio Synthesis**
38
+ - Converts scripts to natural speech
39
+ - Multiple voice models available
40
+ """)
41
+
42
+ with gr.Row():
43
+ with gr.Column():
44
+ api_key = gr.Textbox(
45
+ label="OpenAI API Key",
46
+ placeholder="sk-...",
47
+ type="password"
48
+ )
49
+ file_input = gr.File(
50
+ label="Input Document (PDF)",
51
+ file_types=[".pdf"]
52
+ )
53
+
54
+ with gr.Accordion("Synthesis Parameters", open=True):
55
+ voice_select = gr.Radio(
56
+ choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
57
+ value="onyx",
58
+ label="Voice Model",
59
+ info="TTS voice model selection"
60
+ )
61
+ style_select = gr.Radio(
62
+ choices=["Technical", "Narrative", "Instructional", "Descriptive"],
63
+ value="Technical",
64
+ label="Processing Style",
65
+ info="Content processing approach"
66
+ )
67
+
68
+ with gr.Accordion("Processing Parameters", open=False):
69
+ chunk_size = gr.Slider(
70
+ minimum=100, maximum=1000, value=300, step=50,
71
+ label="Chunk Size (tokens)",
72
+ info="Text segmentation size"
73
+ )
74
+ temperature = gr.Slider(
75
+ minimum=0, maximum=1, value=0.7, step=0.1,
76
+ label="Temperature",
77
+ info="LLM randomness factor"
78
+ )
79
+ max_tokens = gr.Slider(
80
+ minimum=100, maximum=1000, value=300, step=50,
81
+ label="Max Tokens",
82
+ info="Maximum output token limit"
83
+ )
84
+
85
+ process_btn = gr.Button("Process Document", variant="primary")
86
+ status_output = gr.Textbox(label="Status")
87
+
88
+ with gr.Tabs():
89
+ with gr.TabItem("Content Processing"):
90
+ output_table = gr.Dataframe(
91
+ headers=["Segment", "Processed Content", "Audio Script"],
92
+ wrap=True
93
+ )
94
+ with gr.TabItem("Audio Output"):
95
+ audio_output = gr.Audio(
96
+ label="Synthesized Audio",
97
+ type="filepath",
98
+ show_download_button=True
99
+ )
100
+
101
+ gr.Markdown("""
102
+ ### Technical Notes
103
+ - Token limit affects processing speed and memory usage
104
+ - Temperature values > 0.8 may introduce content variations
105
+ - Audio synthesis has a 4096 token limit per segment
106
+
107
+ ### Performance Considerations
108
+ - Chunk size directly impacts processing time
109
+ - Higher temperatures increase LLM compute time
110
+ - Audio synthesis scales with script length
111
+ """)
112
+
113
+ gr.HTML(
114
+ """
115
+ <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
116
+ <p style="margin: 0; color: #666; font-size: 0.8em;">
117
+ Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
118
+ | <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a>
119
+ | <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a>
120
+ </p>
121
+ </div>
122
+ """
123
+ )
124
+
125
+ def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens):
126
+ return process_document(
127
+ pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens
128
+ )
129
+
130
+ process_btn.click(
131
+ update_interface,
132
+ inputs=[
133
+ file_input, api_key, voice_select, style_select,
134
+ chunk_size, temperature, max_tokens
135
+ ],
136
+ outputs=[output_table, audio_output, status_output]
137
+ )
138
+
139
+ return demo