PierreBrunelle commited on
Commit
f0a88ca
·
verified ·
1 Parent(s): 58c6bad

Update src/interface.py

Browse files
Files changed (1) hide show
  1. src/interface.py +158 -77
src/interface.py CHANGED
@@ -1,6 +1,42 @@
1
  import gradio as gr
2
  from .processor import process_document
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  def create_interface():
5
  with gr.Blocks(theme=gr.themes.Base()) as demo:
6
  gr.HTML(
@@ -13,124 +49,169 @@ def create_interface():
13
  """
14
  )
15
 
 
16
  with gr.Row():
17
  with gr.Column():
18
  with gr.Accordion("🎯 What does it do?", open=True):
19
  gr.Markdown("""
20
- - 📄 PDF document processing and text extraction
21
- - 🧠 Intelligent content transformation and summarization
22
- - 🎧 High-quality audio synthesis with voice selection
23
- - ⚙️ Configurable processing parameters
24
- - ⬇️ Downloadable audio output
25
  """)
26
  with gr.Column():
27
  with gr.Accordion("⚡ How does it work?", open=True):
28
  gr.Markdown("""
29
- 1. 📑 **Document Processing**
30
- - 📊 Chunks document using token-based segmentation
31
- - 🔄 Maintains document structure and context
32
-
33
- 2. 🔍 **Content Processing**
34
- - 🤖 Transforms text using LLM optimization
35
- - 📝 Generates optimized audio scripts
36
-
37
- 3. 🎵 **Audio Synthesis**
38
- - 🗣️ Converts scripts to natural speech
39
- - 🎙️ Multiple voice models available
40
  """)
41
 
 
 
 
42
  with gr.Row():
43
- with gr.Column():
44
- api_key = gr.Textbox(
45
- label="🔑 OpenAI API Key",
46
- placeholder="sk-...",
47
- type="password"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  )
49
- file_input = gr.File(
50
- label="📁 Input Document (PDF)",
51
- file_types=[".pdf"]
52
  )
53
-
54
- with gr.Accordion("🎛️ Synthesis Parameters", open=True):
 
 
 
 
 
55
  voice_select = gr.Radio(
56
  choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
57
  value="onyx",
58
- label="🎙️ Voice Model",
59
- info="TTS voice model selection"
60
  )
61
  style_select = gr.Radio(
62
- choices=["Technical", "Narrative", "Instructional", "Descriptive"],
63
- value="Technical",
64
- label="💫 Processing Style",
65
- info="Content processing approach"
66
  )
67
 
68
- with gr.Accordion("⚙️ Processing Parameters", open=False):
69
- chunk_size = gr.Slider(
70
- minimum=100, maximum=1000, value=300, step=50,
71
- label="📏 Chunk Size (tokens)",
72
- info="Text segmentation size"
73
- )
74
- temperature = gr.Slider(
75
- minimum=0, maximum=1, value=0.7, step=0.1,
76
- label="🌡️ Temperature",
77
- info="LLM randomness factor"
78
- )
79
- max_tokens = gr.Slider(
80
- minimum=100, maximum=1000, value=300, step=50,
81
- label="📊 Max Tokens",
82
- info="Maximum output token limit"
83
- )
 
 
 
 
 
 
 
84
 
85
- process_btn = gr.Button("🚀 Process Document", variant="primary")
86
- status_output = gr.Textbox(label="📋 Status")
87
-
 
 
 
88
  with gr.Tabs():
89
- with gr.TabItem("📝 Content Processing"):
90
  output_table = gr.Dataframe(
91
- headers=["🔍 Segment", "📄 Processed Content", "🎭 Audio Script"],
92
  wrap=True
93
  )
94
- with gr.TabItem("🎧 Audio Output"):
95
- audio_output = gr.Audio(
96
- label="🔊 Synthesized Audio",
97
- type="filepath",
98
- show_download_button=True
99
- )
100
-
101
- gr.Markdown("""
102
- ### 📚 Technical Notes
103
- - Token limit affects processing speed and memory usage
104
- - 🎯 Temperature values > 0.8 may introduce content variations
105
- - 🔊 Audio synthesis has a 4096 token limit per segment
106
-
107
- ### ⚙️ Performance Considerations
108
- - 📊 Chunk size directly impacts processing time
109
- - 🔄 Higher temperatures increase LLM compute time
110
- - ⏱️ Audio synthesis scales with script length
111
- """)
112
 
 
113
  gr.HTML(
114
  """
115
  <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
116
  <p style="margin: 0; color: #666; font-size: 0.8em;">
117
  🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
118
- | 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Documentation</a>
119
- | 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">Hugging Face Space</a>
120
  </p>
121
  </div>
122
  """
123
  )
124
-
125
- def update_interface(pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  return process_document(
127
- pdf_file, api_key, voice, style, chunk_size, temperature, max_tokens
 
 
 
 
 
 
 
128
  )
129
 
130
  process_btn.click(
131
  update_interface,
132
  inputs=[
133
- file_input, api_key, voice_select, style_select,
134
  chunk_size, temperature, max_tokens
135
  ],
136
  outputs=[output_table, audio_output, status_output]
 
1
  import gradio as gr
2
  from .processor import process_document
3
 
4
+ SYNTHESIS_MODES = {
5
+ "narration": {
6
+ "description": "Simple document narration with clear voice and natural pacing",
7
+ "styles": ["Technical", "Narrative", "Instructional", "Descriptive"],
8
+ "default_temp": 0.7,
9
+ "default_chunks": 300,
10
+ "system_prompt": """Convert this content into clear narration.
11
+ Format:
12
+ - Clear sentence structures
13
+ - Natural pauses (...)
14
+ - Term definitions when needed
15
+ - Proper transitions"""
16
+ },
17
+ "podcast": {
18
+ "description": "Conversational style with engaging tone and dynamic pacing",
19
+ "styles": ["Casual", "Interview", "Educational", "Commentary"],
20
+ "default_temp": 0.8,
21
+ "default_chunks": 400,
22
+ "system_prompt": """Transform this content into engaging podcast-style speech."""
23
+ },
24
+ "presentation": {
25
+ "description": "Professional presentation style with clear structure",
26
+ "styles": ["Business", "Academic", "Sales", "Training"],
27
+ "default_temp": 0.6,
28
+ "default_chunks": 250,
29
+ "system_prompt": """Convert this content into a presentation format."""
30
+ },
31
+ "storytelling": {
32
+ "description": "Narrative style with emotional engagement",
33
+ "styles": ["Dynamic", "Dramatic", "Calm", "Energetic"],
34
+ "default_temp": 0.9,
35
+ "default_chunks": 500,
36
+ "system_prompt": """Transform this content into an engaging story."""
37
+ }
38
+ }
39
+
40
  def create_interface():
41
  with gr.Blocks(theme=gr.themes.Base()) as demo:
42
  gr.HTML(
 
49
  """
50
  )
51
 
52
+ # Overview Row
53
  with gr.Row():
54
  with gr.Column():
55
  with gr.Accordion("🎯 What does it do?", open=True):
56
  gr.Markdown("""
57
+ - 📄 Document processing - 🧠 Content transformation
58
+ - 🎧 Audio synthesis - ⚙️ Multiple output styles
 
 
 
59
  """)
60
  with gr.Column():
61
  with gr.Accordion("⚡ How does it work?", open=True):
62
  gr.Markdown("""
63
+ 1. 📑 **Processing:** Token-based segmentation
64
+ 2. 🔍 **Analysis:** LLM optimization & scripts
65
+ 3. 🎵 **Synthesis:** Multiple voice options
 
 
 
 
 
 
 
 
66
  """)
67
 
68
+ synthesis_mode = gr.State(SYNTHESIS_MODES["narration"])
69
+
70
+ # Main Input Row
71
  with gr.Row():
72
+ # Left Column - Core Inputs
73
+ with gr.Column(scale=1):
74
+ with gr.Row():
75
+ api_key = gr.Textbox(
76
+ label="🔑 OpenAI API Key",
77
+ placeholder="sk-...",
78
+ type="password",
79
+ scale=2
80
+ )
81
+ file_input = gr.File(
82
+ label="📁 Input PDF",
83
+ file_types=[".pdf"],
84
+ scale=1
85
+ )
86
+
87
+ # Right Column - Mode Selection
88
+ with gr.Column(scale=1):
89
+ mode_select = gr.Radio(
90
+ choices=list(SYNTHESIS_MODES.keys()),
91
+ value="narration",
92
+ label="🎭 Output Mode",
93
+ info="Select output type"
94
  )
95
+ mode_description = gr.Markdown(
96
+ SYNTHESIS_MODES["narration"]["description"],
97
+ elem_classes=["mode-description"]
98
  )
99
+
100
+ # Parameters Row
101
+ with gr.Row():
102
+ # Voice and Style Column
103
+ with gr.Column():
104
+ with gr.Box():
105
+ gr.Markdown("### 🎛️ Voice & Style")
106
  voice_select = gr.Radio(
107
  choices=["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
108
  value="onyx",
109
+ label="🎙️ Voice",
110
+ interactive=True
111
  )
112
  style_select = gr.Radio(
113
+ choices=SYNTHESIS_MODES["narration"]["styles"],
114
+ value=SYNTHESIS_MODES["narration"]["styles"][0],
115
+ label="💫 Style",
116
+ interactive=True
117
  )
118
 
119
+ # Processing Parameters Column
120
+ with gr.Column():
121
+ with gr.Box():
122
+ gr.Markdown("### ⚙️ Processing")
123
+ with gr.Row():
124
+ chunk_size = gr.Slider(
125
+ minimum=100, maximum=1000,
126
+ value=SYNTHESIS_MODES["narration"]["default_chunks"],
127
+ step=50,
128
+ label="📏 Chunk Size"
129
+ )
130
+ temperature = gr.Slider(
131
+ minimum=0, maximum=1,
132
+ value=SYNTHESIS_MODES["narration"]["default_temp"],
133
+ step=0.1,
134
+ label="🌡️ Temperature"
135
+ )
136
+ max_tokens = gr.Slider(
137
+ minimum=100, maximum=1000,
138
+ value=300,
139
+ step=50,
140
+ label="📊 Tokens"
141
+ )
142
 
143
+ # Process Button Row
144
+ with gr.Row():
145
+ process_btn = gr.Button("🚀 Generate Audio", variant="primary", scale=2)
146
+ status_output = gr.Textbox(label="📋 Status", scale=1)
147
+
148
+ # Output Tabs Row
149
  with gr.Tabs():
150
+ with gr.Tab("📝 Content"):
151
  output_table = gr.Dataframe(
152
+ headers=["🔍 Segment", "📄 Content", "🎭 Script"],
153
  wrap=True
154
  )
155
+ with gr.Tab("🎧 Audio"):
156
+ with gr.Row():
157
+ audio_output = gr.Audio(
158
+ label="🔊 Output",
159
+ type="filepath",
160
+ show_download_button=True
161
+ )
162
+ with gr.Column():
163
+ gr.Markdown("""
164
+ ### 📚 Quick Tips
165
+ - 🎯 Lower temperature = more consistent output
166
+ - 📏 Smaller chunks = more precise control
167
+ - 🎙️ Try different voices for best results
168
+ """)
 
 
 
 
169
 
170
+ # Footer
171
  gr.HTML(
172
  """
173
  <div style="text-align: center; margin-top: 1rem; padding-top: 1rem; border-top: 1px solid #ccc;">
174
  <p style="margin: 0; color: #666; font-size: 0.8em;">
175
  🚀 Powered by <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #F25022; text-decoration: none;">Pixeltable</a>
176
+ | 📚 <a href="https://docs.pixeltable.io" target="_blank" style="color: #666;">Docs</a>
177
+ | 🤗 <a href="https://huggingface.co/spaces/Pixeltable/document-to-audio-synthesis" target="_blank" style="color: #666;">HF Space</a>
178
  </p>
179
  </div>
180
  """
181
  )
182
+
183
+ def update_mode(mode_name):
184
+ mode = SYNTHESIS_MODES[mode_name]
185
+ return (
186
+ gr.update(choices=mode["styles"], value=mode["styles"][0]),
187
+ gr.update(value=mode["default_chunks"]),
188
+ gr.update(value=mode["default_temp"]),
189
+ mode["description"]
190
+ )
191
+
192
+ mode_select.change(
193
+ update_mode,
194
+ inputs=[mode_select],
195
+ outputs=[style_select, chunk_size, temperature, mode_description]
196
+ )
197
+
198
+ def update_interface(pdf_file, api_key, mode_name, voice, style, chunk_size, temperature, max_tokens):
199
+ mode = SYNTHESIS_MODES[mode_name]
200
  return process_document(
201
+ pdf_file=pdf_file,
202
+ api_key=api_key,
203
+ voice_choice=voice,
204
+ style_choice=style,
205
+ chunk_size=chunk_size,
206
+ temperature=temperature,
207
+ max_tokens=max_tokens,
208
+ system_prompt=mode["system_prompt"]
209
  )
210
 
211
  process_btn.click(
212
  update_interface,
213
  inputs=[
214
+ file_input, api_key, mode_select, voice_select, style_select,
215
  chunk_size, temperature, max_tokens
216
  ],
217
  outputs=[output_table, audio_output, status_output]