Michael Hu commited on
Commit
0aa0b99
Β·
1 Parent(s): 4b33339

use Gradio

Browse files
Files changed (3) hide show
  1. app.py +265 -289
  2. pyproject.toml +1 -2
  3. requirements.txt +1 -2
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Main entry point for the Audio Translation Web Application
3
- Handles file upload, processing pipeline, and UI rendering using DDD architecture
4
  """
5
 
6
  import logging
@@ -14,9 +14,10 @@ logging.basicConfig(
14
  )
15
  logger = logging.getLogger(__name__)
16
 
17
- import streamlit as st
18
  import os
19
- from typing import Optional
 
20
 
21
  # Import application services and DTOs
22
  from src.application.services.audio_processing_service import AudioProcessingApplicationService
@@ -32,39 +33,43 @@ from src.infrastructure.config.container_setup import initialize_global_containe
32
  os.makedirs("temp/uploads", exist_ok=True)
33
  os.makedirs("temp/outputs", exist_ok=True)
34
 
35
- def configure_page():
36
- """Set up Streamlit page configuration"""
37
- logger.info("Configuring Streamlit page")
38
- st.set_page_config(
39
- page_title="Audio Translator",
40
- page_icon="🎧",
41
- layout="wide",
42
- initial_sidebar_state="expanded"
43
- )
44
- st.markdown("""
45
- <style>
46
- .reportview-container {margin-top: -2em;}
47
- #MainMenu {visibility: hidden;}
48
- .stDeployButton {display:none;}
49
- .stAlert {padding: 20px !important;}
50
- </style>
51
- """, unsafe_allow_html=True)
52
 
53
- def create_audio_upload_dto(uploaded_file) -> AudioUploadDto:
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  """
55
- Create AudioUploadDto from Streamlit uploaded file.
56
 
57
  Args:
58
- uploaded_file: Streamlit UploadedFile object
59
 
60
  Returns:
61
  AudioUploadDto: DTO containing upload information
62
  """
63
  try:
64
- content = uploaded_file.getbuffer().tobytes()
 
 
 
 
 
 
65
 
66
  # Determine content type based on file extension
67
- file_ext = os.path.splitext(uploaded_file.name.lower())[1]
68
  content_type_map = {
69
  '.wav': 'audio/wav',
70
  '.mp3': 'audio/mpeg',
@@ -75,7 +80,7 @@ def create_audio_upload_dto(uploaded_file) -> AudioUploadDto:
75
  content_type = content_type_map.get(file_ext, 'audio/wav')
76
 
77
  return AudioUploadDto(
78
- filename=uploaded_file.name,
79
  content=content,
80
  content_type=content_type,
81
  size=len(content)
@@ -84,33 +89,65 @@ def create_audio_upload_dto(uploaded_file) -> AudioUploadDto:
84
  logger.error(f"Failed to create AudioUploadDto: {e}")
85
  raise ValueError(f"Invalid audio file: {str(e)}")
86
 
87
- def handle_file_processing(
88
- audio_upload: AudioUploadDto,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  asr_model: str,
90
  target_language: str,
91
  voice: str,
92
  speed: float,
93
- source_language: Optional[str] = None
94
- ) -> ProcessingResultDto:
95
  """
96
  Execute the complete processing pipeline using application services.
97
 
98
  Args:
99
- audio_upload: Audio upload DTO
100
  asr_model: ASR model to use
101
  target_language: Target language for translation
102
  voice: Voice for TTS
103
  speed: Speech speed
104
- source_language: Source language (optional)
105
 
106
  Returns:
107
- ProcessingResultDto: Processing result
108
  """
109
- logger.info(f"Starting processing for: {audio_upload.filename} using {asr_model} model")
110
- progress_bar = st.progress(0)
111
- status_text = st.empty()
112
-
113
  try:
 
 
 
 
 
 
 
 
114
  # Get application service from container
115
  container = get_global_container()
116
  audio_service = container.resolve(AudioProcessingApplicationService)
@@ -125,274 +162,213 @@ def handle_file_processing(
125
  source_language=source_language
126
  )
127
 
128
- # Update progress and status
129
- status_text.markdown("πŸ” **Performing Speech Recognition...**")
130
- progress_bar.progress(10)
131
-
132
  # Process through application service
133
- with st.spinner("Processing audio pipeline..."):
134
- result = audio_service.process_audio_pipeline(request)
135
 
136
  if result.success:
137
- progress_bar.progress(100)
138
- status_text.success("βœ… Processing Complete!")
139
  logger.info(f"Processing completed successfully in {result.processing_time:.2f}s")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  else:
141
- status_text.error(f"❌ Processing Failed: {result.error_message}")
142
  logger.error(f"Processing failed: {result.error_message}")
143
-
144
- return result
145
 
146
  except Exception as e:
147
  logger.error(f"Processing failed: {str(e)}", exc_info=True)
148
- status_text.error(f"❌ Processing Failed: {str(e)}")
149
- st.exception(e)
150
-
151
- # Return error result
152
- return ProcessingResultDto.error_result(
153
- error_message=str(e),
154
- error_code='SYSTEM_ERROR'
155
- )
156
-
157
- def render_results(result: ProcessingResultDto):
158
- """
159
- Display processing results using ProcessingResultDto.
160
-
161
- Args:
162
- result: Processing result DTO
163
- """
164
- logger.info("Rendering results")
165
- st.divider()
166
-
167
- if not result.success:
168
- st.error(f"Processing failed: {result.error_message}")
169
- if result.error_code:
170
- st.code(f"Error Code: {result.error_code}")
171
- return
172
-
173
- col1, col2 = st.columns([2, 1])
174
-
175
- with col1:
176
- # Display original text if available
177
- if result.original_text:
178
- st.subheader("Recognition Results")
179
- st.code(result.original_text, language="text")
180
-
181
- # Display translated text if available
182
- if result.translated_text:
183
- st.subheader("Translation Results")
184
- st.code(result.translated_text, language="text")
185
-
186
- # Display processing metadata
187
- if result.metadata:
188
- with st.expander("Processing Details"):
189
- st.json(result.metadata)
190
-
191
- with col2:
192
- # Display audio output if available
193
- if result.has_audio_output and result.audio_path:
194
- st.subheader("Audio Output")
195
-
196
- # Check if file exists and is accessible
197
- if os.path.exists(result.audio_path):
198
- # Standard audio player
199
- st.audio(result.audio_path)
200
-
201
- # Download button
202
- try:
203
- with open(result.audio_path, "rb") as f:
204
- st.download_button(
205
- label="Download Audio",
206
- data=f,
207
- file_name="translated_audio.wav",
208
- mime="audio/wav"
209
- )
210
- except Exception as e:
211
- st.warning(f"Download not available: {str(e)}")
212
- else:
213
- st.warning("Audio file not found or not accessible")
214
-
215
- # Display processing time
216
- st.metric("Processing Time", f"{result.processing_time:.2f}s")
217
-
218
- def get_supported_configurations() -> dict:
219
- """
220
- Get supported configurations from application service.
221
-
222
- Returns:
223
- dict: Supported configurations
224
- """
225
- try:
226
- logger.info("Getting global container...")
227
- container = get_global_container()
228
- logger.info("Resolving AudioProcessingApplicationService...")
229
- audio_service = container.resolve(AudioProcessingApplicationService)
230
- logger.info("Getting supported configurations from service...")
231
- config = audio_service.get_supported_configurations()
232
- logger.info(f"Retrieved configurations: {config}")
233
- return config
234
- except Exception as e:
235
- logger.error(f"Failed to get configurations: {e}", exc_info=True)
236
- # Return fallback configurations
237
- return {
238
- 'asr_models': ['whisper-small', 'parakeet'],
239
- 'voices': ['kokoro', 'dia', 'cosyvoice2', 'dummy'],
240
- 'languages': ['en', 'zh', 'es', 'fr', 'de'],
241
- 'audio_formats': ['wav', 'mp3'],
242
- 'max_file_size_mb': 100,
243
- 'speed_range': {'min': 0.5, 'max': 2.0}
244
  }
245
-
246
- def initialize_session_state():
247
- """Initialize session state variables"""
248
- if 'processing_result' not in st.session_state:
249
- st.session_state.processing_result = None
250
- if 'container_initialized' not in st.session_state:
251
- st.session_state.container_initialized = False
252
-
253
- def initialize_application():
254
- """Initialize the application with dependency injection container"""
255
- if not st.session_state.get('container_initialized', False):
256
- try:
257
- logger.info("Initializing application container")
258
- initialize_global_container()
259
- st.session_state.container_initialized = True
260
- logger.info("Application container initialized successfully")
261
- except Exception as e:
262
- logger.error(f"Failed to initialize application: {e}")
263
- st.error(f"Application initialization failed: {str(e)}")
264
- st.stop()
265
-
266
- def main():
267
- """Main application workflow"""
268
- logger.info("Starting application")
269
-
270
- try:
271
- # Configure page
272
- configure_page()
273
-
274
- # Initialize session state first
275
- initialize_session_state()
276
-
277
- # Initialize application
278
- initialize_application()
279
-
280
- st.title("🎧 High-Quality Audio Translation System")
281
- st.markdown("Upload English Audio β†’ Get Chinese Speech Output")
282
-
283
- # Get supported configurations with error handling
284
- try:
285
- config = get_supported_configurations()
286
- logger.info("Successfully retrieved configurations")
287
- except Exception as e:
288
- logger.error(f"Failed to get configurations: {e}")
289
- st.error(f"Configuration error: {str(e)}")
290
- # Use fallback configuration
291
- config = {
292
- 'asr_models': ['parakeet', 'whisper-small'],
293
- 'voices': ['kokoro', 'dia', 'cosyvoice2', 'dummy'],
294
- 'languages': ['en', 'zh', 'es', 'fr', 'de'],
295
- 'audio_formats': ['wav', 'mp3'],
296
- 'max_file_size_mb': 100,
297
- 'speed_range': {'min': 0.5, 'max': 2.0}
298
- }
299
-
300
- # Voice selection in sidebar
301
- st.sidebar.header("TTS Settings")
302
-
303
- # Map voice display names to internal IDs
304
- voice_options = {
305
- "Kokoro": "kokoro",
306
- "Dia": "dia",
307
- "CosyVoice2": "cosyvoice2",
308
- "Dummy (Test)": "dummy"
309
  }
310
-
311
- selected_voice_display = st.sidebar.selectbox(
312
- "Select Voice",
313
- list(voice_options.keys()),
314
- index=0
315
- )
316
- selected_voice = voice_options[selected_voice_display]
317
-
318
- speed = st.sidebar.slider(
319
- "Speech Speed",
320
- config['speed_range']['min'],
321
- config['speed_range']['max'],
322
- 1.0,
323
- 0.1
324
- )
325
-
326
- # Model selection
327
- asr_model = st.selectbox(
328
- "Select Speech Recognition Model",
329
- options=config['asr_models'],
330
- index=0,
331
- help="Choose the ASR model for speech recognition"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  )
333
-
334
- # Language selection
335
- language_options = {
336
- "Chinese (Mandarin)": "zh",
337
- "Spanish": "es",
338
- "French": "fr",
339
- "German": "de",
340
- "English": "en"
341
- }
342
-
343
- selected_language_display = st.selectbox(
344
- "Target Language",
345
- list(language_options.keys()),
346
- index=0,
347
- help="Select the target language for translation"
348
  )
349
- target_language = language_options[selected_language_display]
 
350
 
351
- # File upload
352
- uploaded_file = st.file_uploader(
353
- f"Select Audio File ({', '.join(config['audio_formats']).upper()})",
354
- type=config['audio_formats'],
355
- accept_multiple_files=False,
356
- help=f"Maximum file size: {config['max_file_size_mb']}MB"
 
 
 
 
 
 
 
 
 
 
357
  )
358
-
359
- if uploaded_file:
360
- logger.info(f"File uploaded: {uploaded_file.name}")
361
-
362
- try:
363
- # Create audio upload DTO
364
- audio_upload = create_audio_upload_dto(uploaded_file)
365
-
366
- # Display file information
367
- st.info(f"πŸ“ **File:** {audio_upload.filename} ({audio_upload.size / 1024:.1f} KB)")
368
-
369
- # Process button
370
- if st.button("πŸš€ Process Audio", type="primary"):
371
- # Process the audio
372
- result = handle_file_processing(
373
- audio_upload=audio_upload,
374
- asr_model=asr_model,
375
- target_language=target_language,
376
- voice=selected_voice,
377
- speed=speed,
378
- source_language="en" # Assume English source for now
379
- )
380
-
381
- # Store result in session state
382
- st.session_state.processing_result = result
383
-
384
- # Display results if available
385
- if st.session_state.processing_result:
386
- render_results(st.session_state.processing_result)
387
-
388
- except Exception as e:
389
- st.error(f"Error processing file: {str(e)}")
390
- logger.error(f"File processing error: {e}")
391
-
392
  except Exception as e:
393
- logger.error(f"Main application error: {str(e)}", exc_info=True)
394
- st.error(f"Application error: {str(e)}")
395
- st.exception(e)
396
 
397
  if __name__ == "__main__":
398
  main()
 
1
  """
2
  Main entry point for the Audio Translation Web Application
3
+ Handles file upload, processing pipeline, and UI rendering using DDD architecture with Gradio
4
  """
5
 
6
  import logging
 
14
  )
15
  logger = logging.getLogger(__name__)
16
 
17
+ import gradio as gr
18
  import os
19
+ import json
20
+ from typing import Optional, Tuple, Dict, Any
21
 
22
  # Import application services and DTOs
23
  from src.application.services.audio_processing_service import AudioProcessingApplicationService
 
33
  os.makedirs("temp/uploads", exist_ok=True)
34
  os.makedirs("temp/outputs", exist_ok=True)
35
 
36
+ # Global container initialization
37
+ container_initialized = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ def initialize_application():
40
+ """Initialize the application with dependency injection container"""
41
+ global container_initialized
42
+ if not container_initialized:
43
+ try:
44
+ logger.info("Initializing application container")
45
+ initialize_global_container()
46
+ container_initialized = True
47
+ logger.info("Application container initialized successfully")
48
+ except Exception as e:
49
+ logger.error(f"Failed to initialize application: {e}")
50
+ raise RuntimeError(f"Application initialization failed: {str(e)}")
51
+
52
+ def create_audio_upload_dto(audio_file_path: str) -> AudioUploadDto:
53
  """
54
+ Create AudioUploadDto from audio file path.
55
 
56
  Args:
57
+ audio_file_path: Path to the uploaded audio file
58
 
59
  Returns:
60
  AudioUploadDto: DTO containing upload information
61
  """
62
  try:
63
+ if not audio_file_path or not os.path.exists(audio_file_path):
64
+ raise ValueError("No audio file provided or file does not exist")
65
+
66
+ filename = os.path.basename(audio_file_path)
67
+
68
+ with open(audio_file_path, 'rb') as f:
69
+ content = f.read()
70
 
71
  # Determine content type based on file extension
72
+ file_ext = os.path.splitext(filename.lower())[1]
73
  content_type_map = {
74
  '.wav': 'audio/wav',
75
  '.mp3': 'audio/mpeg',
 
80
  content_type = content_type_map.get(file_ext, 'audio/wav')
81
 
82
  return AudioUploadDto(
83
+ filename=filename,
84
  content=content,
85
  content_type=content_type,
86
  size=len(content)
 
89
  logger.error(f"Failed to create AudioUploadDto: {e}")
90
  raise ValueError(f"Invalid audio file: {str(e)}")
91
 
92
+ def get_supported_configurations() -> dict:
93
+ """
94
+ Get supported configurations from application service.
95
+
96
+ Returns:
97
+ dict: Supported configurations
98
+ """
99
+ try:
100
+ logger.info("Getting global container...")
101
+ container = get_global_container()
102
+ logger.info("Resolving AudioProcessingApplicationService...")
103
+ audio_service = container.resolve(AudioProcessingApplicationService)
104
+ logger.info("Getting supported configurations from service...")
105
+ config = audio_service.get_supported_configurations()
106
+ logger.info(f"Retrieved configurations: {config}")
107
+ return config
108
+ except Exception as e:
109
+ logger.error(f"Failed to get configurations: {e}", exc_info=True)
110
+ # Return fallback configurations
111
+ return {
112
+ 'asr_models': ['whisper-small', 'parakeet'],
113
+ 'voices': ['kokoro', 'dia', 'cosyvoice2', 'dummy'],
114
+ 'languages': ['en', 'zh', 'es', 'fr', 'de'],
115
+ 'audio_formats': ['wav', 'mp3'],
116
+ 'max_file_size_mb': 100,
117
+ 'speed_range': {'min': 0.5, 'max': 2.0}
118
+ }
119
+
120
+ def process_audio_pipeline(
121
+ audio_file,
122
  asr_model: str,
123
  target_language: str,
124
  voice: str,
125
  speed: float,
126
+ source_language: str = "en"
127
+ ) -> Tuple[str, str, str, str, str]:
128
  """
129
  Execute the complete processing pipeline using application services.
130
 
131
  Args:
132
+ audio_file: Gradio audio file input
133
  asr_model: ASR model to use
134
  target_language: Target language for translation
135
  voice: Voice for TTS
136
  speed: Speech speed
137
+ source_language: Source language
138
 
139
  Returns:
140
+ Tuple: (status_message, original_text, translated_text, audio_output_path, processing_details)
141
  """
 
 
 
 
142
  try:
143
+ if not audio_file:
144
+ return "❌ No audio file provided", "", "", None, ""
145
+
146
+ logger.info(f"Starting processing for: {audio_file} using {asr_model} model")
147
+
148
+ # Create audio upload DTO
149
+ audio_upload = create_audio_upload_dto(audio_file)
150
+
151
  # Get application service from container
152
  container = get_global_container()
153
  audio_service = container.resolve(AudioProcessingApplicationService)
 
162
  source_language=source_language
163
  )
164
 
 
 
 
 
165
  # Process through application service
166
+ result = audio_service.process_audio_pipeline(request)
 
167
 
168
  if result.success:
169
+ status_message = f"βœ… Processing Complete! ({result.processing_time:.2f}s)"
 
170
  logger.info(f"Processing completed successfully in {result.processing_time:.2f}s")
171
+
172
+ # Prepare processing details
173
+ details = {
174
+ "processing_time": f"{result.processing_time:.2f}s",
175
+ "asr_model": asr_model,
176
+ "target_language": target_language,
177
+ "voice": voice,
178
+ "speed": speed
179
+ }
180
+ if result.metadata:
181
+ details.update(result.metadata)
182
+
183
+ processing_details = json.dumps(details, indent=2)
184
+
185
+ return (
186
+ status_message,
187
+ result.original_text or "",
188
+ result.translated_text or "",
189
+ result.audio_path if result.has_audio_output else None,
190
+ processing_details
191
+ )
192
  else:
193
+ error_msg = f"❌ Processing Failed: {result.error_message}"
194
  logger.error(f"Processing failed: {result.error_message}")
195
+ return error_msg, "", "", None, f"Error: {result.error_message}"
 
196
 
197
  except Exception as e:
198
  logger.error(f"Processing failed: {str(e)}", exc_info=True)
199
+ error_msg = f"❌ Processing Failed: {str(e)}"
200
+ return error_msg, "", "", None, f"System Error: {str(e)}"
201
+
202
+ def create_interface():
203
+ """Create and configure the Gradio interface"""
204
+
205
+ # Initialize application
206
+ initialize_application()
207
+
208
+ # Get supported configurations
209
+ config = get_supported_configurations()
210
+
211
+ # Voice options mapping
212
+ voice_options = ["kokoro", "dia", "cosyvoice2", "dummy"]
213
+
214
+ # Language options mapping
215
+ language_options = {
216
+ "Chinese (Mandarin)": "zh",
217
+ "Spanish": "es",
218
+ "French": "fr",
219
+ "German": "de",
220
+ "English": "en"
221
+ }
222
+
223
+ # Create the interface
224
+ with gr.Blocks(
225
+ title="🎧 High-Quality Audio Translation System",
226
+ theme=gr.themes.Soft(),
227
+ css="""
228
+ .gradio-container {
229
+ max-width: 1200px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  }
231
+ .audio-player {
232
+ width: 100%;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  }
234
+ """
235
+ ) as interface:
236
+
237
+ gr.Markdown("# 🎧 High-Quality Audio Translation System")
238
+ gr.Markdown("Upload English Audio β†’ Get Chinese Speech Output")
239
+
240
+ with gr.Row():
241
+ with gr.Column(scale=2):
242
+ # Audio input
243
+ audio_input = gr.Audio(
244
+ label=f"Upload Audio File ({', '.join(config['audio_formats']).upper()})",
245
+ type="filepath",
246
+ format="wav"
247
+ )
248
+
249
+ # Model selection
250
+ asr_model = gr.Dropdown(
251
+ choices=config['asr_models'],
252
+ value=config['asr_models'][0] if config['asr_models'] else "parakeet",
253
+ label="Speech Recognition Model",
254
+ info="Choose the ASR model for speech recognition"
255
+ )
256
+
257
+ # Language selection
258
+ target_language = gr.Dropdown(
259
+ choices=list(language_options.keys()),
260
+ value="Chinese (Mandarin)",
261
+ label="Target Language",
262
+ info="Select the target language for translation"
263
+ )
264
+
265
+ with gr.Column(scale=1):
266
+ # TTS Settings
267
+ gr.Markdown("### TTS Settings")
268
+
269
+ voice = gr.Dropdown(
270
+ choices=voice_options,
271
+ value="kokoro",
272
+ label="Voice"
273
+ )
274
+
275
+ speed = gr.Slider(
276
+ minimum=config['speed_range']['min'],
277
+ maximum=config['speed_range']['max'],
278
+ value=1.0,
279
+ step=0.1,
280
+ label="Speech Speed"
281
+ )
282
+
283
+ # Process button
284
+ process_btn = gr.Button("πŸš€ Process Audio", variant="primary", size="lg")
285
+
286
+ # Status message
287
+ status_output = gr.Markdown(label="Status")
288
+
289
+ # Results section
290
+ with gr.Row():
291
+ with gr.Column(scale=2):
292
+ # Text outputs
293
+ original_text = gr.Textbox(
294
+ label="Recognition Results",
295
+ lines=4,
296
+ max_lines=8,
297
+ interactive=False
298
+ )
299
+
300
+ translated_text = gr.Textbox(
301
+ label="Translation Results",
302
+ lines=4,
303
+ max_lines=8,
304
+ interactive=False
305
+ )
306
+
307
+ # Processing details
308
+ with gr.Accordion("Processing Details", open=False):
309
+ processing_details = gr.Code(
310
+ label="Metadata",
311
+ language="json",
312
+ interactive=False
313
+ )
314
+
315
+ with gr.Column(scale=1):
316
+ # Audio output
317
+ audio_output = gr.Audio(
318
+ label="Audio Output",
319
+ interactive=False
320
+ )
321
+
322
+ # Wire up the processing function
323
+ def process_wrapper(audio_file, asr_model_val, target_lang_val, voice_val, speed_val):
324
+ # Map display language to code
325
+ target_lang_code = language_options.get(target_lang_val, "zh")
326
+
327
+ return process_audio_pipeline(
328
+ audio_file=audio_file,
329
+ asr_model=asr_model_val,
330
+ target_language=target_lang_code,
331
+ voice=voice_val,
332
+ speed=speed_val,
333
+ source_language="en"
334
+ )
335
+
336
+ process_btn.click(
337
+ fn=process_wrapper,
338
+ inputs=[audio_input, asr_model, target_language, voice, speed],
339
+ outputs=[status_output, original_text, translated_text, audio_output, processing_details]
340
  )
341
+
342
+ # Add examples if needed
343
+ gr.Examples(
344
+ examples=[],
345
+ inputs=[audio_input, asr_model, target_language, voice, speed],
346
+ label="Example Configurations"
 
 
 
 
 
 
 
 
 
347
  )
348
+
349
+ return interface
350
 
351
+ def main():
352
+ """Main application entry point"""
353
+ logger.info("Starting Gradio application")
354
+
355
+ try:
356
+ # Create interface
357
+ interface = create_interface()
358
+
359
+ # Launch the interface
360
+ interface.launch(
361
+ server_name="0.0.0.0",
362
+ server_port=7860,
363
+ share=False,
364
+ debug=False,
365
+ show_error=True,
366
+ quiet=False
367
  )
368
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
  except Exception as e:
370
+ logger.error(f"Failed to start application: {str(e)}", exc_info=True)
371
+ raise
 
372
 
373
  if __name__ == "__main__":
374
  main()
pyproject.toml CHANGED
@@ -25,8 +25,7 @@ dependencies = [
25
  "ordered-set>=4.1.0",
26
  "phonemizer-fork>=3.3.2",
27
  "nemo_toolkit[asr]",
28
- "faster-whisper>=1.1.1",
29
- "descript-audio-codec>=0.0.5"
30
  ]
31
 
32
  [project.optional-dependencies]
 
25
  "ordered-set>=4.1.0",
26
  "phonemizer-fork>=3.3.2",
27
  "nemo_toolkit[asr]",
28
+ "faster-whisper>=1.1.1"
 
29
  ]
30
 
31
  [project.optional-dependencies]
requirements.txt CHANGED
@@ -14,5 +14,4 @@ kokoro>=0.7.9
14
  ordered-set>=4.1.0
15
  phonemizer-fork>=3.3.2
16
  nemo_toolkit[asr]
17
- faster-whisper>=1.1.1
18
- descript-audio-codec>=0.0.5
 
14
  ordered-set>=4.1.0
15
  phonemizer-fork>=3.3.2
16
  nemo_toolkit[asr]
17
+ faster-whisper>=1.1.1