Michael Hu commited on
Commit
f7492cb
Β·
1 Parent(s): fafafc3

Refactor presentation layer to use application services

Browse files
Files changed (1) hide show
  1. app.py +279 -112
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  Main entry point for the Audio Translation Web Application
3
- Handles file upload, processing pipeline, and UI rendering
4
  """
5
 
6
  import logging
@@ -17,10 +17,17 @@ logger = logging.getLogger(__name__)
17
  import streamlit as st
18
  import os
19
  import time
20
- import subprocess
21
- from utils.stt import transcribe_audio
22
- from utils.translation import translate_text
23
- from utils.tts import get_tts_engine, generate_speech
 
 
 
 
 
 
 
24
 
25
  # Initialize environment configurations
26
  os.makedirs("temp/uploads", exist_ok=True)
@@ -44,162 +51,322 @@ def configure_page():
44
  </style>
45
  """, unsafe_allow_html=True)
46
 
47
- def handle_file_processing(upload_path, asr_model="whisper"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  """
49
- Execute the complete processing pipeline:
50
- 1. Speech-to-Text (STT)
51
- 2. Machine Translation
52
- 3. Text-to-Speech (TTS)
53
-
54
  Args:
55
- upload_path: Path to the uploaded audio file
56
- asr_model: ASR model to use (whisper or parakeet)
 
 
 
 
 
 
 
57
  """
58
- logger.info(f"Starting processing for: {upload_path} using {asr_model} model")
59
  progress_bar = st.progress(0)
60
  status_text = st.empty()
61
-
62
  try:
63
- # STT Phase
64
- logger.info("Beginning STT processing")
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  status_text.markdown("πŸ” **Performing Speech Recognition...**")
66
- with st.spinner(f"Initializing {asr_model.capitalize()} model..."):
67
- english_text = transcribe_audio(upload_path, model_name=asr_model)
68
- progress_bar.progress(30)
69
- logger.info(f"STT completed. Text length: {len(english_text)} characters")
70
-
71
- # Translation Phase
72
- logger.info("Beginning translation")
73
- status_text.markdown("🌐 **Translating Content...**")
74
- with st.spinner("Loading translation model..."):
75
- chinese_text = translate_text(english_text)
76
- progress_bar.progress(60)
77
- logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")
78
-
79
- # TTS Phase
80
- logger.info("Beginning TTS generation")
81
- status_text.markdown("🎡 **Generating Chinese Speech...**")
82
-
83
- # Initialize TTS engine with appropriate language code for Chinese
84
- engine = get_tts_engine(lang_code='z') # 'z' for Mandarin Chinese
85
-
86
- # Generate speech and get the file path
87
- output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
88
- progress_bar.progress(100)
89
- logger.info(f"TTS completed. Output file: {output_path}")
90
-
91
- # Store the text for streaming playback
92
- st.session_state.current_text = chinese_text
93
-
94
- status_text.success("βœ… Processing Complete!")
95
- return english_text, chinese_text, output_path
96
-
97
  except Exception as e:
98
  logger.error(f"Processing failed: {str(e)}", exc_info=True)
99
  status_text.error(f"❌ Processing Failed: {str(e)}")
100
  st.exception(e)
101
- raise
102
 
103
- def render_results(english_text, chinese_text, output_path):
104
- """Display processing results in organized columns"""
 
 
 
 
 
 
 
 
 
 
 
105
  logger.info("Rendering results")
106
  st.divider()
107
-
 
 
 
 
 
 
108
  col1, col2 = st.columns([2, 1])
 
109
  with col1:
110
- st.subheader("Recognition Results")
111
- st.code(english_text, language="text")
112
-
113
- st.subheader("Translation Results")
114
- st.code(chinese_text, language="text")
 
 
 
 
 
 
 
 
 
115
 
116
  with col2:
117
- st.subheader("Audio Output")
118
- # Standard audio player for the full file
119
- st.audio(output_path)
120
-
121
- # Download button
122
- with open(output_path, "rb") as f:
123
- st.download_button(
124
- label="Download Audio",
125
- data=f,
126
- file_name="translated_audio.wav",
127
- mime="audio/wav"
128
- )
129
-
130
- # Streaming playback controls
131
- st.subheader("Streaming Playback")
132
- if st.button("Stream Audio"):
133
- engine = get_tts_engine(lang_code='z')
134
- streaming_placeholder = st.empty()
135
-
136
- # Stream the audio in chunks
137
- for sample_rate, audio_chunk in engine.generate_speech_stream(
138
- chinese_text,
139
- voice="zf_xiaobei"
140
- ):
141
- # Create a temporary file for each chunk
142
- temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
143
- import soundfile as sf
144
- sf.write(temp_chunk_path, audio_chunk, sample_rate)
145
-
146
- # Play the chunk
147
- with streaming_placeholder:
148
- st.audio(temp_chunk_path, sample_rate=sample_rate)
149
-
150
- # Clean up the temporary chunk file
151
- os.remove(temp_chunk_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  def initialize_session_state():
154
  """Initialize session state variables"""
155
- if 'current_text' not in st.session_state:
156
- st.session_state.current_text = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  def main():
159
  """Main application workflow"""
160
  logger.info("Starting application")
 
 
 
 
 
161
  configure_page()
162
  initialize_session_state()
163
-
164
  st.title("🎧 High-Quality Audio Translation System")
165
  st.markdown("Upload English Audio β†’ Get Chinese Speech Output")
166
 
 
 
 
167
  # Voice selection in sidebar
168
  st.sidebar.header("TTS Settings")
 
 
169
  voice_options = {
170
- "Xiaobei (Female)": "zf_xiaobei",
171
- "Yunjian (Male)": "zm_yunjian",
 
 
172
  }
173
- selected_voice = st.sidebar.selectbox(
 
174
  "Select Voice",
175
  list(voice_options.keys()),
176
- format_func=lambda x: x
 
 
 
 
 
 
 
 
 
177
  )
178
- speed = st.sidebar.slider("Speech Speed", 0.5, 2.0, 1.0, 0.1)
179
-
180
  # Model selection
181
  asr_model = st.selectbox(
182
  "Select Speech Recognition Model",
183
- options=["parakeet", "whisper"],
184
  index=0,
185
  help="Choose the ASR model for speech recognition"
186
  )
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  uploaded_file = st.file_uploader(
189
- "Select Audio File (MP3/WAV)",
190
- type=["mp3", "wav"],
191
- accept_multiple_files=False
 
192
  )
193
 
194
  if uploaded_file:
195
  logger.info(f"File uploaded: {uploaded_file.name}")
196
- upload_path = os.path.join("temp/uploads", uploaded_file.name)
197
- with open(upload_path, "wb") as f:
198
- f.write(uploaded_file.getbuffer())
199
-
200
- results = handle_file_processing(upload_path, asr_model=asr_model)
201
- if results:
202
- render_results(*results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  if __name__ == "__main__":
205
  main()
 
1
  """
2
  Main entry point for the Audio Translation Web Application
3
+ Handles file upload, processing pipeline, and UI rendering using DDD architecture
4
  """
5
 
6
  import logging
 
17
  import streamlit as st
18
  import os
19
  import time
20
+ from typing import Optional
21
+
22
+ # Import application services and DTOs
23
+ from src.application.services.audio_processing_service import AudioProcessingApplicationService
24
+ from src.application.services.configuration_service import ConfigurationApplicationService
25
+ from src.application.dtos.audio_upload_dto import AudioUploadDto
26
+ from src.application.dtos.processing_request_dto import ProcessingRequestDto
27
+ from src.application.dtos.processing_result_dto import ProcessingResultDto
28
+
29
+ # Import infrastructure setup
30
+ from src.infrastructure.config.container_setup import initialize_global_container, get_global_container
31
 
32
  # Initialize environment configurations
33
  os.makedirs("temp/uploads", exist_ok=True)
 
51
  </style>
52
  """, unsafe_allow_html=True)
53
 
54
+ def create_audio_upload_dto(uploaded_file) -> AudioUploadDto:
55
+ """
56
+ Create AudioUploadDto from Streamlit uploaded file.
57
+
58
+ Args:
59
+ uploaded_file: Streamlit UploadedFile object
60
+
61
+ Returns:
62
+ AudioUploadDto: DTO containing upload information
63
+ """
64
+ try:
65
+ content = uploaded_file.getbuffer().tobytes()
66
+
67
+ # Determine content type based on file extension
68
+ file_ext = os.path.splitext(uploaded_file.name.lower())[1]
69
+ content_type_map = {
70
+ '.wav': 'audio/wav',
71
+ '.mp3': 'audio/mpeg',
72
+ '.m4a': 'audio/mp4',
73
+ '.flac': 'audio/flac',
74
+ '.ogg': 'audio/ogg'
75
+ }
76
+ content_type = content_type_map.get(file_ext, 'audio/wav')
77
+
78
+ return AudioUploadDto(
79
+ filename=uploaded_file.name,
80
+ content=content,
81
+ content_type=content_type,
82
+ size=len(content)
83
+ )
84
+ except Exception as e:
85
+ logger.error(f"Failed to create AudioUploadDto: {e}")
86
+ raise ValueError(f"Invalid audio file: {str(e)}")
87
+
88
+ def handle_file_processing(
89
+ audio_upload: AudioUploadDto,
90
+ asr_model: str,
91
+ target_language: str,
92
+ voice: str,
93
+ speed: float,
94
+ source_language: Optional[str] = None
95
+ ) -> ProcessingResultDto:
96
  """
97
+ Execute the complete processing pipeline using application services.
98
+
 
 
 
99
  Args:
100
+ audio_upload: Audio upload DTO
101
+ asr_model: ASR model to use
102
+ target_language: Target language for translation
103
+ voice: Voice for TTS
104
+ speed: Speech speed
105
+ source_language: Source language (optional)
106
+
107
+ Returns:
108
+ ProcessingResultDto: Processing result
109
  """
110
+ logger.info(f"Starting processing for: {audio_upload.filename} using {asr_model} model")
111
  progress_bar = st.progress(0)
112
  status_text = st.empty()
113
+
114
  try:
115
+ # Get application service from container
116
+ container = get_global_container()
117
+ audio_service = container.resolve(AudioProcessingApplicationService)
118
+
119
+ # Create processing request
120
+ request = ProcessingRequestDto(
121
+ audio=audio_upload,
122
+ asr_model=asr_model,
123
+ target_language=target_language,
124
+ voice=voice,
125
+ speed=speed,
126
+ source_language=source_language
127
+ )
128
+
129
+ # Update progress and status
130
  status_text.markdown("πŸ” **Performing Speech Recognition...**")
131
+ progress_bar.progress(10)
132
+
133
+ # Process through application service
134
+ with st.spinner("Processing audio pipeline..."):
135
+ result = audio_service.process_audio_pipeline(request)
136
+
137
+ if result.success:
138
+ progress_bar.progress(100)
139
+ status_text.success("βœ… Processing Complete!")
140
+ logger.info(f"Processing completed successfully in {result.processing_time:.2f}s")
141
+ else:
142
+ status_text.error(f"❌ Processing Failed: {result.error_message}")
143
+ logger.error(f"Processing failed: {result.error_message}")
144
+
145
+ return result
146
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  except Exception as e:
148
  logger.error(f"Processing failed: {str(e)}", exc_info=True)
149
  status_text.error(f"❌ Processing Failed: {str(e)}")
150
  st.exception(e)
 
151
 
152
+ # Return error result
153
+ return ProcessingResultDto.error_result(
154
+ error_message=str(e),
155
+ error_code='SYSTEM_ERROR'
156
+ )
157
+
158
+ def render_results(result: ProcessingResultDto):
159
+ """
160
+ Display processing results using ProcessingResultDto.
161
+
162
+ Args:
163
+ result: Processing result DTO
164
+ """
165
  logger.info("Rendering results")
166
  st.divider()
167
+
168
+ if not result.success:
169
+ st.error(f"Processing failed: {result.error_message}")
170
+ if result.error_code:
171
+ st.code(f"Error Code: {result.error_code}")
172
+ return
173
+
174
  col1, col2 = st.columns([2, 1])
175
+
176
  with col1:
177
+ # Display original text if available
178
+ if result.original_text:
179
+ st.subheader("Recognition Results")
180
+ st.code(result.original_text, language="text")
181
+
182
+ # Display translated text if available
183
+ if result.translated_text:
184
+ st.subheader("Translation Results")
185
+ st.code(result.translated_text, language="text")
186
+
187
+ # Display processing metadata
188
+ if result.metadata:
189
+ with st.expander("Processing Details"):
190
+ st.json(result.metadata)
191
 
192
  with col2:
193
+ # Display audio output if available
194
+ if result.has_audio_output and result.audio_path:
195
+ st.subheader("Audio Output")
196
+
197
+ # Check if file exists and is accessible
198
+ if os.path.exists(result.audio_path):
199
+ # Standard audio player
200
+ st.audio(result.audio_path)
201
+
202
+ # Download button
203
+ try:
204
+ with open(result.audio_path, "rb") as f:
205
+ st.download_button(
206
+ label="Download Audio",
207
+ data=f,
208
+ file_name="translated_audio.wav",
209
+ mime="audio/wav"
210
+ )
211
+ except Exception as e:
212
+ st.warning(f"Download not available: {str(e)}")
213
+ else:
214
+ st.warning("Audio file not found or not accessible")
215
+
216
+ # Display processing time
217
+ st.metric("Processing Time", f"{result.processing_time:.2f}s")
218
+
219
+ def get_supported_configurations() -> dict:
220
+ """
221
+ Get supported configurations from application service.
222
+
223
+ Returns:
224
+ dict: Supported configurations
225
+ """
226
+ try:
227
+ container = get_global_container()
228
+ audio_service = container.resolve(AudioProcessingApplicationService)
229
+ return audio_service.get_supported_configurations()
230
+ except Exception as e:
231
+ logger.warning(f"Failed to get configurations: {e}")
232
+ # Return fallback configurations
233
+ return {
234
+ 'asr_models': ['whisper-small', 'parakeet'],
235
+ 'voices': ['kokoro', 'dia', 'cosyvoice2', 'dummy'],
236
+ 'languages': ['en', 'zh', 'es', 'fr', 'de'],
237
+ 'audio_formats': ['wav', 'mp3'],
238
+ 'max_file_size_mb': 100,
239
+ 'speed_range': {'min': 0.5, 'max': 2.0}
240
+ }
241
 
242
  def initialize_session_state():
243
  """Initialize session state variables"""
244
+ if 'processing_result' not in st.session_state:
245
+ st.session_state.processing_result = None
246
+ if 'container_initialized' not in st.session_state:
247
+ st.session_state.container_initialized = False
248
+
249
+ def initialize_application():
250
+ """Initialize the application with dependency injection container"""
251
+ if not st.session_state.container_initialized:
252
+ try:
253
+ logger.info("Initializing application container")
254
+ initialize_global_container()
255
+ st.session_state.container_initialized = True
256
+ logger.info("Application container initialized successfully")
257
+ except Exception as e:
258
+ logger.error(f"Failed to initialize application: {e}")
259
+ st.error(f"Application initialization failed: {str(e)}")
260
+ st.stop()
261
 
262
  def main():
263
  """Main application workflow"""
264
  logger.info("Starting application")
265
+
266
+ # Initialize application
267
+ initialize_application()
268
+
269
+ # Configure page
270
  configure_page()
271
  initialize_session_state()
272
+
273
  st.title("🎧 High-Quality Audio Translation System")
274
  st.markdown("Upload English Audio β†’ Get Chinese Speech Output")
275
 
276
+ # Get supported configurations
277
+ config = get_supported_configurations()
278
+
279
  # Voice selection in sidebar
280
  st.sidebar.header("TTS Settings")
281
+
282
+ # Map voice display names to internal IDs
283
  voice_options = {
284
+ "Kokoro": "kokoro",
285
+ "Dia": "dia",
286
+ "CosyVoice2": "cosyvoice2",
287
+ "Dummy (Test)": "dummy"
288
  }
289
+
290
+ selected_voice_display = st.sidebar.selectbox(
291
  "Select Voice",
292
  list(voice_options.keys()),
293
+ index=0
294
+ )
295
+ selected_voice = voice_options[selected_voice_display]
296
+
297
+ speed = st.sidebar.slider(
298
+ "Speech Speed",
299
+ config['speed_range']['min'],
300
+ config['speed_range']['max'],
301
+ 1.0,
302
+ 0.1
303
  )
304
+
 
305
  # Model selection
306
  asr_model = st.selectbox(
307
  "Select Speech Recognition Model",
308
+ options=config['asr_models'],
309
  index=0,
310
  help="Choose the ASR model for speech recognition"
311
  )
312
 
313
+ # Language selection
314
+ language_options = {
315
+ "Chinese (Mandarin)": "zh",
316
+ "Spanish": "es",
317
+ "French": "fr",
318
+ "German": "de",
319
+ "English": "en"
320
+ }
321
+
322
+ selected_language_display = st.selectbox(
323
+ "Target Language",
324
+ list(language_options.keys()),
325
+ index=0,
326
+ help="Select the target language for translation"
327
+ )
328
+ target_language = language_options[selected_language_display]
329
+
330
+ # File upload
331
  uploaded_file = st.file_uploader(
332
+ f"Select Audio File ({', '.join(config['audio_formats']).upper()})",
333
+ type=config['audio_formats'],
334
+ accept_multiple_files=False,
335
+ help=f"Maximum file size: {config['max_file_size_mb']}MB"
336
  )
337
 
338
  if uploaded_file:
339
  logger.info(f"File uploaded: {uploaded_file.name}")
340
+
341
+ try:
342
+ # Create audio upload DTO
343
+ audio_upload = create_audio_upload_dto(uploaded_file)
344
+
345
+ # Display file information
346
+ st.info(f"πŸ“ **File:** {audio_upload.filename} ({audio_upload.size / 1024:.1f} KB)")
347
+
348
+ # Process button
349
+ if st.button("πŸš€ Process Audio", type="primary"):
350
+ # Process the audio
351
+ result = handle_file_processing(
352
+ audio_upload=audio_upload,
353
+ asr_model=asr_model,
354
+ target_language=target_language,
355
+ voice=selected_voice,
356
+ speed=speed,
357
+ source_language="en" # Assume English source for now
358
+ )
359
+
360
+ # Store result in session state
361
+ st.session_state.processing_result = result
362
+
363
+ # Display results if available
364
+ if st.session_state.processing_result:
365
+ render_results(st.session_state.processing_result)
366
+
367
+ except Exception as e:
368
+ st.error(f"Error processing file: {str(e)}")
369
+ logger.error(f"File processing error: {e}")
370
 
371
  if __name__ == "__main__":
372
  main()