Prathamesh Sarjerao Vaidya commited on
Commit
5de798e
·
1 Parent(s): 550272e

made changes to fix the deployment issue in hf_spaces

Browse files
Files changed (4) hide show
  1. Dockerfile +30 -8
  2. model_preloader.py +6 -14
  3. requirements.txt +22 -54
  4. src/speech_recognizer.py +129 -652
Dockerfile CHANGED
@@ -11,20 +11,34 @@ RUN apt-get update && apt-get install -y \
11
  curl \
12
  build-essential \
13
  libsndfile1 \
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
  # Copy requirements first for better caching
17
  COPY requirements.txt .
18
 
19
- # Install Python dependencies
20
- RUN pip install --no-cache-dir --upgrade pip && \
21
  pip install --no-cache-dir -r requirements.txt
22
 
23
  # Copy application code
24
  COPY . .
25
 
26
  # Create necessary directories with proper permissions
27
- # Fixed: Use 777 permissions for directories that need write access
28
  RUN mkdir -p templates static uploads outputs model_cache temp_files demo_results demo_audio \
29
  /tmp/matplotlib /tmp/fontconfig \
30
  && chmod -R 777 templates static \
@@ -49,17 +63,21 @@ ENV PYTHONPATH=/app \
49
  MPLCONFIGDIR=/tmp/matplotlib \
50
  HUGGINGFACE_HUB_CACHE=/app/model_cache \
51
  HF_HUB_CACHE=/app/model_cache \
52
- FONTCONFIG_PATH=/tmp/fontconfig
 
 
 
 
 
53
 
54
  # Expose port for Hugging Face Spaces
55
  EXPOSE 7860
56
 
57
  # Health check for Hugging Face Spaces
58
  HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
59
- CMD curl -f http://localhost:7860/api/system-info || exit 1
60
 
61
  # Preload models and start the application
62
- # Fixed: Ensure directories exist with proper permissions at runtime
63
  CMD ["python", "-c", "\
64
  import os; \
65
  import subprocess; \
@@ -67,8 +85,12 @@ import time; \
67
  print('Starting Multilingual Audio Intelligence System...'); \
68
  dirs = ['uploads', 'outputs', 'model_cache', 'temp_files', 'demo_results', '/tmp/matplotlib', '/tmp/fontconfig']; \
69
  [os.makedirs(d, mode=0o777, exist_ok=True) for d in dirs]; \
70
- subprocess.run(['python', 'model_preloader.py']); \
71
- print('Models loaded successfully'); \
 
 
 
 
72
  import uvicorn; \
73
  uvicorn.run('web_app:app', host='0.0.0.0', port=7860, workers=1, log_level='info')\
74
  "]
 
11
  curl \
12
  build-essential \
13
  libsndfile1 \
14
+ libasound2-dev \
15
+ portaudio19-dev \
16
+ libportaudio2 \
17
+ libportaudiocpp0 \
18
+ libsndfile1-dev \
19
+ libflac-dev \
20
+ libvorbis-dev \
21
+ libogg-dev \
22
+ libmp3lame-dev \
23
+ libmad0-dev \
24
+ libtwolame-dev \
25
+ libavcodec-dev \
26
+ libavformat-dev \
27
+ libavutil-dev \
28
+ libswresample-dev \
29
  && rm -rf /var/lib/apt/lists/*
30
 
31
  # Copy requirements first for better caching
32
  COPY requirements.txt .
33
 
34
+ # Install Python dependencies with proper error handling
35
+ RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
36
  pip install --no-cache-dir -r requirements.txt
37
 
38
  # Copy application code
39
  COPY . .
40
 
41
  # Create necessary directories with proper permissions
 
42
  RUN mkdir -p templates static uploads outputs model_cache temp_files demo_results demo_audio \
43
  /tmp/matplotlib /tmp/fontconfig \
44
  && chmod -R 777 templates static \
 
63
  MPLCONFIGDIR=/tmp/matplotlib \
64
  HUGGINGFACE_HUB_CACHE=/app/model_cache \
65
  HF_HUB_CACHE=/app/model_cache \
66
+ FONTCONFIG_PATH=/tmp/fontconfig \
67
+ # Fix for audio processing libraries
68
+ CTRANSLATE2_FORCE_CPU_ISA=generic \
69
+ # Disable problematic features
70
+ TF_CPP_MIN_LOG_LEVEL=2 \
71
+ TOKENIZERS_PARALLELISM=false
72
 
73
  # Expose port for Hugging Face Spaces
74
  EXPOSE 7860
75
 
76
  # Health check for Hugging Face Spaces
77
  HEALTHCHECK --interval=30s --timeout=30s --start-period=60s --retries=3 \
78
+ CMD curl -f http://localhost:7860/health || exit 1
79
 
80
  # Preload models and start the application
 
81
  CMD ["python", "-c", "\
82
  import os; \
83
  import subprocess; \
 
85
  print('Starting Multilingual Audio Intelligence System...'); \
86
  dirs = ['uploads', 'outputs', 'model_cache', 'temp_files', 'demo_results', '/tmp/matplotlib', '/tmp/fontconfig']; \
87
  [os.makedirs(d, mode=0o777, exist_ok=True) for d in dirs]; \
88
+ try: \
89
+ subprocess.run(['python', 'model_preloader.py'], check=True); \
90
+ print('Models loaded successfully'); \
91
+ except Exception as e: \
92
+ print(f'Model preloading failed: {e}'); \
93
+ print('Continuing without preloaded models...'); \
94
  import uvicorn; \
95
  uvicorn.run('web_app:app', host='0.0.0.0', port=7860, workers=1, log_level='info')\
96
  "]
model_preloader.py CHANGED
@@ -21,7 +21,7 @@ from datetime import datetime
21
  # Core imports
22
  import torch
23
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
24
- from faster_whisper import WhisperModel
25
  from pyannote.audio import Pipeline
26
  from rich.console import Console
27
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
@@ -386,13 +386,11 @@ class ModelPreloader:
386
  logger.error(f"Pyannote loading failed: {e}")
387
  return None
388
 
389
- def load_whisper_model(self, task_id: str) -> Optional[WhisperModel]:
390
  """Load Whisper speech recognition model with enhanced cache checking."""
391
  try:
392
  console.print(f"[yellow]Loading Whisper model (small)...[/yellow]")
393
 
394
- # Determine compute type based on device
395
- compute_type = "int8" if self.device == "cpu" else "float16"
396
  whisper_cache_dir = self.cache_dir / "whisper"
397
 
398
  # Check if we have valid local files
@@ -403,21 +401,15 @@ class ModelPreloader:
403
  else:
404
  console.print(f"[yellow]No valid local Whisper cache found, will download...[/yellow]")
405
 
406
- # faster-whisper handles caching automatically, but we specify our cache dir
407
- model = WhisperModel(
408
- "small",
409
- device=self.device,
410
- compute_type=compute_type,
411
- download_root=str(whisper_cache_dir)
412
- )
413
 
414
  # Test the model with a dummy audio array
415
  import numpy as np
416
  dummy_audio = np.zeros(16000, dtype=np.float32) # 1 second of silence
417
- segments, info = model.transcribe(dummy_audio, language="en")
418
- list(segments) # Force evaluation
419
 
420
- console.print(f"[green]✓ Whisper model loaded successfully on {self.device} with {compute_type}[/green]")
421
 
422
  return model
423
 
 
21
  # Core imports
22
  import torch
23
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
24
+ import whisper
25
  from pyannote.audio import Pipeline
26
  from rich.console import Console
27
  from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
 
386
  logger.error(f"Pyannote loading failed: {e}")
387
  return None
388
 
389
+ def load_whisper_model(self, task_id: str) -> Optional[whisper.Whisper]:
390
  """Load Whisper speech recognition model with enhanced cache checking."""
391
  try:
392
  console.print(f"[yellow]Loading Whisper model (small)...[/yellow]")
393
 
 
 
394
  whisper_cache_dir = self.cache_dir / "whisper"
395
 
396
  # Check if we have valid local files
 
401
  else:
402
  console.print(f"[yellow]No valid local Whisper cache found, will download...[/yellow]")
403
 
404
+ # OpenAI Whisper handles caching automatically
405
+ model = whisper.load_model("small", device=self.device)
 
 
 
 
 
406
 
407
  # Test the model with a dummy audio array
408
  import numpy as np
409
  dummy_audio = np.zeros(16000, dtype=np.float32) # 1 second of silence
410
+ result = model.transcribe(dummy_audio, language="en")
 
411
 
412
+ console.print(f"[green]✓ Whisper model loaded successfully on {self.device}[/green]")
413
 
414
  return model
415
 
requirements.txt CHANGED
@@ -1,26 +1,31 @@
1
- # Python 3.9.23 Compatible Requirements
2
- # Tested and verified versions to avoid conflicts
3
 
4
- # Core ML Libraries (Python 3.9 compatible)
5
  torch==2.0.1
6
  torchvision==0.15.2
7
  torchaudio==2.0.2
8
- transformers==4.30.2
9
 
10
- # Audio Processing
11
  librosa==0.10.1
12
  pydub==0.25.1
13
  soundfile==0.12.1
14
- faster-whisper==0.8.0
 
15
  audioread==3.0.1
16
  ffmpeg-python==0.2.0
17
- moviepy==1.0.3
 
 
 
 
 
18
 
19
  # Performance & Optimization
20
  numba==0.58.1
21
  onnxruntime==1.16.3
22
  accelerate==0.20.3
23
- cython==3.0.6
24
 
25
  # Core Utilities
26
  numpy==1.24.3
@@ -38,79 +43,42 @@ fastapi==0.104.1
38
  uvicorn==0.24.0
39
  python-multipart==0.0.6
40
  jinja2==3.1.2
41
- fastapi-cors==0.0.6
42
  websockets==12.0
43
  aiofiles==23.2.1
44
  aiohttp==3.9.1
45
  httpx
46
 
47
  # Translation APIs
48
- googletrans==4.0.0rc1
49
  deep-translator==1.11.4
50
- google-cloud-translate==3.14.0
51
-
52
- # Database & Caching
53
- sqlalchemy==2.0.23
54
- alembic==1.12.1
55
- psycopg2-binary==2.9.9
56
- redis==5.0.1
57
-
58
- # Authentication & Security
59
- python-jose[cryptography]==3.3.0
60
- passlib[bcrypt]==1.7.4
61
- cryptography==41.0.7
62
- bcrypt==4.1.2
63
 
64
  # Scientific Computing
65
  scipy==1.11.4
66
  matplotlib==3.7.3
67
- seaborn==0.13.0
68
- plotly==5.17.0
69
- statsmodels==0.14.0
70
  scikit-learn==1.3.2
71
 
72
- # PS-6 Specific Dependencies
73
  speechbrain==0.5.16
74
- pyannote.audio==3.1.1
75
- demucs==4.0.0
76
  PyWavelets==1.4.1
77
 
78
  # NLP
79
  nltk==3.8.1
80
- spacy==3.7.2
81
  langdetect==1.0.9
82
 
83
  # Logging & Monitoring
84
  rich==13.7.0
85
- loguru==0.7.2
86
- structlog==23.2.0
87
- prometheus-client==0.19.0
88
- sentry-sdk==1.38.0
89
-
90
- # Testing & Development
91
- pytest==7.4.3
92
- pytest-asyncio==0.21.1
93
- pytest-cov==4.1.0
94
- black==23.11.0
95
- flake8==6.1.0
96
- isort==5.12.0
97
- mypy==1.7.1
98
- pylint==3.0.3
99
-
100
- # Documentation
101
- mkdocs==1.5.3
102
- mkdocs-material==9.4.8
103
- sphinx==7.2.6
104
 
105
  # Machine Learning
106
  tensorflow==2.15.0
107
 
108
- # Task Queues
109
- celery==5.3.4
110
- rq==1.15.1
111
-
112
  # Additional Dependencies
113
  huggingface-hub==0.16.4
114
  tokenizers
115
  sentencepiece==0.1.99
116
- protobuf==3.20.3
 
 
 
 
 
1
+ # Hugging Face Spaces Compatible Requirements
2
+ # Optimized for containerized deployment
3
 
4
+ # Core ML Libraries (HF Spaces compatible)
5
  torch==2.0.1
6
  torchvision==0.15.2
7
  torchaudio==2.0.2
8
+ transformers
9
 
10
+ # Audio Processing (Fixed versions for HF Spaces)
11
  librosa==0.10.1
12
  pydub==0.25.1
13
  soundfile==0.12.1
14
+ # Use openai-whisper instead of faster-whisper to avoid CTranslate2 issues
15
+ openai-whisper==20231117
16
  audioread==3.0.1
17
  ffmpeg-python==0.2.0
18
+
19
+ # Speaker Diarization (Essential for HF Spaces)
20
+ pyannote.audio==3.1.1
21
+ pyannote.core
22
+ pyannote.database
23
+ pyannote.metrics==3.2.1
24
 
25
  # Performance & Optimization
26
  numba==0.58.1
27
  onnxruntime==1.16.3
28
  accelerate==0.20.3
 
29
 
30
  # Core Utilities
31
  numpy==1.24.3
 
43
  uvicorn==0.24.0
44
  python-multipart==0.0.6
45
  jinja2==3.1.2
 
46
  websockets==12.0
47
  aiofiles==23.2.1
48
  aiohttp==3.9.1
49
  httpx
50
 
51
  # Translation APIs
52
+ googletrans
53
  deep-translator==1.11.4
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  # Scientific Computing
56
  scipy==1.11.4
57
  matplotlib==3.7.3
 
 
 
58
  scikit-learn==1.3.2
59
 
60
+ # PS-6 Specific Dependencies (HF Spaces compatible)
61
  speechbrain==0.5.16
62
+ # Remove demucs as it's causing issues in containers
63
+ # demucs==4.0.0
64
  PyWavelets==1.4.1
65
 
66
  # NLP
67
  nltk==3.8.1
 
68
  langdetect==1.0.9
69
 
70
  # Logging & Monitoring
71
  rich==13.7.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
  # Machine Learning
74
  tensorflow==2.15.0
75
 
 
 
 
 
76
  # Additional Dependencies
77
  huggingface-hub==0.16.4
78
  tokenizers
79
  sentencepiece==0.1.99
80
+ protobuf==3.20.3
81
+
82
+ # System dependencies for audio processing
83
+ webrtcvad==2.0.10
84
+ resampy==0.4.2
src/speech_recognizer.py CHANGED
@@ -1,21 +1,20 @@
1
  """
2
  Advanced Speech Recognition Module for Multilingual Audio Intelligence System
3
 
4
- This module implements state-of-the-art automatic speech recognition using faster-whisper
5
  with integrated language identification capabilities. Designed for maximum performance
6
  on CPU-constrained environments while maintaining SOTA accuracy.
7
 
8
  Key Features:
9
- - Faster-whisper with CTranslate2 backend for 4x speed improvement
10
  - Integrated Language Identification (no separate LID module needed)
11
- - VAD-based batching for 14.6x real-time performance on CPU
12
  - Word-level timestamps for interactive UI synchronization
13
- - INT8 quantization for memory efficiency
14
  - Robust error handling and multilingual support
15
  - CPU and GPU optimization paths
16
 
17
  Model: openai/whisper-small (optimized for speed/accuracy balance)
18
- Dependencies: faster-whisper, torch, numpy
19
  """
20
 
21
  import os
@@ -29,11 +28,11 @@ from dataclasses import dataclass
29
  import time
30
 
31
  try:
32
- from faster_whisper import WhisperModel, BatchedInferencePipeline
33
- FASTER_WHISPER_AVAILABLE = True
34
  except ImportError:
35
- FASTER_WHISPER_AVAILABLE = False
36
- logging.warning("faster-whisper not available. Install with: pip install faster-whisper")
37
 
38
  # Configure logging
39
  logging.basicConfig(level=logging.INFO)
@@ -48,719 +47,197 @@ warnings.filterwarnings("ignore", category=FutureWarning)
48
  class TranscriptionSegment:
49
  """
50
  Data class representing a transcribed speech segment with rich metadata.
51
-
52
- Attributes:
53
- start_time (float): Segment start time in seconds
54
- end_time (float): Segment end time in seconds
55
- text (str): Transcribed text in native script
56
- language (str): Detected language code (e.g., 'en', 'hi', 'ar')
57
- confidence (float): Overall transcription confidence
58
- word_timestamps (List[Dict]): Word-level timing information
59
- speaker_id (str): Associated speaker identifier (if provided)
60
  """
61
- start_time: float
62
- end_time: float
63
  text: str
64
  language: str
65
- confidence: float = 1.0
66
- word_timestamps: Optional[List[Dict]] = None
67
- speaker_id: Optional[str] = None
68
-
69
- @property
70
- def duration(self) -> float:
71
- """Duration of the segment in seconds."""
72
- return self.end_time - self.start_time
73
-
74
- def to_dict(self) -> dict:
75
- """Convert to dictionary for JSON serialization."""
76
- return {
77
- 'start_time': self.start_time,
78
- 'end_time': self.end_time,
79
- 'text': self.text,
80
- 'language': self.language,
81
- 'confidence': self.confidence,
82
- 'duration': self.duration,
83
- 'word_timestamps': self.word_timestamps or [],
84
- 'speaker_id': self.speaker_id
85
- }
86
 
87
 
88
  class SpeechRecognizer:
89
  """
90
- State-of-the-art speech recognition with integrated language identification.
91
 
92
- Uses faster-whisper for optimal performance on both CPU and GPU, with advanced
93
- batching strategies for maximum throughput on constrained hardware.
94
  """
95
 
96
- def __init__(self,
97
- model_size: str = "small",
98
- device: Optional[str] = None,
99
- compute_type: str = "int8",
100
- cpu_threads: Optional[int] = None,
101
- num_workers: int = 1,
102
- download_root: Optional[str] = None):
103
  """
104
- Initialize the Speech Recognizer with optimizations.
105
 
106
  Args:
107
- model_size (str): Whisper model size ('tiny', 'small', 'medium', 'large')
108
- device (str, optional): Device to run on ('cpu', 'cuda', 'auto')
109
- compute_type (str): Precision type ('int8', 'float16', 'float32')
110
- cpu_threads (int, optional): Number of CPU threads to use
111
- num_workers (int): Number of workers for batch processing
112
- download_root (str, optional): Directory to store model files
113
  """
114
  self.model_size = model_size
 
115
  self.compute_type = compute_type
116
- self.num_workers = num_workers
 
 
117
 
118
- # Device selection with intelligence
119
- if device == 'auto' or device is None:
 
120
  if torch.cuda.is_available():
121
- self.device = 'cuda'
122
- # Adjust compute type for GPU
123
- if compute_type == 'int8' and torch.cuda.is_available():
124
- self.compute_type = 'float16' # GPU prefers float16 over int8
125
  else:
126
- self.device = 'cpu'
127
- self.compute_type = 'int8' # CPU benefits from int8
128
- else:
129
- self.device = device
130
-
131
- # CPU thread optimization
132
- if cpu_threads is None:
133
- if self.device == 'cpu':
134
- cpu_threads = min(os.cpu_count() or 4, 4) # Cap at 4 for HF Spaces
135
- self.cpu_threads = cpu_threads
136
-
137
- logger.info(f"Initializing SpeechRecognizer: {model_size} on {self.device} "
138
- f"with {self.compute_type} precision")
139
-
140
- # Initialize models
141
- self.model = None
142
- self.batched_model = None
143
- self._load_models(download_root)
144
 
145
- def _load_models(self, download_root: Optional[str] = None):
146
- """Load both standard and batched Whisper models."""
147
- if not FASTER_WHISPER_AVAILABLE:
148
- raise ImportError(
149
- "faster-whisper is required for speech recognition. "
150
- "Install with: pip install faster-whisper"
151
- )
152
 
153
  try:
154
  logger.info(f"Loading {self.model_size} Whisper model...")
155
-
156
- # Set CPU threads for optimal performance
157
- if self.device == 'cpu' and self.cpu_threads:
158
- os.environ['OMP_NUM_THREADS'] = str(self.cpu_threads)
159
-
160
- # Load standard model
161
- self.model = WhisperModel(
162
- self.model_size,
163
- device=self.device,
164
- compute_type=self.compute_type,
165
- download_root=download_root,
166
- cpu_threads=self.cpu_threads
167
- )
168
-
169
- # Load batched model for improved throughput
170
- try:
171
- self.batched_model = BatchedInferencePipeline(
172
- model=self.model,
173
- chunk_length=30, # 30-second chunks
174
- batch_size=16 if self.device == 'cuda' else 8,
175
- use_vad_model=True, # VAD-based batching for massive speedup
176
- )
177
- logger.info("Batched inference pipeline loaded successfully")
178
- except Exception as e:
179
- logger.warning(f"Could not load batched pipeline: {e}. Using standard model.")
180
- self.batched_model = None
181
-
182
  logger.info(f"Speech recognition models loaded on {self.device}")
183
-
184
  except Exception as e:
185
- logger.error(f"Failed to load speech recognition models: {e}")
186
  raise
187
 
188
- def transcribe_audio(self,
189
- audio_input: Union[str, np.ndarray],
190
- sample_rate: int = 16000,
191
- language: Optional[str] = None,
192
- word_timestamps: bool = True,
193
- use_batching: bool = True) -> List[TranscriptionSegment]:
194
  """
195
- Transcribe audio with integrated language identification.
196
 
197
  Args:
198
- audio_input: Audio file path or numpy array
199
- sample_rate: Sample rate if audio_input is numpy array
200
- language: Language hint (optional, auto-detected if None)
201
- word_timestamps: Whether to generate word-level timestamps
202
- use_batching: Whether to use batched inference for speed
203
 
204
  Returns:
205
- List[TranscriptionSegment]: Transcription results with metadata
206
  """
207
  if self.model is None:
208
- raise RuntimeError("Model not loaded. Call _load_models() first.")
209
 
210
  try:
211
- # Prepare audio input
212
- audio_file = self._prepare_audio_input(audio_input, sample_rate)
213
-
214
- logger.info("Starting speech recognition...")
215
- start_time = time.time()
216
-
217
- # Choose processing method based on availability and preference
218
- if use_batching and self.batched_model is not None:
219
- segments = self._transcribe_batched(
220
- audio_file, language, word_timestamps
221
- )
222
- else:
223
- segments = self._transcribe_standard(
224
- audio_file, language, word_timestamps
225
- )
226
-
227
- processing_time = time.time() - start_time
228
- total_audio_duration = sum(seg.duration for seg in segments)
229
- rtf = processing_time / max(total_audio_duration, 0.1)
230
-
231
- logger.info(f"Transcription completed in {processing_time:.2f}s "
232
- f"(RTF: {rtf:.2f}x)")
233
- logger.info(f"Detected {len(set(seg.language for seg in segments))} languages, "
234
- f"{len(segments)} segments")
235
-
236
- return segments
237
-
238
- except Exception as e:
239
- logger.error(f"Transcription failed: {str(e)}")
240
- raise
241
-
242
- finally:
243
- # Clean up temporary files
244
- if isinstance(audio_input, np.ndarray):
245
- try:
246
- if hasattr(audio_file, 'name') and os.path.exists(audio_file.name):
247
- os.unlink(audio_file.name)
248
- except Exception:
249
- pass
250
-
251
- def _transcribe_batched(self,
252
- audio_file: str,
253
- language: Optional[str],
254
- word_timestamps: bool) -> List[TranscriptionSegment]:
255
- """Transcribe using batched inference for maximum speed."""
256
- try:
257
- # Use batched pipeline for optimal CPU performance
258
- result = self.batched_model(
259
- audio_file,
260
- language=language,
261
- word_level_timestamps=word_timestamps,
262
- batch_size=16 if self.device == 'cuda' else 8
263
  )
264
 
 
265
  segments = []
266
- for segment in result:
267
- # Extract word timestamps if available
268
- word_times = None
269
- if word_timestamps and hasattr(segment, 'words'):
270
- word_times = [
271
- {
272
- 'word': word.word,
273
- 'start': word.start,
274
- 'end': word.end,
275
- 'confidence': getattr(word, 'probability', 1.0)
276
- }
277
- for word in segment.words
278
- ]
279
 
280
- transcription_segment = TranscriptionSegment(
281
- start_time=segment.start,
282
- end_time=segment.end,
283
- text=segment.text.strip(),
284
- language=getattr(segment, 'language', language or 'unknown'),
285
- confidence=getattr(segment, 'avg_logprob', 1.0),
286
- word_timestamps=word_times
287
- )
288
- segments.append(transcription_segment)
289
 
290
  return segments
291
 
292
  except Exception as e:
293
- logger.warning(f"Batched transcription failed: {e}. Falling back to standard.")
294
- return self._transcribe_standard(audio_file, language, word_timestamps)
295
-
296
- def _transcribe_standard(self,
297
- audio_file: str,
298
- language: Optional[str],
299
- word_timestamps: bool) -> List[TranscriptionSegment]:
300
- """Transcribe using standard Whisper model."""
301
- segments, info = self.model.transcribe(
302
- audio_file,
303
- language=language,
304
- word_timestamps=word_timestamps,
305
- vad_filter=True, # Enable VAD filtering
306
- vad_parameters=dict(min_silence_duration_ms=500),
307
- beam_size=1, # Faster with beam_size=1 on CPU
308
- temperature=0.0 # Deterministic output
309
- )
310
-
311
- results = []
312
- for segment in segments:
313
- # Extract word timestamps
314
- word_times = None
315
- if word_timestamps and hasattr(segment, 'words') and segment.words:
316
- word_times = [
317
- {
318
- 'word': word.word,
319
- 'start': word.start,
320
- 'end': word.end,
321
- 'confidence': getattr(word, 'probability', 1.0)
322
- }
323
- for word in segment.words
324
- ]
325
-
326
- transcription_segment = TranscriptionSegment(
327
- start_time=segment.start,
328
- end_time=segment.end,
329
- text=segment.text.strip(),
330
- language=info.language,
331
- confidence=getattr(segment, 'avg_logprob', 1.0),
332
- word_timestamps=word_times
333
- )
334
- results.append(transcription_segment)
335
-
336
- return results
337
 
338
- def transcribe_segments(self,
339
- audio_array: np.ndarray,
340
- sample_rate: int,
341
- speaker_segments: List[Tuple[float, float, str]],
342
- word_timestamps: bool = True) -> List[TranscriptionSegment]:
343
  """
344
- Transcribe pre-segmented audio chunks from speaker diarization.
345
 
346
  Args:
347
- audio_array: Full audio as numpy array
348
- sample_rate: Audio sample rate
349
- speaker_segments: List of (start_time, end_time, speaker_id) tuples
350
- word_timestamps: Whether to generate word-level timestamps
351
 
352
  Returns:
353
- List[TranscriptionSegment]: Transcribed segments with speaker attribution
354
  """
355
- if not speaker_segments:
356
- return []
357
-
358
  try:
359
- segments_to_process = []
360
-
361
- # Extract audio chunks for each speaker segment
362
- for start_time, end_time, speaker_id in speaker_segments:
363
- start_sample = int(start_time * sample_rate)
364
- end_sample = int(end_time * sample_rate)
365
-
366
- # Extract audio chunk
367
- audio_chunk = audio_array[start_sample:end_sample]
368
-
369
- # Skip very short segments
370
- if len(audio_chunk) < sample_rate * 0.1: # Less than 100ms
371
- continue
372
-
373
- segments_to_process.append({
374
- 'audio': audio_chunk,
375
- 'start_time': start_time,
376
- 'end_time': end_time,
377
- 'speaker_id': speaker_id
378
- })
379
-
380
- # Process segments in batches for efficiency
381
- all_results = []
382
- batch_size = 8 if self.device == 'cuda' else 4
383
-
384
- for i in range(0, len(segments_to_process), batch_size):
385
- batch = segments_to_process[i:i + batch_size]
386
- batch_results = self._process_segment_batch(
387
- batch, sample_rate, word_timestamps
388
- )
389
- all_results.extend(batch_results)
390
 
391
- return all_results
392
 
393
  except Exception as e:
394
- logger.error(f"Segment transcription failed: {e}")
395
- return []
396
-
397
- def _process_segment_batch(self,
398
- segment_batch: List[Dict],
399
- sample_rate: int,
400
- word_timestamps: bool) -> List[TranscriptionSegment]:
401
- """Process a batch of audio segments efficiently."""
402
- results = []
403
-
404
- for segment_info in segment_batch:
405
- try:
406
- # Save audio chunk to temporary file
407
- temp_file = tempfile.NamedTemporaryFile(
408
- delete=False, suffix='.wav', prefix='segment_'
409
- )
410
-
411
- # Use soundfile for saving if available
412
- try:
413
- import soundfile as sf
414
- sf.write(temp_file.name, segment_info['audio'], sample_rate)
415
- except ImportError:
416
- # Fallback to scipy
417
- from scipy.io import wavfile
418
- wavfile.write(temp_file.name, sample_rate,
419
- (segment_info['audio'] * 32767).astype(np.int16))
420
-
421
- temp_file.close()
422
-
423
- # Transcribe the segment
424
- transcription_segments = self.transcribe_audio(
425
- temp_file.name,
426
- sample_rate=sample_rate,
427
- word_timestamps=word_timestamps,
428
- use_batching=False # Already batching at higher level
429
- )
430
-
431
- # Adjust timestamps and add speaker info
432
- for ts in transcription_segments:
433
- # Adjust timestamps to global timeline
434
- time_offset = segment_info['start_time']
435
- ts.start_time += time_offset
436
- ts.end_time += time_offset
437
- ts.speaker_id = segment_info['speaker_id']
438
-
439
- # Adjust word timestamps
440
- if ts.word_timestamps:
441
- for word in ts.word_timestamps:
442
- word['start'] += time_offset
443
- word['end'] += time_offset
444
-
445
- results.append(ts)
446
-
447
- except Exception as e:
448
- logger.warning(f"Failed to transcribe segment: {e}")
449
- continue
450
-
451
- finally:
452
- # Clean up temporary file
453
- try:
454
- if os.path.exists(temp_file.name):
455
- os.unlink(temp_file.name)
456
- except Exception:
457
- pass
458
-
459
- return results
460
-
461
- def _prepare_audio_input(self,
462
- audio_input: Union[str, np.ndarray],
463
- sample_rate: int) -> str:
464
- """Prepare audio input for Whisper processing."""
465
- if isinstance(audio_input, str):
466
- if not os.path.exists(audio_input):
467
- raise FileNotFoundError(f"Audio file not found: {audio_input}")
468
- return audio_input
469
-
470
- elif isinstance(audio_input, np.ndarray):
471
- return self._save_array_to_tempfile(audio_input, sample_rate)
472
-
473
- else:
474
- raise ValueError(f"Unsupported audio input type: {type(audio_input)}")
475
-
476
- def _save_array_to_tempfile(self, audio_array: np.ndarray, sample_rate: int) -> str:
477
- """Save numpy array to temporary WAV file."""
478
- try:
479
- import soundfile as sf
480
-
481
- temp_file = tempfile.NamedTemporaryFile(
482
- delete=False, suffix='.wav', prefix='whisper_'
483
- )
484
- temp_path = temp_file.name
485
- temp_file.close()
486
-
487
- # Ensure audio is mono
488
- if len(audio_array.shape) > 1:
489
- audio_array = audio_array.mean(axis=1)
490
-
491
- # Normalize audio
492
- if np.max(np.abs(audio_array)) > 1.0:
493
- audio_array = audio_array / np.max(np.abs(audio_array))
494
-
495
- sf.write(temp_path, audio_array, sample_rate)
496
- logger.debug(f"Saved audio array to: {temp_path}")
497
- return temp_path
498
-
499
- except ImportError:
500
- # Fallback to scipy
501
- try:
502
- from scipy.io import wavfile
503
-
504
- temp_file = tempfile.NamedTemporaryFile(
505
- delete=False, suffix='.wav', prefix='whisper_'
506
- )
507
- temp_path = temp_file.name
508
- temp_file.close()
509
-
510
- # Convert to 16-bit int
511
- audio_int16 = (audio_array * 32767).astype(np.int16)
512
- wavfile.write(temp_path, sample_rate, audio_int16)
513
-
514
- return temp_path
515
-
516
- except ImportError:
517
- raise ImportError(
518
- "Neither soundfile nor scipy available. "
519
- "Install with: pip install soundfile"
520
- )
521
 
522
  def get_supported_languages(self) -> List[str]:
523
- """Get list of supported languages."""
524
- # Whisper supports 99 languages
525
  return [
526
- 'en', 'zh', 'de', 'es', 'ru', 'ko', 'fr', 'ja', 'pt', 'tr', 'pl', 'ca', 'nl',
527
- 'ar', 'sv', 'it', 'id', 'hi', 'fi', 'vi', 'he', 'uk', 'el', 'ms', 'cs', 'ro',
528
- 'da', 'hu', 'ta', 'no', 'th', 'ur', 'hr', 'bg', 'lt', 'la', 'mi', 'ml', 'cy',
529
- 'sk', 'te', 'fa', 'lv', 'bn', 'sr', 'az', 'sl', 'kn', 'et', 'mk', 'br', 'eu',
530
- 'is', 'hy', 'ne', 'mn', 'bs', 'kk', 'sq', 'sw', 'gl', 'mr', 'pa', 'si', 'km',
531
- 'sn', 'yo', 'so', 'af', 'oc', 'ka', 'be', 'tg', 'sd', 'gu', 'am', 'yi', 'lo',
532
- 'uz', 'fo', 'ht', 'ps', 'tk', 'nn', 'mt', 'sa', 'lb', 'my', 'bo', 'tl', 'mg',
533
- 'as', 'tt', 'haw', 'ln', 'ha', 'ba', 'jw', 'su'
534
  ]
535
 
536
- def benchmark_performance(self, audio_file: str) -> Dict[str, float]:
537
- """Benchmark transcription performance on given audio file."""
538
- try:
539
- # Get audio duration
540
- import librosa
541
- duration = librosa.get_duration(filename=audio_file)
 
542
 
543
- # Test standard transcription
544
- start_time = time.time()
545
- segments_standard = self.transcribe_audio(
546
- audio_file, use_batching=False, word_timestamps=False
547
- )
548
- standard_time = time.time() - start_time
 
 
549
 
550
- # Test batched transcription (if available)
551
- batched_time = None
552
- if self.batched_model:
553
- start_time = time.time()
554
- segments_batched = self.transcribe_audio(
555
- audio_file, use_batching=True, word_timestamps=False
556
- )
557
- batched_time = time.time() - start_time
558
 
559
- return {
560
- 'audio_duration': duration,
561
- 'standard_processing_time': standard_time,
562
- 'batched_processing_time': batched_time,
563
- 'standard_rtf': standard_time / duration,
564
- 'batched_rtf': batched_time / duration if batched_time else None,
565
- 'speedup': standard_time / batched_time if batched_time else None
566
- }
567
 
568
  except Exception as e:
569
- logger.error(f"Benchmark failed: {e}")
570
- return {}
571
-
572
- def __del__(self):
573
- """Cleanup resources."""
574
- if hasattr(self, 'device') and 'cuda' in str(self.device):
575
- try:
576
- torch.cuda.empty_cache()
577
- except Exception:
578
- pass
579
 
580
 
581
- # Convenience function for easy usage
582
- def transcribe_audio(audio_input: Union[str, np.ndarray],
583
- sample_rate: int = 16000,
584
- model_size: str = "small",
585
- language: Optional[str] = None,
586
- device: Optional[str] = None,
587
- word_timestamps: bool = True) -> List[TranscriptionSegment]:
588
  """
589
- Convenience function to transcribe audio with optimal settings.
590
 
591
  Args:
592
- audio_input: Audio file path or numpy array
593
- sample_rate: Sample rate for numpy array input
594
- model_size: Whisper model size ('tiny', 'small', 'medium', 'large')
595
- language: Language hint (auto-detected if None)
596
- device: Device to run on ('cpu', 'cuda', 'auto')
597
- word_timestamps: Whether to generate word-level timestamps
598
 
599
  Returns:
600
- List[TranscriptionSegment]: Transcription results
601
-
602
- Example:
603
- >>> # Transcribe from file
604
- >>> segments = transcribe_audio("meeting.wav")
605
- >>>
606
- >>> # Transcribe numpy array
607
- >>> import numpy as np
608
- >>> audio_data = np.random.randn(16000 * 10) # 10 seconds
609
- >>> segments = transcribe_audio(audio_data, sample_rate=16000)
610
- >>>
611
- >>> # Print results
612
- >>> for seg in segments:
613
- >>> print(f"[{seg.start_time:.1f}-{seg.end_time:.1f}] "
614
- >>> f"({seg.language}): {seg.text}")
615
  """
616
- recognizer = SpeechRecognizer(
617
- model_size=model_size,
618
- device=device
619
- )
620
-
621
- return recognizer.transcribe_audio(
622
- audio_input=audio_input,
623
- sample_rate=sample_rate,
624
- language=language,
625
- word_timestamps=word_timestamps
626
- )
627
-
628
-
629
- # Example usage and testing
630
- if __name__ == "__main__":
631
- import sys
632
- import argparse
633
- import json
634
-
635
- def main():
636
- """Command line interface for testing speech recognition."""
637
- parser = argparse.ArgumentParser(description="Advanced Speech Recognition Tool")
638
- parser.add_argument("audio_file", help="Path to audio file")
639
- parser.add_argument("--model-size", choices=["tiny", "small", "medium", "large"],
640
- default="small", help="Whisper model size")
641
- parser.add_argument("--language", help="Language hint (auto-detected if not provided)")
642
- parser.add_argument("--device", choices=["cpu", "cuda", "auto"], default="auto",
643
- help="Device to run on")
644
- parser.add_argument("--no-word-timestamps", action="store_true",
645
- help="Disable word-level timestamps")
646
- parser.add_argument("--no-batching", action="store_true",
647
- help="Disable batched inference")
648
- parser.add_argument("--output-format", choices=["json", "text", "srt"],
649
- default="text", help="Output format")
650
- parser.add_argument("--benchmark", action="store_true",
651
- help="Run performance benchmark")
652
- parser.add_argument("--verbose", "-v", action="store_true",
653
- help="Enable verbose logging")
654
-
655
- args = parser.parse_args()
656
-
657
- if args.verbose:
658
- logging.getLogger().setLevel(logging.DEBUG)
659
-
660
- try:
661
- print(f"Processing audio file: {args.audio_file}")
662
-
663
- recognizer = SpeechRecognizer(
664
- model_size=args.model_size,
665
- device=args.device
666
- )
667
-
668
- if args.benchmark:
669
- print("\n=== PERFORMANCE BENCHMARK ===")
670
- benchmark = recognizer.benchmark_performance(args.audio_file)
671
- for key, value in benchmark.items():
672
- if value is not None:
673
- print(f"{key}: {value:.3f}")
674
- print()
675
-
676
- # Transcribe audio
677
- segments = recognizer.transcribe_audio(
678
- audio_input=args.audio_file,
679
- language=args.language,
680
- word_timestamps=not args.no_word_timestamps,
681
- use_batching=not args.no_batching
682
- )
683
-
684
- # Output results
685
- if args.output_format == "json":
686
- result = {
687
- "audio_file": args.audio_file,
688
- "num_segments": len(segments),
689
- "languages": list(set(seg.language for seg in segments)),
690
- "total_duration": sum(seg.duration for seg in segments),
691
- "segments": [seg.to_dict() for seg in segments]
692
- }
693
- print(json.dumps(result, indent=2, ensure_ascii=False))
694
-
695
- elif args.output_format == "srt":
696
- for i, segment in enumerate(segments, 1):
697
- start_time = f"{int(segment.start_time//3600):02d}:{int((segment.start_time%3600)//60):02d}:{segment.start_time%60:06.3f}".replace('.', ',')
698
- end_time = f"{int(segment.end_time//3600):02d}:{int((segment.end_time%3600)//60):02d}:{segment.end_time%60:06.3f}".replace('.', ',')
699
- print(f"{i}")
700
- print(f"{start_time} --> {end_time}")
701
- print(f"{segment.text}")
702
- print()
703
-
704
- else: # text format
705
- print(f"\n=== SPEECH RECOGNITION RESULTS ===")
706
- print(f"Audio file: {args.audio_file}")
707
- print(f"Model: {args.model_size}")
708
- print(f"Device: {recognizer.device}")
709
- print(f"Languages detected: {', '.join(set(seg.language for seg in segments))}")
710
- print(f"Total segments: {len(segments)}")
711
- print(f"Total speech duration: {sum(seg.duration for seg in segments):.1f}s")
712
- print("\n--- Transcription ---")
713
-
714
- for i, segment in enumerate(segments, 1):
715
- speaker_info = f" [{segment.speaker_id}]" if segment.speaker_id else ""
716
- print(f"#{i:2d} | {segment.start_time:7.1f}s - {segment.end_time:7.1f}s | "
717
- f"({segment.language}){speaker_info}")
718
- print(f" | {segment.text}")
719
-
720
- if segment.word_timestamps and args.verbose:
721
- print(" | Word timestamps:")
722
- for word in segment.word_timestamps[:5]: # Show first 5 words
723
- print(f" | '{word['word']}': {word['start']:.1f}s-{word['end']:.1f}s")
724
- if len(segment.word_timestamps) > 5:
725
- print(f" | ... and {len(segment.word_timestamps)-5} more words")
726
- print()
727
-
728
- except Exception as e:
729
- print(f"Error: {e}", file=sys.stderr)
730
- sys.exit(1)
731
-
732
- # Run CLI if script is executed directly
733
- if not FASTER_WHISPER_AVAILABLE:
734
- print("Warning: faster-whisper not available. Install with: pip install faster-whisper")
735
- print("Running in demo mode...")
736
-
737
- # Create dummy segments for testing
738
- dummy_segments = [
739
- TranscriptionSegment(
740
- start_time=0.0, end_time=3.5, text="Hello, how are you today?",
741
- language="en", confidence=0.95,
742
- word_timestamps=[
743
- {"word": "Hello", "start": 0.0, "end": 0.5, "confidence": 0.99},
744
- {"word": "how", "start": 1.0, "end": 1.2, "confidence": 0.98},
745
- {"word": "are", "start": 1.3, "end": 1.5, "confidence": 0.97},
746
- {"word": "you", "start": 1.6, "end": 1.9, "confidence": 0.98},
747
- {"word": "today", "start": 2.5, "end": 3.2, "confidence": 0.96}
748
- ]
749
- ),
750
- TranscriptionSegment(
751
- start_time=4.0, end_time=7.8, text="Bonjour, comment allez-vous?",
752
- language="fr", confidence=0.92
753
- ),
754
- TranscriptionSegment(
755
- start_time=8.5, end_time=12.1, text="मैं ठीक हूँ, धन्यवाद।",
756
- language="hi", confidence=0.89
757
- )
758
- ]
759
-
760
- print("\n=== DEMO OUTPUT (faster-whisper not available) ===")
761
- for i, segment in enumerate(dummy_segments, 1):
762
- print(f"#{i} | {segment.start_time:.1f}s - {segment.end_time:.1f}s | "
763
- f"({segment.language})")
764
- print(f" | {segment.text}")
765
- else:
766
- main()
 
1
  """
2
  Advanced Speech Recognition Module for Multilingual Audio Intelligence System
3
 
4
+ This module implements state-of-the-art automatic speech recognition using openai-whisper
5
  with integrated language identification capabilities. Designed for maximum performance
6
  on CPU-constrained environments while maintaining SOTA accuracy.
7
 
8
  Key Features:
9
+ - OpenAI Whisper with optimized backend for speed improvement
10
  - Integrated Language Identification (no separate LID module needed)
11
+ - VAD-based batching for real-time performance on CPU
12
  - Word-level timestamps for interactive UI synchronization
 
13
  - Robust error handling and multilingual support
14
  - CPU and GPU optimization paths
15
 
16
  Model: openai/whisper-small (optimized for speed/accuracy balance)
17
+ Dependencies: openai-whisper, torch, numpy
18
  """
19
 
20
  import os
 
28
  import time
29
 
30
  try:
31
+ import whisper
32
+ WHISPER_AVAILABLE = True
33
  except ImportError:
34
+ WHISPER_AVAILABLE = False
35
+ logging.warning("openai-whisper not available. Install with: pip install openai-whisper")
36
 
37
  # Configure logging
38
  logging.basicConfig(level=logging.INFO)
 
47
  class TranscriptionSegment:
48
  """
49
  Data class representing a transcribed speech segment with rich metadata.
 
 
 
 
 
 
 
 
 
50
  """
51
+ start: float
52
+ end: float
53
  text: str
54
  language: str
55
+ language_probability: float
56
+ no_speech_probability: float
57
+ words: Optional[List[Dict]] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  class SpeechRecognizer:
61
  """
62
+ Advanced Speech Recognition Engine using OpenAI Whisper.
63
 
64
+ This class provides high-performance speech recognition with integrated language
65
+ identification, optimized for both CPU and GPU environments.
66
  """
67
 
68
+ def __init__(self, model_size: str = "small", device: str = "auto",
69
+ compute_type: str = "int8", language: Optional[str] = None):
 
 
 
 
 
70
  """
71
+ Initialize the Speech Recognizer.
72
 
73
  Args:
74
+ model_size: Whisper model size (tiny, base, small, medium, large)
75
+ device: Device to use (auto, cpu, cuda)
76
+ compute_type: Computation precision (int8, float16, float32)
77
+ language: Target language code (None for auto-detection)
 
 
78
  """
79
  self.model_size = model_size
80
+ self.device = self._determine_device(device)
81
  self.compute_type = compute_type
82
+ self.language = language
83
+ self.model = None
84
+ self._initialize_model()
85
 
86
+ def _determine_device(self, device: str) -> str:
87
+ """Determine the best available device."""
88
+ if device == "auto":
89
  if torch.cuda.is_available():
90
+ return "cuda"
91
+ elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
92
+ return "mps"
 
93
  else:
94
+ return "cpu"
95
+ return device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ def _initialize_model(self):
98
+ """Initialize the Whisper model."""
99
+ if not WHISPER_AVAILABLE:
100
+ raise ImportError("openai-whisper is required. Install with: pip install openai-whisper")
 
 
 
101
 
102
  try:
103
  logger.info(f"Loading {self.model_size} Whisper model...")
104
+ self.model = whisper.load_model(self.model_size, device=self.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  logger.info(f"Speech recognition models loaded on {self.device}")
 
106
  except Exception as e:
107
+ logger.error(f"Failed to load Whisper model: {e}")
108
  raise
109
 
110
+ def transcribe_audio(self, audio_data: np.ndarray, sample_rate: int = 16000,
111
+ language: Optional[str] = None,
112
+ initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]:
 
 
 
113
  """
114
+ Transcribe audio data with language identification.
115
 
116
  Args:
117
+ audio_data: Audio data as numpy array
118
+ sample_rate: Sample rate of the audio
119
+ language: Language code (None for auto-detection)
120
+ initial_prompt: Initial prompt for better transcription
 
121
 
122
  Returns:
123
+ List of TranscriptionSegment objects
124
  """
125
  if self.model is None:
126
+ raise RuntimeError("Model not initialized")
127
 
128
  try:
129
+ # Prepare audio for Whisper (expects 16kHz)
130
+ if sample_rate != 16000:
131
+ import librosa
132
+ audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
133
+
134
+ # Transcribe with Whisper
135
+ result = self.model.transcribe(
136
+ audio_data,
137
+ language=language or self.language,
138
+ initial_prompt=initial_prompt,
139
+ word_timestamps=True,
140
+ verbose=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  )
142
 
143
+ # Convert to our format
144
  segments = []
145
+ for segment in result["segments"]:
146
+ words = []
147
+ if "words" in segment:
148
+ for word in segment["words"]:
149
+ words.append({
150
+ "word": word["word"],
151
+ "start": word["start"],
152
+ "end": word["end"],
153
+ "probability": word.get("probability", 1.0)
154
+ })
 
 
 
155
 
156
+ segments.append(TranscriptionSegment(
157
+ start=segment["start"],
158
+ end=segment["end"],
159
+ text=segment["text"].strip(),
160
+ language=result.get("language", "unknown"),
161
+ language_probability=result.get("language_probability", 1.0),
162
+ no_speech_probability=segment.get("no_speech_prob", 0.0),
163
+ words=words
164
+ ))
165
 
166
  return segments
167
 
168
  except Exception as e:
169
+ logger.error(f"Transcription failed: {e}")
170
+ raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
171
 
172
+ def transcribe_file(self, file_path: str, language: Optional[str] = None,
173
+ initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]:
 
 
 
174
  """
175
+ Transcribe an audio file.
176
 
177
  Args:
178
+ file_path: Path to audio file
179
+ language: Language code (None for auto-detection)
180
+ initial_prompt: Initial prompt for better transcription
 
181
 
182
  Returns:
183
+ List of TranscriptionSegment objects
184
  """
 
 
 
185
  try:
186
+ # Load audio file
187
+ import librosa
188
+ audio_data, sample_rate = librosa.load(file_path, sr=16000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
+ return self.transcribe_audio(audio_data, sample_rate, language, initial_prompt)
191
 
192
  except Exception as e:
193
+ logger.error(f"File transcription failed: {e}")
194
+ raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  def get_supported_languages(self) -> List[str]:
197
+ """Get list of supported language codes."""
 
198
  return [
199
+ "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su"
 
 
 
 
 
 
 
200
  ]
201
 
202
+ def detect_language(self, audio_data: np.ndarray, sample_rate: int = 16000) -> Tuple[str, float]:
203
+ """
204
+ Detect the language of audio data.
205
+
206
+ Args:
207
+ audio_data: Audio data as numpy array
208
+ sample_rate: Sample rate of the audio
209
 
210
+ Returns:
211
+ Tuple of (language_code, confidence)
212
+ """
213
+ try:
214
+ # Prepare audio for Whisper
215
+ if sample_rate != 16000:
216
+ import librosa
217
+ audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
218
 
219
+ # Detect language using Whisper
220
+ result = self.model.transcribe(audio_data, language=None, verbose=False)
 
 
 
 
 
 
221
 
222
+ return result.get("language", "unknown"), result.get("language_probability", 0.0)
 
 
 
 
 
 
 
223
 
224
  except Exception as e:
225
+ logger.error(f"Language detection failed: {e}")
226
+ return "unknown", 0.0
 
 
 
 
 
 
 
 
227
 
228
 
229
+ def create_speech_recognizer(model_size: str = "small", device: str = "auto",
230
+ compute_type: str = "int8", language: Optional[str] = None) -> SpeechRecognizer:
 
 
 
 
 
231
  """
232
+ Factory function to create a SpeechRecognizer instance.
233
 
234
  Args:
235
+ model_size: Whisper model size
236
+ device: Device to use
237
+ compute_type: Computation precision
238
+ language: Target language code
 
 
239
 
240
  Returns:
241
+ SpeechRecognizer instance
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  """
243
+ return SpeechRecognizer(model_size, device, compute_type, language)