Michael Hu commited on
Commit
22eccbb
·
1 Parent(s): b10a453

chore: remove unused dependencies and update model selection in Whisper provider

Browse files

- Remove unused dependencies from requirements.txt
- Update Whisper provider to use the correct model name when loading models

requirements.txt CHANGED
@@ -9,7 +9,5 @@ scipy>=1.11
9
  munch>=2.5
10
  accelerate>=1.2.0
11
  soundfile>=0.13.0
12
- ordered-set>=4.1.0
13
- phonemizer-fork>=3.3.2
14
  faster-whisper
15
  chatterbox-tts
 
9
  munch>=2.5
10
  accelerate>=1.2.0
11
  soundfile>=0.13.0
 
 
12
  faster-whisper
13
  chatterbox-tts
src/infrastructure/stt/whisper_provider.py CHANGED
@@ -46,10 +46,15 @@ class WhisperSTTProvider(STTProviderBase):
46
 
47
  Args:
48
  audio_path: Path to the preprocessed audio file
 
49
  Returns:
50
  str: The transcribed text
51
  """
52
  try:
 
 
 
 
53
  # Perform transcription
54
  segments, info = self.model.transcribe(
55
  str(audio_path),
@@ -73,27 +78,48 @@ class WhisperSTTProvider(STTProviderBase):
73
  except Exception as e:
74
  self._handle_provider_error(e, "transcription")
75
 
76
- def _load_model(self):
77
  """
78
- Load the Whisper model.
 
 
 
79
  """
80
  try:
81
  from faster_whisper import WhisperModel as FasterWhisperModel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
 
 
83
  logger.info(f"Using device: {self._device}, compute_type: {self._compute_type}")
84
 
85
  self.model = FasterWhisperModel(
86
- 'large-v3',
87
  device=self._device,
88
  compute_type=self._compute_type
89
  )
90
 
91
  except ImportError as e:
92
  raise SpeechRecognitionException(
93
- "faster-whisper not available. Please install with: pip install faster-whisper"
94
  ) from e
95
  except Exception as e:
96
- raise SpeechRecognitionException(f"Failed to load Whisper model 'large-v3'") from e
97
 
98
  def is_available(self) -> bool:
99
  """
@@ -137,4 +163,4 @@ class WhisperSTTProvider(STTProviderBase):
137
  Returns:
138
  str: Default model name
139
  """
140
- return "large-v3"
 
46
 
47
  Args:
48
  audio_path: Path to the preprocessed audio file
49
+ model: The model name to use
50
  Returns:
51
  str: The transcribed text
52
  """
53
  try:
54
+ # Lazy load model if not already loaded
55
+ if self.model is None:
56
+ self._load_model(model)
57
+
58
  # Perform transcription
59
  segments, info = self.model.transcribe(
60
  str(audio_path),
 
78
  except Exception as e:
79
  self._handle_provider_error(e, "transcription")
80
 
81
+ def _load_model(self, model_name: str):
82
  """
83
+ Load the Whisper model based on the requested model name.
84
+
85
+ Args:
86
+ model_name: The requested model name (e.g., "whisper-large")
87
  """
88
  try:
89
  from faster_whisper import WhisperModel as FasterWhisperModel
90
+
91
+ # Map requested model to actual faster-whisper model
92
+ model_mapping = {
93
+ "whisper-large": "large-v3",
94
+ "whisper-large-v1": "large-v1",
95
+ "whisper-large-v2": "large-v2",
96
+ "whisper-large-v3": "large-v3",
97
+ "whisper-medium": "medium",
98
+ "whisper-medium.en": "medium.en",
99
+ "whisper-small": "small",
100
+ "whisper-small.en": "small.en",
101
+ "whisper-base": "base",
102
+ "whisper-base.en": "base.en",
103
+ "whisper-tiny": "tiny",
104
+ "whisper-tiny.en": "tiny.en",
105
+ }
106
 
107
+ actual_model = model_mapping.get(model_name.lower(), "large-v3")
108
+ logger.info(f"Loading Whisper model: {actual_model} (requested: {model_name})")
109
  logger.info(f"Using device: {self._device}, compute_type: {self._compute_type}")
110
 
111
  self.model = FasterWhisperModel(
112
+ actual_model,
113
  device=self._device,
114
  compute_type=self._compute_type
115
  )
116
 
117
  except ImportError as e:
118
  raise SpeechRecognitionException(
119
+ "faster-whisper not available. Please install with: uv add faster-whisper"
120
  ) from e
121
  except Exception as e:
122
+ raise SpeechRecognitionException(f"Failed to load Whisper model '{actual_model}' (requested: {model_name})") from e
123
 
124
  def is_available(self) -> bool:
125
  """
 
163
  Returns:
164
  str: Default model name
165
  """
166
+ return "whisper-large"