Tijs Zwinkels commited on
Commit
3696fef
·
1 Parent(s): 531418a

Use OpenAI api word-level timestamps

Browse files
Files changed (1) hide show
  1. whisper_online.py +10 -26
whisper_online.py CHANGED
@@ -176,30 +176,14 @@ class OpenaiApiASR(ASRBase):
176
 
177
  def ts_words(self, segments):
178
  o = []
179
- for segment in segments:
180
- # If VAD on, skip segments containing no speech.
181
- # TODO: threshold can be set from outside
182
- if self.use_vad and segment["no_speech_prob"] > 0.8:
183
- continue
184
-
185
- # Splitting the text into words and filtering out empty strings
186
- words = [word.strip() for word in segment["text"].split() if word.strip()]
187
-
188
- if not words:
189
- continue
190
 
191
- # Assign start and end times for each word
192
- # We only have timestamps per segment, so interpolating start and end-times
193
-
194
-
195
- segment_duration = segment["end"] - segment["start"]
196
- total_characters = sum(len(word) for word in words)
197
- duration_per_character = segment_duration / total_characters
198
- start_time = segment["start"]
199
- for word in words:
200
- end_time = start_time + duration_per_character * len(word)
201
- o.append((start_time, end_time, word))
202
- start_time = end_time
203
 
204
  return o
205
 
@@ -220,7 +204,8 @@ class OpenaiApiASR(ASRBase):
220
  "model": self.modelname,
221
  "file": buffer,
222
  "response_format": self.response_format,
223
- "temperature": self.temperature
 
224
  }
225
  if self.task != "translate" and self.language:
226
  params["language"] = self.language
@@ -233,11 +218,10 @@ class OpenaiApiASR(ASRBase):
233
  proc = self.client.audio.transcriptions
234
 
235
  # Process transcription/translation
236
-
237
  transcript = proc.create(**params)
238
  print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds",file=self.logfile)
239
 
240
- return transcript.segments
241
 
242
  def use_vad(self):
243
  self.use_vad = True
 
176
 
177
  def ts_words(self, segments):
178
  o = []
179
+ # If VAD on, skip segments containing no speech.
180
+ # TODO: threshold can be set from outside
181
+ # TODO: Make VAD work again with word-level timestamps
182
+ #if self.use_vad and segment["no_speech_prob"] > 0.8:
183
+ # continue
 
 
 
 
 
 
184
 
185
+ for word in segments:
186
+ o.append((word.get("start"), word.get("end"), word.get("word")))
 
 
 
 
 
 
 
 
 
 
187
 
188
  return o
189
 
 
204
  "model": self.modelname,
205
  "file": buffer,
206
  "response_format": self.response_format,
207
+ "temperature": self.temperature,
208
+ "timestamp_granularities": ["word"]
209
  }
210
  if self.task != "translate" and self.language:
211
  params["language"] = self.language
 
218
  proc = self.client.audio.transcriptions
219
 
220
  # Process transcription/translation
 
221
  transcript = proc.create(**params)
222
  print(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds",file=self.logfile)
223
 
224
+ return transcript.words
225
 
226
  def use_vad(self):
227
  self.use_vad = True