qqwjq1981 commited on
Commit
fee63e8
·
verified ·
1 Parent(s): 37fa3bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -1
app.py CHANGED
@@ -62,6 +62,7 @@ client = OpenAI(
62
  api_key= os.environ.get("openAI_api_key"), # This is the default and can be omitted
63
  )
64
  hf_api_key = os.environ.get("hf_token")
 
65
 
66
  def silence(duration, fps=44100):
67
  """
@@ -274,6 +275,140 @@ def transcribe_video_with_speakers(video_path):
274
 
275
  return transcript_with_speakers, detected_language
276
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  # Function to get the appropriate translation model based on target language
278
  def get_translation_model(source_language, target_language):
279
  """
@@ -1153,7 +1288,7 @@ def upload_and_manage(file, target_language, process_mode):
1153
 
1154
  # Step 1: Transcribe audio from uploaded media file and get timestamps
1155
  logger.info("Transcribing audio...")
1156
- transcription_json, source_language = transcribe_video_with_speakers(file.name)
1157
  logger.info(f"Transcription completed. Detected source language: {source_language}")
1158
 
1159
  transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name, source_language)
 
62
  api_key= os.environ.get("openAI_api_key"), # This is the default and can be omitted
63
  )
64
  hf_api_key = os.environ.get("hf_token")
65
+ ELEVENLABS_API_KEY = os.environ.get("elevenlabs_token")
66
 
67
  def silence(duration, fps=44100):
68
  """
 
275
 
276
  return transcript_with_speakers, detected_language
277
 
278
+
279
+ def transcribe_video_with_speakers_11labs(video_path, num_speakers=None):
280
+ """
281
+ Transcribes video/audio using the ElevenLabs Scribe API, including speaker diarization.
282
+
283
+ Args:
284
+ video_path (str): The path to the video or audio file to transcribe.
285
+ num_speakers (int, optional): The maximum amount of speakers talking in the uploaded file.
286
+ Can help with predicting who speaks when. Defaults to None.
287
+
288
+ Returns:
289
+ tuple: A tuple containing:
290
+ - transcript_with_speakers (list): A list of dictionaries, where each dictionary
291
+ represents a transcribed segment with 'start', 'end', 'text', and 'speaker'.
292
+ - detected_language (str): The language detected by the API (e.g., "en", "es").
293
+ - error_message (str, optional): An error message if transcription fails.
294
+ """
295
+ # --- Configuration for ElevenLabs Scribe API ---
296
+ # IMPORTANT: Replace with your actual ElevenLabs API Key
297
+ # Correct API endpoint as per documentation
298
+ ELEVENLABS_SCRIBE_API_URL = "https://api.elevenlabs.io/v1/speech-to-text"
299
+
300
+ transcript_with_speakers = []
301
+ detected_language = None
302
+ error_message = None
303
+ audio_path = "temp_audio_for_scribe.wav"
304
+
305
+ try:
306
+ # 1. Extract audio from video
307
+ logger.info(f"Extracting audio from video: {video_path}")
308
+ video = VideoFileClip(video_path)
309
+ # Use a common codec; pcm_s16le is typically 16-bit signed little-endian PCM
310
+ # The API's default 'other' file_format should handle this.
311
+ video.audio.write_audiofile(audio_path, codec='pcm_s16le')
312
+ video.close() # Close the video clip to release file resources
313
+ logger.info(f"Audio extracted to: {audio_path}")
314
+
315
+ # 2. Prepare for API call
316
+ headers = {
317
+ "xi-api-key": ELEVENLABS_API_KEY,
318
+ }
319
+
320
+ # Parameters sent as multipart form data
321
+ data = {
322
+ "model_id": "scribe_v1", # Required parameter as per documentation
323
+ }
324
+ # Query parameters
325
+ params = {
326
+ "diarize": "true", # Correct parameter name for diarization
327
+ }
328
+ if num_speakers is not None:
329
+ params["num_speakers"] = str(num_speakers) # Convert to string for API
330
+
331
+ files = {
332
+ "file": (os.path.basename(audio_path), open(audio_path, "rb"), "audio/wav") # Key changed to 'file'
333
+ }
334
+
335
+ logger.info(f"Sending audio to ElevenLabs Scribe API for transcription and diarization...")
336
+ response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data, params=params)
337
+ response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
338
+
339
+ scribe_result = response.json()
340
+ logger.info("Transcription response received from ElevenLabs Scribe.")
341
+ # logger.debug(f"ElevenLabs Scribe API Response: {json.dumps(scribe_result, indent=2)}")
342
+
343
+ # 3. Parse the API response to match the desired output format
344
+ # The API returns a 'words' list, we need to group them into segments
345
+ if "words" in scribe_result and scribe_result["words"]:
346
+ current_segment = None
347
+ for word_data in scribe_result["words"]:
348
+ # Only process actual words, skip spacing or other types if necessary
349
+ if word_data.get("type") != "word":
350
+ continue
351
+
352
+ word_text = word_data.get("text", "").strip()
353
+ word_start = float(word_data.get("start", 0))
354
+ word_end = float(word_data.get("end", 0))
355
+ speaker_id = word_data.get("speaker_id", "SPEAKER_UNKNOWN")
356
+
357
+ # If starting a new segment or speaker changed or significant gap
358
+ if (current_segment is None or
359
+ speaker_id != current_segment["speaker"] or
360
+ word_start - current_segment["end"] > 0.5): # Adjust gap threshold as needed
361
+
362
+ if current_segment is not None:
363
+ transcript_with_speakers.append(current_segment)
364
+
365
+ current_segment = {
366
+ "start": word_start,
367
+ "end": word_end,
368
+ "text": word_text,
369
+ "speaker": speaker_id
370
+ }
371
+ else:
372
+ # Continue current segment
373
+ current_segment["text"] += " " + word_text
374
+ current_segment["end"] = word_end
375
+
376
+ # Add the last segment after the loop
377
+ if current_segment is not None:
378
+ transcript_with_speakers.append(current_segment)
379
+
380
+ logger.info(f"Successfully parsed {len(transcript_with_speakers)} segments from words.")
381
+ else:
382
+ logger.warning("No 'words' found in ElevenLabs Scribe API response or response is empty.")
383
+ error_message = "ElevenLabs Scribe API response did not contain words for transcription."
384
+
385
+ # Attempt to get the detected language
386
+ detected_language = scribe_result.get("language_code", "unknown") # Use 'language_code' from docs
387
+ logger.info(f"Detected language: {detected_language}")
388
+
389
+ except requests.exceptions.HTTPError as http_err:
390
+ error_message = f"HTTP error occurred: {http_err} - {response.text}"
391
+ logger.error(error_message)
392
+ except requests.exceptions.ConnectionError as conn_err:
393
+ error_message = f"Connection error occurred: {conn_err}"
394
+ logger.error(error_message)
395
+ except requests.exceptions.Timeout as timeout_err:
396
+ error_message = f"Timeout error occurred: {timeout_err}"
397
+ logger.error(error_message)
398
+ except requests.exceptions.RequestException as req_err:
399
+ error_message = f"An unexpected request error occurred: {req_err}"
400
+ logger.error(error_message)
401
+ except Exception as e:
402
+ error_message = f"An error occurred during transcription: {e}"
403
+ logger.error(error_message)
404
+ finally:
405
+ # 4. Clean up temporary audio file
406
+ if os.path.exists(audio_path):
407
+ os.remove(audio_path)
408
+ logger.info(f"Cleaned up temporary audio file: {audio_path}")
409
+
410
+ return transcript_with_speakers, detected_language
411
+
412
  # Function to get the appropriate translation model based on target language
413
  def get_translation_model(source_language, target_language):
414
  """
 
1288
 
1289
  # Step 1: Transcribe audio from uploaded media file and get timestamps
1290
  logger.info("Transcribing audio...")
1291
+ transcription_json, source_language = transcribe_video_with_speakers_11labs(file.name)
1292
  logger.info(f"Transcription completed. Detected source language: {source_language}")
1293
 
1294
  transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name, source_language)