qqwjq1981 commited on
Commit
b4434f0
·
verified ·
1 Parent(s): 3d10bbf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +171 -106
app.py CHANGED
@@ -32,19 +32,20 @@ from openai import OpenAI
32
  import traceback
33
  from TTS.api import TTS
34
  import torch
35
- from pydub import AudioSegment
36
  from pyannote.audio import Pipeline
37
  import wave
38
  import librosa
39
  import noisereduce as nr
40
- import soundfile as sf
41
  from paddleocr import PaddleOCR
42
  import cv2
43
  from rapidfuzz import fuzz
44
  from tqdm import tqdm
45
  import threading
46
  import requests
47
-
 
 
 
48
 
49
  logger = logging.getLogger(__name__)
50
 
@@ -64,6 +65,8 @@ client = OpenAI(
64
  )
65
  hf_api_key = os.environ.get("hf_token")
66
  ELEVENLABS_API_KEY = os.environ.get("elevenlabs_token")
 
 
67
 
68
  def silence(duration, fps=44100):
69
  """
@@ -276,139 +279,201 @@ def transcribe_video_with_speakers(video_path):
276
 
277
  return transcript_with_speakers, detected_language
278
 
279
-
280
- def transcribe_video_with_speakers_11labs(video_path, num_speakers=None):
281
  """
282
- Transcribes video/audio using the ElevenLabs Scribe API, including speaker diarization.
 
283
 
284
  Args:
285
- video_path (str): The path to the video or audio file to transcribe.
286
- num_speakers (int, optional): The maximum amount of speakers talking in the uploaded file.
287
- Can help with predicting who speaks when. Defaults to None.
288
 
289
  Returns:
290
  tuple: A tuple containing:
291
- - transcript_with_speakers (list): A list of dictionaries, where each dictionary
292
- represents a transcribed segment with 'start', 'end', 'text', and 'speaker'.
293
- - detected_language (str): The language detected by the API (e.g., "en", "es").
294
- - error_message (str, optional): An error message if transcription fails.
295
  """
296
- # --- Configuration for ElevenLabs Scribe API ---
297
- # IMPORTANT: Replace with your actual ElevenLabs API Key
298
- # Correct API endpoint as per documentation
299
- ELEVENLABS_SCRIBE_API_URL = "https://api.elevenlabs.io/v1/speech-to-text"
300
-
301
- transcript_with_speakers = []
302
- detected_language = None
303
  error_message = None
304
- audio_path = "temp_audio_for_scribe.wav"
305
 
306
  try:
307
  # 1. Extract audio from video
308
  logger.info(f"Extracting audio from video: {video_path}")
309
  video = VideoFileClip(video_path)
310
- # Use a common codec; pcm_s16le is typically 16-bit signed little-endian PCM
311
- # The API's default 'other' file_format should handle this.
312
- video.audio.write_audiofile(audio_path, codec='pcm_s16le')
313
- video.close() # Close the video clip to release file resources
314
  logger.info(f"Audio extracted to: {audio_path}")
315
 
316
- # 2. Prepare for API call
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  headers = {
318
  "xi-api-key": ELEVENLABS_API_KEY,
319
  }
320
-
321
- # Parameters sent as multipart form data
322
  data = {
323
- "model_id": "scribe_v1", # Required parameter as per documentation
324
  }
325
- # Query parameters
326
  params = {
327
- "diarize": "true", # Correct parameter name for diarization
328
  }
329
- if num_speakers is not None:
330
- params["num_speakers"] = str(num_speakers) # Convert to string for API
331
 
332
- files = {
333
- "file": (os.path.basename(audio_path), open(audio_path, "rb"), "audio/wav") # Key changed to 'file'
334
- }
335
 
336
- logger.info(f"Sending audio to ElevenLabs Scribe API for transcription and diarization...")
337
- response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data, params=params)
338
- response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
339
-
340
- scribe_result = response.json()
341
- logger.info("Transcription response received from ElevenLabs Scribe.")
342
- # logger.debug(f"ElevenLabs Scribe API Response: {json.dumps(scribe_result, indent=2)}")
343
-
344
- # 3. Parse the API response to match the desired output format
345
- # The API returns a 'words' list, we need to group them into segments
346
- if "words" in scribe_result and scribe_result["words"]:
347
- current_segment = None
348
- for word_data in scribe_result["words"]:
349
- # Only process actual words, skip spacing or other types if necessary
350
- if word_data.get("type") != "word":
351
- continue
352
-
353
- word_text = word_data.get("text", "").strip()
354
- word_start = float(word_data.get("start", 0))
355
- word_end = float(word_data.get("end", 0))
356
- speaker_id = word_data.get("speaker_id", "SPEAKER_UNKNOWN")
357
-
358
- # If starting a new segment or speaker changed or significant gap
359
- if (current_segment is None or
360
- speaker_id != current_segment["speaker"] or
361
- word_start - current_segment["end"] > 0.5): # Adjust gap threshold as needed
362
-
363
- if current_segment is not None:
364
- transcript_with_speakers.append(current_segment)
365
-
366
- current_segment = {
367
- "start": word_start,
368
- "end": word_end,
369
- "text": word_text,
370
- "speaker": speaker_id
371
- }
372
- else:
373
- # Continue current segment
374
- current_segment["text"] += " " + word_text
375
- current_segment["end"] = word_end
376
 
377
- # Add the last segment after the loop
378
- if current_segment is not None:
379
- transcript_with_speakers.append(current_segment)
380
-
381
- logger.info(f"Successfully parsed {len(transcript_with_speakers)} segments from words.")
382
- else:
383
- logger.warning("No 'words' found in ElevenLabs Scribe API response or response is empty.")
384
- error_message = "ElevenLabs Scribe API response did not contain words for transcription."
385
 
386
- # Attempt to get the detected language
387
- detected_language = scribe_result.get("language_code", "unknown") # Use 'language_code' from docs
388
- logger.info(f"Detected language: {detected_language}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
- except requests.exceptions.HTTPError as http_err:
391
- error_message = f"HTTP error occurred: {http_err} - {response.text}"
392
- logger.error(error_message)
393
- except requests.exceptions.ConnectionError as conn_err:
394
- error_message = f"Connection error occurred: {conn_err}"
395
- logger.error(error_message)
396
- except requests.exceptions.Timeout as timeout_err:
397
- error_message = f"Timeout error occurred: {timeout_err}"
398
- logger.error(error_message)
399
- except requests.exceptions.RequestException as req_err:
400
- error_message = f"An unexpected request error occurred: {req_err}"
401
- logger.error(error_message)
402
  except Exception as e:
403
- error_message = f"An error occurred during transcription: {e}"
404
  logger.error(error_message)
405
  finally:
406
- # 4. Clean up temporary audio file
407
- if os.path.exists(audio_path):
408
- os.remove(audio_path)
409
- logger.info(f"Cleaned up temporary audio file: {audio_path}")
410
 
411
- return transcript_with_speakers, detected_language
412
 
413
  # Function to get the appropriate translation model based on target language
414
  def get_translation_model(source_language, target_language):
@@ -1308,7 +1373,7 @@ def upload_and_manage(file, target_language, process_mode):
1308
 
1309
  # Step 1: Transcribe audio from uploaded media file and get timestamps
1310
  logger.info("Transcribing audio...")
1311
- transcription_json, source_language = transcribe_video_with_speakers(file.name)
1312
  logger.info(f"Transcription completed. Detected source language: {source_language}")
1313
 
1314
  transcription_json_merged = transcription_json
 
32
  import traceback
33
  from TTS.api import TTS
34
  import torch
 
35
  from pyannote.audio import Pipeline
36
  import wave
37
  import librosa
38
  import noisereduce as nr
 
39
  from paddleocr import PaddleOCR
40
  import cv2
41
  from rapidfuzz import fuzz
42
  from tqdm import tqdm
43
  import threading
44
  import requests
45
+ import webrtcvad
46
+ from pydub import AudioSegment
47
+ from pydub.silence import split_on_silence
48
+ import soundfile as sf
49
 
50
  logger = logging.getLogger(__name__)
51
 
 
65
  )
66
  hf_api_key = os.environ.get("hf_token")
67
  ELEVENLABS_API_KEY = os.environ.get("elevenlabs_token")
68
+ # Correct API endpoint for ElevenLabs Scribe
69
+ ELEVENLABS_SCRIBE_API_URL = "https://api.elevenlabs.io/v1/speech-to-text"
70
 
71
  def silence(duration, fps=44100):
72
  """
 
279
 
280
  return transcript_with_speakers, detected_language
281
 
282
+ def segment_audio_from_video(video_path, frame_duration_ms=30):
 
283
  """
284
+ Extracts audio from a video and segments it into speech chunks using WebRTC VAD.
285
+ Returns a list of dictionaries, each with 'start' and 'end' timestamps for speech segments.
286
 
287
  Args:
288
+ video_path (str): The path to the input video file.
289
+ frame_duration_ms (int): The duration of a frame in milliseconds for VAD (10, 20, or 30).
290
+ Lower values are more precise but computationally intensive.
291
 
292
  Returns:
293
  tuple: A tuple containing:
294
+ - audio_path (str): Path to the extracted temporary audio file.
295
+ - speech_segments (list): A list of dictionaries, where each dictionary
296
+ represents a speech segment with 'start' and 'end' timestamps in seconds.
297
+ - error_message (str, optional): An error message if processing fails.
298
  """
299
+ audio_path = "temp_extracted_audio.wav"
300
+ speech_segments = []
 
 
 
 
 
301
  error_message = None
 
302
 
303
  try:
304
  # 1. Extract audio from video
305
  logger.info(f"Extracting audio from video: {video_path}")
306
  video = VideoFileClip(video_path)
307
+ # Ensure audio is saved in a compatible format for WebRTC VAD (16-bit, 1 channel, 8000/16000/32000 Hz)
308
+ # We will resample to 16kHz for VAD as it's a good balance.
309
+ video.audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000, nbytes=2, channels=1)
310
+ video.close()
311
  logger.info(f"Audio extracted to: {audio_path}")
312
 
313
+ # 2. Load audio for VAD
314
+ audio = AudioSegment.from_wav(audio_path)
315
+ sample_rate = audio.frame_rate
316
+ audio_data = np.array(audio.get_array_of_samples())
317
+
318
+ # WebRTC VAD operates on 16-bit mono audio at 8kHz, 16kHz, or 32kHz.
319
+ # We already saved at 16kHz, so we can proceed.
320
+ if sample_rate not in [8000, 16000, 32000]:
321
+ error_message = f"Unsupported sample rate for VAD: {sample_rate} Hz. Must be 8kHz, 16kHz, or 32kHz."
322
+ logger.error(error_message)
323
+ return audio_path, [], error_message
324
+
325
+ vad = webrtcvad.Vad(3) # Aggressiveness mode (0-3, 3 is most aggressive)
326
+ frames = []
327
+ offset = 0
328
+ while offset + frame_duration_ms <= len(audio):
329
+ frame_start = offset
330
+ frame_end = offset + frame_duration_ms
331
+ frame = audio[frame_start:frame_end]
332
+ frames.append(frame)
333
+ offset += frame_duration_ms
334
+
335
+ logger.info(f"Running WebRTC VAD on {len(frames)} frames...")
336
+
337
+ current_segment_start = None
338
+ for i, frame in enumerate(frames):
339
+ is_speech = vad.is_speech(frame.raw_data, sample_rate)
340
+
341
+ frame_start_time = (i * frame_duration_ms) / 1000.0
342
+ frame_end_time = ((i + 1) * frame_duration_ms) / 1000.0
343
+
344
+ if is_speech:
345
+ if current_segment_start is None:
346
+ current_segment_start = frame_start_time
347
+ else:
348
+ if current_segment_start is not None:
349
+ speech_segments.append({"start": current_segment_start, "end": frame_end_time})
350
+ current_segment_start = None
351
+
352
+ # Add the last segment if it ended with speech
353
+ if current_segment_start is not None:
354
+ speech_segments.append({"start": current_segment_start, "end": len(audio) / 1000.0})
355
+
356
+ logger.info(f"VAD completed. Found {len(speech_segments)} speech segments.")
357
+
358
+ except Exception as e:
359
+ error_message = f"An error occurred during audio segmentation: {e}"
360
+ logger.error(error_message)
361
+
362
+ return audio_path, speech_segments, error_message
363
+
364
+ def transcribe_segments_with_scribe(full_audio_path, segments):
365
+ """
366
+ Transcribes pre-defined audio segments using the ElevenLabs Scribe API.
367
+ Diarization is explicitly turned off as per requirements.
368
+
369
+ Args:
370
+ full_audio_path (str): The path to the full extracted audio file.
371
+ segments (list): A list of dictionaries, where each dictionary
372
+ represents a segment with 'start' and 'end' timestamps in seconds.
373
+
374
+ Returns:
375
+ tuple: A tuple containing:
376
+ - transcribed_segments (list): A list of dictionaries, where each dictionary
377
+ represents a transcribed segment with 'start', 'end', and 'text'.
378
+ - detected_language (str): The language detected by the API (e.g., "en", "es").
379
+ - error_message (str, optional): An error message if transcription fails.
380
+ """
381
+ transcribed_segments = []
382
+ detected_language = "unknown" # Default
383
+ error_message = None
384
+
385
+ if not os.path.exists(full_audio_path):
386
+ return [], detected_language, f"Full audio file not found at {full_audio_path}"
387
+
388
+ try:
389
+ audio_clip = AudioFileClip(full_audio_path)
390
+
391
  headers = {
392
  "xi-api-key": ELEVENLABS_API_KEY,
393
  }
 
 
394
  data = {
395
+ "model_id": "scribe_v1",
396
  }
397
+ # Explicitly set diarize to false, as it's not needed.
398
  params = {
399
+ "diarize": "false",
400
  }
 
 
401
 
402
+ logger.info(f"Starting transcription of {len(segments)} segments with ElevenLabs Scribe...")
 
 
403
 
404
+ for i, segment in enumerate(segments):
405
+ segment_start = segment["start"]
406
+ segment_end = segment["end"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
 
408
+ # Ensure segment duration is positive
409
+ if segment_end <= segment_start:
410
+ logger.warning(f"Skipping segment {i} due to invalid duration: {segment_start:.2f}s -> {segment_end:.2f}s")
411
+ continue
 
 
 
 
412
 
413
+ temp_segment_audio_path = f"temp_segment_{i}.wav"
414
+ try:
415
+ # Subclip the audio and save it temporarily
416
+ sub_clip = audio_clip.subclip(segment_start, segment_end)
417
+ # Save as 16-bit PCM WAV for Scribe API compatibility
418
+ sub_clip.write_audiofile(temp_segment_audio_path, codec='pcm_s16le')
419
+
420
+ logger.info(f"Transcribing segment {i+1}/{len(segments)}: {segment_start:.2f}s - {segment_end:.2f}s")
421
+
422
+ with open(temp_segment_audio_path, "rb") as audio_file:
423
+ files = {
424
+ "file": (os.path.basename(temp_segment_audio_path), audio_file, "audio/wav")
425
+ }
426
+ response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data, params=params)
427
+ response.raise_for_status()
428
+ scribe_result = response.json()
429
+
430
+ segment_text = ""
431
+ if "text" in scribe_result:
432
+ segment_text = scribe_result["text"].strip()
433
+ elif "words" in scribe_result and scribe_result["words"]:
434
+ # Fallback if 'text' field is not directly available, reconstruct from words
435
+ segment_text = " ".join([w.get("text", "") for w in scribe_result["words"] if w.get("type") == "word"]).strip()
436
+
437
+ if segment_text:
438
+ transcribed_segments.append({
439
+ "start": segment_start,
440
+ "end": segment_end,
441
+ "text": segment_text
442
+ })
443
+ else:
444
+ logger.warning(f"No transcription text found for segment {i+1}.")
445
+
446
+ # Update detected language from the first successful transcription
447
+ if "language_code" in scribe_result and detected_language == "unknown":
448
+ detected_language = scribe_result["language_code"]
449
+
450
+ except requests.exceptions.HTTPError as http_err:
451
+ error_message = f"HTTP error for segment {i+1}: {http_err} - {response.text}"
452
+ logger.error(error_message)
453
+ # Continue to next segment even if one fails
454
+ except requests.exceptions.RequestException as req_err:
455
+ error_message = f"Request error for segment {i+1}: {req_err}"
456
+ logger.error(error_message)
457
+ # Continue to next segment
458
+ except Exception as e:
459
+ error_message = f"Error processing segment {i+1}: {e}"
460
+ logger.error(error_message)
461
+ # Continue to next segment
462
+ finally:
463
+ if os.path.exists(temp_segment_audio_path):
464
+ os.remove(temp_segment_audio_path)
465
+
466
+ logger.info("All segments processed by ElevenLabs Scribe.")
467
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  except Exception as e:
469
+ error_message = f"An error occurred during overall transcription process: {e}"
470
  logger.error(error_message)
471
  finally:
472
+ if 'audio_clip' in locals() and audio_clip is not None:
473
+ audio_clip.close()
474
+
475
+ return transcribed_segments, detected_language, error_message
476
 
 
477
 
478
  # Function to get the appropriate translation model based on target language
479
  def get_translation_model(source_language, target_language):
 
1373
 
1374
  # Step 1: Transcribe audio from uploaded media file and get timestamps
1375
  logger.info("Transcribing audio...")
1376
+ transcription_json, source_language = transcribe_segments_with_scribe(file.name)
1377
  logger.info(f"Transcription completed. Detected source language: {source_language}")
1378
 
1379
  transcription_json_merged = transcription_json