studio_V1_4_OCR_SOTA

Running

App Files Files Community

qqwjq1981 commited on Jun 4

Commit

b4434f0

verified ·

1 Parent(s): 3d10bbf

Update app.py

Browse files

Files changed (1) hide show

app.py +171 -106

app.py CHANGED Viewed

@@ -32,19 +32,20 @@ from openai import OpenAI
 import traceback
 from TTS.api import TTS
 import torch
-from pydub import AudioSegment
 from pyannote.audio import Pipeline
 import wave
 import librosa
 import noisereduce as nr
-import soundfile as sf
 from paddleocr import PaddleOCR
 import cv2
 from rapidfuzz import fuzz
 from tqdm import tqdm
 import threading
 import requests
 logger = logging.getLogger(__name__)
@@ -64,6 +65,8 @@ client = OpenAI(
 )
 hf_api_key = os.environ.get("hf_token")
 ELEVENLABS_API_KEY = os.environ.get("elevenlabs_token")
 def silence(duration, fps=44100):
     """
@@ -276,139 +279,201 @@ def transcribe_video_with_speakers(video_path):
     return transcript_with_speakers, detected_language
-def transcribe_video_with_speakers_11labs(video_path, num_speakers=None):
     """
-    Transcribes video/audio using the ElevenLabs Scribe API, including speaker diarization.
     Args:
-        video_path (str): The path to the video or audio file to transcribe.
-        num_speakers (int, optional): The maximum amount of speakers talking in the uploaded file.
-                                      Can help with predicting who speaks when. Defaults to None.
     Returns:
         tuple: A tuple containing:
-            - transcript_with_speakers (list): A list of dictionaries, where each dictionary
-              represents a transcribed segment with 'start', 'end', 'text', and 'speaker'.
-            - detected_language (str): The language detected by the API (e.g., "en", "es").
-            - error_message (str, optional): An error message if transcription fails.
     """
-    # --- Configuration for ElevenLabs Scribe API ---
-    # IMPORTANT: Replace with your actual ElevenLabs API Key
-    # Correct API endpoint as per documentation
-    ELEVENLABS_SCRIBE_API_URL = "https://api.elevenlabs.io/v1/speech-to-text"
-    transcript_with_speakers = []
-    detected_language = None
     error_message = None
-    audio_path = "temp_audio_for_scribe.wav"
     try:
         # 1. Extract audio from video
         logger.info(f"Extracting audio from video: {video_path}")
         video = VideoFileClip(video_path)
-        # Use a common codec; pcm_s16le is typically 16-bit signed little-endian PCM
-        # The API's default 'other' file_format should handle this.
-        video.audio.write_audiofile(audio_path, codec='pcm_s16le')
-        video.close() # Close the video clip to release file resources
         logger.info(f"Audio extracted to: {audio_path}")
-        # 2. Prepare for API call
         headers = {
             "xi-api-key": ELEVENLABS_API_KEY,
         }
-        # Parameters sent as multipart form data
         data = {
-            "model_id": "scribe_v1", # Required parameter as per documentation
         }
-        # Query parameters
         params = {
-            "diarize": "true", # Correct parameter name for diarization
         }
-        if num_speakers is not None:
-            params["num_speakers"] = str(num_speakers) # Convert to string for API
-        files = {
-            "file": (os.path.basename(audio_path), open(audio_path, "rb"), "audio/wav") # Key changed to 'file'
-        }
-        logger.info(f"Sending audio to ElevenLabs Scribe API for transcription and diarization...")
-        response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data, params=params)
-        response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
-        scribe_result = response.json()
-        logger.info("Transcription response received from ElevenLabs Scribe.")
-        # logger.debug(f"ElevenLabs Scribe API Response: {json.dumps(scribe_result, indent=2)}")
-        # 3. Parse the API response to match the desired output format
-        # The API returns a 'words' list, we need to group them into segments
-        if "words" in scribe_result and scribe_result["words"]:
-            current_segment = None
-            for word_data in scribe_result["words"]:
-                # Only process actual words, skip spacing or other types if necessary
-                if word_data.get("type") != "word":
-                    continue
-                word_text = word_data.get("text", "").strip()
-                word_start = float(word_data.get("start", 0))
-                word_end = float(word_data.get("end", 0))
-                speaker_id = word_data.get("speaker_id", "SPEAKER_UNKNOWN")
-                # If starting a new segment or speaker changed or significant gap
-                if (current_segment is None or
-                    speaker_id != current_segment["speaker"] or
-                    word_start - current_segment["end"] > 0.5): # Adjust gap threshold as needed
-                    if current_segment is not None:
-                        transcript_with_speakers.append(current_segment)
-                    current_segment = {
-                        "start": word_start,
-                        "end": word_end,
-                        "text": word_text,
-                        "speaker": speaker_id
-                    }
-                else:
-                    # Continue current segment
-                    current_segment["text"] += " " + word_text
-                    current_segment["end"] = word_end
-            # Add the last segment after the loop
-            if current_segment is not None:
-                transcript_with_speakers.append(current_segment)
-            logger.info(f"Successfully parsed {len(transcript_with_speakers)} segments from words.")
-        else:
-            logger.warning("No 'words' found in ElevenLabs Scribe API response or response is empty.")
-            error_message = "ElevenLabs Scribe API response did not contain words for transcription."
-        # Attempt to get the detected language
-        detected_language = scribe_result.get("language_code", "unknown") # Use 'language_code' from docs
-        logger.info(f"Detected language: {detected_language}")
-    except requests.exceptions.HTTPError as http_err:
-        error_message = f"HTTP error occurred: {http_err} - {response.text}"
-        logger.error(error_message)
-    except requests.exceptions.ConnectionError as conn_err:
-        error_message = f"Connection error occurred: {conn_err}"
-        logger.error(error_message)
-    except requests.exceptions.Timeout as timeout_err:
-        error_message = f"Timeout error occurred: {timeout_err}"
-        logger.error(error_message)
-    except requests.exceptions.RequestException as req_err:
-        error_message = f"An unexpected request error occurred: {req_err}"
-        logger.error(error_message)
     except Exception as e:
-        error_message = f"An error occurred during transcription: {e}"
         logger.error(error_message)
     finally:
-        # 4. Clean up temporary audio file
-        if os.path.exists(audio_path):
-            os.remove(audio_path)
-            logger.info(f"Cleaned up temporary audio file: {audio_path}")
-    return transcript_with_speakers, detected_language
 # Function to get the appropriate translation model based on target language
 def get_translation_model(source_language, target_language):
@@ -1308,7 +1373,7 @@ def upload_and_manage(file, target_language, process_mode):
         # Step 1: Transcribe audio from uploaded media file and get timestamps
         logger.info("Transcribing audio...")
-        transcription_json, source_language = transcribe_video_with_speakers(file.name)
         logger.info(f"Transcription completed. Detected source language: {source_language}")
         transcription_json_merged = transcription_json

 import traceback
 from TTS.api import TTS
 import torch
 from pyannote.audio import Pipeline
 import wave
 import librosa
 import noisereduce as nr
 from paddleocr import PaddleOCR
 import cv2
 from rapidfuzz import fuzz
 from tqdm import tqdm
 import threading
 import requests
+import webrtcvad
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+import soundfile as sf
 logger = logging.getLogger(__name__)
 )
 hf_api_key = os.environ.get("hf_token")
 ELEVENLABS_API_KEY = os.environ.get("elevenlabs_token")
+# Correct API endpoint for ElevenLabs Scribe
+ELEVENLABS_SCRIBE_API_URL = "https://api.elevenlabs.io/v1/speech-to-text"
 def silence(duration, fps=44100):
     """
     return transcript_with_speakers, detected_language
+def segment_audio_from_video(video_path, frame_duration_ms=30):
     """
+    Extracts audio from a video and segments it into speech chunks using WebRTC VAD.
+    Returns a list of dictionaries, each with 'start' and 'end' timestamps for speech segments.
     Args:
+        video_path (str): The path to the input video file.
+        frame_duration_ms (int): The duration of a frame in milliseconds for VAD (10, 20, or 30).
+                                 Lower values are more precise but computationally intensive.
     Returns:
         tuple: A tuple containing:
+            - audio_path (str): Path to the extracted temporary audio file.
+            - speech_segments (list): A list of dictionaries, where each dictionary
+              represents a speech segment with 'start' and 'end' timestamps in seconds.
+            - error_message (str, optional): An error message if processing fails.
     """
+    audio_path = "temp_extracted_audio.wav"
+    speech_segments = []
     error_message = None
     try:
         # 1. Extract audio from video
         logger.info(f"Extracting audio from video: {video_path}")
         video = VideoFileClip(video_path)
+        # Ensure audio is saved in a compatible format for WebRTC VAD (16-bit, 1 channel, 8000/16000/32000 Hz)
+        # We will resample to 16kHz for VAD as it's a good balance.
+        video.audio.write_audiofile(audio_path, codec='pcm_s16le', fps=16000, nbytes=2, channels=1)
+        video.close()
         logger.info(f"Audio extracted to: {audio_path}")
+        # 2. Load audio for VAD
+        audio = AudioSegment.from_wav(audio_path)
+        sample_rate = audio.frame_rate
+        audio_data = np.array(audio.get_array_of_samples())
+        # WebRTC VAD operates on 16-bit mono audio at 8kHz, 16kHz, or 32kHz.
+        # We already saved at 16kHz, so we can proceed.
+        if sample_rate not in [8000, 16000, 32000]:
+            error_message = f"Unsupported sample rate for VAD: {sample_rate} Hz. Must be 8kHz, 16kHz, or 32kHz."
+            logger.error(error_message)
+            return audio_path, [], error_message
+        vad = webrtcvad.Vad(3)  # Aggressiveness mode (0-3, 3 is most aggressive)
+        frames = []
+        offset = 0
+        while offset + frame_duration_ms <= len(audio):
+            frame_start = offset
+            frame_end = offset + frame_duration_ms
+            frame = audio[frame_start:frame_end]
+            frames.append(frame)
+            offset += frame_duration_ms
+        logger.info(f"Running WebRTC VAD on {len(frames)} frames...")
+        current_segment_start = None
+        for i, frame in enumerate(frames):
+            is_speech = vad.is_speech(frame.raw_data, sample_rate)
+            frame_start_time = (i * frame_duration_ms) / 1000.0
+            frame_end_time = ((i + 1) * frame_duration_ms) / 1000.0
+            if is_speech:
+                if current_segment_start is None:
+                    current_segment_start = frame_start_time
+            else:
+                if current_segment_start is not None:
+                    speech_segments.append({"start": current_segment_start, "end": frame_end_time})
+                    current_segment_start = None
+        # Add the last segment if it ended with speech
+        if current_segment_start is not None:
+            speech_segments.append({"start": current_segment_start, "end": len(audio) / 1000.0})
+        logger.info(f"VAD completed. Found {len(speech_segments)} speech segments.")
+    except Exception as e:
+        error_message = f"An error occurred during audio segmentation: {e}"
+        logger.error(error_message)
+    return audio_path, speech_segments, error_message
+def transcribe_segments_with_scribe(full_audio_path, segments):
+    """
+    Transcribes pre-defined audio segments using the ElevenLabs Scribe API.
+    Diarization is explicitly turned off as per requirements.
+    Args:
+        full_audio_path (str): The path to the full extracted audio file.
+        segments (list): A list of dictionaries, where each dictionary
+                         represents a segment with 'start' and 'end' timestamps in seconds.
+    Returns:
+        tuple: A tuple containing:
+            - transcribed_segments (list): A list of dictionaries, where each dictionary
+              represents a transcribed segment with 'start', 'end', and 'text'.
+            - detected_language (str): The language detected by the API (e.g., "en", "es").
+            - error_message (str, optional): An error message if transcription fails.
+    """
+    transcribed_segments = []
+    detected_language = "unknown" # Default
+    error_message = None
+    if not os.path.exists(full_audio_path):
+        return [], detected_language, f"Full audio file not found at {full_audio_path}"
+    try:
+        audio_clip = AudioFileClip(full_audio_path)
         headers = {
             "xi-api-key": ELEVENLABS_API_KEY,
         }
         data = {
+            "model_id": "scribe_v1",
         }
+        # Explicitly set diarize to false, as it's not needed.
         params = {
+            "diarize": "false",
         }
+        logger.info(f"Starting transcription of {len(segments)} segments with ElevenLabs Scribe...")
+        for i, segment in enumerate(segments):
+            segment_start = segment["start"]
+            segment_end = segment["end"]
+            # Ensure segment duration is positive
+            if segment_end <= segment_start:
+                logger.warning(f"Skipping segment {i} due to invalid duration: {segment_start:.2f}s -> {segment_end:.2f}s")
+                continue
+            temp_segment_audio_path = f"temp_segment_{i}.wav"
+            try:
+                # Subclip the audio and save it temporarily
+                sub_clip = audio_clip.subclip(segment_start, segment_end)
+                # Save as 16-bit PCM WAV for Scribe API compatibility
+                sub_clip.write_audiofile(temp_segment_audio_path, codec='pcm_s16le')
+                logger.info(f"Transcribing segment {i+1}/{len(segments)}: {segment_start:.2f}s - {segment_end:.2f}s")
+                with open(temp_segment_audio_path, "rb") as audio_file:
+                    files = {
+                        "file": (os.path.basename(temp_segment_audio_path), audio_file, "audio/wav")
+                    }
+                    response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data, params=params)
+                    response.raise_for_status()
+                    scribe_result = response.json()
+                    segment_text = ""
+                    if "text" in scribe_result:
+                        segment_text = scribe_result["text"].strip()
+                    elif "words" in scribe_result and scribe_result["words"]:
+                        # Fallback if 'text' field is not directly available, reconstruct from words
+                        segment_text = " ".join([w.get("text", "") for w in scribe_result["words"] if w.get("type") == "word"]).strip()
+                    if segment_text:
+                        transcribed_segments.append({
+                            "start": segment_start,
+                            "end": segment_end,
+                            "text": segment_text
+                        })
+                    else:
+                        logger.warning(f"No transcription text found for segment {i+1}.")
+                    # Update detected language from the first successful transcription
+                    if "language_code" in scribe_result and detected_language == "unknown":
+                        detected_language = scribe_result["language_code"]
+            except requests.exceptions.HTTPError as http_err:
+                error_message = f"HTTP error for segment {i+1}: {http_err} - {response.text}"
+                logger.error(error_message)
+                # Continue to next segment even if one fails
+            except requests.exceptions.RequestException as req_err:
+                error_message = f"Request error for segment {i+1}: {req_err}"
+                logger.error(error_message)
+                # Continue to next segment
+            except Exception as e:
+                error_message = f"Error processing segment {i+1}: {e}"
+                logger.error(error_message)
+                # Continue to next segment
+            finally:
+                if os.path.exists(temp_segment_audio_path):
+                    os.remove(temp_segment_audio_path)
+        logger.info("All segments processed by ElevenLabs Scribe.")
     except Exception as e:
+        error_message = f"An error occurred during overall transcription process: {e}"
         logger.error(error_message)
     finally:
+        if 'audio_clip' in locals() and audio_clip is not None:
+            audio_clip.close()
+    return transcribed_segments, detected_language, error_message
 # Function to get the appropriate translation model based on target language
 def get_translation_model(source_language, target_language):
         # Step 1: Transcribe audio from uploaded media file and get timestamps
         logger.info("Transcribing audio...")
+        transcription_json, source_language = transcribe_segments_with_scribe(file.name)
         logger.info(f"Transcription completed. Detected source language: {source_language}")
         transcription_json_merged = transcription_json