studio_V1_4_OCR_SOTA

Sleeping

App Files Files Community

qqwjq1981 commited on Jun 13

Commit

fea34c6

verified ·

1 Parent(s): 9870a55

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -0

app.py CHANGED Viewed

@@ -313,6 +313,7 @@ def segment_audio_from_video(video_path):
     ]
     return audio_path, transcript_with_speakers
 def clean_transcribed_text(text: str) -> str:
     """
     Remove noise tags like (panting), [booming sound], repeated symbols, and trim whitespace.
@@ -390,6 +391,104 @@ def transcribe_segments_with_scribe(full_audio_path, segments):
     return transcribed_segments, detected_language, error_message
 # Function to get the appropriate translation model based on target language
 def get_translation_model(source_language, target_language):
     """

     ]
     return audio_path, transcript_with_speakers
 def clean_transcribed_text(text: str) -> str:
     """
     Remove noise tags like (panting), [booming sound], repeated symbols, and trim whitespace.
     return transcribed_segments, detected_language, error_message
+from collections import Counter
+def process_scribe_output(scribe_response, max_line_length=50):
+    """
+    Processes the Scribe API response to clean the text and generate line-level timestamps.
+    Args:
+        scribe_response (dict): The raw response dictionary from the Scribe API.
+        max_line_length (int): The maximum number of characters desired per line before
+                                a new line is created. This is an approximate guide.
+    Returns:
+        list: A list of dictionaries, where each dictionary represents a line
+              and contains 'text', 'start_time', 'end_time', and 'speaker_id'.
+    """
+    cleaned_words = []
+    for word_info in scribe_response['words']:
+        text = word_info['text']
+        start = word_info['start']
+        end = word_info['end']
+        word_type = word_info['type']
+        speaker_id = word_info.get('speaker_id', None)
+        if word_type == 'audio_event':
+            continue  # Remove audio event tags like ［背景音］
+        elif word_type == 'spacing':
+            if cleaned_words and cleaned_words[-1]['text'].endswith(' '):
+                continue
+            text = ' '
+        cleaned_words.append({
+            'text': text,
+            'start': start,
+            'end': end,
+            'speaker_id': speaker_id
+        })
+    lines = []
+    current_line_words = []
+    current_line_start_time = None
+    for i, word_info in enumerate(cleaned_words):
+        if not current_line_words:
+            current_line_start_time = word_info['start']
+        current_line_words.append(word_info)
+        current_line_text = "".join([w['text'] for w in current_line_words]).strip()
+        line_should_end = (
+            len(current_line_text) >= max_line_length or
+            i == len(cleaned_words) - 1 or
+            word_info['text'].endswith(('。', '？', '！'))
+        )
+        if line_should_end:
+            line_text = current_line_text
+            line_end_time = word_info['end']
+            # Majority speaker_id in this line
+            speaker_ids = [w['speaker_id'] for w in current_line_words if w['speaker_id'] is not None]
+            speaker_id = Counter(speaker_ids).most_common(1)[0][0] if speaker_ids else None
+            lines.append({
+                'original': line_text,
+                'start': current_line_start_time,
+                'end': line_end_time,
+                'speaker': speaker_id
+            })
+            current_line_words = []
+            current_line_start_time = None
+    return lines
+def transcribe_with_scribe(full_audio_path):
+    transcribed_segments = []
+    detected_language = "unknown"
+    error_message = None
+    if not os.path.exists(full_audio_path):
+        return [], detected_language, f"Full audio file not found at {full_audio_path}"
+    headers = {"xi-api-key": ELEVENLABS_API_KEY}
+    data = {
+        "model_id": "scribe_v1",
+        "diarize": "true"
+    }
+    logger.info(f"Starting transcription for full audio: {full_audio_path}")
+    with open(full_audio_path, "rb") as audio_file:
+        files = {"file": (os.path.basename(full_audio_path), audio_file, "audio/wav")}
+        response = requests.post(ELEVENLABS_SCRIBE_API_URL, headers=headers, files=files, data=data)
+        response.raise_for_status()
+        scribe_result = response.json()
+    return scribe_result
 # Function to get the appropriate translation model based on target language
 def get_translation_model(source_language, target_language):
     """