studio_V1_4_OCR_SOTA

Sleeping

qqwjq1981 commited on Jun 5

Commit

9870a55

verified ·

1 Parent(s): c70b968

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -313,13 +313,14 @@ def segment_audio_from_video(video_path):
     ]
     return audio_path, transcript_with_speakers
 def clean_transcribed_text(text: str) -> str:
-    """Remove repetitive symbols and artifacts from text."""
-    # Remove only-punctuation or repeated tokens
-    cleaned = re.sub(r"[_,.~`^•·。！？!?,，\.\/\\\-–—=+]+", " ", text)
-    cleaned = re.sub(r"\s+", " ", cleaned).strip()
-    return cleaned
 def transcribe_segments_with_scribe(full_audio_path, segments):
     transcribed_segments = []
@@ -388,7 +389,7 @@ def transcribe_segments_with_scribe(full_audio_path, segments):
             audio_clip.close()
     return transcribed_segments, detected_language, error_message
 # Function to get the appropriate translation model based on target language
 def get_translation_model(source_language, target_language):
     """

     ]
     return audio_path, transcript_with_speakers
 def clean_transcribed_text(text: str) -> str:
+    """
+    Remove noise tags like (panting), [booming sound], repeated symbols, and trim whitespace.
+    """
+    text = re.sub(r"[\(\[\{].*?[\)\]\}]", "", text)
+    text = re.sub(r"[_,.~`^•·。！？!?,，\.\/\\\-–—=+]+", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
 def transcribe_segments_with_scribe(full_audio_path, segments):
     transcribed_segments = []
             audio_clip.close()
     return transcribed_segments, detected_language, error_message
 # Function to get the appropriate translation model based on target language
 def get_translation_model(source_language, target_language):
     """