qqwjq1981 commited on
Commit
9870a55
·
verified ·
1 Parent(s): c70b968

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -313,13 +313,14 @@ def segment_audio_from_video(video_path):
313
  ]
314
 
315
  return audio_path, transcript_with_speakers
316
-
317
  def clean_transcribed_text(text: str) -> str:
318
- """Remove repetitive symbols and artifacts from text."""
319
- # Remove only-punctuation or repeated tokens
320
- cleaned = re.sub(r"[_,.~`^•·。!?!?,,\.\/\\\-–—=+]+", " ", text)
321
- cleaned = re.sub(r"\s+", " ", cleaned).strip()
322
- return cleaned
 
 
323
 
324
  def transcribe_segments_with_scribe(full_audio_path, segments):
325
  transcribed_segments = []
@@ -388,7 +389,7 @@ def transcribe_segments_with_scribe(full_audio_path, segments):
388
  audio_clip.close()
389
 
390
  return transcribed_segments, detected_language, error_message
391
-
392
  # Function to get the appropriate translation model based on target language
393
  def get_translation_model(source_language, target_language):
394
  """
 
313
  ]
314
 
315
  return audio_path, transcript_with_speakers
 
316
  def clean_transcribed_text(text: str) -> str:
317
+ """
318
+ Remove noise tags like (panting), [booming sound], repeated symbols, and trim whitespace.
319
+ """
320
+ text = re.sub(r"[\(\[\{].*?[\)\]\}]", "", text)
321
+ text = re.sub(r"[_,.~`^•·。!?!?,,\.\/\\\-–—=+]+", " ", text)
322
+ text = re.sub(r"\s+", " ", text).strip()
323
+ return text
324
 
325
  def transcribe_segments_with_scribe(full_audio_path, segments):
326
  transcribed_segments = []
 
389
  audio_clip.close()
390
 
391
  return transcribed_segments, detected_language, error_message
392
+
393
  # Function to get the appropriate translation model based on target language
394
  def get_translation_model(source_language, target_language):
395
  """