studio_V1_4_OCR_SOTA

Sleeping

App Files Files Community

qqwjq1981 commited on Jun 3

Commit

281e1f6

verified ·

1 Parent(s): 3f4f0cf

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -5

app.py CHANGED Viewed

@@ -605,6 +605,9 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
     d = np.array(generated_durations)
     m = np.array([(seg['start'] + seg['end']) / 2 for seg in original_segments])
     try:
         s = cp.Variable(N)
         objective = cp.Minimize(cp.sum_squares(s + d / 2 - m))
@@ -1043,7 +1046,6 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
     return i, txt_clip, audio_segment, actual_duration, error_message
 def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
     video = VideoFileClip(video_path)
@@ -1082,10 +1084,13 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
                 err = f"❌ Unexpected error in future result: {e}"
                 error_messages.append(err)
     results.sort(key=lambda x: x[0])
-    text_clips = [clip for _, clip, _, _ in results if clip]
-    generated_durations = [dur for _, _, _, dur in results if dur > 0]
     # Align using optimization (modifies translated_json in-place)
     translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
@@ -1306,7 +1311,8 @@ def upload_and_manage(file, target_language, process_mode):
         transcription_json, source_language = transcribe_video_with_speakers_11labs(file.name)
         logger.info(f"Transcription completed. Detected source language: {source_language}")
-        transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name, source_language)
         # Step 2: Translate the transcription
         logger.info(f"Translating transcription from {source_language} to {target_language}...")
         translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)

     d = np.array(generated_durations)
     m = np.array([(seg['start'] + seg['end']) / 2 for seg in original_segments])
+    if N == 0 or len(generated_durations) == 0:
+        logger.warning("⚠️ Alignment skipped: empty segments or durations.")
+        return original_segments  # or raise an error, depending on your app logic
     try:
         s = cp.Variable(N)
         objective = cp.Minimize(cp.sum_squares(s + d / 2 - m))
     return i, txt_clip, audio_segment, actual_duration, error_message
 def add_transcript_voiceover(video_path, translated_json, output_path, process_mode, target_language="en", speaker_sample_paths=None, background_audio_path="background_segments.wav"):
     video = VideoFileClip(video_path)
                 err = f"❌ Unexpected error in future result: {e}"
                 error_messages.append(err)
+    # Sort and filter together
     results.sort(key=lambda x: x[0])
+    filtered = [(translated_json[i], txt, aud, dur) for i, txt, aud, dur in results if dur > 0]
+    translated_json = [entry for entry, _, _, _ in filtered]
+    generated_durations = [dur for _, _, _, dur in filtered]
     # Align using optimization (modifies translated_json in-place)
     translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
         transcription_json, source_language = transcribe_video_with_speakers_11labs(file.name)
         logger.info(f"Transcription completed. Detected source language: {source_language}")
+        transcription_json_merged = transcription_json
+        #post_edit_transcribed_segments(transcription_json, file.name, source_language)
         # Step 2: Translate the transcription
         logger.info(f"Translating transcription from {source_language} to {target_language}...")
         translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)