studio_V1_4_OCR_SOTA

Sleeping

App Files Files Community

qqwjq1981 commited on Jul 1

Commit

306be63

verified ·

1 Parent(s): 6c15ec0

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -10

app.py CHANGED Viewed

@@ -281,15 +281,22 @@ def transcribe_video_with_speakers(video_path):
     return transcript_with_speakers, detected_language
-def segment_audio_from_video(video_path):
     # Extract audio from video
     video = VideoFileClip(video_path)
     audio_path = "audio.wav"
     video.audio.write_audiofile(audio_path)
     logger.info(f"Audio extracted from video: {audio_path}")
-    segment_result, speech_audio_path = segment_background_audio(audio_path)
-    print(f"Saved non-speech (background) audio to local")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
@@ -1333,7 +1340,8 @@ def calibrated_speed(text, desired_duration):
         slope = (1.7 - 1.0) / (25.2 - 14)
         return 1.0 + slope * (cps - 14)
-def upload_and_manage(file, target_language, process_mode):
     if file is None:
         logger.info("No file uploaded. Please upload a video/audio file.")
         return None, [], None, "No file uploaded. Please upload a video/audio file."
@@ -1343,7 +1351,7 @@ def upload_and_manage(file, target_language, process_mode):
         logger.info(f"Started processing file: {file.name}")
         # Define paths for audio and output files
-        audio_path = "audio.wav"
         output_video_path = "output_video.mp4"
         voiceover_path = "voiceover.wav"
         translated_json_filepath = "translated_output.json"
@@ -1352,7 +1360,11 @@ def upload_and_manage(file, target_language, process_mode):
         # Step 1: Segment audio from the uploaded video/audio file
         logger.info("Segmenting audio...")
-        temp_audio_for_vad, background_audio_path, speech_segments = segment_audio_from_video(file.name)
         if not speech_segments:
             raise Exception("No speech segments detected in the audio.")
         logger.info(f"Audio segmentation completed. Found {len(speech_segments)} segments.")
@@ -1386,7 +1398,7 @@ def upload_and_manage(file, target_language, process_mode):
         with open(translated_json_filepath, "w", encoding="utf-8") as f:
             json.dump(translated_json, f, ensure_ascii=False, indent=4)
         logger.info(f"Translated JSON saved to {translated_json_filepath}")
         # Step 3: Add transcript to video based on timestamps
         logger.info("Adding translated transcript to video...")
         add_transcript_voiceover(file.name, translated_json, output_video_path, process_mode, target_language, background_audio_path = background_audio_path)
@@ -1430,7 +1442,15 @@ def build_interface():
                 process_mode = gr.Radio(choices=[("Transcription Only", 1),
                                                  ("Transcription with Premium Voice", 2),
                                                  ("Transcription with Voice Clone", 3)],
-                                       label="Choose Processing Type", value=1)
                 submit_button = gr.Button("Post and Process")
             with gr.Column(scale=8):
                 gr.Markdown("## Edit Translations")
@@ -1475,7 +1495,7 @@ def build_interface():
             )
             submit_button.click(
                 upload_and_manage,
-                inputs=[file_input, language_input, process_mode],
                 outputs=[editable_table, processed_video_output, translated_json_download, elapsed_time_display]
             )
             # Connect submit button to save_feedback_db function
@@ -1489,4 +1509,4 @@ def build_interface():
 tts_model = None
 # Launch the Gradio interface
 demo = build_interface()
-demo.launch()

     return transcript_with_speakers, detected_language
+def segment_audio_from_video(video_path, separate_background = True):
     # Extract audio from video
     video = VideoFileClip(video_path)
     audio_path = "audio.wav"
     video.audio.write_audiofile(audio_path)
     logger.info(f"Audio extracted from video: {audio_path}")
+    segment_result = None
+    speech_audio_path = audio_path
+    if separate_background:
+        # Assuming segment_background_audio returns a tuple (segment_result, speech_audio_path)
+        segment_result, speech_audio_path = segment_background_audio(audio_path)
+        print(f"Saved non-speech (background) audio to local")
+    else:
+        logger.info("Background audio separation skipped as per separate_background=False.")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     logger.info(f"Using device: {device}")
         slope = (1.7 - 1.0) / (25.2 - 14)
         return 1.0 + slope * (cps - 14)
+# Modified upload_and_manage function
+def upload_and_manage(file, target_language, process_mode, separate_background_audio): # Added separate_background_audio
     if file is None:
         logger.info("No file uploaded. Please upload a video/audio file.")
         return None, [], None, "No file uploaded. Please upload a video/audio file."
         logger.info(f"Started processing file: {file.name}")
         # Define paths for audio and output files
+        audio_path = "audio.wav" # This will be the full extracted audio
         output_video_path = "output_video.mp4"
         voiceover_path = "voiceover.wav"
         translated_json_filepath = "translated_output.json"
         # Step 1: Segment audio from the uploaded video/audio file
         logger.info("Segmenting audio...")
+        # Pass the separate_background_audio boolean from the Gradio input
+        temp_audio_for_vad, background_audio_path, speech_segments = segment_audio_from_video(
+            file.name,
+            separate_background=separate_background_audio
+        )
         if not speech_segments:
             raise Exception("No speech segments detected in the audio.")
         logger.info(f"Audio segmentation completed. Found {len(speech_segments)} segments.")
         with open(translated_json_filepath, "w", encoding="utf-8") as f:
             json.dump(translated_json, f, ensure_ascii=False, indent=4)
         logger.info(f"Translated JSON saved to {translated_json_filepath}")
         # Step 3: Add transcript to video based on timestamps
         logger.info("Adding translated transcript to video...")
         add_transcript_voiceover(file.name, translated_json, output_video_path, process_mode, target_language, background_audio_path = background_audio_path)
                 process_mode = gr.Radio(choices=[("Transcription Only", 1),
                                                  ("Transcription with Premium Voice", 2),
                                                  ("Transcription with Voice Clone", 3)],
+                                        label="Choose Processing Type", value=1)
+                # New Gradio Checkbox for background audio separation
+                separate_background_checkbox = gr.Checkbox(
+                    label="Separate Background Audio (Recommended)",
+                    value=True, # Default to True
+                    interactive=True
+                )
                 submit_button = gr.Button("Post and Process")
             with gr.Column(scale=8):
                 gr.Markdown("## Edit Translations")
             )
             submit_button.click(
                 upload_and_manage,
+                inputs=[file_input, language_input, process_mode, separate_background_checkbox], # Add checkbox as input
                 outputs=[editable_table, processed_video_output, translated_json_download, elapsed_time_display]
             )
             # Connect submit button to save_feedback_db function
 tts_model = None
 # Launch the Gradio interface
 demo = build_interface()
+demo.launch()