Spaces:

reab5555
/

WhisperCap

Sleeping

App Files Files Community

reab5555 commited on Oct 29, 2024

Commit

f22e5b0

verified ·

1 Parent(s): 991636c

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -18

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import gradio as gr
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 from moviepy.editor import VideoFileClip
-import spaces
 def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -21,14 +21,13 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
         max_new_tokens=128,
-        chunk_length_s=30,
         batch_size=2,
         return_timestamps=True,
         torch_dtype=torch_dtype,
         device=device,
     )
-    # Handle the video file input
     if video_file is None:
         yield "Error: No video file provided.", None
         return
@@ -42,35 +41,39 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
     audio = video.audio
     duration = video.duration
-    n_chunks = math.ceil(duration / 30)
     transcription_txt = ""
     transcription_srt = []
-    for i in range(n_chunks):
-        start = i * 30
-        end = min((i + 1) * 30, duration)
-        audio_chunk = audio.subclip(start, end)
-        temp_file_path = f"temp_audio_{i}.wav"
-        audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')
         with open(temp_file_path, "rb") as temp_file:
             result = pipe(temp_file_path, generate_kwargs={"language": language})
             transcription_txt += result["text"]
             if transcribe_to_srt:
                 for chunk in result["chunks"]:
                     start_time, end_time = chunk["timestamp"]
-                    # Handle potential None values
                     if start_time is not None and end_time is not None:
                         transcription_srt.append({
-                            "start": start_time + i * 30,
-                            "end": end_time + i * 30,
                             "text": chunk["text"]
                         })
                     else:
-                        # Log or handle the case where timestamps are None
                         print(f"Warning: Invalid timestamp for chunk: {chunk}")
         os.remove(temp_file_path)
-        yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%", None
     output = ""
     srt_file_path = None
@@ -84,7 +87,6 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
             output += srt_entry
             srt_content += srt_entry
-        # Save SRT content to a file
         srt_file_path = "transcription.srt"
         with open(srt_file_path, "w", encoding="utf-8") as srt_file:
             srt_file.write(srt_content)
@@ -114,4 +116,4 @@ iface = gr.Interface(
     description="Upload a video file to transcribe its audio using Whisper. You can download the SRT file if generated.",
 )
-iface.launch()

 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 from moviepy.editor import VideoFileClip
+import librosa  # Add librosa for audio processing
 def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
     device = "cuda:0" if torch.cuda.is_available() else "cpu"
         tokenizer=processor.tokenizer,
         feature_extractor=processor.feature_extractor,
         max_new_tokens=128,
+        chunk_length_s=10,  # Shorter chunk length to prevent overly long captions
         batch_size=2,
         return_timestamps=True,
         torch_dtype=torch_dtype,
         device=device,
     )
     if video_file is None:
         yield "Error: No video file provided.", None
         return
     audio = video.audio
     duration = video.duration
     transcription_txt = ""
     transcription_srt = []
+    # Chunk the audio by detecting silent periods to improve transcription quality
+    audio_samples, sr = librosa.load(audio_path, sr=None)
+    intervals = librosa.effects.split(audio_samples, top_db=30)  # Adjust threshold if necessary
+    for idx, (start_frame, end_frame) in enumerate(intervals):
+        start_time = start_frame / sr
+        end_time = end_frame / sr
+        if end_time - start_time > 10:  # Enforce 10-second max duration for each caption
+            end_time = start_time + 10
+        temp_file_path = f"temp_audio_{idx}.wav"
+        librosa.output.write_wav(temp_file_path, audio_samples[start_frame:end_frame], sr)
         with open(temp_file_path, "rb") as temp_file:
             result = pipe(temp_file_path, generate_kwargs={"language": language})
             transcription_txt += result["text"]
             if transcribe_to_srt:
                 for chunk in result["chunks"]:
                     start_time, end_time = chunk["timestamp"]
                     if start_time is not None and end_time is not None:
                         transcription_srt.append({
+                            "start": start_time + idx * 10,
+                            "end": end_time + idx * 10,
                             "text": chunk["text"]
                         })
                     else:
                         print(f"Warning: Invalid timestamp for chunk: {chunk}")
         os.remove(temp_file_path)
+        yield f"Progress: {int((idx / len(intervals)) * 100)}%", None
     output = ""
     srt_file_path = None
             output += srt_entry
             srt_content += srt_entry
         srt_file_path = "transcription.srt"
         with open(srt_file_path, "w", encoding="utf-8") as srt_file:
             srt_file.write(srt_content)
     description="Upload a video file to transcribe its audio using Whisper. You can download the SRT file if generated.",
 )
+iface.launch()