Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
|
|
4 |
import torch
|
5 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
6 |
from moviepy.editor import VideoFileClip
|
7 |
-
import
|
8 |
|
9 |
def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
10 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
@@ -21,14 +21,13 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
|
21 |
tokenizer=processor.tokenizer,
|
22 |
feature_extractor=processor.feature_extractor,
|
23 |
max_new_tokens=128,
|
24 |
-
chunk_length_s=
|
25 |
batch_size=2,
|
26 |
return_timestamps=True,
|
27 |
torch_dtype=torch_dtype,
|
28 |
device=device,
|
29 |
)
|
30 |
-
|
31 |
-
# Handle the video file input
|
32 |
if video_file is None:
|
33 |
yield "Error: No video file provided.", None
|
34 |
return
|
@@ -42,35 +41,39 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
|
42 |
|
43 |
audio = video.audio
|
44 |
duration = video.duration
|
45 |
-
n_chunks = math.ceil(duration / 30)
|
46 |
transcription_txt = ""
|
47 |
transcription_srt = []
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
for
|
50 |
-
|
51 |
-
|
52 |
-
audio_chunk = audio.subclip(start, end)
|
53 |
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
56 |
with open(temp_file_path, "rb") as temp_file:
|
57 |
result = pipe(temp_file_path, generate_kwargs={"language": language})
|
58 |
transcription_txt += result["text"]
|
59 |
if transcribe_to_srt:
|
60 |
for chunk in result["chunks"]:
|
61 |
start_time, end_time = chunk["timestamp"]
|
62 |
-
# Handle potential None values
|
63 |
if start_time is not None and end_time is not None:
|
64 |
transcription_srt.append({
|
65 |
-
"start": start_time +
|
66 |
-
"end": end_time +
|
67 |
"text": chunk["text"]
|
68 |
})
|
69 |
else:
|
70 |
-
# Log or handle the case where timestamps are None
|
71 |
print(f"Warning: Invalid timestamp for chunk: {chunk}")
|
72 |
os.remove(temp_file_path)
|
73 |
-
yield f"Progress: {int((
|
74 |
|
75 |
output = ""
|
76 |
srt_file_path = None
|
@@ -84,7 +87,6 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
|
84 |
output += srt_entry
|
85 |
srt_content += srt_entry
|
86 |
|
87 |
-
# Save SRT content to a file
|
88 |
srt_file_path = "transcription.srt"
|
89 |
with open(srt_file_path, "w", encoding="utf-8") as srt_file:
|
90 |
srt_file.write(srt_content)
|
@@ -114,4 +116,4 @@ iface = gr.Interface(
|
|
114 |
description="Upload a video file to transcribe its audio using Whisper. You can download the SRT file if generated.",
|
115 |
)
|
116 |
|
117 |
-
iface.launch()
|
|
|
4 |
import torch
|
5 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
6 |
from moviepy.editor import VideoFileClip
|
7 |
+
import librosa # Add librosa for audio processing
|
8 |
|
9 |
def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
10 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
21 |
tokenizer=processor.tokenizer,
|
22 |
feature_extractor=processor.feature_extractor,
|
23 |
max_new_tokens=128,
|
24 |
+
chunk_length_s=10, # Shorter chunk length to prevent overly long captions
|
25 |
batch_size=2,
|
26 |
return_timestamps=True,
|
27 |
torch_dtype=torch_dtype,
|
28 |
device=device,
|
29 |
)
|
30 |
+
|
|
|
31 |
if video_file is None:
|
32 |
yield "Error: No video file provided.", None
|
33 |
return
|
|
|
41 |
|
42 |
audio = video.audio
|
43 |
duration = video.duration
|
|
|
44 |
transcription_txt = ""
|
45 |
transcription_srt = []
|
46 |
+
|
47 |
+
# Chunk the audio by detecting silent periods to improve transcription quality
|
48 |
+
audio_samples, sr = librosa.load(audio_path, sr=None)
|
49 |
+
intervals = librosa.effects.split(audio_samples, top_db=30) # Adjust threshold if necessary
|
50 |
|
51 |
+
for idx, (start_frame, end_frame) in enumerate(intervals):
|
52 |
+
start_time = start_frame / sr
|
53 |
+
end_time = end_frame / sr
|
|
|
54 |
|
55 |
+
if end_time - start_time > 10: # Enforce 10-second max duration for each caption
|
56 |
+
end_time = start_time + 10
|
57 |
+
|
58 |
+
temp_file_path = f"temp_audio_{idx}.wav"
|
59 |
+
librosa.output.write_wav(temp_file_path, audio_samples[start_frame:end_frame], sr)
|
60 |
+
|
61 |
with open(temp_file_path, "rb") as temp_file:
|
62 |
result = pipe(temp_file_path, generate_kwargs={"language": language})
|
63 |
transcription_txt += result["text"]
|
64 |
if transcribe_to_srt:
|
65 |
for chunk in result["chunks"]:
|
66 |
start_time, end_time = chunk["timestamp"]
|
|
|
67 |
if start_time is not None and end_time is not None:
|
68 |
transcription_srt.append({
|
69 |
+
"start": start_time + idx * 10,
|
70 |
+
"end": end_time + idx * 10,
|
71 |
"text": chunk["text"]
|
72 |
})
|
73 |
else:
|
|
|
74 |
print(f"Warning: Invalid timestamp for chunk: {chunk}")
|
75 |
os.remove(temp_file_path)
|
76 |
+
yield f"Progress: {int((idx / len(intervals)) * 100)}%", None
|
77 |
|
78 |
output = ""
|
79 |
srt_file_path = None
|
|
|
87 |
output += srt_entry
|
88 |
srt_content += srt_entry
|
89 |
|
|
|
90 |
srt_file_path = "transcription.srt"
|
91 |
with open(srt_file_path, "w", encoding="utf-8") as srt_file:
|
92 |
srt_file.write(srt_content)
|
|
|
116 |
description="Upload a video file to transcribe its audio using Whisper. You can download the SRT file if generated.",
|
117 |
)
|
118 |
|
119 |
+
iface.launch()
|