Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,8 +3,7 @@ import math
|
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
6 |
-
from moviepy.editor import VideoFileClip
|
7 |
-
import librosa # Add librosa for audio processing
|
8 |
|
9 |
def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
10 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
@@ -41,23 +40,18 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
|
41 |
|
42 |
audio = video.audio
|
43 |
duration = video.duration
|
|
|
44 |
transcription_txt = ""
|
45 |
transcription_srt = []
|
46 |
-
|
47 |
-
# Corrected this to use `video_path` for librosa's load function
|
48 |
-
audio_samples, sr = librosa.load(video_path, sr=None)
|
49 |
-
intervals = librosa.effects.split(audio_samples, top_db=30) # Adjust threshold if necessary
|
50 |
|
51 |
-
for
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
54 |
|
55 |
-
if end_time - start_time > 10: # Enforce 10-second max duration for each caption
|
56 |
-
end_time = start_time + 10
|
57 |
-
|
58 |
-
temp_file_path = f"temp_audio_{idx}.wav"
|
59 |
-
librosa.output.write_wav(temp_file_path, audio_samples[start_frame:end_frame], sr)
|
60 |
-
|
61 |
with open(temp_file_path, "rb") as temp_file:
|
62 |
result = pipe(temp_file_path, generate_kwargs={"language": language})
|
63 |
transcription_txt += result["text"]
|
@@ -66,14 +60,15 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
|
66 |
start_time, end_time = chunk["timestamp"]
|
67 |
if start_time is not None and end_time is not None:
|
68 |
transcription_srt.append({
|
69 |
-
"start": start_time +
|
70 |
-
"end": end_time +
|
71 |
"text": chunk["text"]
|
72 |
})
|
73 |
else:
|
74 |
print(f"Warning: Invalid timestamp for chunk: {chunk}")
|
|
|
75 |
os.remove(temp_file_path)
|
76 |
-
yield f"Progress: {int((
|
77 |
|
78 |
output = ""
|
79 |
srt_file_path = None
|
|
|
3 |
import gradio as gr
|
4 |
import torch
|
5 |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
|
6 |
+
from moviepy.editor import VideoFileClip, concatenate_audioclips
|
|
|
7 |
|
8 |
def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
|
9 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
40 |
|
41 |
audio = video.audio
|
42 |
duration = video.duration
|
43 |
+
n_chunks = math.ceil(duration / 10) # Split into 10-second chunks
|
44 |
transcription_txt = ""
|
45 |
transcription_srt = []
|
|
|
|
|
|
|
|
|
46 |
|
47 |
+
for i in range(n_chunks):
|
48 |
+
start = i * 10
|
49 |
+
end = min((i + 1) * 10, duration)
|
50 |
+
audio_chunk = audio.subclip(start, end)
|
51 |
+
|
52 |
+
temp_file_path = f"temp_audio_{i}.wav"
|
53 |
+
audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
with open(temp_file_path, "rb") as temp_file:
|
56 |
result = pipe(temp_file_path, generate_kwargs={"language": language})
|
57 |
transcription_txt += result["text"]
|
|
|
60 |
start_time, end_time = chunk["timestamp"]
|
61 |
if start_time is not None and end_time is not None:
|
62 |
transcription_srt.append({
|
63 |
+
"start": start_time + i * 10,
|
64 |
+
"end": end_time + i * 10,
|
65 |
"text": chunk["text"]
|
66 |
})
|
67 |
else:
|
68 |
print(f"Warning: Invalid timestamp for chunk: {chunk}")
|
69 |
+
|
70 |
os.remove(temp_file_path)
|
71 |
+
yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%", None
|
72 |
|
73 |
output = ""
|
74 |
srt_file_path = None
|