reab5555 commited on
Commit
f22e5b0
·
verified ·
1 Parent(s): 991636c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -18
app.py CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
4
  import torch
5
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
6
  from moviepy.editor import VideoFileClip
7
- import spaces
8
 
9
  def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -21,14 +21,13 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
21
  tokenizer=processor.tokenizer,
22
  feature_extractor=processor.feature_extractor,
23
  max_new_tokens=128,
24
- chunk_length_s=30,
25
  batch_size=2,
26
  return_timestamps=True,
27
  torch_dtype=torch_dtype,
28
  device=device,
29
  )
30
-
31
- # Handle the video file input
32
  if video_file is None:
33
  yield "Error: No video file provided.", None
34
  return
@@ -42,35 +41,39 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
42
 
43
  audio = video.audio
44
  duration = video.duration
45
- n_chunks = math.ceil(duration / 30)
46
  transcription_txt = ""
47
  transcription_srt = []
 
 
 
 
48
 
49
- for i in range(n_chunks):
50
- start = i * 30
51
- end = min((i + 1) * 30, duration)
52
- audio_chunk = audio.subclip(start, end)
53
 
54
- temp_file_path = f"temp_audio_{i}.wav"
55
- audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')
 
 
 
 
56
  with open(temp_file_path, "rb") as temp_file:
57
  result = pipe(temp_file_path, generate_kwargs={"language": language})
58
  transcription_txt += result["text"]
59
  if transcribe_to_srt:
60
  for chunk in result["chunks"]:
61
  start_time, end_time = chunk["timestamp"]
62
- # Handle potential None values
63
  if start_time is not None and end_time is not None:
64
  transcription_srt.append({
65
- "start": start_time + i * 30,
66
- "end": end_time + i * 30,
67
  "text": chunk["text"]
68
  })
69
  else:
70
- # Log or handle the case where timestamps are None
71
  print(f"Warning: Invalid timestamp for chunk: {chunk}")
72
  os.remove(temp_file_path)
73
- yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%", None
74
 
75
  output = ""
76
  srt_file_path = None
@@ -84,7 +87,6 @@ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
84
  output += srt_entry
85
  srt_content += srt_entry
86
 
87
- # Save SRT content to a file
88
  srt_file_path = "transcription.srt"
89
  with open(srt_file_path, "w", encoding="utf-8") as srt_file:
90
  srt_file.write(srt_content)
@@ -114,4 +116,4 @@ iface = gr.Interface(
114
  description="Upload a video file to transcribe its audio using Whisper. You can download the SRT file if generated.",
115
  )
116
 
117
- iface.launch()
 
4
  import torch
5
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
6
  from moviepy.editor import VideoFileClip
7
+ import librosa # Add librosa for audio processing
8
 
9
  def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
10
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
21
  tokenizer=processor.tokenizer,
22
  feature_extractor=processor.feature_extractor,
23
  max_new_tokens=128,
24
+ chunk_length_s=10, # Shorter chunk length to prevent overly long captions
25
  batch_size=2,
26
  return_timestamps=True,
27
  torch_dtype=torch_dtype,
28
  device=device,
29
  )
30
+
 
31
  if video_file is None:
32
  yield "Error: No video file provided.", None
33
  return
 
41
 
42
  audio = video.audio
43
  duration = video.duration
 
44
  transcription_txt = ""
45
  transcription_srt = []
46
+
47
+ # Chunk the audio by detecting silent periods to improve transcription quality
48
+ audio_samples, sr = librosa.load(audio_path, sr=None)
49
+ intervals = librosa.effects.split(audio_samples, top_db=30) # Adjust threshold if necessary
50
 
51
+ for idx, (start_frame, end_frame) in enumerate(intervals):
52
+ start_time = start_frame / sr
53
+ end_time = end_frame / sr
 
54
 
55
+ if end_time - start_time > 10: # Enforce 10-second max duration for each caption
56
+ end_time = start_time + 10
57
+
58
+ temp_file_path = f"temp_audio_{idx}.wav"
59
+ librosa.output.write_wav(temp_file_path, audio_samples[start_frame:end_frame], sr)
60
+
61
  with open(temp_file_path, "rb") as temp_file:
62
  result = pipe(temp_file_path, generate_kwargs={"language": language})
63
  transcription_txt += result["text"]
64
  if transcribe_to_srt:
65
  for chunk in result["chunks"]:
66
  start_time, end_time = chunk["timestamp"]
 
67
  if start_time is not None and end_time is not None:
68
  transcription_srt.append({
69
+ "start": start_time + idx * 10,
70
+ "end": end_time + idx * 10,
71
  "text": chunk["text"]
72
  })
73
  else:
 
74
  print(f"Warning: Invalid timestamp for chunk: {chunk}")
75
  os.remove(temp_file_path)
76
+ yield f"Progress: {int((idx / len(intervals)) * 100)}%", None
77
 
78
  output = ""
79
  srt_file_path = None
 
87
  output += srt_entry
88
  srt_content += srt_entry
89
 
 
90
  srt_file_path = "transcription.srt"
91
  with open(srt_file_path, "w", encoding="utf-8") as srt_file:
92
  srt_file.write(srt_content)
 
116
  description="Upload a video file to transcribe its audio using Whisper. You can download the SRT file if generated.",
117
  )
118
 
119
+ iface.launch()