reab5555 commited on
Commit
03701cd
·
verified ·
1 Parent(s): e3225d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -21
app.py CHANGED
@@ -3,20 +3,17 @@ import math
3
  import gradio as gr
4
  import torch
5
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
6
- from moviepy.editor import AudioFileClip
7
 
8
- def transcribe(audio_file, transcribe_to_text, transcribe_to_srt, language):
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
11
-
12
  model_id = "openai/whisper-large-v3"
13
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
14
  model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
15
  )
16
  model.to(device)
17
-
18
  processor = AutoProcessor.from_pretrained(model_id)
19
-
20
  pipe = pipeline(
21
  "automatic-speech-recognition",
22
  model=model,
@@ -30,16 +27,16 @@ def transcribe(audio_file, transcribe_to_text, transcribe_to_srt, language):
30
  device=device,
31
  generate_kwargs={"language": language}
32
  )
33
-
34
  # Handle both file path (str) and file object
35
- audio_path = audio_file if isinstance(audio_file, str) else audio_file.name
36
- audio = AudioFileClip(audio_path)
37
- duration = audio.duration
 
38
  n_chunks = math.ceil(duration / 30)
39
-
40
  transcription_txt = ""
41
  transcription_srt = []
42
-
43
  for i in range(n_chunks):
44
  start = i * 30
45
  end = min((i + 1) * 30, duration)
@@ -47,11 +44,9 @@ def transcribe(audio_file, transcribe_to_text, transcribe_to_srt, language):
47
 
48
  temp_file_path = f"temp_audio_{i}.wav"
49
  audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')
50
-
51
  with open(temp_file_path, "rb") as temp_file:
52
  result = pipe(temp_file_path)
53
  transcription_txt += result["text"]
54
-
55
  if transcribe_to_srt:
56
  for chunk in result["chunks"]:
57
  start_time, end_time = chunk["timestamp"]
@@ -60,20 +55,16 @@ def transcribe(audio_file, transcribe_to_text, transcribe_to_srt, language):
60
  "end": end_time + i * 30,
61
  "text": chunk["text"]
62
  })
63
-
64
  os.remove(temp_file_path)
65
-
66
  yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%"
67
-
68
  output = ""
69
  if transcribe_to_text:
70
  output += "Text Transcription:\n" + transcription_txt + "\n\n"
71
-
72
  if transcribe_to_srt:
73
  output += "SRT Transcription:\n"
74
  for i, sub in enumerate(transcription_srt, 1):
75
  output += f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
76
-
77
  yield output
78
 
79
  def format_time(seconds):
@@ -84,14 +75,14 @@ def format_time(seconds):
84
  iface = gr.Interface(
85
  fn=transcribe,
86
  inputs=[
87
- gr.Audio(type="filepath"),
88
  gr.Checkbox(label="Transcribe to Text"),
89
  gr.Checkbox(label="Transcribe to SRT"),
90
  gr.Dropdown(choices=['en', 'he', 'it', 'fr', 'de', 'zh', 'ar'], label="Language")
91
  ],
92
  outputs="text",
93
- title="WhisperCap Transcription",
94
- description="Upload an audio file to transcribe it using Whisper.",
95
  )
96
 
97
  iface.launch()
 
3
  import gradio as gr
4
  import torch
5
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
6
+ from moviepy.editor import VideoFileClip
7
 
8
+ def transcribe(video_file, transcribe_to_text, transcribe_to_srt, language):
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
11
  model_id = "openai/whisper-large-v3"
12
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
13
  model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
14
  )
15
  model.to(device)
 
16
  processor = AutoProcessor.from_pretrained(model_id)
 
17
  pipe = pipeline(
18
  "automatic-speech-recognition",
19
  model=model,
 
27
  device=device,
28
  generate_kwargs={"language": language}
29
  )
30
+
31
  # Handle both file path (str) and file object
32
+ video_path = video_file if isinstance(video_file, str) else video_file.name
33
+ video = VideoFileClip(video_path)
34
+ audio = video.audio
35
+ duration = video.duration
36
  n_chunks = math.ceil(duration / 30)
 
37
  transcription_txt = ""
38
  transcription_srt = []
39
+
40
  for i in range(n_chunks):
41
  start = i * 30
42
  end = min((i + 1) * 30, duration)
 
44
 
45
  temp_file_path = f"temp_audio_{i}.wav"
46
  audio_chunk.write_audiofile(temp_file_path, codec='pcm_s16le')
 
47
  with open(temp_file_path, "rb") as temp_file:
48
  result = pipe(temp_file_path)
49
  transcription_txt += result["text"]
 
50
  if transcribe_to_srt:
51
  for chunk in result["chunks"]:
52
  start_time, end_time = chunk["timestamp"]
 
55
  "end": end_time + i * 30,
56
  "text": chunk["text"]
57
  })
 
58
  os.remove(temp_file_path)
 
59
  yield f"Progress: {int(((i + 1) / n_chunks) * 100)}%"
60
+
61
  output = ""
62
  if transcribe_to_text:
63
  output += "Text Transcription:\n" + transcription_txt + "\n\n"
 
64
  if transcribe_to_srt:
65
  output += "SRT Transcription:\n"
66
  for i, sub in enumerate(transcription_srt, 1):
67
  output += f"{i}\n{format_time(sub['start'])} --> {format_time(sub['end'])}\n{sub['text']}\n\n"
 
68
  yield output
69
 
70
  def format_time(seconds):
 
75
  iface = gr.Interface(
76
  fn=transcribe,
77
  inputs=[
78
+ gr.Video(type="filepath"),
79
  gr.Checkbox(label="Transcribe to Text"),
80
  gr.Checkbox(label="Transcribe to SRT"),
81
  gr.Dropdown(choices=['en', 'he', 'it', 'fr', 'de', 'zh', 'ar'], label="Language")
82
  ],
83
  outputs="text",
84
+ title="WhisperCap Video Transcription",
85
+ description="Upload a video file to transcribe its audio using Whisper.",
86
  )
87
 
88
  iface.launch()