aka7774 commited on
Commit
99f2b87
·
verified ·
1 Parent(s): f91092b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -19
app.py CHANGED
@@ -1,18 +1,9 @@
1
  import gradio as gr
2
- import whisper
3
- #from faster_whisper import WhisperModel
4
 
5
- #model_size = 'aka7774/whisper-large-v3-ct2'
6
  model_size = 'large-v3'
7
- model = whisper.load_model(model_size, device="cpu")
8
- #_ = model.half()
9
- #_ = model.cuda()
10
 
11
- #for m in model.modules():
12
- # if isinstance(m, whisper.model.LayerNorm):
13
- # m.float()
14
-
15
- # model = WhisperModel(model_size, device="cuda", compute_type="float16")
16
  # or run on GPU with INT8
17
  # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
18
  # or run on CPU with INT8
@@ -22,26 +13,26 @@ def speech_to_text(audio_file, _model_size):
22
  global model_size, model
23
  if model_size != _model_size:
24
  model_size = _model_size
25
- model = whisper.load_model(model_size)
26
 
27
  with torch.no_grad():
28
- result = model.transcribe(
29
  audio_file,
30
  verbose=True,
31
  language='japanese',
32
  beam_size=5,
33
- #fp16=True,
34
- without_timestamps=False
35
  )
36
- #segments, info = model.transcribe(audio_file, beam_size=5)
37
 
38
- return result["text"]
39
- #return "".join([segment.text for segment in segments])
 
40
 
41
  gr.Interface(
42
  fn=speech_to_text,
43
  inputs=[
44
  gr.Audio(source="upload", type="filepath"),
45
- gr.Dropdown(value=model_size, choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3", "aka7774/whisper-large-v3-ct2"]),
46
  ],
47
  outputs="text").launch()
 
1
  import gradio as gr
2
+ from faster_whisper import WhisperModel
 
3
 
 
4
  model_size = 'large-v3'
5
+ model = whisper.load_model(model_size, device="auto", compute_type="float16")
 
 
6
 
 
 
 
 
 
7
  # or run on GPU with INT8
8
  # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
9
  # or run on CPU with INT8
 
13
  global model_size, model
14
  if model_size != _model_size:
15
  model_size = _model_size
16
+ model = whisper.load_model(model_size, device="auto", compute_type="float16")
17
 
18
  with torch.no_grad():
19
+ segments, info = model.transcribe(
20
  audio_file,
21
  verbose=True,
22
  language='japanese',
23
  beam_size=5,
24
+ vad_filter=True,
25
+ without_timestamps=False,
26
  )
 
27
 
28
+ text = ''
29
+ for segment in segments:
30
+ text += "{segment.start:.2f}\t{segment.end:.2f}\t{segment.text}\n"
31
 
32
  gr.Interface(
33
  fn=speech_to_text,
34
  inputs=[
35
  gr.Audio(source="upload", type="filepath"),
36
+ gr.Dropdown(value=model_size, choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"]),
37
  ],
38
  outputs="text").launch()