camparchimedes commited on
Commit
47661bd
Β·
verified Β·
1 Parent(s): 198c942

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -19
app.py CHANGED
@@ -3,8 +3,7 @@ import warnings
3
  import torch
4
  from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
5
  import soundfile as sf
6
- import huggingface_hub
7
- import os
8
 
9
 
10
  warnings.filterwarnings("ignore")
@@ -21,21 +20,27 @@ torch_dtype = torch.float32
21
  # move model to device
22
  model.to(device)
23
 
24
- # @device.GPU
25
  def transcribe_audio(audio_file):
26
- audio_input, _ = sf.read(audio_file)
27
- inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt")
28
- inputs = inputs.to(device)
29
- with torch.no_grad():
30
- output = model.generate(
31
- inputs.input_features,
32
- max_length=448,
33
- num_beams=5,
34
- task="transcribe",
35
- language="no"
36
- )
37
- transcription = processor.batch_decode(output, skip_special_tokens=True)[0]
38
- return transcription
 
 
 
 
 
 
39
 
40
  # HTML for banner image
41
  banner_html = """
@@ -49,7 +54,7 @@ iface = gr.Blocks()
49
 
50
  with iface:
51
  gr.HTML(banner_html)
52
- gr.Markdown("# Audio Transcription App\nUpload an audio file to get the transcription")
53
  audio_input = gr.Audio(type="filepath")
54
  transcription_output = gr.Textbox()
55
  transcribe_button = gr.Button("Transcribe")
@@ -57,5 +62,4 @@ with iface:
57
  transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)
58
 
59
  # Launch the interface
60
- iface.launch(share=True, debug=True)
61
-
 
3
  import torch
4
  from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
5
  import soundfile as sf
6
+ from huggingface_hub import spaces
 
7
 
8
 
9
  warnings.filterwarnings("ignore")
 
20
  # move model to device
21
  model.to(device)
22
 
23
+
24
  def transcribe_audio(audio_file):
25
+ audio_input, sample_rate = sf.read(audio_file)
26
+ chunk_size = 16000 * 28 # 28 seconds chunks (seems to work best)
27
+ chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
28
+
29
+ transcription = ""
30
+ for chunk in chunks:
31
+ inputs = processor(chunk, sampling_rate=16000, return_tensors="pt")
32
+ inputs = inputs.to(device)
33
+ with torch.no_grad():
34
+ output = model.generate(
35
+ inputs.input_features,
36
+ max_length=2048, # Increase max_length for longer outputs
37
+ num_beams=10,
38
+ task="transcribe",
39
+ language="no"
40
+ )
41
+ transcription += processor.batch_decode(output, skip_special_tokens=True)[0] + " "
42
+
43
+ return transcription.strip()
44
 
45
  # HTML for banner image
46
  banner_html = """
 
54
 
55
  with iface:
56
  gr.HTML(banner_html)
57
+ gr.Markdown("# Ola's AudioSwitch2Go πŸ”ŠπŸŽ§β˜•πŸ§‘πŸΌβ€πŸ«@{NbAiLab/whisper-norwegian-medium}\nUpload audio file (if .ma4 ~simply rename it to .mp3 before upload)")
58
  audio_input = gr.Audio(type="filepath")
59
  transcription_output = gr.Textbox()
60
  transcribe_button = gr.Button("Transcribe")
 
62
  transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)
63
 
64
  # Launch the interface
65
+ iface.launch(share=True, debug=True)