camparchimedes commited on
Commit
1b9402b
Β·
verified Β·
1 Parent(s): aea18b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -9
app.py CHANGED
@@ -27,20 +27,22 @@ def transcribe_audio(audio_file):
27
  chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
28
 
29
  transcription = ""
30
- for chunk in chunks:
31
- inputs = processor(chunk, sampling_rate=16000, return_tensors="pt")
 
32
  inputs = inputs.to(device)
 
33
  with torch.no_grad():
34
  output = model.generate(
35
  inputs.input_features,
36
- max_length=1024, # Increase max_length@longer outputs
37
- num_beams=5,
38
  task="transcribe",
 
 
39
  language="no"
40
  )
41
- transcription += processor.batch_decode(output, skip_special_tokens=True)[0] + " "
42
-
43
- return transcription.strip()
44
 
45
  # HTML |banner image
46
  banner_html = """
@@ -54,12 +56,13 @@ iface = gr.Blocks()
54
 
55
  with iface:
56
  gr.HTML(banner_html)
57
- gr.Markdown("# Nvidia A100πŸ‘‹πŸΌπŸ‘ΎπŸ¦Ύβš‘β˜•πŸ§‘πŸΌβ€πŸ«@{NbAiLab/whisper-norwegian-medium}\nUpload audio file (*needs to be in .mp3 format before upload*)")
58
  audio_input = gr.Audio(type="filepath")
 
59
  transcription_output = gr.Textbox()
60
  transcribe_button = gr.Button("Transcribe")
61
 
62
- transcribe_button.click(fn=transcribe_audio, inputs=audio_input, outputs=transcription_output)
63
 
64
  # Launch interface
65
  iface.launch(share=True, debug=True)
 
27
  chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
28
 
29
  transcription = ""
30
+ for i in range(0, len(chunks), batch_size):
31
+ batch_chunks = chunks[i:i + batch_size]
32
+ inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
33
  inputs = inputs.to(device)
34
+ attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
35
  with torch.no_grad():
36
  output = model.generate(
37
  inputs.input_features,
38
+ max_length=1024, # Increase max_length for longer outputs
39
+ num_beams=7,
40
  task="transcribe",
41
+ attention_mask=attention_mask,
42
+ forced_decoder_ids=None # forced_decoder_ids must not be set
43
  language="no"
44
  )
45
+ transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
 
 
46
 
47
  # HTML |banner image
48
  banner_html = """
 
56
 
57
  with iface:
58
  gr.HTML(banner_html)
59
+ gr.Markdown("# 𝐍𝐯𝐒𝐝𝐒𝐚 π€πŸπŸŽπŸŽ πŸ‘‹πŸΌπŸ‘ΎπŸ¦Ύβš‘ @{NbAiLab/whisper-norwegian-medium}\nUpload audio file:β˜•")
60
  audio_input = gr.Audio(type="filepath")
61
+ batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, default=4, label="Batch Size")
62
  transcription_output = gr.Textbox()
63
  transcribe_button = gr.Button("Transcribe")
64
 
65
+ transcribe_button.click(fn=transcribe_audio, inputs=[audio_input, batch_size_input], outputs=transcription_output)
66
 
67
  # Launch interface
68
  iface.launch(share=True, debug=True)