camparchimedes commited on
Commit
b3d3679
·
verified ·
1 Parent(s): 7ec9f42

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -16
app.py CHANGED
@@ -38,30 +38,25 @@ def convert_to_wav(audio_file):
38
  device = "cuda" if torch.cuda.is_available() else "cpu"
39
 
40
  # Load Whisper model and tokenizer
41
- whisper_pipeline = pipeline("automatic-speech-recognition", model="NbAiLab/nb-whisper-large", device=device)
42
- summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base", torch_dtype=torch.float16).to(device)
43
- summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
44
 
45
- # Transcribe audio to text
46
  def transcribe_audio(audio_file):
47
  if audio_file.endswith(".m4a"):
48
  audio_file = convert_to_wav(audio_file)
49
-
50
  start_time = time.time()
51
 
52
- # Prepare input and attention mask
53
- inputs = whisper_pipeline.tokenizer(audio_file, return_tensors="pt", padding=True)
54
- inputs = {k: v.to(device) for k, v in inputs.items()}
55
 
56
- # Generate the transcription with attention_mask
57
- output = whisper_pipeline.model.generate(
58
- inputs['input_ids'],
59
- attention_mask=inputs['attention_mask']
60
- )
61
 
62
  # Decode the output
63
- text = whisper_pipeline.tokenizer.decode(output[0], skip_special_tokens=True)
64
-
65
  output_time = time.time() - start_time
66
  result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {len(text.split())}"
67
 
@@ -171,7 +166,7 @@ iface = gr.Blocks()
171
  with iface:
172
  gr.HTML("""
173
  <div style="text-align: center;">
174
- <img src="https://huggingface.co/spaces/camparchimedes/transcription_app/blob/main/lol.webp" alt="" width="100%" height="auto">
175
  </div>
176
  """)
177
  gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
 
38
  device = "cuda" if torch.cuda.is_available() else "cpu"
39
 
40
  # Load Whisper model and tokenizer
41
+ whisper_processor = WhisperProcessor.from_pretrained("NbAiLab/nb-whisper-large")
42
+ whisper_model = torch.hub.load('huggingface/pytorch-transformers', 'model', "NbAiLab/nb-whisper-large").to(device)
 
43
 
 
44
  def transcribe_audio(audio_file):
45
  if audio_file.endswith(".m4a"):
46
  audio_file = convert_to_wav(audio_file)
47
+
48
  start_time = time.time()
49
 
50
+ # Load the audio file and process it with Whisper's processor
51
+ audio, sample_rate = whisper_processor.audio_to_array(audio_file)
52
+ input_features = whisper_processor(audio, sampling_rate=sample_rate, return_tensors="pt").input_features.to(device)
53
 
54
+ # Generate the transcription
55
+ output = whisper_model.generate(input_features=input_features)
 
 
 
56
 
57
  # Decode the output
58
+ text = whisper_processor.batch_decode(output, skip_special_tokens=True)[0]
59
+
60
  output_time = time.time() - start_time
61
  result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {len(text.split())}"
62
 
 
166
  with iface:
167
  gr.HTML("""
168
  <div style="text-align: center;">
169
+ <img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/banner_trans.png" alt="" width="100%" height="auto">
170
  </div>
171
  """)
172
  gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")