MusIre commited on
Commit
27f3b82
·
1 Parent(s): 0c01f75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -19
app.py CHANGED
@@ -5,30 +5,25 @@ subprocess.run(["pip", "install", "gradio", "--upgrade"])
5
  subprocess.run(["pip", "install", "datasets"])
6
  subprocess.run(["pip", "install", "transformers"])
7
  subprocess.run(["pip", "install", "torch", "torchvision", "torchaudio", "-f", "https://download.pytorch.org/whl/torch_stable.html"])
8
- import gradio as gr
9
- import numpy as np
10
- import torch
11
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
12
 
13
- # Load model and processor
14
- processor = WhisperProcessor.from_pretrained("openai/whisper-large")
15
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
16
  forced_decoder_ids = processor.get_decoder_prompt_ids(language="italian", task="transcribe")
17
 
18
- # Custom preprocessing function
19
- def preprocess_audio(audio_data, sampling_rate=16_000):
20
- sample_rate, raw_audio = audio_data
21
- raw_speech = np.asarray(raw_audio, dtype=np.float32)
22
- return {"input_values": raw_speech, "sampling_rate": sample_rate}
23
 
24
- # Function to perform ASR on audio data
25
- def transcribe_audio(audio_data):
26
- input_features = preprocess_audio(audio_data)
27
- input_values = torch.tensor(input_features["input_values"]).unsqueeze(0) # Add batch dimension
28
- input_values = input_values.view(1, -1) # Flatten the tensor to 2D
29
- predicted_ids = model.generate(input_values)
30
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
31
- return transcription[0]
32
 
33
  # Create Gradio interface
34
  audio_input = gr.Audio()
 
5
  subprocess.run(["pip", "install", "datasets"])
6
  subprocess.run(["pip", "install", "transformers"])
7
  subprocess.run(["pip", "install", "torch", "torchvision", "torchaudio", "-f", "https://download.pytorch.org/whl/torch_stable.html"])
 
 
 
8
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
9
+ from datasets import load_dataset
10
 
11
+ # load model and processor
12
+ processor = WhisperProcessor.from_pretrained("openai/whisper-small")
13
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
14
  forced_decoder_ids = processor.get_decoder_prompt_ids(language="italian", task="transcribe")
15
 
16
+ # load dummy dataset and read audio files
17
+ ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
18
+ sample = ds[0]["audio"]
19
+ input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
 
20
 
21
+ # generate token ids
22
+ predicted_ids = model.generate(input_features)
23
+ # decode token ids to text
24
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
25
+
26
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
 
 
27
 
28
  # Create Gradio interface
29
  audio_input = gr.Audio()