MusIre commited on
Commit
6a6d2f9
·
1 Parent(s): 95dcc38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -4
app.py CHANGED
@@ -14,11 +14,27 @@ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
14
  forced_decoder_ids = processor.get_decoder_prompt_ids(language="italian", task="transcribe")
15
 
16
  # Custom preprocessing function
17
- def preprocess_audio(audio_data):
18
- # Apply any custom preprocessing to the audio data here if needed
19
  # Ensure that the input data is a valid format for the model
20
- processed_data = processor(audio_data, return_tensors="pt", padding=True, truncation=True)
21
- return processed_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Function to perform ASR on audio data
24
  def transcribe_audio(audio_data):
 
14
  forced_decoder_ids = processor.get_decoder_prompt_ids(language="italian", task="transcribe")
15
 
16
  # Custom preprocessing function
17
+ def preprocess_audio(audio_data, sampling_rate=16_000):
 
18
  # Ensure that the input data is a valid format for the model
19
+ # Convert the audio data to a numpy array with a correct shape
20
+ raw_speech = np.asarray(audio_data, dtype=np.float32)
21
+
22
+ # Pad or truncate the audio data to the required length
23
+ if len(raw_speech) > processor.feature_extractor.max_len:
24
+ raw_speech = raw_speech[:processor.feature_extractor.max_len]
25
+ else:
26
+ raw_speech = np.pad(raw_speech, (0, processor.feature_extractor.max_len - len(raw_speech)))
27
+
28
+ # Process the audio data using the Whisper processor
29
+ processed_data = processor(
30
+ raw_speech,
31
+ sampling_rate=sampling_rate,
32
+ return_tensors="pt",
33
+ padding=True,
34
+ truncation=True
35
+ )
36
+
37
+ return processed_data.input_features
38
 
39
  # Function to perform ASR on audio data
40
  def transcribe_audio(audio_data):