Update app.py
Browse files
app.py
CHANGED
|
@@ -28,7 +28,7 @@ class SpeechInferenceDataset(Dataset):
|
|
| 28 |
inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
|
| 29 |
return_tensors="pt",
|
| 30 |
sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
|
| 31 |
-
input_features = inputs.input_features
|
| 32 |
decoder_input_ids = torch.tensor([[1, 1]]) # Modify as per your model's requirements
|
| 33 |
return input_features, decoder_input_ids
|
| 34 |
|
|
@@ -58,6 +58,9 @@ class SpeechClassifier(nn.Module, PyTorchModelHubMixin):
|
|
| 58 |
# Prepare data function
|
| 59 |
def prepare_data(audio_data, sampling_rate, model_checkpoint="openai/whisper-base"):
|
| 60 |
|
|
|
|
|
|
|
|
|
|
| 61 |
# Resample audio data to 16000 Hz
|
| 62 |
audio_data_resampled = librosa.resample(audio_data, orig_sr=sampling_rate, target_sr=16000)
|
| 63 |
|
|
@@ -68,12 +71,15 @@ def prepare_data(audio_data, sampling_rate, model_checkpoint="openai/whisper-bas
|
|
| 68 |
dataset = SpeechInferenceDataset([{"audio": {"array": audio_data_resampled, "sampling_rate": 16000}}],
|
| 69 |
text_processor=feature_extractor)
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
# Prediction function
|
| 75 |
def predict(audio_data, sampling_rate, config):
|
| 76 |
-
|
| 77 |
|
| 78 |
model = SpeechClassifier(config).to(device)
|
| 79 |
# Here we load the model from Hugging Face Hub
|
|
@@ -81,23 +87,43 @@ def predict(audio_data, sampling_rate, config):
|
|
| 81 |
|
| 82 |
model.eval()
|
| 83 |
with torch.no_grad():
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
| 86 |
return predicted_ids
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
# Gradio Interface functions
|
| 89 |
def gradio_file_interface(uploaded_file):
|
| 90 |
# Assuming the uploaded_file is a filepath (str)
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
prediction = predict(audio_data, 16000, config) # Assume 16kHz sample rate
|
| 94 |
label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
|
| 95 |
return label
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
def gradio_mic_interface(mic_input):
|
| 98 |
# mic_input is a tuple with sample_rate and data as entries
|
| 99 |
# (44100, array([ 0, 0, 0, ..., -153, -140, -120], dtype=int16))
|
| 100 |
-
prediction = predict(mic_input[1], mic_input[0], config)
|
| 101 |
label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
|
| 102 |
return label
|
| 103 |
|
|
@@ -119,5 +145,5 @@ with gr.Blocks() as demo:
|
|
| 119 |
outputs=gr.Textbox(label="Prediction")
|
| 120 |
)
|
| 121 |
|
| 122 |
-
# Launch the demo
|
| 123 |
-
demo.launch()
|
|
|
|
| 28 |
inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
|
| 29 |
return_tensors="pt",
|
| 30 |
sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
|
| 31 |
+
input_features = inputs.input_features.squeeze(0)
|
| 32 |
decoder_input_ids = torch.tensor([[1, 1]]) # Modify as per your model's requirements
|
| 33 |
return input_features, decoder_input_ids
|
| 34 |
|
|
|
|
| 58 |
# Prepare data function
|
| 59 |
def prepare_data(audio_data, sampling_rate, model_checkpoint="openai/whisper-base"):
|
| 60 |
|
| 61 |
+
# Convert audio data to float32
|
| 62 |
+
audio_data = audio_data.astype(np.float32)
|
| 63 |
+
|
| 64 |
# Resample audio data to 16000 Hz
|
| 65 |
audio_data_resampled = librosa.resample(audio_data, orig_sr=sampling_rate, target_sr=16000)
|
| 66 |
|
|
|
|
| 71 |
dataset = SpeechInferenceDataset([{"audio": {"array": audio_data_resampled, "sampling_rate": 16000}}],
|
| 72 |
text_processor=feature_extractor)
|
| 73 |
|
| 74 |
+
dataloader = DataLoader(dataset, batch_size=1)
|
| 75 |
+
|
| 76 |
+
return dataloader
|
| 77 |
+
# return dataset
|
| 78 |
|
| 79 |
|
| 80 |
# Prediction function
|
| 81 |
def predict(audio_data, sampling_rate, config):
|
| 82 |
+
dataloader = prepare_data(audio_data, sampling_rate, config["encoder"])
|
| 83 |
|
| 84 |
model = SpeechClassifier(config).to(device)
|
| 85 |
# Here we load the model from Hugging Face Hub
|
|
|
|
| 87 |
|
| 88 |
model.eval()
|
| 89 |
with torch.no_grad():
|
| 90 |
+
for input_features, decoder_input_ids in dataloader:
|
| 91 |
+
input_features = input_features.to(device)
|
| 92 |
+
decoder_input_ids = decoder_input_ids.to(device)
|
| 93 |
+
logits = model(input_features, decoder_input_ids)
|
| 94 |
+
predicted_ids = int(torch.argmax(logits, dim=-1))
|
| 95 |
return predicted_ids
|
| 96 |
+
|
| 97 |
+
# input_features, decoder_input_ids = prepare_data(audio_data, sampling_rate, config["encoder"])
|
| 98 |
+
|
| 99 |
+
# model = SpeechClassifier(config).to(device)
|
| 100 |
+
# # Here we load the model from Hugging Face Hub
|
| 101 |
+
# model.load_state_dict(torch.hub.load_state_dict_from_url("https://huggingface.co/jcho02/whisper_cleft/resolve/main/pytorch_model.bin", map_location=device))
|
| 102 |
+
|
| 103 |
+
# model.eval()
|
| 104 |
+
# with torch.no_grad():
|
| 105 |
+
# logits = model(input_features, decoder_input_ids)
|
| 106 |
+
# predicted_ids = int(torch.argmax(logits, dim=-1))
|
| 107 |
+
# return predicted_ids
|
| 108 |
|
| 109 |
# Gradio Interface functions
|
| 110 |
def gradio_file_interface(uploaded_file):
|
| 111 |
# Assuming the uploaded_file is a filepath (str)
|
| 112 |
+
audio_data, sampling_rate = librosa.load(uploaded_file, sr=None)
|
| 113 |
+
prediction = predict(audio_data, sampling_rate, config)
|
|
|
|
| 114 |
label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
|
| 115 |
return label
|
| 116 |
|
| 117 |
+
# with open(uploaded_file, "rb") as f:
|
| 118 |
+
# audio_data = np.frombuffer(f.read(), np.int16)
|
| 119 |
+
# prediction = predict(audio_data, 16000, config) # Assume 16kHz sample rate
|
| 120 |
+
# label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
|
| 121 |
+
# return label
|
| 122 |
+
|
| 123 |
def gradio_mic_interface(mic_input):
|
| 124 |
# mic_input is a tuple with sample_rate and data as entries
|
| 125 |
# (44100, array([ 0, 0, 0, ..., -153, -140, -120], dtype=int16))
|
| 126 |
+
prediction = predict(mic_input[1].astype(np.float32), mic_input[0], config)
|
| 127 |
label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
|
| 128 |
return label
|
| 129 |
|
|
|
|
| 145 |
outputs=gr.Textbox(label="Prediction")
|
| 146 |
)
|
| 147 |
|
| 148 |
+
# Launch the demo with debugging enabled
|
| 149 |
+
demo.launch(debug=True)
|