jcho02's picture
Update app.py
73b065a verified
raw
history blame
4 kB
import gradio as gr
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import WhisperModel, WhisperFeatureExtractor
import datasets
from datasets import load_dataset, DatasetDict, Audio
from huggingface_hub import PyTorchModelHubMixin
import numpy as np
# Ensure you have the device setup (cuda or cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define the config for your model
config = {"encoder": "openai/whisper-base", "num_labels": 2}
# Define data class
class SpeechInferenceDataset(Dataset):
def __init__(self, audio_data, text_processor):
self.audio_data = audio_data
self.text_processor = text_processor
def __len__(self):
return len(self.audio_data)
def __getitem__(self, index):
inputs = self.text_processor(self.audio_data[index]["audio"]["array"],
return_tensors="pt",
sampling_rate=self.audio_data[index]["audio"]["sampling_rate"])
input_features = inputs.input_features
decoder_input_ids = torch.tensor([[1, 1]]) # Modify as per your model's requirements
return input_features, decoder_input_ids
# Define model class
class SpeechClassifier(nn.Module, PyTorchModelHubMixin):
def __init__(self, config):
super(SpeechClassifier, self).__init__()
self.encoder = WhisperModel.from_pretrained(config["encoder"])
self.classifier = nn.Sequential(
nn.Linear(self.encoder.config.hidden_size, 4096),
nn.ReLU(),
nn.Linear(4096, 2048),
nn.ReLU(),
nn.Linear(2048, 1024),
nn.ReLU(),
nn.Linear(1024, 512),
nn.ReLU(),
nn.Linear(512, config["num_labels"])
)
def forward(self, input_features, decoder_input_ids):
outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
pooled_output = outputs['last_hidden_state'][:, 0, :]
logits = self.classifier(pooled_output)
return logits
# Prepare data function
def prepare_data(audio_data, sampling_rate, model_checkpoint="openai/whisper-base"):
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_checkpoint)
inputs = feature_extractor(audio_data, sampling_rate=sampling_rate, return_tensors="pt")
input_features = inputs.input_features
decoder_input_ids = torch.tensor([[1, 1]]) # Modify as per your model's requirements
return input_features.to(device), decoder_input_ids.to(device)
# Prediction function
def predict(audio_data, sampling_rate, config):
input_features, decoder_input_ids = prepare_data(audio_data, sampling_rate, config["encoder"])
model = SpeechClassifier(config).to(device)
model.load_state_dict(torch.hub.load_state_dict_from_url("https://huggingface.co/jcho02/whisper_cleft/resolve/main/pytorch_model.bin", map_location=device))
model.eval()
with torch.no_grad():
logits = model(input_features, decoder_input_ids)
predicted_ids = int(torch.argmax(logits, dim=-1))
return predicted_ids
# Unified Gradio interface function
def gradio_interface(audio_input):
if isinstance(audio_input, tuple):
# If the input is a tuple, it's from the microphone
audio_data, sample_rate = audio_input
else:
# Otherwise, it's an uploaded file
with open(audio_input, "rb") as f:
audio_data = np.frombuffer(f.read(), np.int16)
sample_rate = 16000 # Assume 16kHz sample rate for uploaded files
prediction = predict(audio_data, sample_rate, config)
label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
return label
# Create Gradio interface
demo = gr.Interface(
fn=gradio_interface,
inputs=gr.Audio(type="numpy", label="Upload or Record Audio"),
outputs=gr.Textbox(label="Prediction")
)
# Launch the demo
demo.launch(debug=True)