Transformers_whisper_cleft

Sleeping

App Files Files Community

jcho02 commited on Apr 2, 2024

Commit

2c19de2

verified ·

1 Parent(s): 6d9e5fd

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -10

app.py CHANGED Viewed

@@ -1,5 +1,3 @@
 # Import necessary libraries
 import gradio as gr
 import torch
@@ -10,6 +8,9 @@ import datasets
 from datasets import load_dataset, DatasetDict, Audio
 from huggingface_hub import PyTorchModelHubMixin
 # Define data class
 class SpeechInferenceDataset(Dataset):
     def __init__(self, audio_data, text_processor):
@@ -57,7 +58,6 @@ def prepare_data(audio_file_path, model_checkpoint="openai/whisper-base"):
     inference_dataset = SpeechInferenceDataset(inference_data, feature_extractor)
     inference_loader = DataLoader(inference_dataset, batch_size=1, shuffle=False)
     input_features, decoder_input_ids = next(iter(inference_loader))
-    # Replace 'device' with your device configuration (e.g., 'cuda' or 'cpu')
     input_features = input_features.squeeze(1).to(device)
     decoder_input_ids = decoder_input_ids.squeeze(1).to(device)
     return input_features, decoder_input_ids
@@ -68,6 +68,8 @@ def predict(audio_file_path, config={"encoder": "openai/whisper-base", "num_labe
     # Load the model from Hugging Face Hub
     model = SpeechClassifier(config)
     model.load_state_dict(torch.load(model.push_from_hub("jcho02/whisper_cleft")))
     model.eval()
@@ -76,16 +78,38 @@ def predict(audio_file_path, config={"encoder": "openai/whisper-base", "num_labe
         predicted_ids = int(torch.argmax(logits, dim=-1))
     return predicted_ids
-# Gradio Interface function
-def gradio_interface(uploaded_file):
     with open(uploaded_file.name, "wb") as f:
         f.write(uploaded_file.read())
     prediction = predict(uploaded_file.name)
     label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
     return label
-# Create and launch Gradio Interface with File upload input
-iface = gr.Interface(fn=gradio_interface,
-                     inputs=gr.inputs.File(label="Upload Audio File"),
-                     outputs="text")
-iface.launch()

 # Import necessary libraries
 import gradio as gr
 import torch
 from datasets import load_dataset, DatasetDict, Audio
 from huggingface_hub import PyTorchModelHubMixin
+# Ensure you have the device setup (cuda or cpu)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Define data class
 class SpeechInferenceDataset(Dataset):
     def __init__(self, audio_data, text_processor):
     inference_dataset = SpeechInferenceDataset(inference_data, feature_extractor)
     inference_loader = DataLoader(inference_dataset, batch_size=1, shuffle=False)
     input_features, decoder_input_ids = next(iter(inference_loader))
     input_features = input_features.squeeze(1).to(device)
     decoder_input_ids = decoder_input_ids.squeeze(1).to(device)
     return input_features, decoder_input_ids
     # Load the model from Hugging Face Hub
     model = SpeechClassifier(config)
+    model.to(device)
+    # Use the correct method to load your model (this is an example and may not directly apply)
     model.load_state_dict(torch.load(model.push_from_hub("jcho02/whisper_cleft")))
     model.eval()
         predicted_ids = int(torch.argmax(logits, dim=-1))
     return predicted_ids
+# Gradio Interface function for uploaded files
+def gradio_file_interface(uploaded_file):
     with open(uploaded_file.name, "wb") as f:
         f.write(uploaded_file.read())
     prediction = predict(uploaded_file.name)
     label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
     return label
+# Gradio Interface function for microphone input
+def gradio_mic_interface(mic_input):
+    prediction = predict(mic_input.name)
+    label = "Hypernasality Detected" if prediction == 1 else "No Hypernasality Detected"
+    return label
+# Initialize Blocks
+demo = gr.Blocks()
+# Define the interfaces inside the Blocks context
+with demo:
+    mic_transcribe = gr.Interface(
+        fn=gradio_mic_interface,
+        inputs=gr.Audio(source="microphone", type="filepath"),
+        outputs=gr.Textbox(label="Prediction")
+    )
+    file_transcribe = gr.Interface(
+        fn=gradio_file_interface,
+        inputs=gr.Audio(source="upload", type="filepath"),
+        outputs=gr.Textbox(label="Prediction")
+    )
+    gr.TabbedInterface([mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"])
+# Launch the demo
+demo.launch(debug=True)