|
import gradio as gr |
|
import torch |
|
import numpy as np |
|
from transformers import ( |
|
AutoModel, AutoProcessor, AutoFeatureExtractor, |
|
AutoTokenizer, pipeline |
|
) |
|
import warnings |
|
warnings.filterwarnings("ignore") |
|
|
|
def test_single_model(model_name): |
|
"""Test compatibility of a single model""" |
|
|
|
if not model_name.strip(): |
|
return "Please enter a model name" |
|
|
|
result_text = f"π Testing Model: {model_name}\n" |
|
result_text += "=" * 60 + "\n\n" |
|
|
|
try: |
|
|
|
result_text += "1οΈβ£ Loading Model...\n" |
|
model = AutoModel.from_pretrained(model_name) |
|
result_text += " β
Model loaded successfully\n" |
|
result_text += f" π Model type: {model.config.model_type}\n" |
|
result_text += f" ποΈ Model class: {model.__class__.__name__}\n\n" |
|
|
|
|
|
result_text += "2οΈβ£ Checking Model Architecture...\n" |
|
if hasattr(model.config, 'hidden_size'): |
|
result_text += f" π’ Hidden size: {model.config.hidden_size}\n" |
|
if hasattr(model.config, 'num_hidden_layers'): |
|
result_text += f" π Number of layers: {model.config.num_hidden_layers}\n" |
|
if hasattr(model.config, 'vocab_size'): |
|
result_text += f" π Vocabulary size: {model.config.vocab_size}\n" |
|
result_text += "\n" |
|
|
|
|
|
result_text += "3οΈβ£ Loading Processor...\n" |
|
processor = None |
|
supports_audio = False |
|
|
|
try: |
|
processor = AutoProcessor.from_pretrained(model_name) |
|
result_text += f" β
Processor loaded successfully: {processor.__class__.__name__}\n" |
|
supports_audio = True |
|
except: |
|
try: |
|
processor = AutoFeatureExtractor.from_pretrained(model_name) |
|
result_text += f" β
Feature extractor loaded successfully: {processor.__class__.__name__}\n" |
|
supports_audio = True |
|
except: |
|
result_text += " β Cannot load audio processor\n" |
|
supports_audio = False |
|
result_text += "\n" |
|
|
|
|
|
result_text += "4οΈβ£ Checking Input Requirements...\n" |
|
sampling_rate = 16000 |
|
if processor and supports_audio: |
|
if hasattr(processor, 'sampling_rate'): |
|
sampling_rate = processor.sampling_rate |
|
result_text += f" π΅ Sampling rate: {sampling_rate} Hz\n" |
|
if hasattr(processor, 'feature_size'): |
|
result_text += f" π Feature dimension: {processor.feature_size}\n" |
|
if hasattr(processor, 'return_attention_mask'): |
|
result_text += f" π Supports attention mask: {processor.return_attention_mask}\n" |
|
result_text += "\n" |
|
|
|
|
|
result_text += "5οΈβ£ Testing Inference...\n" |
|
if supports_audio: |
|
try: |
|
|
|
dummy_audio = np.random.randn(sampling_rate * 2).astype(np.float32) |
|
|
|
|
|
inputs = processor(dummy_audio, sampling_rate=sampling_rate, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
if hasattr(outputs, 'last_hidden_state'): |
|
shape = outputs.last_hidden_state.shape |
|
result_text += f" β
Inference successful! Hidden state shape: {shape}\n" |
|
elif hasattr(outputs, 'logits'): |
|
shape = outputs.logits.shape |
|
result_text += f" β
Inference successful! Logits shape: {shape}\n" |
|
else: |
|
result_text += f" β
Inference successful! Output type: {type(outputs)}\n" |
|
|
|
except Exception as e: |
|
result_text += f" β Inference failed: {str(e)}\n" |
|
else: |
|
result_text += " β οΈ Audio input not supported, skipping inference test\n" |
|
result_text += "\n" |
|
|
|
|
|
result_text += "6οΈβ£ Multilingual Support Check...\n" |
|
multilingual = False |
|
|
|
if hasattr(model.config, 'vocab_size') and model.config.vocab_size > 50000: |
|
result_text += f" β
Likely supports multiple languages (large vocabulary: {model.config.vocab_size})\n" |
|
multilingual = True |
|
elif any(keyword in model_name.lower() for keyword in ['xlsr', 'multilingual', 'cross-lingual']): |
|
result_text += " β
Supports multiple languages based on model name\n" |
|
multilingual = True |
|
else: |
|
result_text += " β Multilingual support unclear\n" |
|
result_text += "\n" |
|
|
|
|
|
result_text += "7οΈβ£ Depression Detection Suitability Assessment...\n" |
|
score = 0 |
|
max_score = 15 |
|
|
|
|
|
depression_keywords = ['depression', 'mental-health', 'psychological', 'mood', 'phq'] |
|
emotion_keywords = ['emotion', 'sentiment', 'affective', 'feeling'] |
|
|
|
if any(keyword in model_name.lower() for keyword in depression_keywords): |
|
score += 6 |
|
result_text += " π― Specifically for depression/mental health detection (+6 points)\n" |
|
elif any(keyword in model_name.lower() for keyword in emotion_keywords): |
|
score += 3 |
|
result_text += " π For emotion recognition, potentially applicable (+3 points)\n" |
|
|
|
|
|
if supports_audio: |
|
score += 2 |
|
result_text += " π΅ Supports audio input (+2 points)\n" |
|
else: |
|
result_text += " β Does not support audio input (0 points)\n" |
|
|
|
|
|
if multilingual: |
|
score += 2 |
|
result_text += " π Supports multiple languages (+2 points)\n" |
|
|
|
|
|
if model.config.model_type in ['wav2vec2', 'hubert', 'wavlm']: |
|
score += 2 |
|
result_text += " ποΈ Excellent speech representation learning architecture (+2 points)\n" |
|
elif model.config.model_type == 'whisper': |
|
score += 1 |
|
result_text += " β οΈ Whisper architecture needs modification for classification (+1 point)\n" |
|
|
|
|
|
if hasattr(model.config, 'num_labels'): |
|
if model.config.num_labels == 2: |
|
score += 1 |
|
result_text += f" β
Binary classification task configuration (likely depression detection) (+1 point)\n" |
|
else: |
|
score += 0.5 |
|
result_text += f" β οΈ Multi-class task ({model.config.num_labels} classes) (+0.5 points)\n" |
|
|
|
|
|
daic_keywords = ['daic', 'wizard-of-oz', 'depression-detection', 'clinical'] |
|
if any(keyword in model_name.lower() for keyword in daic_keywords): |
|
score += 2 |
|
result_text += " π Possibly trained on clinical depression datasets (+2 points)\n" |
|
|
|
result_text += f"\nπ― Depression Detection Suitability Score: {score}/{max_score}\n" |
|
|
|
|
|
result_text += "\n8οΈβ£ Usage Recommendations...\n" |
|
if score >= 12: |
|
result_text += " π Highly recommended! Specifically for depression detection, very suitable\n" |
|
elif score >= 8: |
|
result_text += " π Recommended, may need some fine-tuning\n" |
|
elif score >= 5: |
|
result_text += " β οΈ Use with caution, may need significant modification\n" |
|
else: |
|
result_text += " β Not recommended, suggest finding specialized depression detection models\n" |
|
|
|
|
|
result_text += "\n9οΈβ£ Further Inspection Suggestions...\n" |
|
result_text += " π Check model card for training data description\n" |
|
result_text += " π Check if DAIC-WOZ or other depression datasets are mentioned\n" |
|
result_text += " π Check papers or documentation for task description\n" |
|
result_text += " π§ͺ Test with small samples to see if model output matches depression detection expectations\n" |
|
|
|
return result_text |
|
|
|
except Exception as e: |
|
error_msg = f"β Model test failed: {str(e)}\n" |
|
error_msg += "\nPossible causes:\n" |
|
error_msg += "β’ Incorrect model name\n" |
|
error_msg += "β’ Model requires special permissions\n" |
|
error_msg += "β’ Network connection issues\n" |
|
error_msg += "β’ Model architecture incompatibility\n" |
|
return error_msg |
|
|
|
def test_recommended_models(): |
|
"""Test recommended model list""" |
|
|
|
recommended_models = [ |
|
"facebook/wav2vec2-large-xlsr-53", |
|
"microsoft/wavlm-large", |
|
"harshit345/xlsr-wav2vec-speech-emotion-recognition", |
|
"audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim", |
|
"speechbrain/emotion-recognition-wav2vec2-IEMOCAP" |
|
] |
|
|
|
result_text = "π Batch Testing Recommended Models\n" |
|
result_text += "=" * 60 + "\n\n" |
|
|
|
results = [] |
|
|
|
for i, model_name in enumerate(recommended_models, 1): |
|
result_text += f"π Testing {i}/{len(recommended_models)}: {model_name}\n" |
|
result_text += "-" * 50 + "\n" |
|
|
|
try: |
|
|
|
model = AutoModel.from_pretrained(model_name) |
|
|
|
|
|
supports_audio = False |
|
try: |
|
processor = AutoProcessor.from_pretrained(model_name) |
|
supports_audio = True |
|
except: |
|
try: |
|
processor = AutoFeatureExtractor.from_pretrained(model_name) |
|
supports_audio = True |
|
except: |
|
pass |
|
|
|
|
|
multilingual = False |
|
if hasattr(model.config, 'vocab_size') and model.config.vocab_size > 50000: |
|
multilingual = True |
|
elif any(keyword in model_name.lower() for keyword in ['xlsr', 'multilingual']): |
|
multilingual = True |
|
|
|
|
|
score = 0 |
|
if supports_audio: |
|
score += 3 |
|
if multilingual: |
|
score += 2 |
|
if model.config.model_type in ['wav2vec2', 'hubert', 'wavlm']: |
|
score += 3 |
|
|
|
results.append({ |
|
'name': model_name, |
|
'score': score, |
|
'audio': supports_audio, |
|
'multilingual': multilingual, |
|
'type': model.config.model_type |
|
}) |
|
|
|
result_text += f"β
Loaded successfully | Audio: {'β
' if supports_audio else 'β'} | Multilingual: {'β
' if multilingual else 'β'} | Score: {score}/8\n\n" |
|
|
|
except Exception as e: |
|
result_text += f"β Loading failed: {str(e)}\n\n" |
|
|
|
|
|
results.sort(key=lambda x: x['score'], reverse=True) |
|
|
|
result_text += "π Recommendation Rankings:\n" |
|
result_text += "=" * 40 + "\n" |
|
|
|
for i, model in enumerate(results, 1): |
|
result_text += f"{i}. {model['name']}\n" |
|
result_text += f" Score: {model['score']}/8 | Type: {model['type']}\n\n" |
|
|
|
return result_text |
|
|
|
|
|
with gr.Blocks(title="π€ Depression Detection Model Compatibility Test") as app: |
|
gr.Markdown(""" |
|
# π€ Depression Detection Model Compatibility Test Tool |
|
|
|
This tool helps you quickly test whether Hugging Face models are suitable for depression detection tasks. |
|
|
|
## Features: |
|
- β
Check model loading compatibility |
|
- π΅ Verify audio input support |
|
- π Assess multilingual capabilities |
|
- π Suitability scoring (0-15 points) |
|
- π‘ Usage recommendations |
|
""") |
|
|
|
with gr.Tab("Single Model Test"): |
|
with gr.Row(): |
|
model_input = gr.Textbox( |
|
placeholder="Enter model name, e.g.: facebook/wav2vec2-large-xlsr-53", |
|
label="π Model Name", |
|
value="ireneminhee/speech-to-depression" |
|
) |
|
test_btn = gr.Button("π Start Test", variant="primary") |
|
|
|
result_output = gr.Textbox( |
|
label="π Test Results", |
|
lines=25, |
|
max_lines=50 |
|
) |
|
|
|
test_btn.click( |
|
fn=test_single_model, |
|
inputs=[model_input], |
|
outputs=[result_output] |
|
) |
|
|
|
with gr.Tab("Recommended Models Batch Test"): |
|
gr.Markdown(""" |
|
### π Recommended Depression Detection Candidate Models |
|
|
|
These models perform well in speech emotion recognition and multilingual support: |
|
- `facebook/wav2vec2-large-xlsr-53` - Multilingual speech representation learning |
|
- `microsoft/wavlm-large` - Speech understanding specialized model |
|
- `harshit345/xlsr-wav2vec-speech-emotion-recognition` - Emotion recognition |
|
- `audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim` - Emotion dimension recognition |
|
- `speechbrain/emotion-recognition-wav2vec2-IEMOCAP` - Emotion classification |
|
""") |
|
|
|
batch_test_btn = gr.Button("π Batch Test Recommended Models", variant="primary") |
|
batch_result_output = gr.Textbox( |
|
label="π Batch Test Results", |
|
lines=20, |
|
max_lines=50 |
|
) |
|
|
|
batch_test_btn.click( |
|
fn=test_recommended_models, |
|
inputs=[], |
|
outputs=[batch_result_output] |
|
) |
|
|
|
with gr.Tab("Usage Instructions"): |
|
gr.Markdown(""" |
|
## π Usage Instructions |
|
|
|
### Scoring Criteria (Redesigned): |
|
- **Depression-specific model** (+6 points): Specifically for depression/mental health detection |
|
- **Emotion recognition model** (+3 points): For emotion recognition, potentially applicable |
|
- **Audio support** (+2 points): Whether the model can process audio input |
|
- **Multilingual support** (+2 points): Support for Chinese, English, German, Russian |
|
- **Architecture suitability** (+2 points): Whether model architecture is suitable for speech classification |
|
- **Classification configuration** (+1 point): Whether configured for classification tasks |
|
- **Clinical datasets** (+2 points): Whether trained on clinical depression datasets |
|
|
|
### Score Interpretation: |
|
- **12-15 points**: π Highly recommended, specialized depression detection model |
|
- **8-11 points**: π Recommended, may need fine-tuning |
|
- **5-7 points**: β οΈ Use with caution, needs modification |
|
- **0-4 points**: β Not recommended |
|
|
|
### Next Steps: |
|
1. Select the top 2-3 models with highest scores |
|
2. Conduct in-depth testing in Google Colab |
|
3. Fine-tune using DAIC-WOZ dataset |
|
4. Final evaluation with your multilingual data |
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch() |