File size: 5,588 Bytes
72f90b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""
Model A Dataset A - XLM-RoBERTa Language Detection

This module implements the XLM-RoBERTa based language detection model
fine-tuned on Dataset A (standard multilingual language detection dataset).

Model Architecture: XLM-RoBERTa (Model A)
Training Dataset: Dataset A (standard multilingual)
Performance: 97.9% accuracy across 60+ languages
"""

import logging
from typing import Dict, List, Any

from .base_model import BaseLanguageModel
from .model_config import get_model_config, get_supported_languages, get_language_name

try:
    from transformers import pipeline
    HF_AVAILABLE = True
except ImportError:
    HF_AVAILABLE = False
    logging.warning("Transformers library not available. Please install with: pip install transformers torch")


class ModelADatasetA(BaseLanguageModel):
    """
    XLM-RoBERTa based language detection model (Model A) trained on Dataset A.
    
    This model represents the XLM-RoBERTa architecture fine-tuned on a standard
    multilingual language detection dataset, achieving 97.9% accuracy with
    robust cross-lingual performance across 60+ languages.
    
    Architecture: XLM-RoBERTa (Model A)
    Dataset: Dataset A (standard multilingual)
    Base Model: xlm-roberta-base
    Accuracy: 97.9%
    Parameters: 278M
    """
    
    def __init__(self):
        """Initialize the Model A Dataset A language detector."""
        self.model_key = "model-a-dataset-a"
        self.config = get_model_config(self.model_key)
        self.model_name = self.config["huggingface_model"]
        
        # Check if transformers library is available
        if not HF_AVAILABLE:
            raise ImportError(
                "Transformers library required for Model A Dataset A. "
                "Install with: pip install transformers torch"
            )
        
        # Initialize the model pipeline
        try:
            self.classifier = pipeline(
                "text-classification",
                model=self.model_name,
                device=0,
                top_k=None  # Return all scores
            )
            logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
        except Exception as e:
            logging.error(f"Failed to load {self.config['display_name']}: {e}")
            raise RuntimeError(f"Could not initialize Model A Dataset A: {str(e)}")
    
    def predict(self, text: str) -> Dict[str, Any]:
        """
        Predict language using Model A Dataset A (XLM-RoBERTa).
        
        Args:
            text (str): Input text to analyze
            
        Returns:
            Dict with predictions, metadata, and model information
        """
        if not text or not text.strip():
            raise ValueError("Input text cannot be empty")
        
        try:
            # Run the model prediction
            results = self.classifier(text)
            
            # Handle the format returned by the pipeline
            if isinstance(results, list) and len(results) > 0:
                if isinstance(results[0], list):
                    # Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
                    prediction_list = results[0]
                else:
                    # Direct list format: [{'label': 'en', 'score': 0.99}, ...]
                    prediction_list = results
            else:
                raise ValueError("Unexpected pipeline output format")
            
            # Sort predictions by confidence score (descending)
            predictions = [
                {
                    'language_code': result['label'].lower(),
                    'confidence': result['score']
                }
                for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
            ]
            
            return {
                'predictions': predictions,
                'text_length': len(text),
                'model_version': self.model_name,
                'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
            }
            
        except Exception as e:
            logging.error(f"Model A Dataset A prediction failed: {e}")
            raise RuntimeError(f"Model prediction failed: {str(e)}")
    
    def get_supported_languages(self) -> List[str]:
        """
        Get supported languages for Model A Dataset A.
        
        Returns:
            List of ISO 639-1 language codes supported by the model
        """
        return get_supported_languages(self.model_key)
    
    def get_model_info(self) -> Dict[str, Any]:
        """
        Get detailed information about Model A Dataset A.
        
        Returns:
            Dict containing comprehensive model metadata
        """
        # Build comprehensive model info from centralized config
        model_info = {
            "name": self.config["display_name"],
            "description": self.config["description"],
            "accuracy": self.config["accuracy"],
            "model_size": self.config["model_size"],
            "architecture": self.config["architecture"],
            "base_model": self.config["base_model"],
            "dataset": self.config["dataset"],
            "languages_supported": f"{self.config['languages_supported']}+ languages",
            "training_details": self.config["training_details"],
            "use_cases": self.config["use_cases"],
            "strengths": self.config["strengths"],
            "limitations": self.config["limitations"]
        }
        
        return model_info