File size: 9,278 Bytes
72f90b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
"""
Language Detection Backend

This module provides the main LanguageDetector class and ModelRegistry
for managing multiple language detection models organized by architecture and dataset.

Model Architecture:
- Model A: XLM-RoBERTa based architectures  
- Model B: BERT based architectures

Training Datasets:
- Dataset A: Standard multilingual language detection dataset
- Dataset B: Enhanced/specialized language detection dataset
"""

import logging
from typing import Dict, List, Any

from .models import (
    BaseLanguageModel,
    ModelADatasetA,
    ModelBDatasetA,
    ModelADatasetB, 
    ModelBDatasetB,
    get_all_model_configs,
    get_language_name,
    LANGUAGE_MAPPINGS
)


class ModelRegistry:
    """
    Registry for managing available language detection models.
    
    This class handles the registration and creation of language detection models
    organized by model architecture (A: XLM-RoBERTa, B: BERT) and training 
    dataset (A: standard, B: enhanced).
    """
    
    def __init__(self):
        """Initialize the model registry with available models."""
        # Get model configurations from centralized config
        self.model_configs = get_all_model_configs()
        
        # Map model keys to their implementation classes
        self.model_classes = {
            "model-a-dataset-a": ModelADatasetA,      # XLM-RoBERTa + Dataset A
            "model-b-dataset-a": ModelBDatasetA,      # BERT + Dataset A  
            "model-a-dataset-b": ModelADatasetB,      # XLM-RoBERTa + Dataset B
            "model-b-dataset-b": ModelBDatasetB,      # BERT + Dataset B
        }
        
        # Build models registry by combining configs with classes
        self.models = {}
        
        # Add the new organized models
        for model_key, config in self.model_configs.items():
            if model_key in self.model_classes:
                self.models[model_key] = {
                    "class": self.model_classes[model_key],
                    "display_name": config["display_name"],
                    "description": config["description"],
                    "status": config["status"]
                }
    
    def get_available_models(self) -> Dict[str, Dict[str, Any]]:
        """
        Get all registered models.
        
        Returns:
            Dict containing all model information
        """
        return self.models.copy()
    
    def create_model(self, model_key: str) -> BaseLanguageModel:
        """
        Create an instance of the specified model.
        
        Args:
            model_key (str): Key of the model to create
            
        Returns:
            BaseLanguageModel: Instance of the requested model
            
        Raises:
            ValueError: If the model key is not found
        """
        if model_key not in self.models:
            available_keys = list(self.models.keys())
            raise ValueError(f"Unknown model: {model_key}. Available models: {available_keys}")
        
        model_class = self.models[model_key]["class"]
        return model_class()


class LanguageDetector:
    """
    Main language detection class that orchestrates model predictions.
    
    This class provides a unified interface for language detection using
    different model architectures and training datasets. It handles model 
    switching and provides consistent output formatting.
    """
    
    def __init__(self, model_key: str = "model-a-dataset-a"):
        """
        Initialize the language detector.
        
        Args:
            model_key (str): Key of the model to use from the registry
                - "model-a-dataset-a": XLM-RoBERTa + standard dataset
                - "model-b-dataset-a": BERT + standard dataset  
                - "model-a-dataset-b": XLM-RoBERTa + enhanced dataset
                - "model-b-dataset-b": BERT + enhanced dataset
        """
        self.registry = ModelRegistry()
        self.current_model_key = model_key
        self.model = self.registry.create_model(model_key)
        
        # Use centralized language mappings
        self.language_names = LANGUAGE_MAPPINGS
    
    def switch_model(self, model_key: str):
        """
        Switch to a different model.
        
        Args:
            model_key (str): Key of the new model to use
            
        Raises:
            Exception: If model switching fails
        """
        try:
            self.model = self.registry.create_model(model_key)
            self.current_model_key = model_key
            logging.info(f"Successfully switched to model: {model_key}")
        except Exception as e:
            logging.error(f"Failed to switch to model {model_key}: {e}")
            raise
    
    def get_current_model_info(self) -> Dict[str, Any]:
        """
        Get information about the currently selected model.
        
        Returns:
            Dict containing current model information
        """
        return self.model.get_model_info()
    
    def get_available_models(self) -> Dict[str, Dict[str, Any]]:
        """
        Get all available models.
        
        Returns:
            Dict containing all available models
        """
        return self.registry.get_available_models()
    
    def detect_language(self, text: str) -> Dict[str, Any]:
        """
        Detect the language of the input text.
        
        Args:
            text (str): Input text to analyze
            
        Returns:
            Dict containing:
                - language: Main predicted language name
                - language_code: Main predicted language code
                - confidence: Confidence score for main prediction
                - top_predictions: List of top 5 predictions with details
                - metadata: Additional information about the prediction
                
        Raises:
            ValueError: If input text is empty
            RuntimeError: If model prediction fails
        """
        if not text or not text.strip():
            raise ValueError("Input text cannot be empty")
        
        # Get predictions from the current model
        model_result = self.model.predict(text.strip())
        predictions = model_result['predictions']
        
        if not predictions:
            raise RuntimeError("Model returned no predictions")
        
        # Extract main prediction
        top_prediction = predictions[0]
        main_language_code = top_prediction['language_code']
        main_confidence = top_prediction['confidence']
        
        # Get human-readable language name using centralized function
        main_language_name = get_language_name(main_language_code)
        
        # Format top predictions (limit to 5)
        top_predictions = []
        for pred in predictions[:5]:
            lang_code = pred['language_code']
            lang_name = get_language_name(lang_code)
            top_predictions.append({
                'language': lang_name,
                'language_code': lang_code,
                'confidence': pred['confidence']
            })
        
        # Prepare metadata
        metadata = {
            'text_length': model_result.get('text_length', len(text)),
            'model_name': model_result.get('model_version', 'unknown'),
            'model_type': model_result.get('model_type', 'unknown'),
            'current_model_key': self.current_model_key,
            'model_info': self.get_current_model_info()
        }
        
        return {
            'language': main_language_name,
            'language_code': main_language_code,
            'confidence': main_confidence,
            'top_predictions': top_predictions,
            'metadata': metadata
        }
    
    def get_supported_languages(self) -> Dict[str, str]:
        """
        Get dictionary of supported language codes and names.
        
        Returns:
            Dict mapping language codes to language names
        """
        supported_codes = self.model.get_supported_languages()
        return {
            code: get_language_name(code)
            for code in supported_codes
        }


# Example usage and testing
if __name__ == "__main__":
    # Initialize detector with default model (Model A Dataset A)
    detector = LanguageDetector()
    
    # Test with sample texts
    test_texts = [
        "Hello, how are you today?",
        "Bonjour, comment allez-vous?", 
        "Hola, ¿cómo estás?",
        "Guten Tag, wie geht es Ihnen?"
    ]
    
    print("Language Detection Test - Model A Dataset A")
    print("=" * 60)
    
    for text in test_texts:
        try:
            result = detector.detect_language(text)
            print(f"Text: {text}")
            print(f"Detected: {result['language']} ({result['language_code']}) - {result['confidence']:.3f}")
            print("---")
        except Exception as e:
            print(f"Error detecting language for '{text}': {e}")
            print("---")
    
    # Show available models
    print("\nAvailable Models:")
    models = detector.get_available_models()
    for key, info in models.items():
        status = "✅" if info["status"] == "available" else "🚧"
        print(f"{status} {info['display_name']} ({key}): {info['description']}")