File size: 9,989 Bytes
72f90b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""
Centralized Model Configuration

This module contains the configuration for all language detection models
organized by the model architecture and training dataset combinations.

Model Architecture:
- Model A: XLM-RoBERTa based architectures
- Model B: BERT based architectures

Training Datasets:
- Dataset A: Standard multilingual language detection dataset
- Dataset B: Enhanced/specialized language detection dataset
"""

from typing import Dict, Any, List

# Model configurations organized by architecture and dataset
MODEL_CONFIGURATIONS = {
    "model-a-dataset-a": {
        "huggingface_model": "ZheYu03/xlm-r-langdetect-model",
        "display_name": "XLM-RoBERTa Model A Dataset A",
        "short_name": "Model A Dataset A",
        "architecture": "XLM-RoBERTa",
        "base_model": "xlm-roberta-base",
        "dataset": "Dataset A",
        "accuracy": "97.9%",
        "model_size": "278M parameters",
        "training_epochs": 10,
        "languages_supported": 100,
        "description": "High-performance XLM-RoBERTa based language detection model fine-tuned on standard multilingual dataset. Delivers reliable 97.9% accuracy across 100+ languages with robust cross-lingual capabilities.",
        "training_details": "Fine-tuned XLM-RoBERTa base model with AdamW optimizer, 10 epochs training on comprehensive multilingual language detection dataset",
        "use_cases": "General-purpose language detection, multilingual content processing, cross-lingual text analysis",
        "strengths": "Excellent multilingual performance, robust cross-lingual transfer, proven reliability, comprehensive language coverage",
        "limitations": "Higher computational requirements, moderate inference speed",
        "status": "available"
    },
    
    "model-b-dataset-a": {
        "huggingface_model": "SongJuNN/xlm-r-langdetect-model",
        "display_name": "BERT Model B Dataset A", 
        "short_name": "Model B Dataset A",
        "architecture": "BERT",
        "base_model": "bert-base-multilingual-cased",
        "dataset": "Dataset A",
        "accuracy": "96.17%",
        "model_size": "178M parameters",
        "training_epochs": 10,
        "languages_supported": 100,
        "description": "Efficient BERT-based language detection model trained on standard multilingual dataset. Optimized for speed and broad language coverage with 96.17% accuracy across 100+ languages.",
        "training_details": "BERT multilingual model fine-tuned with AdamW optimizer (lr=2e-05), mixed precision training, optimized for efficiency",
        "use_cases": "High-throughput language detection, real-time applications, resource-constrained environments",
        "strengths": "Fast inference speed, lower memory usage, broad language support, efficient processing",
        "limitations": "Slightly lower accuracy compared to XLM-RoBERTa variants",
        "status": "available"
    },
    
    "model-a-dataset-b": {
        "huggingface_model": "zues0102/xlmr-papluca-model",
        "display_name": "XLM-RoBERTa Model A Dataset B",
        "short_name": "Model A Dataset B", 
        "architecture": "XLM-RoBERTa",
        "base_model": "xlm-roberta-base",
        "dataset": "Dataset B",
        "accuracy": "99.72%",
        "model_size": "278M parameters", 
        "training_epochs": 10,
        "training_loss": 0.0176,
        "languages_supported": 20,
        "description": "Ultra high-accuracy XLM-RoBERTa model fine-tuned on enhanced dataset. Achieves exceptional 99.72% accuracy with support for 20 carefully selected high-performance languages and state-of-the-art performance.",
        "training_details": "Advanced fine-tuning of XLM-RoBERTa on enhanced dataset with specialized training procedures, achieving loss of 0.0176",
        "use_cases": "Research applications, high-precision language detection, critical accuracy requirements",
        "strengths": "Exceptional accuracy (99.72%), focused language support, state-of-the-art results",
        "limitations": "Higher computational requirements, limited to 20 languages",
        "status": "available"
    },
    
    "model-b-dataset-b": {
        "huggingface_model": "zues0102/bert-base-multilingual-cased",
        "display_name": "BERT Model B Dataset B",
        "short_name": "Model B Dataset B",
        "architecture": "BERT", 
        "base_model": "bert-base-multilingual-cased",
        "dataset": "Dataset B",
        "accuracy": "99.85%",
        "model_size": "178M parameters",
        "training_epochs": 10,
        "training_loss": 0.0125,
        "languages_supported": 20,
        "description": "State-of-the-art BERT model achieving highest accuracy (99.85%) through specialized training on enhanced dataset. Optimized for 20 carefully selected high-performance languages.",
        "training_details": "Precision-optimized BERT training on enhanced dataset achieving ultra-low loss of 0.0125, specialized for maximum accuracy",
        "use_cases": "Maximum precision applications, research requiring highest accuracy, critical language identification",
        "strengths": "Highest accuracy (99.85%), ultra-low training loss, optimized precision, efficient architecture",
        "limitations": "Limited to 20 languages, specialized for specific language set",
        "status": "available"
    }
}

# Language mappings - comprehensive set
LANGUAGE_MAPPINGS = {
    'af': 'Afrikaans', 'am': 'Amharic', 'ar': 'Arabic', 'as': 'Assamese', 
    'az': 'Azerbaijani', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bn': 'Bengali',
    'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'cs': 'Czech', 
    'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dz': 'Dzongkha',
    'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish',
    'et': 'Estonian', 'eu': 'Basque', 'fa': 'Persian', 'fi': 'Finnish', 
    'fr': 'French', 'fy': 'Frisian', 'ga': 'Irish', 'gd': 'Scottish Gaelic',
    'gl': 'Galician', 'gu': 'Gujarati', 'ha': 'Hausa', 'he': 'Hebrew', 
    'hi': 'Hindi', 'hr': 'Croatian', 'ht': 'Haitian Creole', 'hu': 'Hungarian',
    'hy': 'Armenian', 'id': 'Indonesian', 'is': 'Icelandic', 'it': 'Italian', 
    'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kk': 'Kazakh',
    'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean', 'ku': 'Kurdish',
    'ky': 'Kyrgyz', 'la': 'Latin', 'lb': 'Luxembourgish', 'lo': 'Lao',
    'lt': 'Lithuanian', 'lv': 'Latvian', 'mg': 'Malagasy', 'mk': 'Macedonian', 
    'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay',
    'mt': 'Maltese', 'my': 'Myanmar (Burmese)', 'nb': 'Norwegian Bokmål',
    'ne': 'Nepali', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian',
    'oc': 'Occitan', 'or': 'Odia', 'pa': 'Punjabi', 'pl': 'Polish',
    'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'ro': 'Romanian', 
    'ru': 'Russian', 'rw': 'Kinyarwanda', 'se': 'Northern Sami', 'si': 'Sinhala',
    'sk': 'Slovak', 'sl': 'Slovenian', 'so': 'Somali', 'sq': 'Albanian',
    'sr': 'Serbian', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', 
    'te': 'Telugu', 'th': 'Thai', 'tl': 'Filipino', 'tr': 'Turkish',
    'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'vi': 'Vietnamese',
    'vo': 'Volapük', 'wa': 'Walloon', 'xh': 'Xhosa', 'yi': 'Yiddish',
    'yo': 'Yoruba', 'zh': 'Chinese', 'zh-cn': 'Chinese (Simplified)',
    'zh-tw': 'Chinese (Traditional)', 'zh-hans': 'Chinese (Simplified)',
    'zh-hant': 'Chinese (Traditional)', 'zu': 'Zulu'
}

# Model-specific language support
MODEL_LANGUAGE_SUPPORT = {
    "model-a-dataset-a": [
        'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 
        'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu', 
        'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi', 
        'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 
        'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg', 
        'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn', 
        'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw', 
        'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 
        'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi', 
        'yo', 'zh', 'zu'
    ],
    
    "model-b-dataset-a": [
        'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 
        'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu', 
        'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi', 
        'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 
        'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg', 
        'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn', 
        'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw', 
        'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 
        'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi', 
        'yo', 'zh', 'zu'
    ],
    
    "model-a-dataset-b": [
        'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
        'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
    ],
    
    "model-b-dataset-b": [
        'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
        'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
    ]
}

def get_model_config(model_key: str) -> Dict[str, Any]:
    """Get configuration for a specific model."""
    return MODEL_CONFIGURATIONS.get(model_key, {})

def get_all_model_configs() -> Dict[str, Dict[str, Any]]:
    """Get all model configurations."""
    return MODEL_CONFIGURATIONS.copy()

def get_supported_languages(model_key: str) -> List[str]:
    """Get supported languages for a specific model."""
    return MODEL_LANGUAGE_SUPPORT.get(model_key, [])

def get_language_name(language_code: str) -> str:
    """Get human-readable language name from code."""
    return LANGUAGE_MAPPINGS.get(language_code.lower(), f"Unknown ({language_code})")