File size: 9,989 Bytes
72f90b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
"""
Centralized Model Configuration
This module contains the configuration for all language detection models
organized by the model architecture and training dataset combinations.
Model Architecture:
- Model A: XLM-RoBERTa based architectures
- Model B: BERT based architectures
Training Datasets:
- Dataset A: Standard multilingual language detection dataset
- Dataset B: Enhanced/specialized language detection dataset
"""
from typing import Dict, Any, List
# Model configurations organized by architecture and dataset
MODEL_CONFIGURATIONS = {
"model-a-dataset-a": {
"huggingface_model": "ZheYu03/xlm-r-langdetect-model",
"display_name": "XLM-RoBERTa Model A Dataset A",
"short_name": "Model A Dataset A",
"architecture": "XLM-RoBERTa",
"base_model": "xlm-roberta-base",
"dataset": "Dataset A",
"accuracy": "97.9%",
"model_size": "278M parameters",
"training_epochs": 10,
"languages_supported": 100,
"description": "High-performance XLM-RoBERTa based language detection model fine-tuned on standard multilingual dataset. Delivers reliable 97.9% accuracy across 100+ languages with robust cross-lingual capabilities.",
"training_details": "Fine-tuned XLM-RoBERTa base model with AdamW optimizer, 10 epochs training on comprehensive multilingual language detection dataset",
"use_cases": "General-purpose language detection, multilingual content processing, cross-lingual text analysis",
"strengths": "Excellent multilingual performance, robust cross-lingual transfer, proven reliability, comprehensive language coverage",
"limitations": "Higher computational requirements, moderate inference speed",
"status": "available"
},
"model-b-dataset-a": {
"huggingface_model": "SongJuNN/xlm-r-langdetect-model",
"display_name": "BERT Model B Dataset A",
"short_name": "Model B Dataset A",
"architecture": "BERT",
"base_model": "bert-base-multilingual-cased",
"dataset": "Dataset A",
"accuracy": "96.17%",
"model_size": "178M parameters",
"training_epochs": 10,
"languages_supported": 100,
"description": "Efficient BERT-based language detection model trained on standard multilingual dataset. Optimized for speed and broad language coverage with 96.17% accuracy across 100+ languages.",
"training_details": "BERT multilingual model fine-tuned with AdamW optimizer (lr=2e-05), mixed precision training, optimized for efficiency",
"use_cases": "High-throughput language detection, real-time applications, resource-constrained environments",
"strengths": "Fast inference speed, lower memory usage, broad language support, efficient processing",
"limitations": "Slightly lower accuracy compared to XLM-RoBERTa variants",
"status": "available"
},
"model-a-dataset-b": {
"huggingface_model": "zues0102/xlmr-papluca-model",
"display_name": "XLM-RoBERTa Model A Dataset B",
"short_name": "Model A Dataset B",
"architecture": "XLM-RoBERTa",
"base_model": "xlm-roberta-base",
"dataset": "Dataset B",
"accuracy": "99.72%",
"model_size": "278M parameters",
"training_epochs": 10,
"training_loss": 0.0176,
"languages_supported": 20,
"description": "Ultra high-accuracy XLM-RoBERTa model fine-tuned on enhanced dataset. Achieves exceptional 99.72% accuracy with support for 20 carefully selected high-performance languages and state-of-the-art performance.",
"training_details": "Advanced fine-tuning of XLM-RoBERTa on enhanced dataset with specialized training procedures, achieving loss of 0.0176",
"use_cases": "Research applications, high-precision language detection, critical accuracy requirements",
"strengths": "Exceptional accuracy (99.72%), focused language support, state-of-the-art results",
"limitations": "Higher computational requirements, limited to 20 languages",
"status": "available"
},
"model-b-dataset-b": {
"huggingface_model": "zues0102/bert-base-multilingual-cased",
"display_name": "BERT Model B Dataset B",
"short_name": "Model B Dataset B",
"architecture": "BERT",
"base_model": "bert-base-multilingual-cased",
"dataset": "Dataset B",
"accuracy": "99.85%",
"model_size": "178M parameters",
"training_epochs": 10,
"training_loss": 0.0125,
"languages_supported": 20,
"description": "State-of-the-art BERT model achieving highest accuracy (99.85%) through specialized training on enhanced dataset. Optimized for 20 carefully selected high-performance languages.",
"training_details": "Precision-optimized BERT training on enhanced dataset achieving ultra-low loss of 0.0125, specialized for maximum accuracy",
"use_cases": "Maximum precision applications, research requiring highest accuracy, critical language identification",
"strengths": "Highest accuracy (99.85%), ultra-low training loss, optimized precision, efficient architecture",
"limitations": "Limited to 20 languages, specialized for specific language set",
"status": "available"
}
}
# Language mappings - comprehensive set
LANGUAGE_MAPPINGS = {
'af': 'Afrikaans', 'am': 'Amharic', 'ar': 'Arabic', 'as': 'Assamese',
'az': 'Azerbaijani', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bn': 'Bengali',
'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'cs': 'Czech',
'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dz': 'Dzongkha',
'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish',
'et': 'Estonian', 'eu': 'Basque', 'fa': 'Persian', 'fi': 'Finnish',
'fr': 'French', 'fy': 'Frisian', 'ga': 'Irish', 'gd': 'Scottish Gaelic',
'gl': 'Galician', 'gu': 'Gujarati', 'ha': 'Hausa', 'he': 'Hebrew',
'hi': 'Hindi', 'hr': 'Croatian', 'ht': 'Haitian Creole', 'hu': 'Hungarian',
'hy': 'Armenian', 'id': 'Indonesian', 'is': 'Icelandic', 'it': 'Italian',
'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kk': 'Kazakh',
'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean', 'ku': 'Kurdish',
'ky': 'Kyrgyz', 'la': 'Latin', 'lb': 'Luxembourgish', 'lo': 'Lao',
'lt': 'Lithuanian', 'lv': 'Latvian', 'mg': 'Malagasy', 'mk': 'Macedonian',
'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay',
'mt': 'Maltese', 'my': 'Myanmar (Burmese)', 'nb': 'Norwegian Bokmål',
'ne': 'Nepali', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian',
'oc': 'Occitan', 'or': 'Odia', 'pa': 'Punjabi', 'pl': 'Polish',
'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'ro': 'Romanian',
'ru': 'Russian', 'rw': 'Kinyarwanda', 'se': 'Northern Sami', 'si': 'Sinhala',
'sk': 'Slovak', 'sl': 'Slovenian', 'so': 'Somali', 'sq': 'Albanian',
'sr': 'Serbian', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil',
'te': 'Telugu', 'th': 'Thai', 'tl': 'Filipino', 'tr': 'Turkish',
'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'vi': 'Vietnamese',
'vo': 'Volapük', 'wa': 'Walloon', 'xh': 'Xhosa', 'yi': 'Yiddish',
'yo': 'Yoruba', 'zh': 'Chinese', 'zh-cn': 'Chinese (Simplified)',
'zh-tw': 'Chinese (Traditional)', 'zh-hans': 'Chinese (Simplified)',
'zh-hant': 'Chinese (Traditional)', 'zu': 'Zulu'
}
# Model-specific language support
MODEL_LANGUAGE_SUPPORT = {
"model-a-dataset-a": [
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
'yo', 'zh', 'zu'
],
"model-b-dataset-a": [
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
'yo', 'zh', 'zu'
],
"model-a-dataset-b": [
'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
],
"model-b-dataset-b": [
'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
]
}
def get_model_config(model_key: str) -> Dict[str, Any]:
"""Get configuration for a specific model."""
return MODEL_CONFIGURATIONS.get(model_key, {})
def get_all_model_configs() -> Dict[str, Dict[str, Any]]:
"""Get all model configurations."""
return MODEL_CONFIGURATIONS.copy()
def get_supported_languages(model_key: str) -> List[str]:
"""Get supported languages for a specific model."""
return MODEL_LANGUAGE_SUPPORT.get(model_key, [])
def get_language_name(language_code: str) -> str:
"""Get human-readable language name from code."""
return LANGUAGE_MAPPINGS.get(language_code.lower(), f"Unknown ({language_code})") |