|
""" |
|
Centralized Model Configuration |
|
|
|
This module contains the configuration for all language detection models |
|
organized by the model architecture and training dataset combinations. |
|
|
|
Model Architecture: |
|
- Model A: XLM-RoBERTa based architectures |
|
- Model B: BERT based architectures |
|
|
|
Training Datasets: |
|
- Dataset A: Standard multilingual language detection dataset |
|
- Dataset B: Enhanced/specialized language detection dataset |
|
""" |
|
|
|
from typing import Dict, Any, List |
|
|
|
|
|
MODEL_CONFIGURATIONS = { |
|
"model-a-dataset-a": { |
|
"huggingface_model": "ZheYu03/xlm-r-langdetect-model", |
|
"display_name": "XLM-RoBERTa Model A Dataset A", |
|
"short_name": "Model A Dataset A", |
|
"architecture": "XLM-RoBERTa", |
|
"base_model": "xlm-roberta-base", |
|
"dataset": "Dataset A", |
|
"accuracy": "97.9%", |
|
"model_size": "278M parameters", |
|
"training_epochs": 10, |
|
"languages_supported": 100, |
|
"description": "High-performance XLM-RoBERTa based language detection model fine-tuned on standard multilingual dataset. Delivers reliable 97.9% accuracy across 100+ languages with robust cross-lingual capabilities.", |
|
"training_details": "Fine-tuned XLM-RoBERTa base model with AdamW optimizer, 10 epochs training on comprehensive multilingual language detection dataset", |
|
"use_cases": "General-purpose language detection, multilingual content processing, cross-lingual text analysis", |
|
"strengths": "Excellent multilingual performance, robust cross-lingual transfer, proven reliability, comprehensive language coverage", |
|
"limitations": "Higher computational requirements, moderate inference speed", |
|
"status": "available" |
|
}, |
|
|
|
"model-b-dataset-a": { |
|
"huggingface_model": "SongJuNN/xlm-r-langdetect-model", |
|
"display_name": "BERT Model B Dataset A", |
|
"short_name": "Model B Dataset A", |
|
"architecture": "BERT", |
|
"base_model": "bert-base-multilingual-cased", |
|
"dataset": "Dataset A", |
|
"accuracy": "96.17%", |
|
"model_size": "178M parameters", |
|
"training_epochs": 10, |
|
"languages_supported": 100, |
|
"description": "Efficient BERT-based language detection model trained on standard multilingual dataset. Optimized for speed and broad language coverage with 96.17% accuracy across 100+ languages.", |
|
"training_details": "BERT multilingual model fine-tuned with AdamW optimizer (lr=2e-05), mixed precision training, optimized for efficiency", |
|
"use_cases": "High-throughput language detection, real-time applications, resource-constrained environments", |
|
"strengths": "Fast inference speed, lower memory usage, broad language support, efficient processing", |
|
"limitations": "Slightly lower accuracy compared to XLM-RoBERTa variants", |
|
"status": "available" |
|
}, |
|
|
|
"model-a-dataset-b": { |
|
"huggingface_model": "zues0102/xlmr-papluca-model", |
|
"display_name": "XLM-RoBERTa Model A Dataset B", |
|
"short_name": "Model A Dataset B", |
|
"architecture": "XLM-RoBERTa", |
|
"base_model": "xlm-roberta-base", |
|
"dataset": "Dataset B", |
|
"accuracy": "99.72%", |
|
"model_size": "278M parameters", |
|
"training_epochs": 10, |
|
"training_loss": 0.0176, |
|
"languages_supported": 20, |
|
"description": "Ultra high-accuracy XLM-RoBERTa model fine-tuned on enhanced dataset. Achieves exceptional 99.72% accuracy with support for 20 carefully selected high-performance languages and state-of-the-art performance.", |
|
"training_details": "Advanced fine-tuning of XLM-RoBERTa on enhanced dataset with specialized training procedures, achieving loss of 0.0176", |
|
"use_cases": "Research applications, high-precision language detection, critical accuracy requirements", |
|
"strengths": "Exceptional accuracy (99.72%), focused language support, state-of-the-art results", |
|
"limitations": "Higher computational requirements, limited to 20 languages", |
|
"status": "available" |
|
}, |
|
|
|
"model-b-dataset-b": { |
|
"huggingface_model": "zues0102/bert-base-multilingual-cased", |
|
"display_name": "BERT Model B Dataset B", |
|
"short_name": "Model B Dataset B", |
|
"architecture": "BERT", |
|
"base_model": "bert-base-multilingual-cased", |
|
"dataset": "Dataset B", |
|
"accuracy": "99.85%", |
|
"model_size": "178M parameters", |
|
"training_epochs": 10, |
|
"training_loss": 0.0125, |
|
"languages_supported": 20, |
|
"description": "State-of-the-art BERT model achieving highest accuracy (99.85%) through specialized training on enhanced dataset. Optimized for 20 carefully selected high-performance languages.", |
|
"training_details": "Precision-optimized BERT training on enhanced dataset achieving ultra-low loss of 0.0125, specialized for maximum accuracy", |
|
"use_cases": "Maximum precision applications, research requiring highest accuracy, critical language identification", |
|
"strengths": "Highest accuracy (99.85%), ultra-low training loss, optimized precision, efficient architecture", |
|
"limitations": "Limited to 20 languages, specialized for specific language set", |
|
"status": "available" |
|
} |
|
} |
|
|
|
|
|
LANGUAGE_MAPPINGS = { |
|
'af': 'Afrikaans', 'am': 'Amharic', 'ar': 'Arabic', 'as': 'Assamese', |
|
'az': 'Azerbaijani', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bn': 'Bengali', |
|
'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'cs': 'Czech', |
|
'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dz': 'Dzongkha', |
|
'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish', |
|
'et': 'Estonian', 'eu': 'Basque', 'fa': 'Persian', 'fi': 'Finnish', |
|
'fr': 'French', 'fy': 'Frisian', 'ga': 'Irish', 'gd': 'Scottish Gaelic', |
|
'gl': 'Galician', 'gu': 'Gujarati', 'ha': 'Hausa', 'he': 'Hebrew', |
|
'hi': 'Hindi', 'hr': 'Croatian', 'ht': 'Haitian Creole', 'hu': 'Hungarian', |
|
'hy': 'Armenian', 'id': 'Indonesian', 'is': 'Icelandic', 'it': 'Italian', |
|
'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kk': 'Kazakh', |
|
'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean', 'ku': 'Kurdish', |
|
'ky': 'Kyrgyz', 'la': 'Latin', 'lb': 'Luxembourgish', 'lo': 'Lao', |
|
'lt': 'Lithuanian', 'lv': 'Latvian', 'mg': 'Malagasy', 'mk': 'Macedonian', |
|
'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay', |
|
'mt': 'Maltese', 'my': 'Myanmar (Burmese)', 'nb': 'Norwegian Bokmål', |
|
'ne': 'Nepali', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian', |
|
'oc': 'Occitan', 'or': 'Odia', 'pa': 'Punjabi', 'pl': 'Polish', |
|
'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'ro': 'Romanian', |
|
'ru': 'Russian', 'rw': 'Kinyarwanda', 'se': 'Northern Sami', 'si': 'Sinhala', |
|
'sk': 'Slovak', 'sl': 'Slovenian', 'so': 'Somali', 'sq': 'Albanian', |
|
'sr': 'Serbian', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', |
|
'te': 'Telugu', 'th': 'Thai', 'tl': 'Filipino', 'tr': 'Turkish', |
|
'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'vi': 'Vietnamese', |
|
'vo': 'Volapük', 'wa': 'Walloon', 'xh': 'Xhosa', 'yi': 'Yiddish', |
|
'yo': 'Yoruba', 'zh': 'Chinese', 'zh-cn': 'Chinese (Simplified)', |
|
'zh-tw': 'Chinese (Traditional)', 'zh-hans': 'Chinese (Simplified)', |
|
'zh-hant': 'Chinese (Traditional)', 'zu': 'Zulu' |
|
} |
|
|
|
|
|
MODEL_LANGUAGE_SUPPORT = { |
|
"model-a-dataset-a": [ |
|
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', |
|
'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu', |
|
'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi', |
|
'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk', |
|
'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg', |
|
'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn', |
|
'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw', |
|
'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', |
|
'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi', |
|
'yo', 'zh', 'zu' |
|
], |
|
|
|
"model-b-dataset-a": [ |
|
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', |
|
'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu', |
|
'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi', |
|
'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk', |
|
'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg', |
|
'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn', |
|
'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw', |
|
'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', |
|
'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi', |
|
'yo', 'zh', 'zu' |
|
], |
|
|
|
"model-a-dataset-b": [ |
|
'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja', |
|
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh' |
|
], |
|
|
|
"model-b-dataset-b": [ |
|
'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja', |
|
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh' |
|
] |
|
} |
|
|
|
def get_model_config(model_key: str) -> Dict[str, Any]: |
|
"""Get configuration for a specific model.""" |
|
return MODEL_CONFIGURATIONS.get(model_key, {}) |
|
|
|
def get_all_model_configs() -> Dict[str, Dict[str, Any]]: |
|
"""Get all model configurations.""" |
|
return MODEL_CONFIGURATIONS.copy() |
|
|
|
def get_supported_languages(model_key: str) -> List[str]: |
|
"""Get supported languages for a specific model.""" |
|
return MODEL_LANGUAGE_SUPPORT.get(model_key, []) |
|
|
|
def get_language_name(language_code: str) -> str: |
|
"""Get human-readable language name from code.""" |
|
return LANGUAGE_MAPPINGS.get(language_code.lower(), f"Unknown ({language_code})") |