yangding14
first commit
72f90b1
"""
Centralized Model Configuration
This module contains the configuration for all language detection models
organized by the model architecture and training dataset combinations.
Model Architecture:
- Model A: XLM-RoBERTa based architectures
- Model B: BERT based architectures
Training Datasets:
- Dataset A: Standard multilingual language detection dataset
- Dataset B: Enhanced/specialized language detection dataset
"""
from typing import Dict, Any, List
# Model configurations organized by architecture and dataset
MODEL_CONFIGURATIONS = {
"model-a-dataset-a": {
"huggingface_model": "ZheYu03/xlm-r-langdetect-model",
"display_name": "XLM-RoBERTa Model A Dataset A",
"short_name": "Model A Dataset A",
"architecture": "XLM-RoBERTa",
"base_model": "xlm-roberta-base",
"dataset": "Dataset A",
"accuracy": "97.9%",
"model_size": "278M parameters",
"training_epochs": 10,
"languages_supported": 100,
"description": "High-performance XLM-RoBERTa based language detection model fine-tuned on standard multilingual dataset. Delivers reliable 97.9% accuracy across 100+ languages with robust cross-lingual capabilities.",
"training_details": "Fine-tuned XLM-RoBERTa base model with AdamW optimizer, 10 epochs training on comprehensive multilingual language detection dataset",
"use_cases": "General-purpose language detection, multilingual content processing, cross-lingual text analysis",
"strengths": "Excellent multilingual performance, robust cross-lingual transfer, proven reliability, comprehensive language coverage",
"limitations": "Higher computational requirements, moderate inference speed",
"status": "available"
},
"model-b-dataset-a": {
"huggingface_model": "SongJuNN/xlm-r-langdetect-model",
"display_name": "BERT Model B Dataset A",
"short_name": "Model B Dataset A",
"architecture": "BERT",
"base_model": "bert-base-multilingual-cased",
"dataset": "Dataset A",
"accuracy": "96.17%",
"model_size": "178M parameters",
"training_epochs": 10,
"languages_supported": 100,
"description": "Efficient BERT-based language detection model trained on standard multilingual dataset. Optimized for speed and broad language coverage with 96.17% accuracy across 100+ languages.",
"training_details": "BERT multilingual model fine-tuned with AdamW optimizer (lr=2e-05), mixed precision training, optimized for efficiency",
"use_cases": "High-throughput language detection, real-time applications, resource-constrained environments",
"strengths": "Fast inference speed, lower memory usage, broad language support, efficient processing",
"limitations": "Slightly lower accuracy compared to XLM-RoBERTa variants",
"status": "available"
},
"model-a-dataset-b": {
"huggingface_model": "zues0102/xlmr-papluca-model",
"display_name": "XLM-RoBERTa Model A Dataset B",
"short_name": "Model A Dataset B",
"architecture": "XLM-RoBERTa",
"base_model": "xlm-roberta-base",
"dataset": "Dataset B",
"accuracy": "99.72%",
"model_size": "278M parameters",
"training_epochs": 10,
"training_loss": 0.0176,
"languages_supported": 20,
"description": "Ultra high-accuracy XLM-RoBERTa model fine-tuned on enhanced dataset. Achieves exceptional 99.72% accuracy with support for 20 carefully selected high-performance languages and state-of-the-art performance.",
"training_details": "Advanced fine-tuning of XLM-RoBERTa on enhanced dataset with specialized training procedures, achieving loss of 0.0176",
"use_cases": "Research applications, high-precision language detection, critical accuracy requirements",
"strengths": "Exceptional accuracy (99.72%), focused language support, state-of-the-art results",
"limitations": "Higher computational requirements, limited to 20 languages",
"status": "available"
},
"model-b-dataset-b": {
"huggingface_model": "zues0102/bert-base-multilingual-cased",
"display_name": "BERT Model B Dataset B",
"short_name": "Model B Dataset B",
"architecture": "BERT",
"base_model": "bert-base-multilingual-cased",
"dataset": "Dataset B",
"accuracy": "99.85%",
"model_size": "178M parameters",
"training_epochs": 10,
"training_loss": 0.0125,
"languages_supported": 20,
"description": "State-of-the-art BERT model achieving highest accuracy (99.85%) through specialized training on enhanced dataset. Optimized for 20 carefully selected high-performance languages.",
"training_details": "Precision-optimized BERT training on enhanced dataset achieving ultra-low loss of 0.0125, specialized for maximum accuracy",
"use_cases": "Maximum precision applications, research requiring highest accuracy, critical language identification",
"strengths": "Highest accuracy (99.85%), ultra-low training loss, optimized precision, efficient architecture",
"limitations": "Limited to 20 languages, specialized for specific language set",
"status": "available"
}
}
# Language mappings - comprehensive set
LANGUAGE_MAPPINGS = {
'af': 'Afrikaans', 'am': 'Amharic', 'ar': 'Arabic', 'as': 'Assamese',
'az': 'Azerbaijani', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bn': 'Bengali',
'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'cs': 'Czech',
'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dz': 'Dzongkha',
'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish',
'et': 'Estonian', 'eu': 'Basque', 'fa': 'Persian', 'fi': 'Finnish',
'fr': 'French', 'fy': 'Frisian', 'ga': 'Irish', 'gd': 'Scottish Gaelic',
'gl': 'Galician', 'gu': 'Gujarati', 'ha': 'Hausa', 'he': 'Hebrew',
'hi': 'Hindi', 'hr': 'Croatian', 'ht': 'Haitian Creole', 'hu': 'Hungarian',
'hy': 'Armenian', 'id': 'Indonesian', 'is': 'Icelandic', 'it': 'Italian',
'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kk': 'Kazakh',
'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean', 'ku': 'Kurdish',
'ky': 'Kyrgyz', 'la': 'Latin', 'lb': 'Luxembourgish', 'lo': 'Lao',
'lt': 'Lithuanian', 'lv': 'Latvian', 'mg': 'Malagasy', 'mk': 'Macedonian',
'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay',
'mt': 'Maltese', 'my': 'Myanmar (Burmese)', 'nb': 'Norwegian Bokmål',
'ne': 'Nepali', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian',
'oc': 'Occitan', 'or': 'Odia', 'pa': 'Punjabi', 'pl': 'Polish',
'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'ro': 'Romanian',
'ru': 'Russian', 'rw': 'Kinyarwanda', 'se': 'Northern Sami', 'si': 'Sinhala',
'sk': 'Slovak', 'sl': 'Slovenian', 'so': 'Somali', 'sq': 'Albanian',
'sr': 'Serbian', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil',
'te': 'Telugu', 'th': 'Thai', 'tl': 'Filipino', 'tr': 'Turkish',
'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'vi': 'Vietnamese',
'vo': 'Volapük', 'wa': 'Walloon', 'xh': 'Xhosa', 'yi': 'Yiddish',
'yo': 'Yoruba', 'zh': 'Chinese', 'zh-cn': 'Chinese (Simplified)',
'zh-tw': 'Chinese (Traditional)', 'zh-hans': 'Chinese (Simplified)',
'zh-hant': 'Chinese (Traditional)', 'zu': 'Zulu'
}
# Model-specific language support
MODEL_LANGUAGE_SUPPORT = {
"model-a-dataset-a": [
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
'yo', 'zh', 'zu'
],
"model-b-dataset-a": [
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
'yo', 'zh', 'zu'
],
"model-a-dataset-b": [
'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
],
"model-b-dataset-b": [
'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
]
}
def get_model_config(model_key: str) -> Dict[str, Any]:
"""Get configuration for a specific model."""
return MODEL_CONFIGURATIONS.get(model_key, {})
def get_all_model_configs() -> Dict[str, Dict[str, Any]]:
"""Get all model configurations."""
return MODEL_CONFIGURATIONS.copy()
def get_supported_languages(model_key: str) -> List[str]:
"""Get supported languages for a specific model."""
return MODEL_LANGUAGE_SUPPORT.get(model_key, [])
def get_language_name(language_code: str) -> str:
"""Get human-readable language name from code."""
return LANGUAGE_MAPPINGS.get(language_code.lower(), f"Unknown ({language_code})")