yangding14
first commit
72f90b1
raw
history blame
9.99 kB
"""
Centralized Model Configuration
This module contains the configuration for all language detection models
organized by the model architecture and training dataset combinations.
Model Architecture:
- Model A: XLM-RoBERTa based architectures
- Model B: BERT based architectures
Training Datasets:
- Dataset A: Standard multilingual language detection dataset
- Dataset B: Enhanced/specialized language detection dataset
"""
from typing import Dict, Any, List
# Model configurations organized by architecture and dataset
MODEL_CONFIGURATIONS = {
"model-a-dataset-a": {
"huggingface_model": "ZheYu03/xlm-r-langdetect-model",
"display_name": "XLM-RoBERTa Model A Dataset A",
"short_name": "Model A Dataset A",
"architecture": "XLM-RoBERTa",
"base_model": "xlm-roberta-base",
"dataset": "Dataset A",
"accuracy": "97.9%",
"model_size": "278M parameters",
"training_epochs": 10,
"languages_supported": 100,
"description": "High-performance XLM-RoBERTa based language detection model fine-tuned on standard multilingual dataset. Delivers reliable 97.9% accuracy across 100+ languages with robust cross-lingual capabilities.",
"training_details": "Fine-tuned XLM-RoBERTa base model with AdamW optimizer, 10 epochs training on comprehensive multilingual language detection dataset",
"use_cases": "General-purpose language detection, multilingual content processing, cross-lingual text analysis",
"strengths": "Excellent multilingual performance, robust cross-lingual transfer, proven reliability, comprehensive language coverage",
"limitations": "Higher computational requirements, moderate inference speed",
"status": "available"
},
"model-b-dataset-a": {
"huggingface_model": "SongJuNN/xlm-r-langdetect-model",
"display_name": "BERT Model B Dataset A",
"short_name": "Model B Dataset A",
"architecture": "BERT",
"base_model": "bert-base-multilingual-cased",
"dataset": "Dataset A",
"accuracy": "96.17%",
"model_size": "178M parameters",
"training_epochs": 10,
"languages_supported": 100,
"description": "Efficient BERT-based language detection model trained on standard multilingual dataset. Optimized for speed and broad language coverage with 96.17% accuracy across 100+ languages.",
"training_details": "BERT multilingual model fine-tuned with AdamW optimizer (lr=2e-05), mixed precision training, optimized for efficiency",
"use_cases": "High-throughput language detection, real-time applications, resource-constrained environments",
"strengths": "Fast inference speed, lower memory usage, broad language support, efficient processing",
"limitations": "Slightly lower accuracy compared to XLM-RoBERTa variants",
"status": "available"
},
"model-a-dataset-b": {
"huggingface_model": "zues0102/xlmr-papluca-model",
"display_name": "XLM-RoBERTa Model A Dataset B",
"short_name": "Model A Dataset B",
"architecture": "XLM-RoBERTa",
"base_model": "xlm-roberta-base",
"dataset": "Dataset B",
"accuracy": "99.72%",
"model_size": "278M parameters",
"training_epochs": 10,
"training_loss": 0.0176,
"languages_supported": 20,
"description": "Ultra high-accuracy XLM-RoBERTa model fine-tuned on enhanced dataset. Achieves exceptional 99.72% accuracy with support for 20 carefully selected high-performance languages and state-of-the-art performance.",
"training_details": "Advanced fine-tuning of XLM-RoBERTa on enhanced dataset with specialized training procedures, achieving loss of 0.0176",
"use_cases": "Research applications, high-precision language detection, critical accuracy requirements",
"strengths": "Exceptional accuracy (99.72%), focused language support, state-of-the-art results",
"limitations": "Higher computational requirements, limited to 20 languages",
"status": "available"
},
"model-b-dataset-b": {
"huggingface_model": "zues0102/bert-base-multilingual-cased",
"display_name": "BERT Model B Dataset B",
"short_name": "Model B Dataset B",
"architecture": "BERT",
"base_model": "bert-base-multilingual-cased",
"dataset": "Dataset B",
"accuracy": "99.85%",
"model_size": "178M parameters",
"training_epochs": 10,
"training_loss": 0.0125,
"languages_supported": 20,
"description": "State-of-the-art BERT model achieving highest accuracy (99.85%) through specialized training on enhanced dataset. Optimized for 20 carefully selected high-performance languages.",
"training_details": "Precision-optimized BERT training on enhanced dataset achieving ultra-low loss of 0.0125, specialized for maximum accuracy",
"use_cases": "Maximum precision applications, research requiring highest accuracy, critical language identification",
"strengths": "Highest accuracy (99.85%), ultra-low training loss, optimized precision, efficient architecture",
"limitations": "Limited to 20 languages, specialized for specific language set",
"status": "available"
}
}
# Language mappings - comprehensive set
LANGUAGE_MAPPINGS = {
'af': 'Afrikaans', 'am': 'Amharic', 'ar': 'Arabic', 'as': 'Assamese',
'az': 'Azerbaijani', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bn': 'Bengali',
'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'cs': 'Czech',
'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dz': 'Dzongkha',
'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish',
'et': 'Estonian', 'eu': 'Basque', 'fa': 'Persian', 'fi': 'Finnish',
'fr': 'French', 'fy': 'Frisian', 'ga': 'Irish', 'gd': 'Scottish Gaelic',
'gl': 'Galician', 'gu': 'Gujarati', 'ha': 'Hausa', 'he': 'Hebrew',
'hi': 'Hindi', 'hr': 'Croatian', 'ht': 'Haitian Creole', 'hu': 'Hungarian',
'hy': 'Armenian', 'id': 'Indonesian', 'is': 'Icelandic', 'it': 'Italian',
'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kk': 'Kazakh',
'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean', 'ku': 'Kurdish',
'ky': 'Kyrgyz', 'la': 'Latin', 'lb': 'Luxembourgish', 'lo': 'Lao',
'lt': 'Lithuanian', 'lv': 'Latvian', 'mg': 'Malagasy', 'mk': 'Macedonian',
'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay',
'mt': 'Maltese', 'my': 'Myanmar (Burmese)', 'nb': 'Norwegian Bokmål',
'ne': 'Nepali', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian',
'oc': 'Occitan', 'or': 'Odia', 'pa': 'Punjabi', 'pl': 'Polish',
'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'ro': 'Romanian',
'ru': 'Russian', 'rw': 'Kinyarwanda', 'se': 'Northern Sami', 'si': 'Sinhala',
'sk': 'Slovak', 'sl': 'Slovenian', 'so': 'Somali', 'sq': 'Albanian',
'sr': 'Serbian', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil',
'te': 'Telugu', 'th': 'Thai', 'tl': 'Filipino', 'tr': 'Turkish',
'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'vi': 'Vietnamese',
'vo': 'Volapük', 'wa': 'Walloon', 'xh': 'Xhosa', 'yi': 'Yiddish',
'yo': 'Yoruba', 'zh': 'Chinese', 'zh-cn': 'Chinese (Simplified)',
'zh-tw': 'Chinese (Traditional)', 'zh-hans': 'Chinese (Simplified)',
'zh-hant': 'Chinese (Traditional)', 'zu': 'Zulu'
}
# Model-specific language support
MODEL_LANGUAGE_SUPPORT = {
"model-a-dataset-a": [
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
'yo', 'zh', 'zu'
],
"model-b-dataset-a": [
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
'yo', 'zh', 'zu'
],
"model-a-dataset-b": [
'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
],
"model-b-dataset-b": [
'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
]
}
def get_model_config(model_key: str) -> Dict[str, Any]:
"""Get configuration for a specific model."""
return MODEL_CONFIGURATIONS.get(model_key, {})
def get_all_model_configs() -> Dict[str, Dict[str, Any]]:
"""Get all model configurations."""
return MODEL_CONFIGURATIONS.copy()
def get_supported_languages(model_key: str) -> List[str]:
"""Get supported languages for a specific model."""
return MODEL_LANGUAGE_SUPPORT.get(model_key, [])
def get_language_name(language_code: str) -> str:
"""Get human-readable language name from code."""
return LANGUAGE_MAPPINGS.get(language_code.lower(), f"Unknown ({language_code})")