Spaces:

yangdingcheok
/

language-detection-assignment

Running

App Files Files Community

yangding14 commited on 1 day ago

Commit

72f90b1

1 Parent(s): f5560c6

first commit

Browse files

Files changed (32) hide show

app.py +260 -0
backend/.DS_Store +0 -0
backend/__init__.py +1 -0
backend/__pycache__/__init__.cpython-313.pyc +0 -0
backend/__pycache__/language_detector.cpython-313.pyc +0 -0
backend/language_detector.py +269 -0
backend/models/__init__.py +34 -0
backend/models/__pycache__/__init__.cpython-313.pyc +0 -0
backend/models/__pycache__/base_model.cpython-313.pyc +0 -0
backend/models/__pycache__/model_a_dataset_a.cpython-313.pyc +0 -0
backend/models/__pycache__/model_a_dataset_b.cpython-313.pyc +0 -0
backend/models/__pycache__/model_b_dataset_a.cpython-313.pyc +0 -0
backend/models/__pycache__/model_b_dataset_b.cpython-313.pyc +0 -0
backend/models/__pycache__/model_config.cpython-313.pyc +0 -0
backend/models/__pycache__/placeholder_model_1.cpython-313.pyc +0 -0
backend/models/__pycache__/placeholder_model_2.cpython-313.pyc +0 -0
backend/models/__pycache__/placeholder_model_3.cpython-313.pyc +0 -0
backend/models/__pycache__/songjun.cpython-313.pyc +0 -0
backend/models/__pycache__/xlm_roberta_detector.cpython-313.pyc +0 -0
backend/models/base_model.py +73 -0
backend/models/model_a_dataset_a.py +147 -0
backend/models/model_a_dataset_b.py +149 -0
backend/models/model_b_dataset_a.py +147 -0
backend/models/model_b_dataset_b.py +149 -0
backend/models/model_config.py +184 -0
combined_app.py +414 -0
demo_page.py +423 -0
requirements.txt +64 -0
run.py +140 -0
test_demo_functionality.py +135 -0
test_model_a_dataset_a.py +153 -0
test_model_b_dataset_b.py +231 -0

app.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import gradio as gr
+from backend.language_detector import LanguageDetector
+def main():
+    # Initialize the language detector with default model (Model A Dataset A)
+    detector = LanguageDetector()
+    # Create Gradio interface
+    with gr.Blocks(title="Language Detection App", theme=gr.themes.Soft()) as app:
+        gr.Markdown("# 🌍 Language Detection App")
+        gr.Markdown("Select a model and enter text below to detect its language with confidence scores.")
+        # Model Selection Section with visual styling
+        with gr.Group():
+            gr.Markdown(
+                "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.1), transparent); border-radius: 8px 8px 0 0;'>🤖 Model Selection</div>"
+            )
+            # Get available models
+            available_models = detector.get_available_models()
+            model_choices = []
+            model_info_map = {}
+            for key, info in available_models.items():
+                if info["status"] == "available":
+                    model_choices.append((info["display_name"], key))
+                else:
+                    model_choices.append((f"{info['display_name']} (Coming Soon)", key))
+                model_info_map[key] = info
+            model_selector = gr.Dropdown(
+                choices=model_choices,
+                value="model-a-dataset-a",  # Default to Model A Dataset A
+                label="Choose Language Detection Model",
+                interactive=True
+            )
+            # Model Information Display
+            model_info_display = gr.Markdown(
+                value=_format_model_info(detector.get_current_model_info()),
+                label="Model Information"
+            )
+        # Add visual separator
+        gr.Markdown(
+            "<div style='margin: 24px 0; border-top: 3px solid rgba(99, 102, 241, 0.2); background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.05), transparent); height: 2px;'></div>"
+        )
+        # Analysis Section
+        with gr.Group():
+            gr.Markdown(
+                "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(34, 197, 94, 0.1), transparent); border-radius: 8px 8px 0 0;'>🔍 Language Analysis</div>"
+            )
+            with gr.Row():
+                with gr.Column(scale=2):
+                    # Input section
+                    text_input = gr.Textbox(
+                        label="Text to Analyze",
+                        placeholder="Enter text here to detect its language...",
+                        lines=5,
+                        max_lines=10
+                    )
+                    detect_btn = gr.Button("🔍 Detect Language", variant="primary", size="lg")
+                    # Example texts
+                    gr.Examples(
+                        examples=[
+                            ["Hello, how are you today?"],
+                            ["Bonjour, comment allez-vous?"],
+                            ["Hola, ¿cómo estás?"],
+                            ["Guten Tag, wie geht es Ihnen?"],
+                            ["こんにちは、元気ですか？"],
+                            ["Привет, как дела?"],
+                            ["Ciao, come stai?"],
+                            ["Olá, como você está?"],
+                            ["你好，你好吗？"],
+                            ["안녕하세요, 어떻게 지내세요?"]
+                        ],
+                        inputs=text_input,
+                        label="Try these examples:"
+                    )
+                with gr.Column(scale=2):
+                    # Output section
+                    with gr.Group():
+                        gr.Markdown(
+                            "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 12px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(168, 85, 247, 0.1), transparent); border-radius: 8px 8px 0 0;'>📊 Detection Results</div>"
+                        )
+                        detected_language = gr.Textbox(
+                            label="Detected Language",
+                            interactive=False
+                        )
+                        confidence_score = gr.Number(
+                            label="Confidence Score",
+                            interactive=False,
+                            precision=4
+                        )
+                        language_code = gr.Textbox(
+                            label="Language Code (ISO 639-1)",
+                            interactive=False
+                        )
+                        # Top predictions table
+                        top_predictions = gr.Dataframe(
+                            headers=["Language", "Code", "Confidence"],
+                            label="Top 5 Predictions",
+                            interactive=False,
+                            wrap=True
+                        )
+        # Status/Info section
+        with gr.Row():
+            status_text = gr.Textbox(
+                label="Status",
+                interactive=False,
+                visible=False
+            )
+        # Event handlers
+        def detect_language_wrapper(text, selected_model):
+            if not text.strip():
+                return (
+                    "No text provided",
+                    0.0,
+                    "",
+                    [],
+                    gr.update(value="Please enter some text to analyze.", visible=True)
+                )
+            try:
+                # Switch model if needed
+                if detector.current_model_key != selected_model:
+                    try:
+                        detector.switch_model(selected_model)
+                    except NotImplementedError:
+                        return (
+                            "Model unavailable",
+                            0.0,
+                            "",
+                            [],
+                            gr.update(value="This model is not yet implemented. Please select an available model.", visible=True)
+                        )
+                    except Exception as e:
+                        return (
+                            "Model error",
+                            0.0,
+                            "",
+                            [],
+                            gr.update(value=f"Error loading model: {str(e)}", visible=True)
+                        )
+                result = detector.detect_language(text)
+                # Extract main prediction
+                main_lang = result['language']
+                main_confidence = result['confidence']
+                main_code = result['language_code']
+                # Format top predictions for table
+                predictions_table = [
+                    [pred['language'], pred['language_code'], f"{pred['confidence']:.4f}"]
+                    for pred in result['top_predictions']
+                ]
+                model_info = result.get('metadata', {}).get('model_info', {})
+                model_name = model_info.get('name', 'Unknown Model')
+                return (
+                    main_lang,
+                    main_confidence,
+                    main_code,
+                    predictions_table,
+                    gr.update(value=f"✅ Analysis Complete\n\nInput Text: {text[:100]}{'...' if len(text) > 100 else ''}\n\nDetected Language: {main_lang} ({main_code})\nConfidence: {main_confidence:.2%}\n\nModel: {model_name}", visible=True)
+                )
+            except Exception as e:
+                return (
+                    "Error occurred",
+                    0.0,
+                    "",
+                    [],
+                    gr.update(value=f"Error: {str(e)}", visible=True)
+                )
+        def update_model_info(selected_model):
+            """Update model information display when model selection changes."""
+            try:
+                if detector.current_model_key != selected_model:
+                    detector.switch_model(selected_model)
+                model_info = detector.get_current_model_info()
+                return _format_model_info(model_info)
+            except NotImplementedError:
+                return "**This model is not yet implemented.** Please select an available model."
+            except Exception as e:
+                return f"**Error loading model information:** {str(e)}"
+        # Connect the button to the detection function
+        detect_btn.click(
+            fn=detect_language_wrapper,
+            inputs=[text_input, model_selector],
+            outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
+        )
+        # Also trigger on Enter key in text input
+        text_input.submit(
+            fn=detect_language_wrapper,
+            inputs=[text_input, model_selector],
+            outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
+        )
+        # Update model info when selection changes
+        model_selector.change(
+            fn=update_model_info,
+            inputs=[model_selector],
+            outputs=[model_info_display]
+        )
+    return app
+def _format_model_info(model_info):
+    """Format model information for display."""
+    if not model_info:
+        return "No model information available."
+    formatted_info = f"""
+**{model_info.get('name', 'Unknown Model')}**
+{model_info.get('description', 'No description available.')}
+**📊 Performance:**
+- Accuracy: {model_info.get('accuracy', 'N/A')}
+- Model Size: {model_info.get('model_size', 'N/A')}
+**🏗️ Architecture:**
+- Model Architecture: {model_info.get('architecture', 'N/A')}
+- Base Model: {model_info.get('base_model', 'N/A')}
+- Training Dataset: {model_info.get('dataset', 'N/A')}
+**🌐 Languages:** {model_info.get('languages_supported', 'N/A')}
+**⚙️ Training Details:** {model_info.get('training_details', 'N/A')}
+**💡 Use Cases:** {model_info.get('use_cases', 'N/A')}
+**✅ Strengths:** {model_info.get('strengths', 'N/A')}
+**⚠️ Limitations:** {model_info.get('limitations', 'N/A')}
+"""
+    return formatted_info
+if __name__ == "__main__":
+    app = main()
+    app.launch()

backend/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

backend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Backend package for language detection models

backend/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (173 Bytes). View file

backend/__pycache__/language_detector.cpython-313.pyc ADDED Viewed

Binary file (10.1 kB). View file

backend/language_detector.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+Language Detection Backend
+This module provides the main LanguageDetector class and ModelRegistry
+for managing multiple language detection models organized by architecture and dataset.
+Model Architecture:
+- Model A: XLM-RoBERTa based architectures
+- Model B: BERT based architectures
+Training Datasets:
+- Dataset A: Standard multilingual language detection dataset
+- Dataset B: Enhanced/specialized language detection dataset
+"""
+import logging
+from typing import Dict, List, Any
+from .models import (
+    BaseLanguageModel,
+    ModelADatasetA,
+    ModelBDatasetA,
+    ModelADatasetB,
+    ModelBDatasetB,
+    get_all_model_configs,
+    get_language_name,
+    LANGUAGE_MAPPINGS
+)
+class ModelRegistry:
+    """
+    Registry for managing available language detection models.
+    This class handles the registration and creation of language detection models
+    organized by model architecture (A: XLM-RoBERTa, B: BERT) and training
+    dataset (A: standard, B: enhanced).
+    """
+    def __init__(self):
+        """Initialize the model registry with available models."""
+        # Get model configurations from centralized config
+        self.model_configs = get_all_model_configs()
+        # Map model keys to their implementation classes
+        self.model_classes = {
+            "model-a-dataset-a": ModelADatasetA,      # XLM-RoBERTa + Dataset A
+            "model-b-dataset-a": ModelBDatasetA,      # BERT + Dataset A
+            "model-a-dataset-b": ModelADatasetB,      # XLM-RoBERTa + Dataset B
+            "model-b-dataset-b": ModelBDatasetB,      # BERT + Dataset B
+        }
+        # Build models registry by combining configs with classes
+        self.models = {}
+        # Add the new organized models
+        for model_key, config in self.model_configs.items():
+            if model_key in self.model_classes:
+                self.models[model_key] = {
+                    "class": self.model_classes[model_key],
+                    "display_name": config["display_name"],
+                    "description": config["description"],
+                    "status": config["status"]
+                }
+    def get_available_models(self) -> Dict[str, Dict[str, Any]]:
+        """
+        Get all registered models.
+        Returns:
+            Dict containing all model information
+        """
+        return self.models.copy()
+    def create_model(self, model_key: str) -> BaseLanguageModel:
+        """
+        Create an instance of the specified model.
+        Args:
+            model_key (str): Key of the model to create
+        Returns:
+            BaseLanguageModel: Instance of the requested model
+        Raises:
+            ValueError: If the model key is not found
+        """
+        if model_key not in self.models:
+            available_keys = list(self.models.keys())
+            raise ValueError(f"Unknown model: {model_key}. Available models: {available_keys}")
+        model_class = self.models[model_key]["class"]
+        return model_class()
+class LanguageDetector:
+    """
+    Main language detection class that orchestrates model predictions.
+    This class provides a unified interface for language detection using
+    different model architectures and training datasets. It handles model
+    switching and provides consistent output formatting.
+    """
+    def __init__(self, model_key: str = "model-a-dataset-a"):
+        """
+        Initialize the language detector.
+        Args:
+            model_key (str): Key of the model to use from the registry
+                - "model-a-dataset-a": XLM-RoBERTa + standard dataset
+                - "model-b-dataset-a": BERT + standard dataset
+                - "model-a-dataset-b": XLM-RoBERTa + enhanced dataset
+                - "model-b-dataset-b": BERT + enhanced dataset
+        """
+        self.registry = ModelRegistry()
+        self.current_model_key = model_key
+        self.model = self.registry.create_model(model_key)
+        # Use centralized language mappings
+        self.language_names = LANGUAGE_MAPPINGS
+    def switch_model(self, model_key: str):
+        """
+        Switch to a different model.
+        Args:
+            model_key (str): Key of the new model to use
+        Raises:
+            Exception: If model switching fails
+        """
+        try:
+            self.model = self.registry.create_model(model_key)
+            self.current_model_key = model_key
+            logging.info(f"Successfully switched to model: {model_key}")
+        except Exception as e:
+            logging.error(f"Failed to switch to model {model_key}: {e}")
+            raise
+    def get_current_model_info(self) -> Dict[str, Any]:
+        """
+        Get information about the currently selected model.
+        Returns:
+            Dict containing current model information
+        """
+        return self.model.get_model_info()
+    def get_available_models(self) -> Dict[str, Dict[str, Any]]:
+        """
+        Get all available models.
+        Returns:
+            Dict containing all available models
+        """
+        return self.registry.get_available_models()
+    def detect_language(self, text: str) -> Dict[str, Any]:
+        """
+        Detect the language of the input text.
+        Args:
+            text (str): Input text to analyze
+        Returns:
+            Dict containing:
+                - language: Main predicted language name
+                - language_code: Main predicted language code
+                - confidence: Confidence score for main prediction
+                - top_predictions: List of top 5 predictions with details
+                - metadata: Additional information about the prediction
+        Raises:
+            ValueError: If input text is empty
+            RuntimeError: If model prediction fails
+        """
+        if not text or not text.strip():
+            raise ValueError("Input text cannot be empty")
+        # Get predictions from the current model
+        model_result = self.model.predict(text.strip())
+        predictions = model_result['predictions']
+        if not predictions:
+            raise RuntimeError("Model returned no predictions")
+        # Extract main prediction
+        top_prediction = predictions[0]
+        main_language_code = top_prediction['language_code']
+        main_confidence = top_prediction['confidence']
+        # Get human-readable language name using centralized function
+        main_language_name = get_language_name(main_language_code)
+        # Format top predictions (limit to 5)
+        top_predictions = []
+        for pred in predictions[:5]:
+            lang_code = pred['language_code']
+            lang_name = get_language_name(lang_code)
+            top_predictions.append({
+                'language': lang_name,
+                'language_code': lang_code,
+                'confidence': pred['confidence']
+            })
+        # Prepare metadata
+        metadata = {
+            'text_length': model_result.get('text_length', len(text)),
+            'model_name': model_result.get('model_version', 'unknown'),
+            'model_type': model_result.get('model_type', 'unknown'),
+            'current_model_key': self.current_model_key,
+            'model_info': self.get_current_model_info()
+        }
+        return {
+            'language': main_language_name,
+            'language_code': main_language_code,
+            'confidence': main_confidence,
+            'top_predictions': top_predictions,
+            'metadata': metadata
+        }
+    def get_supported_languages(self) -> Dict[str, str]:
+        """
+        Get dictionary of supported language codes and names.
+        Returns:
+            Dict mapping language codes to language names
+        """
+        supported_codes = self.model.get_supported_languages()
+        return {
+            code: get_language_name(code)
+            for code in supported_codes
+        }
+# Example usage and testing
+if __name__ == "__main__":
+    # Initialize detector with default model (Model A Dataset A)
+    detector = LanguageDetector()
+    # Test with sample texts
+    test_texts = [
+        "Hello, how are you today?",
+        "Bonjour, comment allez-vous?",
+        "Hola, ¿cómo estás?",
+        "Guten Tag, wie geht es Ihnen?"
+    ]
+    print("Language Detection Test - Model A Dataset A")
+    print("=" * 60)
+    for text in test_texts:
+        try:
+            result = detector.detect_language(text)
+            print(f"Text: {text}")
+            print(f"Detected: {result['language']} ({result['language_code']}) - {result['confidence']:.3f}")
+            print("---")
+        except Exception as e:
+            print(f"Error detecting language for '{text}': {e}")
+            print("---")
+    # Show available models
+    print("\nAvailable Models:")
+    models = detector.get_available_models()
+    for key, info in models.items():
+        status = "✅" if info["status"] == "available" else "🚧"
+        print(f"{status} {info['display_name']} ({key}): {info['description']}")

backend/models/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""
+Language Detection Models Module
+This module provides the base model interface and all available language detection models.
+Models are organized by architecture (A: XLM-RoBERTa, B: BERT) and training dataset (A: standard, B: enhanced).
+"""
+from .base_model import BaseLanguageModel
+from .model_config import (
+    get_model_config,
+    get_all_model_configs,
+    get_supported_languages,
+    get_language_name,
+    LANGUAGE_MAPPINGS
+)
+# Import all model implementations
+from .model_a_dataset_a import ModelADatasetA
+from .model_b_dataset_a import ModelBDatasetA
+from .model_a_dataset_b import ModelADatasetB
+from .model_b_dataset_b import ModelBDatasetB
+__all__ = [
+    'BaseLanguageModel',
+    'ModelADatasetA',
+    'ModelBDatasetA',
+    'ModelADatasetB',
+    'ModelBDatasetB',
+    'get_model_config',
+    'get_all_model_configs',
+    'get_supported_languages',
+    'get_language_name',
+    'LANGUAGE_MAPPINGS'
+]

backend/models/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (984 Bytes). View file

backend/models/__pycache__/base_model.cpython-313.pyc ADDED Viewed

Binary file (2.39 kB). View file

backend/models/__pycache__/model_a_dataset_a.cpython-313.pyc ADDED Viewed

Binary file (6.57 kB). View file

backend/models/__pycache__/model_a_dataset_b.cpython-313.pyc ADDED Viewed

Binary file (6.71 kB). View file

backend/models/__pycache__/model_b_dataset_a.cpython-313.pyc ADDED Viewed

Binary file (6.56 kB). View file

backend/models/__pycache__/model_b_dataset_b.cpython-313.pyc ADDED Viewed

Binary file (6.92 kB). View file

backend/models/__pycache__/model_config.cpython-313.pyc ADDED Viewed

Binary file (10.4 kB). View file

backend/models/__pycache__/placeholder_model_1.cpython-313.pyc ADDED Viewed

Binary file (6.9 kB). View file

backend/models/__pycache__/placeholder_model_2.cpython-313.pyc ADDED Viewed

Binary file (7 kB). View file

backend/models/__pycache__/placeholder_model_3.cpython-313.pyc ADDED Viewed

Binary file (6.83 kB). View file

backend/models/__pycache__/songjun.cpython-313.pyc ADDED Viewed

Binary file (6.89 kB). View file

backend/models/__pycache__/xlm_roberta_detector.cpython-313.pyc ADDED Viewed

Binary file (6.3 kB). View file

backend/models/base_model.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+Base Language Model Abstract Class
+This module defines the interface that all language detection models must implement.
+"""
+from abc import ABC, abstractmethod
+from typing import Dict, List, Any
+class BaseLanguageModel(ABC):
+    """
+    Abstract base class for language detection models.
+    All language detection models must inherit from this class and implement
+    the required methods.
+    """
+    @abstractmethod
+    def predict(self, text: str) -> Dict[str, Any]:
+        """
+        Predict the language of the given text.
+        Args:
+            text (str): Input text to analyze
+        Returns:
+            Dict containing prediction results with structure:
+            {
+                'predictions': [
+                    {
+                        'language_code': str,
+                        'confidence': float
+                    },
+                    ...
+                ],
+                'text_length': int,
+                'model_version': str,
+                'model_type': str
+            }
+        """
+        pass
+    @abstractmethod
+    def get_supported_languages(self) -> List[str]:
+        """
+        Get list of supported language codes.
+        Returns:
+            List of ISO 639-1 language codes
+        """
+        pass
+    @abstractmethod
+    def get_model_info(self) -> Dict[str, Any]:
+        """
+        Get information about the model.
+        Returns:
+            Dict containing model metadata and description with structure:
+            {
+                'name': str,
+                'description': str,
+                'accuracy': str,
+                'model_size': str,
+                'languages_supported': str,
+                'training_details': str,
+                'use_cases': str,
+                'strengths': str,
+                'limitations': str
+            }
+        """
+        pass

backend/models/model_a_dataset_a.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+Model A Dataset A - XLM-RoBERTa Language Detection
+This module implements the XLM-RoBERTa based language detection model
+fine-tuned on Dataset A (standard multilingual language detection dataset).
+Model Architecture: XLM-RoBERTa (Model A)
+Training Dataset: Dataset A (standard multilingual)
+Performance: 97.9% accuracy across 60+ languages
+"""
+import logging
+from typing import Dict, List, Any
+from .base_model import BaseLanguageModel
+from .model_config import get_model_config, get_supported_languages, get_language_name
+try:
+    from transformers import pipeline
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+    logging.warning("Transformers library not available. Please install with: pip install transformers torch")
+class ModelADatasetA(BaseLanguageModel):
+    """
+    XLM-RoBERTa based language detection model (Model A) trained on Dataset A.
+    This model represents the XLM-RoBERTa architecture fine-tuned on a standard
+    multilingual language detection dataset, achieving 97.9% accuracy with
+    robust cross-lingual performance across 60+ languages.
+    Architecture: XLM-RoBERTa (Model A)
+    Dataset: Dataset A (standard multilingual)
+    Base Model: xlm-roberta-base
+    Accuracy: 97.9%
+    Parameters: 278M
+    """
+    def __init__(self):
+        """Initialize the Model A Dataset A language detector."""
+        self.model_key = "model-a-dataset-a"
+        self.config = get_model_config(self.model_key)
+        self.model_name = self.config["huggingface_model"]
+        # Check if transformers library is available
+        if not HF_AVAILABLE:
+            raise ImportError(
+                "Transformers library required for Model A Dataset A. "
+                "Install with: pip install transformers torch"
+            )
+        # Initialize the model pipeline
+        try:
+            self.classifier = pipeline(
+                "text-classification",
+                model=self.model_name,
+                device=0,
+                top_k=None  # Return all scores
+            )
+            logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
+        except Exception as e:
+            logging.error(f"Failed to load {self.config['display_name']}: {e}")
+            raise RuntimeError(f"Could not initialize Model A Dataset A: {str(e)}")
+    def predict(self, text: str) -> Dict[str, Any]:
+        """
+        Predict language using Model A Dataset A (XLM-RoBERTa).
+        Args:
+            text (str): Input text to analyze
+        Returns:
+            Dict with predictions, metadata, and model information
+        """
+        if not text or not text.strip():
+            raise ValueError("Input text cannot be empty")
+        try:
+            # Run the model prediction
+            results = self.classifier(text)
+            # Handle the format returned by the pipeline
+            if isinstance(results, list) and len(results) > 0:
+                if isinstance(results[0], list):
+                    # Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
+                    prediction_list = results[0]
+                else:
+                    # Direct list format: [{'label': 'en', 'score': 0.99}, ...]
+                    prediction_list = results
+            else:
+                raise ValueError("Unexpected pipeline output format")
+            # Sort predictions by confidence score (descending)
+            predictions = [
+                {
+                    'language_code': result['label'].lower(),
+                    'confidence': result['score']
+                }
+                for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
+            ]
+            return {
+                'predictions': predictions,
+                'text_length': len(text),
+                'model_version': self.model_name,
+                'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
+            }
+        except Exception as e:
+            logging.error(f"Model A Dataset A prediction failed: {e}")
+            raise RuntimeError(f"Model prediction failed: {str(e)}")
+    def get_supported_languages(self) -> List[str]:
+        """
+        Get supported languages for Model A Dataset A.
+        Returns:
+            List of ISO 639-1 language codes supported by the model
+        """
+        return get_supported_languages(self.model_key)
+    def get_model_info(self) -> Dict[str, Any]:
+        """
+        Get detailed information about Model A Dataset A.
+        Returns:
+            Dict containing comprehensive model metadata
+        """
+        # Build comprehensive model info from centralized config
+        model_info = {
+            "name": self.config["display_name"],
+            "description": self.config["description"],
+            "accuracy": self.config["accuracy"],
+            "model_size": self.config["model_size"],
+            "architecture": self.config["architecture"],
+            "base_model": self.config["base_model"],
+            "dataset": self.config["dataset"],
+            "languages_supported": f"{self.config['languages_supported']}+ languages",
+            "training_details": self.config["training_details"],
+            "use_cases": self.config["use_cases"],
+            "strengths": self.config["strengths"],
+            "limitations": self.config["limitations"]
+        }
+        return model_info

backend/models/model_a_dataset_b.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+Model A Dataset B - XLM-RoBERTa Language Detection
+This module implements the XLM-RoBERTa based language detection model
+fine-tuned on Dataset B (enhanced/specialized language detection dataset).
+Model Architecture: XLM-RoBERTa (Model A)
+Training Dataset: Dataset B (enhanced/specialized)
+Performance: 99.72% accuracy across 100+ languages
+"""
+import logging
+from typing import Dict, List, Any
+from .base_model import BaseLanguageModel
+from .model_config import get_model_config, get_supported_languages, get_language_name
+try:
+    from transformers import pipeline
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+    logging.warning("Transformers library not available. Please install with: pip install transformers torch")
+class ModelADatasetB(BaseLanguageModel):
+    """
+    XLM-RoBERTa based language detection model (Model A) trained on Dataset B.
+    This model represents the XLM-RoBERTa architecture fine-tuned on an enhanced
+    language detection dataset, achieving exceptional 99.72% accuracy with
+    state-of-the-art performance across 100+ languages.
+    Architecture: XLM-RoBERTa (Model A)
+    Dataset: Dataset B (enhanced/specialized)
+    Base Model: xlm-roberta-base
+    Accuracy: 99.72%
+    Parameters: 278M
+    Training Loss: 0.0176
+    """
+    def __init__(self):
+        """Initialize the Model A Dataset B language detector."""
+        self.model_key = "model-a-dataset-b"
+        self.config = get_model_config(self.model_key)
+        self.model_name = self.config["huggingface_model"]
+        # Check if transformers library is available
+        if not HF_AVAILABLE:
+            raise ImportError(
+                "Transformers library required for Model A Dataset B. "
+                "Install with: pip install transformers torch"
+            )
+        # Initialize the model pipeline
+        try:
+            self.classifier = pipeline(
+                "text-classification",
+                model=self.model_name,
+                device=0,
+                top_k=None  # Return all scores
+            )
+            logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
+        except Exception as e:
+            logging.error(f"Failed to load {self.config['display_name']}: {e}")
+            raise RuntimeError(f"Could not initialize Model A Dataset B: {str(e)}")
+    def predict(self, text: str) -> Dict[str, Any]:
+        """
+        Predict language using Model A Dataset B (XLM-RoBERTa enhanced).
+        Args:
+            text (str): Input text to analyze
+        Returns:
+            Dict with predictions, metadata, and model information
+        """
+        if not text or not text.strip():
+            raise ValueError("Input text cannot be empty")
+        try:
+            # Run the model prediction
+            results = self.classifier(text)
+            # Handle the format returned by the pipeline
+            if isinstance(results, list) and len(results) > 0:
+                if isinstance(results[0], list):
+                    # Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
+                    prediction_list = results[0]
+                else:
+                    # Direct list format: [{'label': 'en', 'score': 0.99}, ...]
+                    prediction_list = results
+            else:
+                raise ValueError("Unexpected pipeline output format")
+            # Sort predictions by confidence score (descending)
+            predictions = [
+                {
+                    'language_code': result['label'].lower(),
+                    'confidence': result['score']
+                }
+                for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
+            ]
+            return {
+                'predictions': predictions,
+                'text_length': len(text),
+                'model_version': self.model_name,
+                'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
+            }
+        except Exception as e:
+            logging.error(f"Model A Dataset B prediction failed: {e}")
+            raise RuntimeError(f"Model prediction failed: {str(e)}")
+    def get_supported_languages(self) -> List[str]:
+        """
+        Get supported languages for Model A Dataset B.
+        Returns:
+            List of ISO 639-1 language codes supported by the model
+        """
+        return get_supported_languages(self.model_key)
+    def get_model_info(self) -> Dict[str, Any]:
+        """
+        Get detailed information about Model A Dataset B.
+        Returns:
+            Dict containing comprehensive model metadata
+        """
+        # Build comprehensive model info from centralized config
+        model_info = {
+            "name": self.config["display_name"],
+            "description": self.config["description"],
+            "accuracy": self.config["accuracy"],
+            "model_size": self.config["model_size"],
+            "architecture": self.config["architecture"],
+            "base_model": self.config["base_model"],
+            "dataset": self.config["dataset"],
+            "languages_supported": f"{self.config['languages_supported']}+ languages",
+            "training_details": self.config["training_details"],
+            "training_loss": f"{self.config.get('training_loss', 'N/A')}",
+            "use_cases": self.config["use_cases"],
+            "strengths": self.config["strengths"],
+            "limitations": self.config["limitations"]
+        }
+        return model_info

backend/models/model_b_dataset_a.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+Model B Dataset A - BERT Language Detection
+This module implements the BERT based language detection model
+fine-tuned on Dataset A (standard multilingual language detection dataset).
+Model Architecture: BERT (Model B)
+Training Dataset: Dataset A (standard multilingual)
+Performance: 96.17% accuracy across 100+ languages
+"""
+import logging
+from typing import Dict, List, Any
+from .base_model import BaseLanguageModel
+from .model_config import get_model_config, get_supported_languages, get_language_name
+try:
+    from transformers import pipeline
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+    logging.warning("Transformers library not available. Please install with: pip install transformers torch")
+class ModelBDatasetA(BaseLanguageModel):
+    """
+    BERT based language detection model (Model B) trained on Dataset A.
+    This model represents the BERT architecture fine-tuned on a standard
+    multilingual language detection dataset, achieving 96.17% accuracy with
+    optimized efficiency and broad language coverage across 100+ languages.
+    Architecture: BERT (Model B)
+    Dataset: Dataset A (standard multilingual)
+    Base Model: bert-base-multilingual-cased
+    Accuracy: 96.17%
+    Parameters: 178M
+    """
+    def __init__(self):
+        """Initialize the Model B Dataset A language detector."""
+        self.model_key = "model-b-dataset-a"
+        self.config = get_model_config(self.model_key)
+        self.model_name = self.config["huggingface_model"]
+        # Check if transformers library is available
+        if not HF_AVAILABLE:
+            raise ImportError(
+                "Transformers library required for Model B Dataset A. "
+                "Install with: pip install transformers torch"
+            )
+        # Initialize the model pipeline
+        try:
+            self.classifier = pipeline(
+                "text-classification",
+                model=self.model_name,
+                device=0,
+                top_k=None  # Return all scores
+            )
+            logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
+        except Exception as e:
+            logging.error(f"Failed to load {self.config['display_name']}: {e}")
+            raise RuntimeError(f"Could not initialize Model B Dataset A: {str(e)}")
+    def predict(self, text: str) -> Dict[str, Any]:
+        """
+        Predict language using Model B Dataset A (BERT).
+        Args:
+            text (str): Input text to analyze
+        Returns:
+            Dict with predictions, metadata, and model information
+        """
+        if not text or not text.strip():
+            raise ValueError("Input text cannot be empty")
+        try:
+            # Run the model prediction
+            results = self.classifier(text)
+            # Handle the format returned by the pipeline
+            if isinstance(results, list) and len(results) > 0:
+                if isinstance(results[0], list):
+                    # Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
+                    prediction_list = results[0]
+                else:
+                    # Direct list format: [{'label': 'en', 'score': 0.99}, ...]
+                    prediction_list = results
+            else:
+                raise ValueError("Unexpected pipeline output format")
+            # Sort predictions by confidence score (descending)
+            predictions = [
+                {
+                    'language_code': result['label'].lower(),
+                    'confidence': result['score']
+                }
+                for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
+            ]
+            return {
+                'predictions': predictions,
+                'text_length': len(text),
+                'model_version': self.model_name,
+                'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
+            }
+        except Exception as e:
+            logging.error(f"Model B Dataset A prediction failed: {e}")
+            raise RuntimeError(f"Model prediction failed: {str(e)}")
+    def get_supported_languages(self) -> List[str]:
+        """
+        Get supported languages for Model B Dataset A.
+        Returns:
+            List of ISO 639-1 language codes supported by the model
+        """
+        return get_supported_languages(self.model_key)
+    def get_model_info(self) -> Dict[str, Any]:
+        """
+        Get detailed information about Model B Dataset A.
+        Returns:
+            Dict containing comprehensive model metadata
+        """
+        # Build comprehensive model info from centralized config
+        model_info = {
+            "name": self.config["display_name"],
+            "description": self.config["description"],
+            "accuracy": self.config["accuracy"],
+            "model_size": self.config["model_size"],
+            "architecture": self.config["architecture"],
+            "base_model": self.config["base_model"],
+            "dataset": self.config["dataset"],
+            "languages_supported": f"{self.config['languages_supported']}+ languages",
+            "training_details": self.config["training_details"],
+            "use_cases": self.config["use_cases"],
+            "strengths": self.config["strengths"],
+            "limitations": self.config["limitations"]
+        }
+        return model_info

backend/models/model_b_dataset_b.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+Model B Dataset B - BERT Language Detection
+This module implements the BERT based language detection model
+fine-tuned on Dataset B (enhanced/specialized language detection dataset).
+Model Architecture: BERT (Model B)
+Training Dataset: Dataset B (enhanced/specialized)
+Performance: 99.85% accuracy across 20 carefully selected languages
+"""
+import logging
+from typing import Dict, List, Any
+from .base_model import BaseLanguageModel
+from .model_config import get_model_config, get_supported_languages, get_language_name
+try:
+    from transformers import pipeline
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+    logging.warning("Transformers library not available. Please install with: pip install transformers torch")
+class ModelBDatasetB(BaseLanguageModel):
+    """
+    BERT based language detection model (Model B) trained on Dataset B.
+    This model represents the BERT architecture fine-tuned on an enhanced
+    language detection dataset, achieving the highest accuracy (99.85%) with
+    precision-optimized performance on 20 carefully selected languages.
+    Architecture: BERT (Model B)
+    Dataset: Dataset B (enhanced/specialized)
+    Base Model: bert-base-multilingual-cased
+    Accuracy: 99.85%
+    Parameters: 178M
+    Training Loss: 0.0125
+    """
+    def __init__(self):
+        """Initialize the Model B Dataset B language detector."""
+        self.model_key = "model-b-dataset-b"
+        self.config = get_model_config(self.model_key)
+        self.model_name = self.config["huggingface_model"]
+        # Check if transformers library is available
+        if not HF_AVAILABLE:
+            raise ImportError(
+                "Transformers library required for Model B Dataset B. "
+                "Install with: pip install transformers torch"
+            )
+        # Initialize the model pipeline
+        try:
+            self.classifier = pipeline(
+                "text-classification",
+                model=self.model_name,
+                device=0,
+                top_k=None  # Return all scores
+            )
+            logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
+        except Exception as e:
+            logging.error(f"Failed to load {self.config['display_name']}: {e}")
+            raise RuntimeError(f"Could not initialize Model B Dataset B: {str(e)}")
+    def predict(self, text: str) -> Dict[str, Any]:
+        """
+        Predict language using Model B Dataset B (BERT enhanced).
+        Args:
+            text (str): Input text to analyze
+        Returns:
+            Dict with predictions, metadata, and model information
+        """
+        if not text or not text.strip():
+            raise ValueError("Input text cannot be empty")
+        try:
+            # Run the model prediction
+            results = self.classifier(text)
+            # Handle the format returned by the pipeline
+            if isinstance(results, list) and len(results) > 0:
+                if isinstance(results[0], list):
+                    # Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
+                    prediction_list = results[0]
+                else:
+                    # Direct list format: [{'label': 'en', 'score': 0.99}, ...]
+                    prediction_list = results
+            else:
+                raise ValueError("Unexpected pipeline output format")
+            # Sort predictions by confidence score (descending)
+            predictions = [
+                {
+                    'language_code': result['label'].lower(),
+                    'confidence': result['score']
+                }
+                for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
+            ]
+            return {
+                'predictions': predictions,
+                'text_length': len(text),
+                'model_version': self.model_name,
+                'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
+            }
+        except Exception as e:
+            logging.error(f"Model B Dataset B prediction failed: {e}")
+            raise RuntimeError(f"Model prediction failed: {str(e)}")
+    def get_supported_languages(self) -> List[str]:
+        """
+        Get supported languages for Model B Dataset B.
+        Returns:
+            List of ISO 639-1 language codes supported by the model (20 languages)
+        """
+        return get_supported_languages(self.model_key)
+    def get_model_info(self) -> Dict[str, Any]:
+        """
+        Get detailed information about Model B Dataset B.
+        Returns:
+            Dict containing comprehensive model metadata
+        """
+        # Build comprehensive model info from centralized config
+        model_info = {
+            "name": self.config["display_name"],
+            "description": self.config["description"],
+            "accuracy": self.config["accuracy"],
+            "model_size": self.config["model_size"],
+            "architecture": self.config["architecture"],
+            "base_model": self.config["base_model"],
+            "dataset": self.config["dataset"],
+            "languages_supported": f"{self.config['languages_supported']} carefully selected languages: Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Italian, Japanese, Dutch, Polish, Portuguese, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese",
+            "training_details": self.config["training_details"],
+            "training_loss": f"{self.config.get('training_loss', 'N/A')}",
+            "use_cases": self.config["use_cases"],
+            "strengths": self.config["strengths"],
+            "limitations": self.config["limitations"]
+        }
+        return model_info

backend/models/model_config.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+Centralized Model Configuration
+This module contains the configuration for all language detection models
+organized by the model architecture and training dataset combinations.
+Model Architecture:
+- Model A: XLM-RoBERTa based architectures
+- Model B: BERT based architectures
+Training Datasets:
+- Dataset A: Standard multilingual language detection dataset
+- Dataset B: Enhanced/specialized language detection dataset
+"""
+from typing import Dict, Any, List
+# Model configurations organized by architecture and dataset
+MODEL_CONFIGURATIONS = {
+    "model-a-dataset-a": {
+        "huggingface_model": "ZheYu03/xlm-r-langdetect-model",
+        "display_name": "XLM-RoBERTa Model A Dataset A",
+        "short_name": "Model A Dataset A",
+        "architecture": "XLM-RoBERTa",
+        "base_model": "xlm-roberta-base",
+        "dataset": "Dataset A",
+        "accuracy": "97.9%",
+        "model_size": "278M parameters",
+        "training_epochs": 10,
+        "languages_supported": 100,
+        "description": "High-performance XLM-RoBERTa based language detection model fine-tuned on standard multilingual dataset. Delivers reliable 97.9% accuracy across 100+ languages with robust cross-lingual capabilities.",
+        "training_details": "Fine-tuned XLM-RoBERTa base model with AdamW optimizer, 10 epochs training on comprehensive multilingual language detection dataset",
+        "use_cases": "General-purpose language detection, multilingual content processing, cross-lingual text analysis",
+        "strengths": "Excellent multilingual performance, robust cross-lingual transfer, proven reliability, comprehensive language coverage",
+        "limitations": "Higher computational requirements, moderate inference speed",
+        "status": "available"
+    },
+    "model-b-dataset-a": {
+        "huggingface_model": "SongJuNN/xlm-r-langdetect-model",
+        "display_name": "BERT Model B Dataset A",
+        "short_name": "Model B Dataset A",
+        "architecture": "BERT",
+        "base_model": "bert-base-multilingual-cased",
+        "dataset": "Dataset A",
+        "accuracy": "96.17%",
+        "model_size": "178M parameters",
+        "training_epochs": 10,
+        "languages_supported": 100,
+        "description": "Efficient BERT-based language detection model trained on standard multilingual dataset. Optimized for speed and broad language coverage with 96.17% accuracy across 100+ languages.",
+        "training_details": "BERT multilingual model fine-tuned with AdamW optimizer (lr=2e-05), mixed precision training, optimized for efficiency",
+        "use_cases": "High-throughput language detection, real-time applications, resource-constrained environments",
+        "strengths": "Fast inference speed, lower memory usage, broad language support, efficient processing",
+        "limitations": "Slightly lower accuracy compared to XLM-RoBERTa variants",
+        "status": "available"
+    },
+    "model-a-dataset-b": {
+        "huggingface_model": "zues0102/xlmr-papluca-model",
+        "display_name": "XLM-RoBERTa Model A Dataset B",
+        "short_name": "Model A Dataset B",
+        "architecture": "XLM-RoBERTa",
+        "base_model": "xlm-roberta-base",
+        "dataset": "Dataset B",
+        "accuracy": "99.72%",
+        "model_size": "278M parameters",
+        "training_epochs": 10,
+        "training_loss": 0.0176,
+        "languages_supported": 20,
+        "description": "Ultra high-accuracy XLM-RoBERTa model fine-tuned on enhanced dataset. Achieves exceptional 99.72% accuracy with support for 20 carefully selected high-performance languages and state-of-the-art performance.",
+        "training_details": "Advanced fine-tuning of XLM-RoBERTa on enhanced dataset with specialized training procedures, achieving loss of 0.0176",
+        "use_cases": "Research applications, high-precision language detection, critical accuracy requirements",
+        "strengths": "Exceptional accuracy (99.72%), focused language support, state-of-the-art results",
+        "limitations": "Higher computational requirements, limited to 20 languages",
+        "status": "available"
+    },
+    "model-b-dataset-b": {
+        "huggingface_model": "zues0102/bert-base-multilingual-cased",
+        "display_name": "BERT Model B Dataset B",
+        "short_name": "Model B Dataset B",
+        "architecture": "BERT",
+        "base_model": "bert-base-multilingual-cased",
+        "dataset": "Dataset B",
+        "accuracy": "99.85%",
+        "model_size": "178M parameters",
+        "training_epochs": 10,
+        "training_loss": 0.0125,
+        "languages_supported": 20,
+        "description": "State-of-the-art BERT model achieving highest accuracy (99.85%) through specialized training on enhanced dataset. Optimized for 20 carefully selected high-performance languages.",
+        "training_details": "Precision-optimized BERT training on enhanced dataset achieving ultra-low loss of 0.0125, specialized for maximum accuracy",
+        "use_cases": "Maximum precision applications, research requiring highest accuracy, critical language identification",
+        "strengths": "Highest accuracy (99.85%), ultra-low training loss, optimized precision, efficient architecture",
+        "limitations": "Limited to 20 languages, specialized for specific language set",
+        "status": "available"
+    }
+}
+# Language mappings - comprehensive set
+LANGUAGE_MAPPINGS = {
+    'af': 'Afrikaans', 'am': 'Amharic', 'ar': 'Arabic', 'as': 'Assamese',
+    'az': 'Azerbaijani', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bn': 'Bengali',
+    'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'cs': 'Czech',
+    'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dz': 'Dzongkha',
+    'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish',
+    'et': 'Estonian', 'eu': 'Basque', 'fa': 'Persian', 'fi': 'Finnish',
+    'fr': 'French', 'fy': 'Frisian', 'ga': 'Irish', 'gd': 'Scottish Gaelic',
+    'gl': 'Galician', 'gu': 'Gujarati', 'ha': 'Hausa', 'he': 'Hebrew',
+    'hi': 'Hindi', 'hr': 'Croatian', 'ht': 'Haitian Creole', 'hu': 'Hungarian',
+    'hy': 'Armenian', 'id': 'Indonesian', 'is': 'Icelandic', 'it': 'Italian',
+    'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kk': 'Kazakh',
+    'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean', 'ku': 'Kurdish',
+    'ky': 'Kyrgyz', 'la': 'Latin', 'lb': 'Luxembourgish', 'lo': 'Lao',
+    'lt': 'Lithuanian', 'lv': 'Latvian', 'mg': 'Malagasy', 'mk': 'Macedonian',
+    'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay',
+    'mt': 'Maltese', 'my': 'Myanmar (Burmese)', 'nb': 'Norwegian Bokmål',
+    'ne': 'Nepali', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian',
+    'oc': 'Occitan', 'or': 'Odia', 'pa': 'Punjabi', 'pl': 'Polish',
+    'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'ro': 'Romanian',
+    'ru': 'Russian', 'rw': 'Kinyarwanda', 'se': 'Northern Sami', 'si': 'Sinhala',
+    'sk': 'Slovak', 'sl': 'Slovenian', 'so': 'Somali', 'sq': 'Albanian',
+    'sr': 'Serbian', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil',
+    'te': 'Telugu', 'th': 'Thai', 'tl': 'Filipino', 'tr': 'Turkish',
+    'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'vi': 'Vietnamese',
+    'vo': 'Volapük', 'wa': 'Walloon', 'xh': 'Xhosa', 'yi': 'Yiddish',
+    'yo': 'Yoruba', 'zh': 'Chinese', 'zh-cn': 'Chinese (Simplified)',
+    'zh-tw': 'Chinese (Traditional)', 'zh-hans': 'Chinese (Simplified)',
+    'zh-hant': 'Chinese (Traditional)', 'zu': 'Zulu'
+}
+# Model-specific language support
+MODEL_LANGUAGE_SUPPORT = {
+    "model-a-dataset-a": [
+        'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
+        'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
+        'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
+        'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
+        'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
+        'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
+        'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
+        'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
+        'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
+        'yo', 'zh', 'zu'
+    ],
+    "model-b-dataset-a": [
+        'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
+        'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
+        'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
+        'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
+        'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
+        'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
+        'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
+        'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
+        'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
+        'yo', 'zh', 'zu'
+    ],
+    "model-a-dataset-b": [
+        'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
+        'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
+    ],
+    "model-b-dataset-b": [
+        'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
+        'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
+    ]
+}
+def get_model_config(model_key: str) -> Dict[str, Any]:
+    """Get configuration for a specific model."""
+    return MODEL_CONFIGURATIONS.get(model_key, {})
+def get_all_model_configs() -> Dict[str, Dict[str, Any]]:
+    """Get all model configurations."""
+    return MODEL_CONFIGURATIONS.copy()
+def get_supported_languages(model_key: str) -> List[str]:
+    """Get supported languages for a specific model."""
+    return MODEL_LANGUAGE_SUPPORT.get(model_key, [])
+def get_language_name(language_code: str) -> str:
+    """Get human-readable language name from code."""
+    return LANGUAGE_MAPPINGS.get(language_code.lower(), f"Unknown ({language_code})")

combined_app.py ADDED Viewed

	@@ -0,0 +1,414 @@

+import gradio as gr
+import pandas as pd
+from backend.language_detector import LanguageDetector
+from typing import List, Dict, Any
+import time
+# Import demo samples from demo_page
+from demo_page import DEMO_SAMPLES, initialize_models, detect_with_all_models, create_results_dataframe, run_demo_tests
+def create_single_model_interface():
+    """Create the original single model interface."""
+    # Initialize the language detector with default model (Model A Dataset A)
+    detector = LanguageDetector()
+    with gr.Column() as single_interface:
+        gr.Markdown("# 🌍 Language Detection App")
+        gr.Markdown("Select a model and enter text below to detect its language with confidence scores.")
+        # Model Selection Section with visual styling
+        with gr.Group():
+            gr.Markdown(
+                "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.1), transparent); border-radius: 8px 8px 0 0;'>🤖 Model Selection</div>"
+            )
+            # Get available models
+            available_models = detector.get_available_models()
+            model_choices = []
+            model_info_map = {}
+            for key, info in available_models.items():
+                if info["status"] == "available":
+                    model_choices.append((info["display_name"], key))
+                else:
+                    model_choices.append((f"{info['display_name']} (Coming Soon)", key))
+                model_info_map[key] = info
+            model_selector = gr.Dropdown(
+                choices=model_choices,
+                value="model-a-dataset-a",  # Default to Model A Dataset A
+                label="Choose Language Detection Model",
+                interactive=True
+            )
+            # Model Information Display
+            model_info_display = gr.Markdown(
+                value=_format_model_info(detector.get_current_model_info()),
+                label="Model Information"
+            )
+        # Add visual separator
+        gr.Markdown(
+            "<div style='margin: 24px 0; border-top: 3px solid rgba(99, 102, 241, 0.2); background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.05), transparent); height: 2px;'></div>"
+        )
+        # Analysis Section
+        with gr.Group():
+            gr.Markdown(
+                "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(34, 197, 94, 0.1), transparent); border-radius: 8px 8px 0 0;'>🔍 Language Analysis</div>"
+            )
+            with gr.Row():
+                with gr.Column(scale=2):
+                    # Input section
+                    text_input = gr.Textbox(
+                        label="Text to Analyze",
+                        placeholder="Enter text here to detect its language...",
+                        lines=5,
+                        max_lines=10
+                    )
+                    detect_btn = gr.Button("🔍 Detect Language", variant="primary", size="lg")
+                    # Example texts
+                    gr.Examples(
+                        examples=[
+                            ["Hello, how are you today?"],
+                            ["Bonjour, comment allez-vous?"],
+                            ["Hola, ¿cómo estás?"],
+                            ["Guten Tag, wie geht es Ihnen?"],
+                            ["こんにちは、元気ですか？"],
+                            ["Привет, как дела?"],
+                            ["Ciao, come stai?"],
+                            ["Olá, como você está?"],
+                            ["你好，你好吗？"],
+                            ["안녕하세요, 어떻게 지내세요?"]
+                        ],
+                        inputs=text_input,
+                        label="Try these examples:"
+                    )
+                with gr.Column(scale=2):
+                    # Output section
+                    with gr.Group():
+                        gr.Markdown(
+                            "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 12px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(168, 85, 247, 0.1), transparent); border-radius: 8px 8px 0 0;'>📊 Detection Results</div>"
+                        )
+                        detected_language = gr.Textbox(
+                            label="Detected Language",
+                            interactive=False
+                        )
+                        confidence_score = gr.Number(
+                            label="Confidence Score",
+                            interactive=False,
+                            precision=4
+                        )
+                        language_code = gr.Textbox(
+                            label="Language Code (ISO 639-1)",
+                            interactive=False
+                        )
+                        # Top predictions table
+                        top_predictions = gr.Dataframe(
+                            headers=["Language", "Code", "Confidence"],
+                            label="Top 5 Predictions",
+                            interactive=False,
+                            wrap=True
+                        )
+        # Status/Info section
+        with gr.Row():
+            status_text = gr.Textbox(
+                label="Status",
+                interactive=False,
+                visible=False
+            )
+        # Event handlers
+        def detect_language_wrapper(text, selected_model):
+            if not text.strip():
+                return (
+                    "No text provided",
+                    0.0,
+                    "",
+                    [],
+                    gr.update(value="Please enter some text to analyze.", visible=True)
+                )
+            try:
+                # Switch model if needed
+                if detector.current_model_key != selected_model:
+                    try:
+                        detector.switch_model(selected_model)
+                    except NotImplementedError:
+                        return (
+                            "Model unavailable",
+                            0.0,
+                            "",
+                            [],
+                            gr.update(value="This model is not yet implemented. Please select an available model.", visible=True)
+                        )
+                    except Exception as e:
+                        return (
+                            "Model error",
+                            0.0,
+                            "",
+                            [],
+                            gr.update(value=f"Error loading model: {str(e)}", visible=True)
+                        )
+                result = detector.detect_language(text)
+                # Extract main prediction
+                main_lang = result['language']
+                main_confidence = result['confidence']
+                main_code = result['language_code']
+                # Format top predictions for table
+                predictions_table = [
+                    [pred['language'], pred['language_code'], f"{pred['confidence']:.4f}"]
+                    for pred in result['top_predictions']
+                ]
+                model_info = result.get('metadata', {}).get('model_info', {})
+                model_name = model_info.get('name', 'Unknown Model')
+                return (
+                    main_lang,
+                    main_confidence,
+                    main_code,
+                    predictions_table,
+                    gr.update(value=f"✅ Analysis Complete\n\nInput Text: {text[:100]}{'...' if len(text) > 100 else ''}\n\nDetected Language: {main_lang} ({main_code})\nConfidence: {main_confidence:.2%}\n\nModel: {model_name}", visible=True)
+                )
+            except Exception as e:
+                return (
+                    "Error occurred",
+                    0.0,
+                    "",
+                    [],
+                    gr.update(value=f"Error: {str(e)}", visible=True)
+                )
+        def update_model_info(selected_model):
+            """Update model information display when model selection changes."""
+            try:
+                if detector.current_model_key != selected_model:
+                    detector.switch_model(selected_model)
+                model_info = detector.get_current_model_info()
+                return _format_model_info(model_info)
+            except NotImplementedError:
+                return "**This model is not yet implemented.** Please select an available model."
+            except Exception as e:
+                return f"**Error loading model information:** {str(e)}"
+        # Connect the button to the detection function
+        detect_btn.click(
+            fn=detect_language_wrapper,
+            inputs=[text_input, model_selector],
+            outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
+        )
+        # Also trigger on Enter key in text input
+        text_input.submit(
+            fn=detect_language_wrapper,
+            inputs=[text_input, model_selector],
+            outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
+        )
+        # Update model info when selection changes
+        model_selector.change(
+            fn=update_model_info,
+            inputs=[model_selector],
+            outputs=[model_info_display]
+        )
+    return single_interface
+def create_demo_comparison_interface():
+    """Create the demo comparison interface."""
+    # Initialize models
+    models = initialize_models()
+    with gr.Column() as demo_interface:
+        gr.Markdown("# 🚀 Language Detection Demo - Model Comparison")
+        gr.Markdown("Compare all four language detection models simultaneously across various difficulty categories.")
+        # Model Status Section
+        with gr.Group():
+            gr.Markdown("## 🤖 Model Status")
+            model_status_text = ""
+            for model_key, model_info in models.items():
+                status_icon = "✅" if model_info["status"] == "Ready" else "❌"
+                model_status_text += f"{status_icon} **{model_info['name']}**: {model_info['status']}\n\n"
+            gr.Markdown(model_status_text)
+        # Category Selection Section
+        with gr.Group():
+            gr.Markdown("## 📊 Test Categories")
+            gr.Markdown("Select categories to test different aspects of language detection difficulty:")
+            category_checkboxes = gr.CheckboxGroup(
+                choices=list(DEMO_SAMPLES.keys()),
+                label="Select Test Categories",
+                value=["Easy/Obvious", "Short Text"],  # Default selection
+                interactive=True
+            )
+        # Custom Text Input Section
+        with gr.Group():
+            gr.Markdown("## ✏️ Custom Text Input")
+            gr.Markdown("Enter your own texts to test (one per line):")
+            custom_text_input = gr.Textbox(
+                label="Custom Texts",
+                placeholder="Enter custom texts here, one per line...\nExample:\nHello world\nBonjour le monde\n你好世界",
+                lines=5,
+                max_lines=10
+            )
+        # Control Buttons
+        with gr.Row():
+            run_demo_btn = gr.Button("🔍 Run Demo Tests", variant="primary", size="lg")
+            clear_btn = gr.Button("🗑️ Clear Results", variant="secondary")
+        # Sample Preview Section (moved up, condensed)
+        with gr.Group():
+            gr.Markdown("## 📚 Category Explanations")
+            gr.Markdown("Understanding what each test category evaluates:")
+            category_explanations = """
+**Easy/Obvious:** Clear, unambiguous sentences in their native language. Tests basic language detection capability.
+**Short Text:** Single words or very short phrases. Tests model performance with minimal context.
+**False Friends:** Words that look similar across languages but have different meanings. Tests ability to distinguish between closely related languages.
+**Mixed Scripts:** Text containing multiple languages, numbers, symbols, or scripts. Tests handling of multilingual content.
+**Proper Nouns:** Names of people, places, or entities that exist across multiple languages. Tests context-dependent detection.
+**Common Words:** International words with similar spelling across languages (hotel, taxi, etc.). Tests disambiguation of universal terms.
+**Technical Terms:** Specialized vocabulary that may be borrowed or translated across languages. Tests domain-specific detection.
+**Code-switching:** Text that switches between languages mid-sentence. Tests handling of bilingual communication patterns.
+**Transliterated Text:** Non-Latin scripts written in Latin characters. Tests recognition of transliteration vs. native language.
+**Ambiguous Script:** Words that could belong to multiple languages with identical spelling. Tests the model's decision-making under uncertainty.
+"""
+            gr.Markdown(category_explanations)
+        # Results Section (moved to bottom)
+        with gr.Group():
+            gr.Markdown("## 📈 Results")
+            summary_output = gr.Textbox(
+                label="Summary",
+                interactive=False,
+                visible=False
+            )
+            results_dataframe = gr.Dataframe(
+                label="Model Comparison Results (A-A: Model A Dataset A, B-A: Model B Dataset A, A-B: Model A Dataset B, B-B: Model B Dataset B)",
+                wrap=True,
+                interactive=False,
+                visible=False
+            )
+        # Event Handlers
+        def run_tests(selected_cats, custom_texts):
+            summary, df = run_demo_tests(selected_cats, custom_texts, models)
+            if df is not None:
+                return (
+                    gr.update(value=summary, visible=True),
+                    gr.update(value=df, visible=True)
+                )
+            else:
+                return (
+                    gr.update(value=summary, visible=True),
+                    gr.update(visible=False)
+                )
+        def clear_results():
+            return (
+                gr.update(value="", visible=False),
+                gr.update(value=None, visible=False)
+            )
+        # Connect event handlers
+        run_demo_btn.click(
+            fn=run_tests,
+            inputs=[category_checkboxes, custom_text_input],
+            outputs=[summary_output, results_dataframe]
+        )
+        clear_btn.click(
+            fn=clear_results,
+            outputs=[summary_output, results_dataframe]
+        )
+    return demo_interface
+def _format_model_info(model_info):
+    """Format model information for display."""
+    if not model_info:
+        return "No model information available."
+    formatted_info = f"""
+**{model_info.get('name', 'Unknown Model')}**
+{model_info.get('description', 'No description available.')}
+**📊 Performance:**
+- Accuracy: {model_info.get('accuracy', 'N/A')}
+- Model Size: {model_info.get('model_size', 'N/A')}
+**🏗️ Architecture:**
+- Model Architecture: {model_info.get('architecture', 'N/A')}
+- Base Model: {model_info.get('base_model', 'N/A')}
+- Training Dataset: {model_info.get('dataset', 'N/A')}
+**🌐 Languages:** {model_info.get('languages_supported', 'N/A')}
+**⚙️ Training Details:** {model_info.get('training_details', 'N/A')}
+**💡 Use Cases:** {model_info.get('use_cases', 'N/A')}
+**✅ Strengths:** {model_info.get('strengths', 'N/A')}
+**⚠️ Limitations:** {model_info.get('limitations', 'N/A')}
+"""
+    return formatted_info
+def main():
+    """Create the main application with tabbed interface."""
+    with gr.Blocks(title="Language Detection App Suite", theme=gr.themes.Soft()) as app:
+        gr.Markdown("# 🌍 Language Detection App Suite")
+        gr.Markdown("Choose between single model testing or comprehensive model comparison.")
+        with gr.Tabs():
+            with gr.TabItem("🔍 Single Model Detection"):
+                single_model_interface = create_single_model_interface()
+            with gr.TabItem("🚀 Model Comparison Demo"):
+                demo_comparison_interface = create_demo_comparison_interface()
+    return app
+if __name__ == "__main__":
+    app = main()
+    app.launch(
+        server_name="localhost",
+        server_port=7860,
+        share=False,
+        debug=True
+    )

demo_page.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import gradio as gr
+import pandas as pd
+from backend.language_detector import LanguageDetector
+from typing import List, Dict, Any
+import time
+# Sample text database organized by difficulty categories
+DEMO_SAMPLES = {
+    "Easy/Obvious": [
+        ("Hello, how are you doing today?", "en", "Clear English sentence"),
+        ("Bonjour, comment allez-vous aujourd'hui?", "fr", "Clear French sentence"),
+        ("Hola, ¿cómo estás hoy?", "es", "Clear Spanish sentence"),
+        ("Guten Tag, wie geht es Ihnen heute?", "de", "Clear German sentence"),
+        ("こんにちは、今日はどうですか？", "ja", "Clear Japanese sentence"),
+        ("Привет, как дела сегодня?", "ru", "Clear Russian sentence"),
+        ("Ciao, come stai oggi?", "it", "Clear Italian sentence"),
+        ("Olá, como você está hoje?", "pt", "Clear Portuguese sentence"),
+        ("你好，你今天怎么样？", "zh", "Clear Chinese sentence"),
+        ("안녕하세요, 오늘 어떻게 지내세요?", "ko", "Clear Korean sentence"),
+    ],
+    "Short Text": [
+        ("Hi", "en", "Very short greeting"),
+        ("Oui", "fr", "Single word French"),
+        ("Sí", "es", "Single word Spanish"),
+        ("Ja", "de", "Single word German"),
+        ("はい", "ja", "Single word Japanese"),
+        ("Да", "ru", "Single word Russian"),
+        ("Sì", "it", "Single word Italian"),
+        ("Sim", "pt", "Single word Portuguese"),
+        ("是", "zh", "Single character Chinese"),
+        ("네", "ko", "Single word Korean"),
+    ],
+    "False Friends": [
+        ("actual", "en", "English word, but means 'current' in Spanish"),
+        ("embarazada", "es", "Spanish for pregnant, not embarrassed"),
+        ("gift", "en", "English word, but means 'poison' in German"),
+        ("preservativo", "es", "Spanish for condom, not preservative"),
+        ("sensible", "en", "English word, but means 'sensitive' in Spanish"),
+        ("sympathique", "fr", "French for nice, not sympathetic"),
+        ("biblioteca", "es", "Spanish for library, not Bible place"),
+        ("realizzare", "it", "Italian for to achieve, not realize"),
+        ("parents", "en", "English word, but means 'relatives' in French"),
+        ("attualmente", "it", "Italian for currently, not actually"),
+    ],
+    "Mixed Scripts": [
+        ("Hello123世界", "mix", "Mixed English, numbers, Chinese"),
+        ("Café #1 في العالم", "mix", "Mixed French, numbers, Arabic"),
+        ("2023年は良い年です", "ja", "Japanese with numbers"),
+        ("Prix: €50,000", "fr", "French with currency and numbers"),
+        ("iPhone 15 Pro Max", "en", "Product name with numbers"),
+        ("COVID-19 パンデミック", "mix", "Mixed English acronym and Japanese"),
+        ("Wi-Fi пароль: 123456", "mix", "Mixed English tech term and Russian"),
+        ("GPS координаты", "mix", "Mixed English acronym and Russian"),
+        ("URL: https://example.com", "en", "Web address"),
+        ("HTML <div>content</div>", "en", "Code with markup"),
+    ],
+    "Proper Nouns": [
+        ("Paris", "ambiguous", "City name - French or English context?"),
+        ("Berlin", "ambiguous", "City name - German or English context?"),
+        ("Madrid", "ambiguous", "City name - Spanish or English context?"),
+        ("Tokyo", "ambiguous", "City name - Japanese or English context?"),
+        ("Maria", "ambiguous", "Common name in many languages"),
+        ("Alexander", "ambiguous", "Name used in many languages"),
+        ("David", "ambiguous", "Biblical name used worldwide"),
+        ("Anna", "ambiguous", "Name common across languages"),
+        ("Michael", "ambiguous", "International name"),
+        ("Sofia", "ambiguous", "Name and city, multiple languages"),
+    ],
+    "Common Words": [
+        ("hotel", "ambiguous", "Same spelling in many languages"),
+        ("restaurant", "ambiguous", "French origin, used worldwide"),
+        ("taxi", "ambiguous", "International word"),
+        ("pizza", "ambiguous", "Italian origin, used worldwide"),
+        ("chocolate", "ambiguous", "Similar in many languages"),
+        ("hospital", "ambiguous", "Medical term used internationally"),
+        ("radio", "ambiguous", "Technology term used worldwide"),
+        ("metro", "ambiguous", "Transportation term"),
+        ("cafe", "ambiguous", "French origin, international use"),
+        ("photo", "ambiguous", "Greek origin, used worldwide"),
+    ],
+    "Technical Terms": [
+        ("algorithm", "en", "Technical English term"),
+        ("algorithme", "fr", "Technical French term"),
+        ("algoritmo", "es", "Technical Spanish term"),
+        ("Algorithmus", "de", "Technical German term"),
+        ("アルゴリズム", "ja", "Technical Japanese term"),
+        ("алгоритм", "ru", "Technical Russian term"),
+        ("algoritmo", "it", "Technical Italian term"),
+        ("algoritmo", "pt", "Technical Portuguese term"),
+        ("算法", "zh", "Technical Chinese term"),
+        ("알고리즘", "ko", "Technical Korean term"),
+    ],
+    "Code-switching": [
+        ("I love sushi とても美味しい", "mix", "English-Japanese code switching"),
+        ("C'est très nice aujourd'hui", "mix", "French-English code switching"),
+        ("Me gusta this song mucho", "mix", "Spanish-English code switching"),
+        ("Das ist very interessant", "mix", "German-English code switching"),
+        ("Это really хорошо", "mix", "Russian-English code switching"),
+        ("È molto beautiful oggi", "mix", "Italian-English code switching"),
+        ("Está muito good today", "mix", "Portuguese-English code switching"),
+        ("这个 is very 好", "mix", "Chinese-English code switching"),
+        ("이것은 really 좋다", "mix", "Korean-English code switching"),
+        ("Merci beaucoup for everything", "mix", "French-English code switching"),
+    ],
+    "Transliterated Text": [
+        ("Konnichiwa", "transliteration", "Japanese こんにちは in Latin script"),
+        ("Spasibo", "transliteration", "Russian спасибо in Latin script"),
+        ("Arigato", "transliteration", "Japanese ありがとう in Latin script"),
+        ("Privyet", "transliteration", "Russian привет in Latin script"),
+        ("Sayonara", "transliteration", "Japanese さようなら in Latin script"),
+        ("Dosvedanya", "transliteration", "Russian до свидания in Latin script"),
+        ("Nihao", "transliteration", "Chinese 你好 in Latin script"),
+        ("Annyeonghaseyo", "transliteration", "Korean 안녕하세요 in Latin script"),
+        ("Zdravstvuyte", "transliteration", "Russian здравствуйте in Latin script"),
+        ("Ohayo gozaimasu", "transliteration", "Japanese おはようございます in Latin script"),
+    ],
+    "Ambiguous Script": [
+        ("casa", "ambiguous", "House in Spanish/Italian/Portuguese"),
+        ("rose", "ambiguous", "Flower in English or pink in French"),
+        ("more", "ambiguous", "English word or Italian 'deaths'"),
+        ("come", "ambiguous", "English verb or Italian 'how/like'"),
+        ("no", "ambiguous", "English word or Spanish 'no'"),
+        ("si", "ambiguous", "Spanish 'if' or Italian 'yes'"),
+        ("la", "ambiguous", "English 'la' or French/Spanish/Italian article"),
+        ("me", "ambiguous", "English pronoun or Spanish 'me'"),
+        ("le", "ambiguous", "French article or Italian article"),
+        ("son", "ambiguous", "English word or Spanish 'they are'"),
+    ]
+}
+def initialize_models():
+    """Initialize all four models for comparison."""
+    models = {}
+    model_configs = [
+        ("model-a-dataset-a", "Model A Dataset A"),
+        ("model-b-dataset-a", "Model B Dataset A"),
+        ("model-a-dataset-b", "Model A Dataset B"),
+        ("model-b-dataset-b", "Model B Dataset B")
+    ]
+    for model_key, model_name in model_configs:
+        try:
+            models[model_key] = {
+                "detector": LanguageDetector(model_key=model_key),
+                "name": model_name,
+                "status": "Ready"
+            }
+        except Exception as e:
+            models[model_key] = {
+                "detector": None,
+                "name": model_name,
+                "status": f"Error: {str(e)}"
+            }
+    return models
+def detect_with_all_models(text: str, models: Dict) -> Dict[str, Any]:
+    """Run language detection with all models and return results."""
+    results = {}
+    for model_key, model_info in models.items():
+        if model_info["detector"] is None:
+            results[model_key] = {
+                "language": "Error",
+                "confidence": 0.0,
+                "language_code": "error",
+                "status": model_info["status"]
+            }
+        else:
+            try:
+                result = model_info["detector"].detect_language(text)
+                results[model_key] = {
+                    "language": result["language"],
+                    "confidence": result["confidence"],
+                    "language_code": result["language_code"],
+                    "status": "Success"
+                }
+            except Exception as e:
+                results[model_key] = {
+                    "language": "Error",
+                    "confidence": 0.0,
+                    "language_code": "error",
+                    "status": f"Error: {str(e)}"
+                }
+    return results
+def create_results_dataframe(texts: List[str], all_results: List[Dict], expected_langs: List[str] = None, categories: List[str] = None) -> pd.DataFrame:
+    """Create a pandas DataFrame for results display."""
+    data = []
+    for i, (text, results) in enumerate(zip(texts, all_results)):
+        row = {
+            "Text": text[:40] + "..." if len(text) > 40 else text,  # Shortened text display
+            "Expected": expected_langs[i] if expected_langs else "N/A",
+            "Category": categories[i] if categories else "Custom"
+        }
+        expected_lang = expected_langs[i] if expected_langs else None
+        # Add results from each model - combine language and confidence
+        for model_key, result in results.items():
+            # Shortened model names
+            if model_key == "model-a-dataset-a":
+                col_name = "A-A"
+            elif model_key == "model-b-dataset-a":
+                col_name = "B-A"
+            elif model_key == "model-a-dataset-b":
+                col_name = "A-B"
+            elif model_key == "model-b-dataset-b":
+                col_name = "B-B"
+            else:
+                col_name = model_key[:6]
+            # Determine if prediction is correct
+            predicted_lang = result['language_code']
+            is_correct = False
+            if expected_lang and expected_lang not in ['ambiguous', 'mix', 'transliteration', 'unknown', 'N/A']:
+                # For specific expected languages, check exact match
+                is_correct = predicted_lang == expected_lang
+                emoji = "✅" if is_correct else "🚫"
+            else:
+                # For ambiguous/mixed/transliterated/unknown cases, don't show emoji
+                emoji = ""
+            # Combine emoji, language code and confidence in one column
+            if emoji:
+                row[col_name] = f"{emoji} {predicted_lang} ({result['confidence']:.3f})"
+            else:
+                row[col_name] = f"{predicted_lang} ({result['confidence']:.3f})"
+        data.append(row)
+    return pd.DataFrame(data)
+def run_demo_tests(selected_categories: List[str], custom_texts: str, models: Dict):
+    """Run tests on selected categories and custom texts."""
+    if not selected_categories and not custom_texts.strip():
+        return "Please select at least one category or enter custom text.", None
+    all_texts = []
+    expected_langs = []
+    categories = []
+    # Add selected category samples
+    for category in selected_categories:
+        if category in DEMO_SAMPLES:
+            for text, expected, description in DEMO_SAMPLES[category]:
+                all_texts.append(text)
+                expected_langs.append(expected)
+                categories.append(category)
+    # Add custom texts
+    if custom_texts.strip():
+        custom_lines = [line.strip() for line in custom_texts.strip().split('\n') if line.strip()]
+        for text in custom_lines:
+            all_texts.append(text)
+            expected_langs.append("unknown")
+            categories.append("Custom")
+    if not all_texts:
+        return "No texts to analyze.", None
+    # Run detection on all texts
+    all_results = []
+    for text in all_texts:
+        results = detect_with_all_models(text, models)
+        all_results.append(results)
+    # Create results DataFrame
+    df = create_results_dataframe(all_texts, all_results, expected_langs, categories)
+    summary = f"Analyzed {len(all_texts)} texts across {len(set(categories))} categories."
+    return summary, df
+def create_demo_interface():
+    """Create the demo interface."""
+    # Initialize models
+    models = initialize_models()
+    with gr.Blocks(title="Language Detection Demo - Model Comparison", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🚀 Language Detection Demo - Model Comparison")
+        gr.Markdown("Compare all four language detection models simultaneously across various difficulty categories.")
+        # Model Status Section
+        with gr.Group():
+            gr.Markdown("## 🤖 Model Status")
+            model_status_text = ""
+            for model_key, model_info in models.items():
+                status_icon = "✅" if model_info["status"] == "Ready" else "❌"
+                model_status_text += f"{status_icon} **{model_info['name']}**: {model_info['status']}\n\n"
+            gr.Markdown(model_status_text)
+        # Category Selection Section
+        with gr.Group():
+            gr.Markdown("## 📊 Test Categories")
+            gr.Markdown("Select categories to test different aspects of language detection difficulty:")
+            category_checkboxes = gr.CheckboxGroup(
+                choices=list(DEMO_SAMPLES.keys()),
+                label="Select Test Categories",
+                value=["Easy/Obvious", "Short Text"],  # Default selection
+                interactive=True
+            )
+        # Custom Text Input Section
+        with gr.Group():
+            gr.Markdown("## ✏️ Custom Text Input")
+            gr.Markdown("Enter your own texts to test (one per line):")
+            custom_text_input = gr.Textbox(
+                label="Custom Texts",
+                placeholder="Enter custom texts here, one per line...\nExample:\nHello world\nBonjour le monde\n你好世界",
+                lines=5,
+                max_lines=10
+            )
+        # Control Buttons
+        with gr.Row():
+            run_demo_btn = gr.Button("🔍 Run Demo Tests", variant="primary", size="lg")
+            clear_btn = gr.Button("🗑️ Clear Results", variant="secondary")
+        # Sample Preview Section (moved up, condensed)
+        with gr.Group():
+            gr.Markdown("## 📚 Category Explanations")
+            gr.Markdown("Understanding what each test category evaluates:")
+            category_explanations = """
+**Easy/Obvious:** Clear, unambiguous sentences in their native language. Tests basic language detection capability.
+**Short Text:** Single words or very short phrases. Tests model performance with minimal context.
+**False Friends:** Words that look similar across languages but have different meanings. Tests ability to distinguish between closely related languages.
+**Mixed Scripts:** Text containing multiple languages, numbers, symbols, or scripts. Tests handling of multilingual content.
+**Proper Nouns:** Names of people, places, or entities that exist across multiple languages. Tests context-dependent detection.
+**Common Words:** International words with similar spelling across languages (hotel, taxi, etc.). Tests disambiguation of universal terms.
+**Technical Terms:** Specialized vocabulary that may be borrowed or translated across languages. Tests domain-specific detection.
+**Code-switching:** Text that switches between languages mid-sentence. Tests handling of bilingual communication patterns.
+**Transliterated Text:** Non-Latin scripts written in Latin characters. Tests recognition of transliteration vs. native language.
+**Ambiguous Script:** Words that could belong to multiple languages with identical spelling. Tests the model's decision-making under uncertainty.
+"""
+            gr.Markdown(category_explanations)
+        # Results Section (moved to bottom)
+        with gr.Group():
+            gr.Markdown("## 📈 Results")
+            summary_output = gr.Textbox(
+                label="Summary",
+                interactive=False,
+                visible=False
+            )
+            results_dataframe = gr.Dataframe(
+                label="Model Comparison Results (A-A: Model A Dataset A, B-A: Model B Dataset A, A-B: Model A Dataset B, B-B: Model B Dataset B)",
+                wrap=True,
+                interactive=False,
+                visible=False
+            )
+        # Event Handlers
+        def run_tests(selected_cats, custom_texts):
+            summary, df = run_demo_tests(selected_cats, custom_texts, models)
+            if df is not None:
+                return (
+                    gr.update(value=summary, visible=True),
+                    gr.update(value=df, visible=True)
+                )
+            else:
+                return (
+                    gr.update(value=summary, visible=True),
+                    gr.update(visible=False)
+                )
+        def clear_results():
+            return (
+                gr.update(value="", visible=False),
+                gr.update(value=None, visible=False)
+            )
+        # Connect event handlers
+        run_demo_btn.click(
+            fn=run_tests,
+            inputs=[category_checkboxes, custom_text_input],
+            outputs=[summary_output, results_dataframe]
+        )
+        clear_btn.click(
+            fn=clear_results,
+            outputs=[summary_output, results_dataframe]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_demo_interface()
+    demo.launch(
+        server_name="localhost",
+        server_port=7860,
+        share=False,
+        debug=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,64 @@

+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+audioop-lts==0.2.1
+certifi==2025.4.26
+charset-normalizer==3.4.2
+click==8.1.8
+fastapi==0.115.12
+ffmpy==0.5.0
+filelock==3.18.0
+fsspec==2025.5.1
+gradio==5.31.0
+gradio_client==1.10.1
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.2
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.32.0
+idna==3.10
+Jinja2==3.1.6
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.4.2
+numpy==2.2.6
+orjson==3.10.18
+packaging==25.0
+pandas==2.2.3
+pillow==11.2.1
+pydantic==2.11.5
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+ruff==0.11.11
+safehttpx==0.1.6
+safetensors==0.5.3
+semantic-version==2.10.0
+setuptools==80.8.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+starlette==0.46.2
+sympy==1.14.0
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.7.0
+tqdm==4.67.1
+transformers==4.52.3
+typer==0.15.4
+typing-inspection==0.4.1
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.2
+websockets==15.0.1

run.py ADDED Viewed

	@@ -0,0 +1,140 @@

+#!/usr/bin/env python3
+"""
+Simple startup script for the Language Detection App
+This script provides an easy way to run the app with different configurations.
+"""
+import sys
+import os
+import argparse
+from pathlib import Path
+def check_dependencies():
+    """Check if required dependencies are installed."""
+    try:
+        import gradio
+        print("✅ Gradio is available")
+    except ImportError:
+        print("❌ Gradio not found. Install with: pip install -r requirements.txt")
+        return False
+    return True
+def run_tests():
+    """Run the test suite."""
+    print("🧪 Running tests...")
+    os.system("python test_app.py")
+def run_app(model_type="placeholder", host="0.0.0.0", port=7860, share=False):
+    """Run the main application."""
+    if not check_dependencies():
+        return 1
+    # Set environment variables for configuration
+    os.environ["MODEL_TYPE"] = model_type
+    os.environ["HOST"] = host
+    os.environ["PORT"] = str(port)
+    os.environ["SHARE"] = str(share).lower()
+    print(f"🚀 Starting Language Detection App...")
+    print(f"📊 Model: {model_type}")
+    print(f"🌐 Host: {host}:{port}")
+    print(f"🔗 Share: {share}")
+    print("-" * 50)
+    # Import and run the app
+    try:
+        from app import main
+        app = main()
+        app.launch(
+            server_name=host,
+            server_port=port,
+            share=share,
+            debug=True
+        )
+    except KeyboardInterrupt:
+        print("\n👋 App stopped by user")
+    except Exception as e:
+        print(f"❌ Error running app: {e}")
+        return 1
+    return 0
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(
+        description="Language Detection App Runner",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python run.py                          # Run with default settings
+  python run.py --test                   # Run tests only
+  python run.py --model huggingface      # Use Hugging Face model (if available)
+  python run.py --port 8080              # Run on port 8080
+  python run.py --share                  # Create public link
+        """
+    )
+    parser.add_argument(
+        "--test",
+        action="store_true",
+        help="Run tests instead of starting the app"
+    )
+    parser.add_argument(
+        "--model",
+        choices=["placeholder", "huggingface", "custom"],
+        default="placeholder",
+        help="Model type to use (default: placeholder)"
+    )
+    parser.add_argument(
+        "--host",
+        default="0.0.0.0",
+        help="Host to bind to (default: 0.0.0.0)"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="Port to bind to (default: 7860)"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="Create a public link via Gradio"
+    )
+    args = parser.parse_args()
+    print("🌍 Language Detection App Runner")
+    print("=" * 40)
+    if args.test:
+        run_tests()
+        return 0
+    # Validate model choice
+    if args.model == "huggingface":
+        try:
+            import transformers
+            print("✅ Transformers available for Hugging Face model")
+        except ImportError:
+            print("⚠️  Transformers not available. Install with:")
+            print("   pip install transformers torch")
+            print("   Falling back to placeholder model...")
+            args.model = "placeholder"
+    return run_app(
+        model_type=args.model,
+        host=args.host,
+        port=args.port,
+        share=args.share
+    )
+if __name__ == "__main__":
+    sys.exit(main())

test_demo_functionality.py ADDED Viewed

	@@ -0,0 +1,135 @@

+#!/usr/bin/env python3
+"""
+Test script for the demo functionality
+"""
+from demo_page import initialize_models, detect_with_all_models, create_results_dataframe, run_demo_tests, DEMO_SAMPLES
+def test_model_initialization():
+    """Test that all models can be initialized."""
+    print("🔄 Testing model initialization...")
+    models = initialize_models()
+    print(f"✅ Initialized {len(models)} models:")
+    for model_key, model_info in models.items():
+        status_icon = "✅" if model_info["status"] == "Ready" else "❌"
+        print(f"  {status_icon} {model_info['name']}: {model_info['status']}")
+    return models
+def test_single_detection():
+    """Test detection with a single text across all models."""
+    print("\n🔄 Testing single text detection...")
+    models = initialize_models()
+    test_text = "Hello, how are you today?"
+    results = detect_with_all_models(test_text, models)
+    print(f"Text: '{test_text}'")
+    print("Results:")
+    for model_key, result in results.items():
+        print(f"  {model_key}: {result['language_code']} ({result['confidence']:.3f}) - {result['status']}")
+    return results
+def test_category_samples():
+    """Test a few samples from each category."""
+    print("\n🔄 Testing category samples...")
+    models = initialize_models()
+    for category, samples in DEMO_SAMPLES.items():
+        print(f"\n📊 Category: {category}")
+        # Test first sample from each category
+        text, expected, description = samples[0]
+        results = detect_with_all_models(text, models)
+        print(f"  Text: '{text}' (Expected: {expected})")
+        print(f"  Description: {description}")
+        for model_key, result in results.items():
+            match_icon = "✅" if result['language_code'] == expected or expected in ['ambiguous', 'mix', 'transliteration'] else "❌"
+            print(f"    {model_key}: {result['language_code']} ({result['confidence']:.3f}) {match_icon}")
+def test_dataframe_creation():
+    """Test DataFrame creation with sample data."""
+    print("\n🔄 Testing DataFrame creation...")
+    models = initialize_models()
+    # Test with a few samples
+    test_texts = [
+        "Hello world",
+        "Bonjour le monde",
+        "Hola mundo"
+    ]
+    expected_langs = ["en", "fr", "es"]
+    categories = ["Custom", "Custom", "Custom"]
+    all_results = []
+    for text in test_texts:
+        results = detect_with_all_models(text, models)
+        all_results.append(results)
+    df = create_results_dataframe(test_texts, all_results, expected_langs, categories)
+    print("DataFrame shape:", df.shape)
+    print("Columns:", list(df.columns))
+    print("\nFirst few rows:")
+    print(df.head())
+    return df
+def test_demo_workflow():
+    """Test the complete demo workflow."""
+    print("\n🔄 Testing complete demo workflow...")
+    models = initialize_models()
+    # Test with selected categories and custom text
+    selected_categories = ["Easy/Obvious", "Short Text"]
+    custom_texts = "Hello world\nBonjour\n你好"
+    summary, df = run_demo_tests(selected_categories, custom_texts, models)
+    print(f"Summary: {summary}")
+    if df is not None:
+        print(f"Results DataFrame shape: {df.shape}")
+        print("Sample results:")
+        print(df.head())
+    else:
+        print("❌ No DataFrame returned")
+    return summary, df
+def main():
+    """Run all tests."""
+    print("🚀 Starting demo functionality tests...\n")
+    try:
+        # Test 1: Model initialization
+        models = test_model_initialization()
+        # Test 2: Single detection
+        single_results = test_single_detection()
+        # Test 3: Category samples
+        test_category_samples()
+        # Test 4: DataFrame creation
+        df = test_dataframe_creation()
+        # Test 5: Complete workflow
+        summary, demo_df = test_demo_workflow()
+        print("\n✅ All tests completed successfully!")
+        print(f"📊 Total categories available: {len(DEMO_SAMPLES)}")
+        print(f"📝 Total sample texts: {sum(len(samples) for samples in DEMO_SAMPLES.values())}")
+    except Exception as e:
+        print(f"\n❌ Test failed with error: {e}")
+        import traceback
+        traceback.print_exc()
+if __name__ == "__main__":
+    main()

test_model_a_dataset_a.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#!/usr/bin/env python3
+"""
+Test script for Model A Dataset A - XLM-RoBERTa + Standard Dataset
+This script tests the XLM-RoBERTa based language detection model
+trained on the standard multilingual dataset to ensure it works correctly.
+"""
+import sys
+import os
+# Add the project root to the Python path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
+from backend.language_detector import LanguageDetector
+def test_model_a_dataset_a():
+    """Test the Model A Dataset A implementation."""
+    print("🧪 Testing Model A Dataset A - XLM-RoBERTa + Standard Dataset")
+    print("=" * 75)
+    try:
+        # Initialize detector with Model A Dataset A
+        detector = LanguageDetector(model_key="model-a-dataset-a")
+        print("✅ Successfully initialized Model A Dataset A")
+        # Test texts in different languages
+        test_texts = [
+            ("Hello, how are you today?", "en"),
+            ("Bonjour, comment allez-vous?", "fr"),
+            ("Hola, ¿cómo estás?", "es"),
+            ("Guten Tag, wie geht es Ihnen?", "de"),
+            ("こんにちは、元気ですか？", "ja"),
+            ("Привет, как дела?", "ru"),
+            ("Ciao, come stai?", "it"),
+            ("Olá, como você está?", "pt"),
+            ("你好，你好吗？", "zh"),
+            ("안녕하세요, 어떻게 지내세요?", "ko"),
+            ("مرحبا، كيف حالك؟", "ar"),
+            ("नमस्ते, आप कैसे हैं?", "hi")
+        ]
+        print("\n🔍 Running language detection tests:")
+        print("-" * 75)
+        correct_predictions = 0
+        total_predictions = len(test_texts)
+        for text, expected_lang in test_texts:
+            try:
+                result = detector.detect_language(text)
+                predicted_lang = result['language_code']
+                confidence = result['confidence']
+                language_name = result['language']
+                # Check if prediction is correct (allow some flexibility for Chinese variants)
+                is_correct = (predicted_lang == expected_lang or
+                             (expected_lang == "zh" and predicted_lang in ["zh-hans", "zh-hant", "zh-cn", "zh-tw"]))
+                if is_correct:
+                    correct_predictions += 1
+                    status = "✅"
+                else:
+                    status = "❌"
+                print(f"{status} Text: {text[:40]}{'...' if len(text) > 40 else ''}")
+                print(f"   Expected: {expected_lang} | Predicted: {predicted_lang} ({language_name})")
+                print(f"   Confidence: {confidence:.4f}")
+                print()
+            except Exception as e:
+                print(f"❌ Error testing '{text[:30]}...': {str(e)}")
+                print()
+        # Calculate accuracy
+        accuracy = (correct_predictions / total_predictions) * 100
+        print(f"📊 Test Results: {correct_predictions}/{total_predictions} correct")
+        print(f"📈 Accuracy: {accuracy:.1f}%")
+        # Test model info
+        print("\n📋 Model Information:")
+        print("-" * 75)
+        model_info = detector.get_current_model_info()
+        for key, value in model_info.items():
+            print(f"{key.title().replace('_', ' ')}: {value}")
+        print("🎉 Model A Dataset A test completed successfully!")
+    except Exception as e:
+        print(f"❌ Test failed: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return False
+    return True
+def test_model_architecture():
+    """Test the model architecture information."""
+    print("\n🏗️ Testing Model Architecture Information")
+    print("=" * 75)
+    try:
+        detector = LanguageDetector(model_key="model-a-dataset-a")
+        model_info = detector.get_current_model_info()
+        # Verify key architecture information
+        expected_info = {
+            "architecture": "XLM-RoBERTa",
+            "dataset": "Dataset A",
+            "accuracy": "97.9%",
+            "model_size": "278M parameters"
+        }
+        print("🔍 Verifying model architecture information:")
+        print("-" * 50)
+        all_correct = True
+        for key, expected_value in expected_info.items():
+            actual_value = model_info.get(key, "Not found")
+            if actual_value == expected_value:
+                print(f"✅ {key}: {actual_value}")
+            else:
+                print(f"❌ {key}: Expected '{expected_value}', got '{actual_value}'")
+                all_correct = False
+        if all_correct:
+            print("\n🎉 All architecture information verified successfully!")
+        else:
+            print("\n⚠️ Some architecture information mismatches found.")
+        return all_correct
+    except Exception as e:
+        print(f"❌ Architecture test failed: {str(e)}")
+        return False
+if __name__ == "__main__":
+    print("🚀 Starting Model A Dataset A Tests\n")
+    # Run tests
+    test1_passed = test_model_a_dataset_a()
+    test2_passed = test_model_architecture()
+    # Final results
+    print("\n" + "=" * 75)
+    if test1_passed and test2_passed:
+        print("🎉 All tests passed! Model A Dataset A is ready to use.")
+        print("⚖️ This model offers balanced performance with robust cross-lingual capabilities!")
+    else:
+        print("❌ Some tests failed. Please check the implementation.")
+        sys.exit(1)

test_model_b_dataset_b.py ADDED Viewed

	@@ -0,0 +1,231 @@

+#!/usr/bin/env python3
+"""
+Test script for Model B Dataset B - BERT + Enhanced Dataset
+This script tests the BERT based language detection model
+trained on the enhanced dataset, achieving the highest accuracy (99.85%).
+"""
+import sys
+import os
+# Add the project root to the Python path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
+from backend.language_detector import LanguageDetector
+def test_model_b_dataset_b():
+    """Test the Model B Dataset B implementation."""
+    print("🧪 Testing Model B Dataset B - BERT + Enhanced Dataset")
+    print("=" * 75)
+    try:
+        # Initialize detector with Model B Dataset B (highest accuracy)
+        detector = LanguageDetector(model_key="model-b-dataset-b")
+        print("✅ Successfully initialized Model B Dataset B")
+        # Test texts in the 20 supported languages
+        test_texts = [
+            ("Hello, how are you today?", "en"),  # English
+            ("Bonjour, comment allez-vous?", "fr"),  # French
+            ("Hola, ¿cómo estás?", "es"),  # Spanish
+            ("Guten Tag, wie geht es Ihnen?", "de"),  # German
+            ("Ciao, come stai?", "it"),  # Italian
+            ("Olá, como você está?", "pt"),  # Portuguese
+            ("Привет, как дела?", "ru"),  # Russian
+            ("こんにちは、元気ですか？", "ja"),  # Japanese
+            ("你好，你好吗？", "zh"),  # Chinese
+            ("مرحبا، كيف حالك؟", "ar"),  # Arabic
+            ("नमस्ते, आप कैसे हैं?", "hi"),  # Hindi
+            ("Hallo, hoe gaat het met je?", "nl"),  # Dutch
+            ("Γεια σας, πώς είστε;", "el"),  # Greek
+            ("Здравейте, как сте?", "bg"),  # Bulgarian
+            ("Witaj, jak się masz?", "pl"),  # Polish
+            ("สวัสดี คุณเป็นอย่างไรบ้าง?", "th"),  # Thai
+            ("Merhaba, nasılsınız?", "tr"),  # Turkish
+            ("آپ کیسے ہیں؟", "ur"),  # Urdu
+            ("Xin chào, bạn khỏe không?", "vi"),  # Vietnamese
+            ("Habari, unajehje?", "sw")  # Swahili
+        ]
+        print("\n🔍 Running language detection tests on 20 supported languages:")
+        print("-" * 75)
+        correct_predictions = 0
+        total_predictions = len(test_texts)
+        for text, expected_lang in test_texts:
+            try:
+                result = detector.detect_language(text)
+                predicted_lang = result['language_code']
+                confidence = result['confidence']
+                language_name = result['language']
+                # Check if prediction is correct
+                is_correct = predicted_lang == expected_lang
+                if is_correct:
+                    correct_predictions += 1
+                    status = "✅"
+                else:
+                    status = "❌"
+                print(f"{status} Text: {text[:40]}{'...' if len(text) > 40 else ''}")
+                print(f"   Expected: {expected_lang} | Predicted: {predicted_lang} ({language_name})")
+                print(f"   Confidence: {confidence:.4f}")
+                print()
+            except Exception as e:
+                print(f"❌ Error testing '{text[:30]}...': {str(e)}")
+                print()
+        # Calculate accuracy
+        accuracy = (correct_predictions / total_predictions) * 100
+        print(f"📊 Test Results: {correct_predictions}/{total_predictions} correct")
+        print(f"📈 Accuracy: {accuracy:.1f}%")
+        # Test model info
+        print("\n📋 Model Information:")
+        print("-" * 75)
+        model_info = detector.get_current_model_info()
+        for key, value in model_info.items():
+            print(f"{key.title().replace('_', ' ')}: {value}")
+        print("🎉 Model B Dataset B test completed successfully!")
+    except Exception as e:
+        print(f"❌ Test failed: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return False
+    return True
+def test_all_models_comprehensive():
+    """Test and compare all four available model combinations."""
+    print("\n🔄 Comprehensive All-Model Combinations Comparison")
+    print("=" * 75)
+    models_to_test = [
+        ("model-a-dataset-a", "Model A Dataset A", "XLM-RoBERTa + Standard", "97.9%"),
+        ("model-b-dataset-a", "Model B Dataset A", "BERT + Standard", "96.17%"),
+        ("model-a-dataset-b", "Model A Dataset B", "XLM-RoBERTa + Enhanced", "99.72%"),
+        ("model-b-dataset-b", "Model B Dataset B", "BERT + Enhanced", "99.85%")
+    ]
+    test_texts = [
+        "Hello, this is a test in English.",
+        "Bonjour, ceci est un test en français.",
+        "Hola, esto es una prueba en español.",
+        "Guten Tag, das ist ein Test auf Deutsch."
+    ]
+    print("🧪 Testing with multiple sentences across all model combinations:")
+    print("-" * 75)
+    try:
+        results_summary = {}
+        for model_key, model_name, description, claimed_accuracy in models_to_test:
+            print(f"\n🤖 Testing {model_name} ({description}) - Claimed: {claimed_accuracy}")
+            print("-" * 60)
+            try:
+                detector = LanguageDetector(model_key=model_key)
+                model_results = []
+                for text in test_texts:
+                    result = detector.detect_language(text)
+                    model_results.append({
+                        'text': text[:30] + '...' if len(text) > 30 else text,
+                        'language': result['language'],
+                        'code': result['language_code'],
+                        'confidence': result['confidence']
+                    })
+                    print(f"   Text: {text[:30]}{'...' if len(text) > 30 else ''}")
+                    print(f"   → {result['language']} ({result['language_code']}) - {result['confidence']:.4f}")
+                results_summary[model_name] = model_results
+                print(f"✅ {model_name} completed successfully")
+            except Exception as e:
+                print(f"❌ {model_name}: {str(e)}")
+                results_summary[model_name] = f"Error: {str(e)}"
+        print(f"\n📊 All Model Combinations Testing Summary:")
+        print("-" * 75)
+        for model_name, results in results_summary.items():
+            if isinstance(results, str):
+                print(f"❌ {model_name}: {results}")
+            else:
+                avg_confidence = sum(r['confidence'] for r in results) / len(results)
+                print(f"✅ {model_name}: Avg Confidence: {avg_confidence:.4f}")
+        print("🎉 Comprehensive model comparison completed successfully!")
+        return True
+    except Exception as e:
+        print(f"❌ Comprehensive test failed: {str(e)}")
+        return False
+def test_model_architecture():
+    """Test the model architecture information for Model B Dataset B."""
+    print("\n🏗️ Testing Model B Dataset B Architecture Information")
+    print("=" * 75)
+    try:
+        detector = LanguageDetector(model_key="model-b-dataset-b")
+        model_info = detector.get_current_model_info()
+        # Verify key architecture information
+        expected_info = {
+            "architecture": "BERT",
+            "dataset": "Dataset B",
+            "accuracy": "99.85%",
+            "model_size": "178M parameters"
+        }
+        print("🔍 Verifying model architecture information:")
+        print("-" * 50)
+        all_correct = True
+        for key, expected_value in expected_info.items():
+            actual_value = model_info.get(key, "Not found")
+            if actual_value == expected_value:
+                print(f"✅ {key}: {actual_value}")
+            else:
+                print(f"❌ {key}: Expected '{expected_value}', got '{actual_value}'")
+                all_correct = False
+        if all_correct:
+            print("\n🎉 All architecture information verified successfully!")
+        else:
+            print("\n⚠️ Some architecture information mismatches found.")
+        return all_correct
+    except Exception as e:
+        print(f"❌ Architecture test failed: {str(e)}")
+        return False
+if __name__ == "__main__":
+    print("🚀 Starting Model B Dataset B Tests\n")
+    # Run tests
+    test1_passed = test_model_b_dataset_b()
+    test2_passed = test_all_models_comprehensive()
+    test3_passed = test_model_architecture()
+    # Final results
+    print("\n" + "=" * 75)
+    if test1_passed and test2_passed and test3_passed:
+        print("🎉 All tests passed! Model B Dataset B is ready to use.")
+        print("🏆 This model offers the highest accuracy (99.85%) of all available models!")
+        print("📝 Note: Optimized for 20 carefully selected languages for maximum precision.")
+    else:
+        print("❌ Some tests failed. Please check the implementation.")
+        sys.exit(1)