yangding14 commited on
Commit
72f90b1
·
1 Parent(s): f5560c6

first commit

Browse files
app.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from backend.language_detector import LanguageDetector
3
+
4
+ def main():
5
+ # Initialize the language detector with default model (Model A Dataset A)
6
+ detector = LanguageDetector()
7
+
8
+ # Create Gradio interface
9
+ with gr.Blocks(title="Language Detection App", theme=gr.themes.Soft()) as app:
10
+ gr.Markdown("# 🌍 Language Detection App")
11
+ gr.Markdown("Select a model and enter text below to detect its language with confidence scores.")
12
+
13
+ # Model Selection Section with visual styling
14
+ with gr.Group():
15
+ gr.Markdown(
16
+ "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.1), transparent); border-radius: 8px 8px 0 0;'>🤖 Model Selection</div>"
17
+ )
18
+
19
+ # Get available models
20
+ available_models = detector.get_available_models()
21
+ model_choices = []
22
+ model_info_map = {}
23
+
24
+ for key, info in available_models.items():
25
+ if info["status"] == "available":
26
+ model_choices.append((info["display_name"], key))
27
+ else:
28
+ model_choices.append((f"{info['display_name']} (Coming Soon)", key))
29
+ model_info_map[key] = info
30
+
31
+ model_selector = gr.Dropdown(
32
+ choices=model_choices,
33
+ value="model-a-dataset-a", # Default to Model A Dataset A
34
+ label="Choose Language Detection Model",
35
+ interactive=True
36
+ )
37
+
38
+ # Model Information Display
39
+ model_info_display = gr.Markdown(
40
+ value=_format_model_info(detector.get_current_model_info()),
41
+ label="Model Information"
42
+ )
43
+
44
+ # Add visual separator
45
+ gr.Markdown(
46
+ "<div style='margin: 24px 0; border-top: 3px solid rgba(99, 102, 241, 0.2); background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.05), transparent); height: 2px;'></div>"
47
+ )
48
+
49
+ # Analysis Section
50
+ with gr.Group():
51
+ gr.Markdown(
52
+ "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(34, 197, 94, 0.1), transparent); border-radius: 8px 8px 0 0;'>🔍 Language Analysis</div>"
53
+ )
54
+
55
+ with gr.Row():
56
+ with gr.Column(scale=2):
57
+ # Input section
58
+ text_input = gr.Textbox(
59
+ label="Text to Analyze",
60
+ placeholder="Enter text here to detect its language...",
61
+ lines=5,
62
+ max_lines=10
63
+ )
64
+
65
+ detect_btn = gr.Button("🔍 Detect Language", variant="primary", size="lg")
66
+
67
+ # Example texts
68
+ gr.Examples(
69
+ examples=[
70
+ ["Hello, how are you today?"],
71
+ ["Bonjour, comment allez-vous?"],
72
+ ["Hola, ¿cómo estás?"],
73
+ ["Guten Tag, wie geht es Ihnen?"],
74
+ ["こんにちは、元気ですか?"],
75
+ ["Привет, как дела?"],
76
+ ["Ciao, come stai?"],
77
+ ["Olá, como você está?"],
78
+ ["你好,你好吗?"],
79
+ ["안녕하세요, 어떻게 지내세요?"]
80
+ ],
81
+ inputs=text_input,
82
+ label="Try these examples:"
83
+ )
84
+
85
+ with gr.Column(scale=2):
86
+ # Output section
87
+ with gr.Group():
88
+ gr.Markdown(
89
+ "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 12px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(168, 85, 247, 0.1), transparent); border-radius: 8px 8px 0 0;'>📊 Detection Results</div>"
90
+ )
91
+
92
+ detected_language = gr.Textbox(
93
+ label="Detected Language",
94
+ interactive=False
95
+ )
96
+
97
+ confidence_score = gr.Number(
98
+ label="Confidence Score",
99
+ interactive=False,
100
+ precision=4
101
+ )
102
+
103
+ language_code = gr.Textbox(
104
+ label="Language Code (ISO 639-1)",
105
+ interactive=False
106
+ )
107
+
108
+ # Top predictions table
109
+ top_predictions = gr.Dataframe(
110
+ headers=["Language", "Code", "Confidence"],
111
+ label="Top 5 Predictions",
112
+ interactive=False,
113
+ wrap=True
114
+ )
115
+
116
+ # Status/Info section
117
+ with gr.Row():
118
+ status_text = gr.Textbox(
119
+ label="Status",
120
+ interactive=False,
121
+ visible=False
122
+ )
123
+
124
+ # Event handlers
125
+ def detect_language_wrapper(text, selected_model):
126
+ if not text.strip():
127
+ return (
128
+ "No text provided",
129
+ 0.0,
130
+ "",
131
+ [],
132
+ gr.update(value="Please enter some text to analyze.", visible=True)
133
+ )
134
+
135
+ try:
136
+ # Switch model if needed
137
+ if detector.current_model_key != selected_model:
138
+ try:
139
+ detector.switch_model(selected_model)
140
+ except NotImplementedError:
141
+ return (
142
+ "Model unavailable",
143
+ 0.0,
144
+ "",
145
+ [],
146
+ gr.update(value="This model is not yet implemented. Please select an available model.", visible=True)
147
+ )
148
+ except Exception as e:
149
+ return (
150
+ "Model error",
151
+ 0.0,
152
+ "",
153
+ [],
154
+ gr.update(value=f"Error loading model: {str(e)}", visible=True)
155
+ )
156
+
157
+ result = detector.detect_language(text)
158
+
159
+ # Extract main prediction
160
+ main_lang = result['language']
161
+ main_confidence = result['confidence']
162
+ main_code = result['language_code']
163
+
164
+ # Format top predictions for table
165
+ predictions_table = [
166
+ [pred['language'], pred['language_code'], f"{pred['confidence']:.4f}"]
167
+ for pred in result['top_predictions']
168
+ ]
169
+
170
+ model_info = result.get('metadata', {}).get('model_info', {})
171
+ model_name = model_info.get('name', 'Unknown Model')
172
+
173
+ return (
174
+ main_lang,
175
+ main_confidence,
176
+ main_code,
177
+ predictions_table,
178
+ gr.update(value=f"✅ Analysis Complete\n\nInput Text: {text[:100]}{'...' if len(text) > 100 else ''}\n\nDetected Language: {main_lang} ({main_code})\nConfidence: {main_confidence:.2%}\n\nModel: {model_name}", visible=True)
179
+ )
180
+
181
+ except Exception as e:
182
+ return (
183
+ "Error occurred",
184
+ 0.0,
185
+ "",
186
+ [],
187
+ gr.update(value=f"Error: {str(e)}", visible=True)
188
+ )
189
+
190
+ def update_model_info(selected_model):
191
+ """Update model information display when model selection changes."""
192
+ try:
193
+ if detector.current_model_key != selected_model:
194
+ detector.switch_model(selected_model)
195
+ model_info = detector.get_current_model_info()
196
+ return _format_model_info(model_info)
197
+ except NotImplementedError:
198
+ return "**This model is not yet implemented.** Please select an available model."
199
+ except Exception as e:
200
+ return f"**Error loading model information:** {str(e)}"
201
+
202
+ # Connect the button to the detection function
203
+ detect_btn.click(
204
+ fn=detect_language_wrapper,
205
+ inputs=[text_input, model_selector],
206
+ outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
207
+ )
208
+
209
+ # Also trigger on Enter key in text input
210
+ text_input.submit(
211
+ fn=detect_language_wrapper,
212
+ inputs=[text_input, model_selector],
213
+ outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
214
+ )
215
+
216
+ # Update model info when selection changes
217
+ model_selector.change(
218
+ fn=update_model_info,
219
+ inputs=[model_selector],
220
+ outputs=[model_info_display]
221
+ )
222
+
223
+ return app
224
+
225
+
226
+ def _format_model_info(model_info):
227
+ """Format model information for display."""
228
+ if not model_info:
229
+ return "No model information available."
230
+
231
+ formatted_info = f"""
232
+ **{model_info.get('name', 'Unknown Model')}**
233
+
234
+ {model_info.get('description', 'No description available.')}
235
+
236
+ **📊 Performance:**
237
+ - Accuracy: {model_info.get('accuracy', 'N/A')}
238
+ - Model Size: {model_info.get('model_size', 'N/A')}
239
+
240
+ **🏗️ Architecture:**
241
+ - Model Architecture: {model_info.get('architecture', 'N/A')}
242
+ - Base Model: {model_info.get('base_model', 'N/A')}
243
+ - Training Dataset: {model_info.get('dataset', 'N/A')}
244
+
245
+ **🌐 Languages:** {model_info.get('languages_supported', 'N/A')}
246
+
247
+ **⚙️ Training Details:** {model_info.get('training_details', 'N/A')}
248
+
249
+ **💡 Use Cases:** {model_info.get('use_cases', 'N/A')}
250
+
251
+ **✅ Strengths:** {model_info.get('strengths', 'N/A')}
252
+
253
+ **⚠️ Limitations:** {model_info.get('limitations', 'N/A')}
254
+ """
255
+ return formatted_info
256
+
257
+
258
+ if __name__ == "__main__":
259
+ app = main()
260
+ app.launch()
backend/.DS_Store ADDED
Binary file (6.15 kB). View file
 
backend/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Backend package for language detection models
backend/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (173 Bytes). View file
 
backend/__pycache__/language_detector.cpython-313.pyc ADDED
Binary file (10.1 kB). View file
 
backend/language_detector.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Language Detection Backend
3
+
4
+ This module provides the main LanguageDetector class and ModelRegistry
5
+ for managing multiple language detection models organized by architecture and dataset.
6
+
7
+ Model Architecture:
8
+ - Model A: XLM-RoBERTa based architectures
9
+ - Model B: BERT based architectures
10
+
11
+ Training Datasets:
12
+ - Dataset A: Standard multilingual language detection dataset
13
+ - Dataset B: Enhanced/specialized language detection dataset
14
+ """
15
+
16
+ import logging
17
+ from typing import Dict, List, Any
18
+
19
+ from .models import (
20
+ BaseLanguageModel,
21
+ ModelADatasetA,
22
+ ModelBDatasetA,
23
+ ModelADatasetB,
24
+ ModelBDatasetB,
25
+ get_all_model_configs,
26
+ get_language_name,
27
+ LANGUAGE_MAPPINGS
28
+ )
29
+
30
+
31
+ class ModelRegistry:
32
+ """
33
+ Registry for managing available language detection models.
34
+
35
+ This class handles the registration and creation of language detection models
36
+ organized by model architecture (A: XLM-RoBERTa, B: BERT) and training
37
+ dataset (A: standard, B: enhanced).
38
+ """
39
+
40
+ def __init__(self):
41
+ """Initialize the model registry with available models."""
42
+ # Get model configurations from centralized config
43
+ self.model_configs = get_all_model_configs()
44
+
45
+ # Map model keys to their implementation classes
46
+ self.model_classes = {
47
+ "model-a-dataset-a": ModelADatasetA, # XLM-RoBERTa + Dataset A
48
+ "model-b-dataset-a": ModelBDatasetA, # BERT + Dataset A
49
+ "model-a-dataset-b": ModelADatasetB, # XLM-RoBERTa + Dataset B
50
+ "model-b-dataset-b": ModelBDatasetB, # BERT + Dataset B
51
+ }
52
+
53
+ # Build models registry by combining configs with classes
54
+ self.models = {}
55
+
56
+ # Add the new organized models
57
+ for model_key, config in self.model_configs.items():
58
+ if model_key in self.model_classes:
59
+ self.models[model_key] = {
60
+ "class": self.model_classes[model_key],
61
+ "display_name": config["display_name"],
62
+ "description": config["description"],
63
+ "status": config["status"]
64
+ }
65
+
66
+ def get_available_models(self) -> Dict[str, Dict[str, Any]]:
67
+ """
68
+ Get all registered models.
69
+
70
+ Returns:
71
+ Dict containing all model information
72
+ """
73
+ return self.models.copy()
74
+
75
+ def create_model(self, model_key: str) -> BaseLanguageModel:
76
+ """
77
+ Create an instance of the specified model.
78
+
79
+ Args:
80
+ model_key (str): Key of the model to create
81
+
82
+ Returns:
83
+ BaseLanguageModel: Instance of the requested model
84
+
85
+ Raises:
86
+ ValueError: If the model key is not found
87
+ """
88
+ if model_key not in self.models:
89
+ available_keys = list(self.models.keys())
90
+ raise ValueError(f"Unknown model: {model_key}. Available models: {available_keys}")
91
+
92
+ model_class = self.models[model_key]["class"]
93
+ return model_class()
94
+
95
+
96
+ class LanguageDetector:
97
+ """
98
+ Main language detection class that orchestrates model predictions.
99
+
100
+ This class provides a unified interface for language detection using
101
+ different model architectures and training datasets. It handles model
102
+ switching and provides consistent output formatting.
103
+ """
104
+
105
+ def __init__(self, model_key: str = "model-a-dataset-a"):
106
+ """
107
+ Initialize the language detector.
108
+
109
+ Args:
110
+ model_key (str): Key of the model to use from the registry
111
+ - "model-a-dataset-a": XLM-RoBERTa + standard dataset
112
+ - "model-b-dataset-a": BERT + standard dataset
113
+ - "model-a-dataset-b": XLM-RoBERTa + enhanced dataset
114
+ - "model-b-dataset-b": BERT + enhanced dataset
115
+ """
116
+ self.registry = ModelRegistry()
117
+ self.current_model_key = model_key
118
+ self.model = self.registry.create_model(model_key)
119
+
120
+ # Use centralized language mappings
121
+ self.language_names = LANGUAGE_MAPPINGS
122
+
123
+ def switch_model(self, model_key: str):
124
+ """
125
+ Switch to a different model.
126
+
127
+ Args:
128
+ model_key (str): Key of the new model to use
129
+
130
+ Raises:
131
+ Exception: If model switching fails
132
+ """
133
+ try:
134
+ self.model = self.registry.create_model(model_key)
135
+ self.current_model_key = model_key
136
+ logging.info(f"Successfully switched to model: {model_key}")
137
+ except Exception as e:
138
+ logging.error(f"Failed to switch to model {model_key}: {e}")
139
+ raise
140
+
141
+ def get_current_model_info(self) -> Dict[str, Any]:
142
+ """
143
+ Get information about the currently selected model.
144
+
145
+ Returns:
146
+ Dict containing current model information
147
+ """
148
+ return self.model.get_model_info()
149
+
150
+ def get_available_models(self) -> Dict[str, Dict[str, Any]]:
151
+ """
152
+ Get all available models.
153
+
154
+ Returns:
155
+ Dict containing all available models
156
+ """
157
+ return self.registry.get_available_models()
158
+
159
+ def detect_language(self, text: str) -> Dict[str, Any]:
160
+ """
161
+ Detect the language of the input text.
162
+
163
+ Args:
164
+ text (str): Input text to analyze
165
+
166
+ Returns:
167
+ Dict containing:
168
+ - language: Main predicted language name
169
+ - language_code: Main predicted language code
170
+ - confidence: Confidence score for main prediction
171
+ - top_predictions: List of top 5 predictions with details
172
+ - metadata: Additional information about the prediction
173
+
174
+ Raises:
175
+ ValueError: If input text is empty
176
+ RuntimeError: If model prediction fails
177
+ """
178
+ if not text or not text.strip():
179
+ raise ValueError("Input text cannot be empty")
180
+
181
+ # Get predictions from the current model
182
+ model_result = self.model.predict(text.strip())
183
+ predictions = model_result['predictions']
184
+
185
+ if not predictions:
186
+ raise RuntimeError("Model returned no predictions")
187
+
188
+ # Extract main prediction
189
+ top_prediction = predictions[0]
190
+ main_language_code = top_prediction['language_code']
191
+ main_confidence = top_prediction['confidence']
192
+
193
+ # Get human-readable language name using centralized function
194
+ main_language_name = get_language_name(main_language_code)
195
+
196
+ # Format top predictions (limit to 5)
197
+ top_predictions = []
198
+ for pred in predictions[:5]:
199
+ lang_code = pred['language_code']
200
+ lang_name = get_language_name(lang_code)
201
+ top_predictions.append({
202
+ 'language': lang_name,
203
+ 'language_code': lang_code,
204
+ 'confidence': pred['confidence']
205
+ })
206
+
207
+ # Prepare metadata
208
+ metadata = {
209
+ 'text_length': model_result.get('text_length', len(text)),
210
+ 'model_name': model_result.get('model_version', 'unknown'),
211
+ 'model_type': model_result.get('model_type', 'unknown'),
212
+ 'current_model_key': self.current_model_key,
213
+ 'model_info': self.get_current_model_info()
214
+ }
215
+
216
+ return {
217
+ 'language': main_language_name,
218
+ 'language_code': main_language_code,
219
+ 'confidence': main_confidence,
220
+ 'top_predictions': top_predictions,
221
+ 'metadata': metadata
222
+ }
223
+
224
+ def get_supported_languages(self) -> Dict[str, str]:
225
+ """
226
+ Get dictionary of supported language codes and names.
227
+
228
+ Returns:
229
+ Dict mapping language codes to language names
230
+ """
231
+ supported_codes = self.model.get_supported_languages()
232
+ return {
233
+ code: get_language_name(code)
234
+ for code in supported_codes
235
+ }
236
+
237
+
238
+ # Example usage and testing
239
+ if __name__ == "__main__":
240
+ # Initialize detector with default model (Model A Dataset A)
241
+ detector = LanguageDetector()
242
+
243
+ # Test with sample texts
244
+ test_texts = [
245
+ "Hello, how are you today?",
246
+ "Bonjour, comment allez-vous?",
247
+ "Hola, ¿cómo estás?",
248
+ "Guten Tag, wie geht es Ihnen?"
249
+ ]
250
+
251
+ print("Language Detection Test - Model A Dataset A")
252
+ print("=" * 60)
253
+
254
+ for text in test_texts:
255
+ try:
256
+ result = detector.detect_language(text)
257
+ print(f"Text: {text}")
258
+ print(f"Detected: {result['language']} ({result['language_code']}) - {result['confidence']:.3f}")
259
+ print("---")
260
+ except Exception as e:
261
+ print(f"Error detecting language for '{text}': {e}")
262
+ print("---")
263
+
264
+ # Show available models
265
+ print("\nAvailable Models:")
266
+ models = detector.get_available_models()
267
+ for key, info in models.items():
268
+ status = "✅" if info["status"] == "available" else "🚧"
269
+ print(f"{status} {info['display_name']} ({key}): {info['description']}")
backend/models/__init__.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Language Detection Models Module
3
+
4
+ This module provides the base model interface and all available language detection models.
5
+ Models are organized by architecture (A: XLM-RoBERTa, B: BERT) and training dataset (A: standard, B: enhanced).
6
+ """
7
+
8
+ from .base_model import BaseLanguageModel
9
+ from .model_config import (
10
+ get_model_config,
11
+ get_all_model_configs,
12
+ get_supported_languages,
13
+ get_language_name,
14
+ LANGUAGE_MAPPINGS
15
+ )
16
+
17
+ # Import all model implementations
18
+ from .model_a_dataset_a import ModelADatasetA
19
+ from .model_b_dataset_a import ModelBDatasetA
20
+ from .model_a_dataset_b import ModelADatasetB
21
+ from .model_b_dataset_b import ModelBDatasetB
22
+
23
+ __all__ = [
24
+ 'BaseLanguageModel',
25
+ 'ModelADatasetA',
26
+ 'ModelBDatasetA',
27
+ 'ModelADatasetB',
28
+ 'ModelBDatasetB',
29
+ 'get_model_config',
30
+ 'get_all_model_configs',
31
+ 'get_supported_languages',
32
+ 'get_language_name',
33
+ 'LANGUAGE_MAPPINGS'
34
+ ]
backend/models/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (984 Bytes). View file
 
backend/models/__pycache__/base_model.cpython-313.pyc ADDED
Binary file (2.39 kB). View file
 
backend/models/__pycache__/model_a_dataset_a.cpython-313.pyc ADDED
Binary file (6.57 kB). View file
 
backend/models/__pycache__/model_a_dataset_b.cpython-313.pyc ADDED
Binary file (6.71 kB). View file
 
backend/models/__pycache__/model_b_dataset_a.cpython-313.pyc ADDED
Binary file (6.56 kB). View file
 
backend/models/__pycache__/model_b_dataset_b.cpython-313.pyc ADDED
Binary file (6.92 kB). View file
 
backend/models/__pycache__/model_config.cpython-313.pyc ADDED
Binary file (10.4 kB). View file
 
backend/models/__pycache__/placeholder_model_1.cpython-313.pyc ADDED
Binary file (6.9 kB). View file
 
backend/models/__pycache__/placeholder_model_2.cpython-313.pyc ADDED
Binary file (7 kB). View file
 
backend/models/__pycache__/placeholder_model_3.cpython-313.pyc ADDED
Binary file (6.83 kB). View file
 
backend/models/__pycache__/songjun.cpython-313.pyc ADDED
Binary file (6.89 kB). View file
 
backend/models/__pycache__/xlm_roberta_detector.cpython-313.pyc ADDED
Binary file (6.3 kB). View file
 
backend/models/base_model.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Base Language Model Abstract Class
3
+
4
+ This module defines the interface that all language detection models must implement.
5
+ """
6
+
7
+ from abc import ABC, abstractmethod
8
+ from typing import Dict, List, Any
9
+
10
+
11
+ class BaseLanguageModel(ABC):
12
+ """
13
+ Abstract base class for language detection models.
14
+
15
+ All language detection models must inherit from this class and implement
16
+ the required methods.
17
+ """
18
+
19
+ @abstractmethod
20
+ def predict(self, text: str) -> Dict[str, Any]:
21
+ """
22
+ Predict the language of the given text.
23
+
24
+ Args:
25
+ text (str): Input text to analyze
26
+
27
+ Returns:
28
+ Dict containing prediction results with structure:
29
+ {
30
+ 'predictions': [
31
+ {
32
+ 'language_code': str,
33
+ 'confidence': float
34
+ },
35
+ ...
36
+ ],
37
+ 'text_length': int,
38
+ 'model_version': str,
39
+ 'model_type': str
40
+ }
41
+ """
42
+ pass
43
+
44
+ @abstractmethod
45
+ def get_supported_languages(self) -> List[str]:
46
+ """
47
+ Get list of supported language codes.
48
+
49
+ Returns:
50
+ List of ISO 639-1 language codes
51
+ """
52
+ pass
53
+
54
+ @abstractmethod
55
+ def get_model_info(self) -> Dict[str, Any]:
56
+ """
57
+ Get information about the model.
58
+
59
+ Returns:
60
+ Dict containing model metadata and description with structure:
61
+ {
62
+ 'name': str,
63
+ 'description': str,
64
+ 'accuracy': str,
65
+ 'model_size': str,
66
+ 'languages_supported': str,
67
+ 'training_details': str,
68
+ 'use_cases': str,
69
+ 'strengths': str,
70
+ 'limitations': str
71
+ }
72
+ """
73
+ pass
backend/models/model_a_dataset_a.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model A Dataset A - XLM-RoBERTa Language Detection
3
+
4
+ This module implements the XLM-RoBERTa based language detection model
5
+ fine-tuned on Dataset A (standard multilingual language detection dataset).
6
+
7
+ Model Architecture: XLM-RoBERTa (Model A)
8
+ Training Dataset: Dataset A (standard multilingual)
9
+ Performance: 97.9% accuracy across 60+ languages
10
+ """
11
+
12
+ import logging
13
+ from typing import Dict, List, Any
14
+
15
+ from .base_model import BaseLanguageModel
16
+ from .model_config import get_model_config, get_supported_languages, get_language_name
17
+
18
+ try:
19
+ from transformers import pipeline
20
+ HF_AVAILABLE = True
21
+ except ImportError:
22
+ HF_AVAILABLE = False
23
+ logging.warning("Transformers library not available. Please install with: pip install transformers torch")
24
+
25
+
26
+ class ModelADatasetA(BaseLanguageModel):
27
+ """
28
+ XLM-RoBERTa based language detection model (Model A) trained on Dataset A.
29
+
30
+ This model represents the XLM-RoBERTa architecture fine-tuned on a standard
31
+ multilingual language detection dataset, achieving 97.9% accuracy with
32
+ robust cross-lingual performance across 60+ languages.
33
+
34
+ Architecture: XLM-RoBERTa (Model A)
35
+ Dataset: Dataset A (standard multilingual)
36
+ Base Model: xlm-roberta-base
37
+ Accuracy: 97.9%
38
+ Parameters: 278M
39
+ """
40
+
41
+ def __init__(self):
42
+ """Initialize the Model A Dataset A language detector."""
43
+ self.model_key = "model-a-dataset-a"
44
+ self.config = get_model_config(self.model_key)
45
+ self.model_name = self.config["huggingface_model"]
46
+
47
+ # Check if transformers library is available
48
+ if not HF_AVAILABLE:
49
+ raise ImportError(
50
+ "Transformers library required for Model A Dataset A. "
51
+ "Install with: pip install transformers torch"
52
+ )
53
+
54
+ # Initialize the model pipeline
55
+ try:
56
+ self.classifier = pipeline(
57
+ "text-classification",
58
+ model=self.model_name,
59
+ device=0,
60
+ top_k=None # Return all scores
61
+ )
62
+ logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
63
+ except Exception as e:
64
+ logging.error(f"Failed to load {self.config['display_name']}: {e}")
65
+ raise RuntimeError(f"Could not initialize Model A Dataset A: {str(e)}")
66
+
67
+ def predict(self, text: str) -> Dict[str, Any]:
68
+ """
69
+ Predict language using Model A Dataset A (XLM-RoBERTa).
70
+
71
+ Args:
72
+ text (str): Input text to analyze
73
+
74
+ Returns:
75
+ Dict with predictions, metadata, and model information
76
+ """
77
+ if not text or not text.strip():
78
+ raise ValueError("Input text cannot be empty")
79
+
80
+ try:
81
+ # Run the model prediction
82
+ results = self.classifier(text)
83
+
84
+ # Handle the format returned by the pipeline
85
+ if isinstance(results, list) and len(results) > 0:
86
+ if isinstance(results[0], list):
87
+ # Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
88
+ prediction_list = results[0]
89
+ else:
90
+ # Direct list format: [{'label': 'en', 'score': 0.99}, ...]
91
+ prediction_list = results
92
+ else:
93
+ raise ValueError("Unexpected pipeline output format")
94
+
95
+ # Sort predictions by confidence score (descending)
96
+ predictions = [
97
+ {
98
+ 'language_code': result['label'].lower(),
99
+ 'confidence': result['score']
100
+ }
101
+ for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
102
+ ]
103
+
104
+ return {
105
+ 'predictions': predictions,
106
+ 'text_length': len(text),
107
+ 'model_version': self.model_name,
108
+ 'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
109
+ }
110
+
111
+ except Exception as e:
112
+ logging.error(f"Model A Dataset A prediction failed: {e}")
113
+ raise RuntimeError(f"Model prediction failed: {str(e)}")
114
+
115
+ def get_supported_languages(self) -> List[str]:
116
+ """
117
+ Get supported languages for Model A Dataset A.
118
+
119
+ Returns:
120
+ List of ISO 639-1 language codes supported by the model
121
+ """
122
+ return get_supported_languages(self.model_key)
123
+
124
+ def get_model_info(self) -> Dict[str, Any]:
125
+ """
126
+ Get detailed information about Model A Dataset A.
127
+
128
+ Returns:
129
+ Dict containing comprehensive model metadata
130
+ """
131
+ # Build comprehensive model info from centralized config
132
+ model_info = {
133
+ "name": self.config["display_name"],
134
+ "description": self.config["description"],
135
+ "accuracy": self.config["accuracy"],
136
+ "model_size": self.config["model_size"],
137
+ "architecture": self.config["architecture"],
138
+ "base_model": self.config["base_model"],
139
+ "dataset": self.config["dataset"],
140
+ "languages_supported": f"{self.config['languages_supported']}+ languages",
141
+ "training_details": self.config["training_details"],
142
+ "use_cases": self.config["use_cases"],
143
+ "strengths": self.config["strengths"],
144
+ "limitations": self.config["limitations"]
145
+ }
146
+
147
+ return model_info
backend/models/model_a_dataset_b.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model A Dataset B - XLM-RoBERTa Language Detection
3
+
4
+ This module implements the XLM-RoBERTa based language detection model
5
+ fine-tuned on Dataset B (enhanced/specialized language detection dataset).
6
+
7
+ Model Architecture: XLM-RoBERTa (Model A)
8
+ Training Dataset: Dataset B (enhanced/specialized)
9
+ Performance: 99.72% accuracy across 100+ languages
10
+ """
11
+
12
+ import logging
13
+ from typing import Dict, List, Any
14
+
15
+ from .base_model import BaseLanguageModel
16
+ from .model_config import get_model_config, get_supported_languages, get_language_name
17
+
18
+ try:
19
+ from transformers import pipeline
20
+ HF_AVAILABLE = True
21
+ except ImportError:
22
+ HF_AVAILABLE = False
23
+ logging.warning("Transformers library not available. Please install with: pip install transformers torch")
24
+
25
+
26
+ class ModelADatasetB(BaseLanguageModel):
27
+ """
28
+ XLM-RoBERTa based language detection model (Model A) trained on Dataset B.
29
+
30
+ This model represents the XLM-RoBERTa architecture fine-tuned on an enhanced
31
+ language detection dataset, achieving exceptional 99.72% accuracy with
32
+ state-of-the-art performance across 100+ languages.
33
+
34
+ Architecture: XLM-RoBERTa (Model A)
35
+ Dataset: Dataset B (enhanced/specialized)
36
+ Base Model: xlm-roberta-base
37
+ Accuracy: 99.72%
38
+ Parameters: 278M
39
+ Training Loss: 0.0176
40
+ """
41
+
42
+ def __init__(self):
43
+ """Initialize the Model A Dataset B language detector."""
44
+ self.model_key = "model-a-dataset-b"
45
+ self.config = get_model_config(self.model_key)
46
+ self.model_name = self.config["huggingface_model"]
47
+
48
+ # Check if transformers library is available
49
+ if not HF_AVAILABLE:
50
+ raise ImportError(
51
+ "Transformers library required for Model A Dataset B. "
52
+ "Install with: pip install transformers torch"
53
+ )
54
+
55
+ # Initialize the model pipeline
56
+ try:
57
+ self.classifier = pipeline(
58
+ "text-classification",
59
+ model=self.model_name,
60
+ device=0,
61
+ top_k=None # Return all scores
62
+ )
63
+ logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
64
+ except Exception as e:
65
+ logging.error(f"Failed to load {self.config['display_name']}: {e}")
66
+ raise RuntimeError(f"Could not initialize Model A Dataset B: {str(e)}")
67
+
68
+ def predict(self, text: str) -> Dict[str, Any]:
69
+ """
70
+ Predict language using Model A Dataset B (XLM-RoBERTa enhanced).
71
+
72
+ Args:
73
+ text (str): Input text to analyze
74
+
75
+ Returns:
76
+ Dict with predictions, metadata, and model information
77
+ """
78
+ if not text or not text.strip():
79
+ raise ValueError("Input text cannot be empty")
80
+
81
+ try:
82
+ # Run the model prediction
83
+ results = self.classifier(text)
84
+
85
+ # Handle the format returned by the pipeline
86
+ if isinstance(results, list) and len(results) > 0:
87
+ if isinstance(results[0], list):
88
+ # Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
89
+ prediction_list = results[0]
90
+ else:
91
+ # Direct list format: [{'label': 'en', 'score': 0.99}, ...]
92
+ prediction_list = results
93
+ else:
94
+ raise ValueError("Unexpected pipeline output format")
95
+
96
+ # Sort predictions by confidence score (descending)
97
+ predictions = [
98
+ {
99
+ 'language_code': result['label'].lower(),
100
+ 'confidence': result['score']
101
+ }
102
+ for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
103
+ ]
104
+
105
+ return {
106
+ 'predictions': predictions,
107
+ 'text_length': len(text),
108
+ 'model_version': self.model_name,
109
+ 'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
110
+ }
111
+
112
+ except Exception as e:
113
+ logging.error(f"Model A Dataset B prediction failed: {e}")
114
+ raise RuntimeError(f"Model prediction failed: {str(e)}")
115
+
116
+ def get_supported_languages(self) -> List[str]:
117
+ """
118
+ Get supported languages for Model A Dataset B.
119
+
120
+ Returns:
121
+ List of ISO 639-1 language codes supported by the model
122
+ """
123
+ return get_supported_languages(self.model_key)
124
+
125
+ def get_model_info(self) -> Dict[str, Any]:
126
+ """
127
+ Get detailed information about Model A Dataset B.
128
+
129
+ Returns:
130
+ Dict containing comprehensive model metadata
131
+ """
132
+ # Build comprehensive model info from centralized config
133
+ model_info = {
134
+ "name": self.config["display_name"],
135
+ "description": self.config["description"],
136
+ "accuracy": self.config["accuracy"],
137
+ "model_size": self.config["model_size"],
138
+ "architecture": self.config["architecture"],
139
+ "base_model": self.config["base_model"],
140
+ "dataset": self.config["dataset"],
141
+ "languages_supported": f"{self.config['languages_supported']}+ languages",
142
+ "training_details": self.config["training_details"],
143
+ "training_loss": f"{self.config.get('training_loss', 'N/A')}",
144
+ "use_cases": self.config["use_cases"],
145
+ "strengths": self.config["strengths"],
146
+ "limitations": self.config["limitations"]
147
+ }
148
+
149
+ return model_info
backend/models/model_b_dataset_a.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model B Dataset A - BERT Language Detection
3
+
4
+ This module implements the BERT based language detection model
5
+ fine-tuned on Dataset A (standard multilingual language detection dataset).
6
+
7
+ Model Architecture: BERT (Model B)
8
+ Training Dataset: Dataset A (standard multilingual)
9
+ Performance: 96.17% accuracy across 100+ languages
10
+ """
11
+
12
+ import logging
13
+ from typing import Dict, List, Any
14
+
15
+ from .base_model import BaseLanguageModel
16
+ from .model_config import get_model_config, get_supported_languages, get_language_name
17
+
18
+ try:
19
+ from transformers import pipeline
20
+ HF_AVAILABLE = True
21
+ except ImportError:
22
+ HF_AVAILABLE = False
23
+ logging.warning("Transformers library not available. Please install with: pip install transformers torch")
24
+
25
+
26
+ class ModelBDatasetA(BaseLanguageModel):
27
+ """
28
+ BERT based language detection model (Model B) trained on Dataset A.
29
+
30
+ This model represents the BERT architecture fine-tuned on a standard
31
+ multilingual language detection dataset, achieving 96.17% accuracy with
32
+ optimized efficiency and broad language coverage across 100+ languages.
33
+
34
+ Architecture: BERT (Model B)
35
+ Dataset: Dataset A (standard multilingual)
36
+ Base Model: bert-base-multilingual-cased
37
+ Accuracy: 96.17%
38
+ Parameters: 178M
39
+ """
40
+
41
+ def __init__(self):
42
+ """Initialize the Model B Dataset A language detector."""
43
+ self.model_key = "model-b-dataset-a"
44
+ self.config = get_model_config(self.model_key)
45
+ self.model_name = self.config["huggingface_model"]
46
+
47
+ # Check if transformers library is available
48
+ if not HF_AVAILABLE:
49
+ raise ImportError(
50
+ "Transformers library required for Model B Dataset A. "
51
+ "Install with: pip install transformers torch"
52
+ )
53
+
54
+ # Initialize the model pipeline
55
+ try:
56
+ self.classifier = pipeline(
57
+ "text-classification",
58
+ model=self.model_name,
59
+ device=0,
60
+ top_k=None # Return all scores
61
+ )
62
+ logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
63
+ except Exception as e:
64
+ logging.error(f"Failed to load {self.config['display_name']}: {e}")
65
+ raise RuntimeError(f"Could not initialize Model B Dataset A: {str(e)}")
66
+
67
+ def predict(self, text: str) -> Dict[str, Any]:
68
+ """
69
+ Predict language using Model B Dataset A (BERT).
70
+
71
+ Args:
72
+ text (str): Input text to analyze
73
+
74
+ Returns:
75
+ Dict with predictions, metadata, and model information
76
+ """
77
+ if not text or not text.strip():
78
+ raise ValueError("Input text cannot be empty")
79
+
80
+ try:
81
+ # Run the model prediction
82
+ results = self.classifier(text)
83
+
84
+ # Handle the format returned by the pipeline
85
+ if isinstance(results, list) and len(results) > 0:
86
+ if isinstance(results[0], list):
87
+ # Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
88
+ prediction_list = results[0]
89
+ else:
90
+ # Direct list format: [{'label': 'en', 'score': 0.99}, ...]
91
+ prediction_list = results
92
+ else:
93
+ raise ValueError("Unexpected pipeline output format")
94
+
95
+ # Sort predictions by confidence score (descending)
96
+ predictions = [
97
+ {
98
+ 'language_code': result['label'].lower(),
99
+ 'confidence': result['score']
100
+ }
101
+ for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
102
+ ]
103
+
104
+ return {
105
+ 'predictions': predictions,
106
+ 'text_length': len(text),
107
+ 'model_version': self.model_name,
108
+ 'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
109
+ }
110
+
111
+ except Exception as e:
112
+ logging.error(f"Model B Dataset A prediction failed: {e}")
113
+ raise RuntimeError(f"Model prediction failed: {str(e)}")
114
+
115
+ def get_supported_languages(self) -> List[str]:
116
+ """
117
+ Get supported languages for Model B Dataset A.
118
+
119
+ Returns:
120
+ List of ISO 639-1 language codes supported by the model
121
+ """
122
+ return get_supported_languages(self.model_key)
123
+
124
+ def get_model_info(self) -> Dict[str, Any]:
125
+ """
126
+ Get detailed information about Model B Dataset A.
127
+
128
+ Returns:
129
+ Dict containing comprehensive model metadata
130
+ """
131
+ # Build comprehensive model info from centralized config
132
+ model_info = {
133
+ "name": self.config["display_name"],
134
+ "description": self.config["description"],
135
+ "accuracy": self.config["accuracy"],
136
+ "model_size": self.config["model_size"],
137
+ "architecture": self.config["architecture"],
138
+ "base_model": self.config["base_model"],
139
+ "dataset": self.config["dataset"],
140
+ "languages_supported": f"{self.config['languages_supported']}+ languages",
141
+ "training_details": self.config["training_details"],
142
+ "use_cases": self.config["use_cases"],
143
+ "strengths": self.config["strengths"],
144
+ "limitations": self.config["limitations"]
145
+ }
146
+
147
+ return model_info
backend/models/model_b_dataset_b.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model B Dataset B - BERT Language Detection
3
+
4
+ This module implements the BERT based language detection model
5
+ fine-tuned on Dataset B (enhanced/specialized language detection dataset).
6
+
7
+ Model Architecture: BERT (Model B)
8
+ Training Dataset: Dataset B (enhanced/specialized)
9
+ Performance: 99.85% accuracy across 20 carefully selected languages
10
+ """
11
+
12
+ import logging
13
+ from typing import Dict, List, Any
14
+
15
+ from .base_model import BaseLanguageModel
16
+ from .model_config import get_model_config, get_supported_languages, get_language_name
17
+
18
+ try:
19
+ from transformers import pipeline
20
+ HF_AVAILABLE = True
21
+ except ImportError:
22
+ HF_AVAILABLE = False
23
+ logging.warning("Transformers library not available. Please install with: pip install transformers torch")
24
+
25
+
26
+ class ModelBDatasetB(BaseLanguageModel):
27
+ """
28
+ BERT based language detection model (Model B) trained on Dataset B.
29
+
30
+ This model represents the BERT architecture fine-tuned on an enhanced
31
+ language detection dataset, achieving the highest accuracy (99.85%) with
32
+ precision-optimized performance on 20 carefully selected languages.
33
+
34
+ Architecture: BERT (Model B)
35
+ Dataset: Dataset B (enhanced/specialized)
36
+ Base Model: bert-base-multilingual-cased
37
+ Accuracy: 99.85%
38
+ Parameters: 178M
39
+ Training Loss: 0.0125
40
+ """
41
+
42
+ def __init__(self):
43
+ """Initialize the Model B Dataset B language detector."""
44
+ self.model_key = "model-b-dataset-b"
45
+ self.config = get_model_config(self.model_key)
46
+ self.model_name = self.config["huggingface_model"]
47
+
48
+ # Check if transformers library is available
49
+ if not HF_AVAILABLE:
50
+ raise ImportError(
51
+ "Transformers library required for Model B Dataset B. "
52
+ "Install with: pip install transformers torch"
53
+ )
54
+
55
+ # Initialize the model pipeline
56
+ try:
57
+ self.classifier = pipeline(
58
+ "text-classification",
59
+ model=self.model_name,
60
+ device=0,
61
+ top_k=None # Return all scores
62
+ )
63
+ logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
64
+ except Exception as e:
65
+ logging.error(f"Failed to load {self.config['display_name']}: {e}")
66
+ raise RuntimeError(f"Could not initialize Model B Dataset B: {str(e)}")
67
+
68
+ def predict(self, text: str) -> Dict[str, Any]:
69
+ """
70
+ Predict language using Model B Dataset B (BERT enhanced).
71
+
72
+ Args:
73
+ text (str): Input text to analyze
74
+
75
+ Returns:
76
+ Dict with predictions, metadata, and model information
77
+ """
78
+ if not text or not text.strip():
79
+ raise ValueError("Input text cannot be empty")
80
+
81
+ try:
82
+ # Run the model prediction
83
+ results = self.classifier(text)
84
+
85
+ # Handle the format returned by the pipeline
86
+ if isinstance(results, list) and len(results) > 0:
87
+ if isinstance(results[0], list):
88
+ # Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
89
+ prediction_list = results[0]
90
+ else:
91
+ # Direct list format: [{'label': 'en', 'score': 0.99}, ...]
92
+ prediction_list = results
93
+ else:
94
+ raise ValueError("Unexpected pipeline output format")
95
+
96
+ # Sort predictions by confidence score (descending)
97
+ predictions = [
98
+ {
99
+ 'language_code': result['label'].lower(),
100
+ 'confidence': result['score']
101
+ }
102
+ for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
103
+ ]
104
+
105
+ return {
106
+ 'predictions': predictions,
107
+ 'text_length': len(text),
108
+ 'model_version': self.model_name,
109
+ 'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
110
+ }
111
+
112
+ except Exception as e:
113
+ logging.error(f"Model B Dataset B prediction failed: {e}")
114
+ raise RuntimeError(f"Model prediction failed: {str(e)}")
115
+
116
+ def get_supported_languages(self) -> List[str]:
117
+ """
118
+ Get supported languages for Model B Dataset B.
119
+
120
+ Returns:
121
+ List of ISO 639-1 language codes supported by the model (20 languages)
122
+ """
123
+ return get_supported_languages(self.model_key)
124
+
125
+ def get_model_info(self) -> Dict[str, Any]:
126
+ """
127
+ Get detailed information about Model B Dataset B.
128
+
129
+ Returns:
130
+ Dict containing comprehensive model metadata
131
+ """
132
+ # Build comprehensive model info from centralized config
133
+ model_info = {
134
+ "name": self.config["display_name"],
135
+ "description": self.config["description"],
136
+ "accuracy": self.config["accuracy"],
137
+ "model_size": self.config["model_size"],
138
+ "architecture": self.config["architecture"],
139
+ "base_model": self.config["base_model"],
140
+ "dataset": self.config["dataset"],
141
+ "languages_supported": f"{self.config['languages_supported']} carefully selected languages: Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Italian, Japanese, Dutch, Polish, Portuguese, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese",
142
+ "training_details": self.config["training_details"],
143
+ "training_loss": f"{self.config.get('training_loss', 'N/A')}",
144
+ "use_cases": self.config["use_cases"],
145
+ "strengths": self.config["strengths"],
146
+ "limitations": self.config["limitations"]
147
+ }
148
+
149
+ return model_info
backend/models/model_config.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralized Model Configuration
3
+
4
+ This module contains the configuration for all language detection models
5
+ organized by the model architecture and training dataset combinations.
6
+
7
+ Model Architecture:
8
+ - Model A: XLM-RoBERTa based architectures
9
+ - Model B: BERT based architectures
10
+
11
+ Training Datasets:
12
+ - Dataset A: Standard multilingual language detection dataset
13
+ - Dataset B: Enhanced/specialized language detection dataset
14
+ """
15
+
16
+ from typing import Dict, Any, List
17
+
18
+ # Model configurations organized by architecture and dataset
19
+ MODEL_CONFIGURATIONS = {
20
+ "model-a-dataset-a": {
21
+ "huggingface_model": "ZheYu03/xlm-r-langdetect-model",
22
+ "display_name": "XLM-RoBERTa Model A Dataset A",
23
+ "short_name": "Model A Dataset A",
24
+ "architecture": "XLM-RoBERTa",
25
+ "base_model": "xlm-roberta-base",
26
+ "dataset": "Dataset A",
27
+ "accuracy": "97.9%",
28
+ "model_size": "278M parameters",
29
+ "training_epochs": 10,
30
+ "languages_supported": 100,
31
+ "description": "High-performance XLM-RoBERTa based language detection model fine-tuned on standard multilingual dataset. Delivers reliable 97.9% accuracy across 100+ languages with robust cross-lingual capabilities.",
32
+ "training_details": "Fine-tuned XLM-RoBERTa base model with AdamW optimizer, 10 epochs training on comprehensive multilingual language detection dataset",
33
+ "use_cases": "General-purpose language detection, multilingual content processing, cross-lingual text analysis",
34
+ "strengths": "Excellent multilingual performance, robust cross-lingual transfer, proven reliability, comprehensive language coverage",
35
+ "limitations": "Higher computational requirements, moderate inference speed",
36
+ "status": "available"
37
+ },
38
+
39
+ "model-b-dataset-a": {
40
+ "huggingface_model": "SongJuNN/xlm-r-langdetect-model",
41
+ "display_name": "BERT Model B Dataset A",
42
+ "short_name": "Model B Dataset A",
43
+ "architecture": "BERT",
44
+ "base_model": "bert-base-multilingual-cased",
45
+ "dataset": "Dataset A",
46
+ "accuracy": "96.17%",
47
+ "model_size": "178M parameters",
48
+ "training_epochs": 10,
49
+ "languages_supported": 100,
50
+ "description": "Efficient BERT-based language detection model trained on standard multilingual dataset. Optimized for speed and broad language coverage with 96.17% accuracy across 100+ languages.",
51
+ "training_details": "BERT multilingual model fine-tuned with AdamW optimizer (lr=2e-05), mixed precision training, optimized for efficiency",
52
+ "use_cases": "High-throughput language detection, real-time applications, resource-constrained environments",
53
+ "strengths": "Fast inference speed, lower memory usage, broad language support, efficient processing",
54
+ "limitations": "Slightly lower accuracy compared to XLM-RoBERTa variants",
55
+ "status": "available"
56
+ },
57
+
58
+ "model-a-dataset-b": {
59
+ "huggingface_model": "zues0102/xlmr-papluca-model",
60
+ "display_name": "XLM-RoBERTa Model A Dataset B",
61
+ "short_name": "Model A Dataset B",
62
+ "architecture": "XLM-RoBERTa",
63
+ "base_model": "xlm-roberta-base",
64
+ "dataset": "Dataset B",
65
+ "accuracy": "99.72%",
66
+ "model_size": "278M parameters",
67
+ "training_epochs": 10,
68
+ "training_loss": 0.0176,
69
+ "languages_supported": 20,
70
+ "description": "Ultra high-accuracy XLM-RoBERTa model fine-tuned on enhanced dataset. Achieves exceptional 99.72% accuracy with support for 20 carefully selected high-performance languages and state-of-the-art performance.",
71
+ "training_details": "Advanced fine-tuning of XLM-RoBERTa on enhanced dataset with specialized training procedures, achieving loss of 0.0176",
72
+ "use_cases": "Research applications, high-precision language detection, critical accuracy requirements",
73
+ "strengths": "Exceptional accuracy (99.72%), focused language support, state-of-the-art results",
74
+ "limitations": "Higher computational requirements, limited to 20 languages",
75
+ "status": "available"
76
+ },
77
+
78
+ "model-b-dataset-b": {
79
+ "huggingface_model": "zues0102/bert-base-multilingual-cased",
80
+ "display_name": "BERT Model B Dataset B",
81
+ "short_name": "Model B Dataset B",
82
+ "architecture": "BERT",
83
+ "base_model": "bert-base-multilingual-cased",
84
+ "dataset": "Dataset B",
85
+ "accuracy": "99.85%",
86
+ "model_size": "178M parameters",
87
+ "training_epochs": 10,
88
+ "training_loss": 0.0125,
89
+ "languages_supported": 20,
90
+ "description": "State-of-the-art BERT model achieving highest accuracy (99.85%) through specialized training on enhanced dataset. Optimized for 20 carefully selected high-performance languages.",
91
+ "training_details": "Precision-optimized BERT training on enhanced dataset achieving ultra-low loss of 0.0125, specialized for maximum accuracy",
92
+ "use_cases": "Maximum precision applications, research requiring highest accuracy, critical language identification",
93
+ "strengths": "Highest accuracy (99.85%), ultra-low training loss, optimized precision, efficient architecture",
94
+ "limitations": "Limited to 20 languages, specialized for specific language set",
95
+ "status": "available"
96
+ }
97
+ }
98
+
99
+ # Language mappings - comprehensive set
100
+ LANGUAGE_MAPPINGS = {
101
+ 'af': 'Afrikaans', 'am': 'Amharic', 'ar': 'Arabic', 'as': 'Assamese',
102
+ 'az': 'Azerbaijani', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bn': 'Bengali',
103
+ 'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'cs': 'Czech',
104
+ 'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dz': 'Dzongkha',
105
+ 'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish',
106
+ 'et': 'Estonian', 'eu': 'Basque', 'fa': 'Persian', 'fi': 'Finnish',
107
+ 'fr': 'French', 'fy': 'Frisian', 'ga': 'Irish', 'gd': 'Scottish Gaelic',
108
+ 'gl': 'Galician', 'gu': 'Gujarati', 'ha': 'Hausa', 'he': 'Hebrew',
109
+ 'hi': 'Hindi', 'hr': 'Croatian', 'ht': 'Haitian Creole', 'hu': 'Hungarian',
110
+ 'hy': 'Armenian', 'id': 'Indonesian', 'is': 'Icelandic', 'it': 'Italian',
111
+ 'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kk': 'Kazakh',
112
+ 'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean', 'ku': 'Kurdish',
113
+ 'ky': 'Kyrgyz', 'la': 'Latin', 'lb': 'Luxembourgish', 'lo': 'Lao',
114
+ 'lt': 'Lithuanian', 'lv': 'Latvian', 'mg': 'Malagasy', 'mk': 'Macedonian',
115
+ 'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay',
116
+ 'mt': 'Maltese', 'my': 'Myanmar (Burmese)', 'nb': 'Norwegian Bokmål',
117
+ 'ne': 'Nepali', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian',
118
+ 'oc': 'Occitan', 'or': 'Odia', 'pa': 'Punjabi', 'pl': 'Polish',
119
+ 'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'ro': 'Romanian',
120
+ 'ru': 'Russian', 'rw': 'Kinyarwanda', 'se': 'Northern Sami', 'si': 'Sinhala',
121
+ 'sk': 'Slovak', 'sl': 'Slovenian', 'so': 'Somali', 'sq': 'Albanian',
122
+ 'sr': 'Serbian', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil',
123
+ 'te': 'Telugu', 'th': 'Thai', 'tl': 'Filipino', 'tr': 'Turkish',
124
+ 'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'vi': 'Vietnamese',
125
+ 'vo': 'Volapük', 'wa': 'Walloon', 'xh': 'Xhosa', 'yi': 'Yiddish',
126
+ 'yo': 'Yoruba', 'zh': 'Chinese', 'zh-cn': 'Chinese (Simplified)',
127
+ 'zh-tw': 'Chinese (Traditional)', 'zh-hans': 'Chinese (Simplified)',
128
+ 'zh-hant': 'Chinese (Traditional)', 'zu': 'Zulu'
129
+ }
130
+
131
+ # Model-specific language support
132
+ MODEL_LANGUAGE_SUPPORT = {
133
+ "model-a-dataset-a": [
134
+ 'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
135
+ 'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
136
+ 'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
137
+ 'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
138
+ 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
139
+ 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
140
+ 'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
141
+ 'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
142
+ 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
143
+ 'yo', 'zh', 'zu'
144
+ ],
145
+
146
+ "model-b-dataset-a": [
147
+ 'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
148
+ 'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
149
+ 'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
150
+ 'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
151
+ 'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
152
+ 'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
153
+ 'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
154
+ 'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
155
+ 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
156
+ 'yo', 'zh', 'zu'
157
+ ],
158
+
159
+ "model-a-dataset-b": [
160
+ 'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
161
+ 'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
162
+ ],
163
+
164
+ "model-b-dataset-b": [
165
+ 'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
166
+ 'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
167
+ ]
168
+ }
169
+
170
+ def get_model_config(model_key: str) -> Dict[str, Any]:
171
+ """Get configuration for a specific model."""
172
+ return MODEL_CONFIGURATIONS.get(model_key, {})
173
+
174
+ def get_all_model_configs() -> Dict[str, Dict[str, Any]]:
175
+ """Get all model configurations."""
176
+ return MODEL_CONFIGURATIONS.copy()
177
+
178
+ def get_supported_languages(model_key: str) -> List[str]:
179
+ """Get supported languages for a specific model."""
180
+ return MODEL_LANGUAGE_SUPPORT.get(model_key, [])
181
+
182
+ def get_language_name(language_code: str) -> str:
183
+ """Get human-readable language name from code."""
184
+ return LANGUAGE_MAPPINGS.get(language_code.lower(), f"Unknown ({language_code})")
combined_app.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from backend.language_detector import LanguageDetector
4
+ from typing import List, Dict, Any
5
+ import time
6
+
7
+ # Import demo samples from demo_page
8
+ from demo_page import DEMO_SAMPLES, initialize_models, detect_with_all_models, create_results_dataframe, run_demo_tests
9
+
10
+ def create_single_model_interface():
11
+ """Create the original single model interface."""
12
+ # Initialize the language detector with default model (Model A Dataset A)
13
+ detector = LanguageDetector()
14
+
15
+ with gr.Column() as single_interface:
16
+ gr.Markdown("# 🌍 Language Detection App")
17
+ gr.Markdown("Select a model and enter text below to detect its language with confidence scores.")
18
+
19
+ # Model Selection Section with visual styling
20
+ with gr.Group():
21
+ gr.Markdown(
22
+ "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.1), transparent); border-radius: 8px 8px 0 0;'>🤖 Model Selection</div>"
23
+ )
24
+
25
+ # Get available models
26
+ available_models = detector.get_available_models()
27
+ model_choices = []
28
+ model_info_map = {}
29
+
30
+ for key, info in available_models.items():
31
+ if info["status"] == "available":
32
+ model_choices.append((info["display_name"], key))
33
+ else:
34
+ model_choices.append((f"{info['display_name']} (Coming Soon)", key))
35
+ model_info_map[key] = info
36
+
37
+ model_selector = gr.Dropdown(
38
+ choices=model_choices,
39
+ value="model-a-dataset-a", # Default to Model A Dataset A
40
+ label="Choose Language Detection Model",
41
+ interactive=True
42
+ )
43
+
44
+ # Model Information Display
45
+ model_info_display = gr.Markdown(
46
+ value=_format_model_info(detector.get_current_model_info()),
47
+ label="Model Information"
48
+ )
49
+
50
+ # Add visual separator
51
+ gr.Markdown(
52
+ "<div style='margin: 24px 0; border-top: 3px solid rgba(99, 102, 241, 0.2); background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.05), transparent); height: 2px;'></div>"
53
+ )
54
+
55
+ # Analysis Section
56
+ with gr.Group():
57
+ gr.Markdown(
58
+ "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(34, 197, 94, 0.1), transparent); border-radius: 8px 8px 0 0;'>🔍 Language Analysis</div>"
59
+ )
60
+
61
+ with gr.Row():
62
+ with gr.Column(scale=2):
63
+ # Input section
64
+ text_input = gr.Textbox(
65
+ label="Text to Analyze",
66
+ placeholder="Enter text here to detect its language...",
67
+ lines=5,
68
+ max_lines=10
69
+ )
70
+
71
+ detect_btn = gr.Button("🔍 Detect Language", variant="primary", size="lg")
72
+
73
+ # Example texts
74
+ gr.Examples(
75
+ examples=[
76
+ ["Hello, how are you today?"],
77
+ ["Bonjour, comment allez-vous?"],
78
+ ["Hola, ¿cómo estás?"],
79
+ ["Guten Tag, wie geht es Ihnen?"],
80
+ ["こんにちは、元気ですか?"],
81
+ ["Привет, как дела?"],
82
+ ["Ciao, come stai?"],
83
+ ["Olá, como você está?"],
84
+ ["你好,你好吗?"],
85
+ ["안녕하세요, 어떻게 지내세요?"]
86
+ ],
87
+ inputs=text_input,
88
+ label="Try these examples:"
89
+ )
90
+
91
+ with gr.Column(scale=2):
92
+ # Output section
93
+ with gr.Group():
94
+ gr.Markdown(
95
+ "<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 12px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(168, 85, 247, 0.1), transparent); border-radius: 8px 8px 0 0;'>📊 Detection Results</div>"
96
+ )
97
+
98
+ detected_language = gr.Textbox(
99
+ label="Detected Language",
100
+ interactive=False
101
+ )
102
+
103
+ confidence_score = gr.Number(
104
+ label="Confidence Score",
105
+ interactive=False,
106
+ precision=4
107
+ )
108
+
109
+ language_code = gr.Textbox(
110
+ label="Language Code (ISO 639-1)",
111
+ interactive=False
112
+ )
113
+
114
+ # Top predictions table
115
+ top_predictions = gr.Dataframe(
116
+ headers=["Language", "Code", "Confidence"],
117
+ label="Top 5 Predictions",
118
+ interactive=False,
119
+ wrap=True
120
+ )
121
+
122
+ # Status/Info section
123
+ with gr.Row():
124
+ status_text = gr.Textbox(
125
+ label="Status",
126
+ interactive=False,
127
+ visible=False
128
+ )
129
+
130
+ # Event handlers
131
+ def detect_language_wrapper(text, selected_model):
132
+ if not text.strip():
133
+ return (
134
+ "No text provided",
135
+ 0.0,
136
+ "",
137
+ [],
138
+ gr.update(value="Please enter some text to analyze.", visible=True)
139
+ )
140
+
141
+ try:
142
+ # Switch model if needed
143
+ if detector.current_model_key != selected_model:
144
+ try:
145
+ detector.switch_model(selected_model)
146
+ except NotImplementedError:
147
+ return (
148
+ "Model unavailable",
149
+ 0.0,
150
+ "",
151
+ [],
152
+ gr.update(value="This model is not yet implemented. Please select an available model.", visible=True)
153
+ )
154
+ except Exception as e:
155
+ return (
156
+ "Model error",
157
+ 0.0,
158
+ "",
159
+ [],
160
+ gr.update(value=f"Error loading model: {str(e)}", visible=True)
161
+ )
162
+
163
+ result = detector.detect_language(text)
164
+
165
+ # Extract main prediction
166
+ main_lang = result['language']
167
+ main_confidence = result['confidence']
168
+ main_code = result['language_code']
169
+
170
+ # Format top predictions for table
171
+ predictions_table = [
172
+ [pred['language'], pred['language_code'], f"{pred['confidence']:.4f}"]
173
+ for pred in result['top_predictions']
174
+ ]
175
+
176
+ model_info = result.get('metadata', {}).get('model_info', {})
177
+ model_name = model_info.get('name', 'Unknown Model')
178
+
179
+ return (
180
+ main_lang,
181
+ main_confidence,
182
+ main_code,
183
+ predictions_table,
184
+ gr.update(value=f"✅ Analysis Complete\n\nInput Text: {text[:100]}{'...' if len(text) > 100 else ''}\n\nDetected Language: {main_lang} ({main_code})\nConfidence: {main_confidence:.2%}\n\nModel: {model_name}", visible=True)
185
+ )
186
+
187
+ except Exception as e:
188
+ return (
189
+ "Error occurred",
190
+ 0.0,
191
+ "",
192
+ [],
193
+ gr.update(value=f"Error: {str(e)}", visible=True)
194
+ )
195
+
196
+ def update_model_info(selected_model):
197
+ """Update model information display when model selection changes."""
198
+ try:
199
+ if detector.current_model_key != selected_model:
200
+ detector.switch_model(selected_model)
201
+ model_info = detector.get_current_model_info()
202
+ return _format_model_info(model_info)
203
+ except NotImplementedError:
204
+ return "**This model is not yet implemented.** Please select an available model."
205
+ except Exception as e:
206
+ return f"**Error loading model information:** {str(e)}"
207
+
208
+ # Connect the button to the detection function
209
+ detect_btn.click(
210
+ fn=detect_language_wrapper,
211
+ inputs=[text_input, model_selector],
212
+ outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
213
+ )
214
+
215
+ # Also trigger on Enter key in text input
216
+ text_input.submit(
217
+ fn=detect_language_wrapper,
218
+ inputs=[text_input, model_selector],
219
+ outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
220
+ )
221
+
222
+ # Update model info when selection changes
223
+ model_selector.change(
224
+ fn=update_model_info,
225
+ inputs=[model_selector],
226
+ outputs=[model_info_display]
227
+ )
228
+
229
+ return single_interface
230
+
231
+ def create_demo_comparison_interface():
232
+ """Create the demo comparison interface."""
233
+
234
+ # Initialize models
235
+ models = initialize_models()
236
+
237
+ with gr.Column() as demo_interface:
238
+ gr.Markdown("# 🚀 Language Detection Demo - Model Comparison")
239
+ gr.Markdown("Compare all four language detection models simultaneously across various difficulty categories.")
240
+
241
+ # Model Status Section
242
+ with gr.Group():
243
+ gr.Markdown("## 🤖 Model Status")
244
+ model_status_text = ""
245
+ for model_key, model_info in models.items():
246
+ status_icon = "✅" if model_info["status"] == "Ready" else "❌"
247
+ model_status_text += f"{status_icon} **{model_info['name']}**: {model_info['status']}\n\n"
248
+ gr.Markdown(model_status_text)
249
+
250
+ # Category Selection Section
251
+ with gr.Group():
252
+ gr.Markdown("## 📊 Test Categories")
253
+ gr.Markdown("Select categories to test different aspects of language detection difficulty:")
254
+
255
+ category_checkboxes = gr.CheckboxGroup(
256
+ choices=list(DEMO_SAMPLES.keys()),
257
+ label="Select Test Categories",
258
+ value=["Easy/Obvious", "Short Text"], # Default selection
259
+ interactive=True
260
+ )
261
+
262
+ # Custom Text Input Section
263
+ with gr.Group():
264
+ gr.Markdown("## ✏️ Custom Text Input")
265
+ gr.Markdown("Enter your own texts to test (one per line):")
266
+
267
+ custom_text_input = gr.Textbox(
268
+ label="Custom Texts",
269
+ placeholder="Enter custom texts here, one per line...\nExample:\nHello world\nBonjour le monde\n你好世界",
270
+ lines=5,
271
+ max_lines=10
272
+ )
273
+
274
+ # Control Buttons
275
+ with gr.Row():
276
+ run_demo_btn = gr.Button("🔍 Run Demo Tests", variant="primary", size="lg")
277
+ clear_btn = gr.Button("🗑️ Clear Results", variant="secondary")
278
+
279
+ # Sample Preview Section (moved up, condensed)
280
+ with gr.Group():
281
+ gr.Markdown("## 📚 Category Explanations")
282
+ gr.Markdown("Understanding what each test category evaluates:")
283
+
284
+ category_explanations = """
285
+ **Easy/Obvious:** Clear, unambiguous sentences in their native language. Tests basic language detection capability.
286
+
287
+ **Short Text:** Single words or very short phrases. Tests model performance with minimal context.
288
+
289
+ **False Friends:** Words that look similar across languages but have different meanings. Tests ability to distinguish between closely related languages.
290
+
291
+ **Mixed Scripts:** Text containing multiple languages, numbers, symbols, or scripts. Tests handling of multilingual content.
292
+
293
+ **Proper Nouns:** Names of people, places, or entities that exist across multiple languages. Tests context-dependent detection.
294
+
295
+ **Common Words:** International words with similar spelling across languages (hotel, taxi, etc.). Tests disambiguation of universal terms.
296
+
297
+ **Technical Terms:** Specialized vocabulary that may be borrowed or translated across languages. Tests domain-specific detection.
298
+
299
+ **Code-switching:** Text that switches between languages mid-sentence. Tests handling of bilingual communication patterns.
300
+
301
+ **Transliterated Text:** Non-Latin scripts written in Latin characters. Tests recognition of transliteration vs. native language.
302
+
303
+ **Ambiguous Script:** Words that could belong to multiple languages with identical spelling. Tests the model's decision-making under uncertainty.
304
+ """
305
+
306
+ gr.Markdown(category_explanations)
307
+
308
+ # Results Section (moved to bottom)
309
+ with gr.Group():
310
+ gr.Markdown("## 📈 Results")
311
+
312
+ summary_output = gr.Textbox(
313
+ label="Summary",
314
+ interactive=False,
315
+ visible=False
316
+ )
317
+
318
+ results_dataframe = gr.Dataframe(
319
+ label="Model Comparison Results (A-A: Model A Dataset A, B-A: Model B Dataset A, A-B: Model A Dataset B, B-B: Model B Dataset B)",
320
+ wrap=True,
321
+ interactive=False,
322
+ visible=False
323
+ )
324
+
325
+ # Event Handlers
326
+ def run_tests(selected_cats, custom_texts):
327
+ summary, df = run_demo_tests(selected_cats, custom_texts, models)
328
+
329
+ if df is not None:
330
+ return (
331
+ gr.update(value=summary, visible=True),
332
+ gr.update(value=df, visible=True)
333
+ )
334
+ else:
335
+ return (
336
+ gr.update(value=summary, visible=True),
337
+ gr.update(visible=False)
338
+ )
339
+
340
+ def clear_results():
341
+ return (
342
+ gr.update(value="", visible=False),
343
+ gr.update(value=None, visible=False)
344
+ )
345
+
346
+ # Connect event handlers
347
+ run_demo_btn.click(
348
+ fn=run_tests,
349
+ inputs=[category_checkboxes, custom_text_input],
350
+ outputs=[summary_output, results_dataframe]
351
+ )
352
+
353
+ clear_btn.click(
354
+ fn=clear_results,
355
+ outputs=[summary_output, results_dataframe]
356
+ )
357
+
358
+ return demo_interface
359
+
360
+ def _format_model_info(model_info):
361
+ """Format model information for display."""
362
+ if not model_info:
363
+ return "No model information available."
364
+
365
+ formatted_info = f"""
366
+ **{model_info.get('name', 'Unknown Model')}**
367
+
368
+ {model_info.get('description', 'No description available.')}
369
+
370
+ **📊 Performance:**
371
+ - Accuracy: {model_info.get('accuracy', 'N/A')}
372
+ - Model Size: {model_info.get('model_size', 'N/A')}
373
+
374
+ **🏗️ Architecture:**
375
+ - Model Architecture: {model_info.get('architecture', 'N/A')}
376
+ - Base Model: {model_info.get('base_model', 'N/A')}
377
+ - Training Dataset: {model_info.get('dataset', 'N/A')}
378
+
379
+ **🌐 Languages:** {model_info.get('languages_supported', 'N/A')}
380
+
381
+ **⚙️ Training Details:** {model_info.get('training_details', 'N/A')}
382
+
383
+ **💡 Use Cases:** {model_info.get('use_cases', 'N/A')}
384
+
385
+ **✅ Strengths:** {model_info.get('strengths', 'N/A')}
386
+
387
+ **⚠️ Limitations:** {model_info.get('limitations', 'N/A')}
388
+ """
389
+ return formatted_info
390
+
391
+ def main():
392
+ """Create the main application with tabbed interface."""
393
+
394
+ with gr.Blocks(title="Language Detection App Suite", theme=gr.themes.Soft()) as app:
395
+ gr.Markdown("# 🌍 Language Detection App Suite")
396
+ gr.Markdown("Choose between single model testing or comprehensive model comparison.")
397
+
398
+ with gr.Tabs():
399
+ with gr.TabItem("🔍 Single Model Detection"):
400
+ single_model_interface = create_single_model_interface()
401
+
402
+ with gr.TabItem("🚀 Model Comparison Demo"):
403
+ demo_comparison_interface = create_demo_comparison_interface()
404
+
405
+ return app
406
+
407
+ if __name__ == "__main__":
408
+ app = main()
409
+ app.launch(
410
+ server_name="localhost",
411
+ server_port=7860,
412
+ share=False,
413
+ debug=True
414
+ )
demo_page.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from backend.language_detector import LanguageDetector
4
+ from typing import List, Dict, Any
5
+ import time
6
+
7
+ # Sample text database organized by difficulty categories
8
+ DEMO_SAMPLES = {
9
+ "Easy/Obvious": [
10
+ ("Hello, how are you doing today?", "en", "Clear English sentence"),
11
+ ("Bonjour, comment allez-vous aujourd'hui?", "fr", "Clear French sentence"),
12
+ ("Hola, ¿cómo estás hoy?", "es", "Clear Spanish sentence"),
13
+ ("Guten Tag, wie geht es Ihnen heute?", "de", "Clear German sentence"),
14
+ ("こんにちは、今日はどうですか?", "ja", "Clear Japanese sentence"),
15
+ ("Привет, как дела сегодня?", "ru", "Clear Russian sentence"),
16
+ ("Ciao, come stai oggi?", "it", "Clear Italian sentence"),
17
+ ("Olá, como você está hoje?", "pt", "Clear Portuguese sentence"),
18
+ ("你好,你今天怎么样?", "zh", "Clear Chinese sentence"),
19
+ ("안녕하세요, 오늘 어떻게 지내세요?", "ko", "Clear Korean sentence"),
20
+ ],
21
+
22
+ "Short Text": [
23
+ ("Hi", "en", "Very short greeting"),
24
+ ("Oui", "fr", "Single word French"),
25
+ ("Sí", "es", "Single word Spanish"),
26
+ ("Ja", "de", "Single word German"),
27
+ ("はい", "ja", "Single word Japanese"),
28
+ ("Да", "ru", "Single word Russian"),
29
+ ("Sì", "it", "Single word Italian"),
30
+ ("Sim", "pt", "Single word Portuguese"),
31
+ ("是", "zh", "Single character Chinese"),
32
+ ("네", "ko", "Single word Korean"),
33
+ ],
34
+
35
+ "False Friends": [
36
+ ("actual", "en", "English word, but means 'current' in Spanish"),
37
+ ("embarazada", "es", "Spanish for pregnant, not embarrassed"),
38
+ ("gift", "en", "English word, but means 'poison' in German"),
39
+ ("preservativo", "es", "Spanish for condom, not preservative"),
40
+ ("sensible", "en", "English word, but means 'sensitive' in Spanish"),
41
+ ("sympathique", "fr", "French for nice, not sympathetic"),
42
+ ("biblioteca", "es", "Spanish for library, not Bible place"),
43
+ ("realizzare", "it", "Italian for to achieve, not realize"),
44
+ ("parents", "en", "English word, but means 'relatives' in French"),
45
+ ("attualmente", "it", "Italian for currently, not actually"),
46
+ ],
47
+
48
+ "Mixed Scripts": [
49
+ ("Hello123世界", "mix", "Mixed English, numbers, Chinese"),
50
+ ("Café #1 في العالم", "mix", "Mixed French, numbers, Arabic"),
51
+ ("2023年は良い年です", "ja", "Japanese with numbers"),
52
+ ("Prix: €50,000", "fr", "French with currency and numbers"),
53
+ ("iPhone 15 Pro Max", "en", "Product name with numbers"),
54
+ ("COVID-19 パンデミック", "mix", "Mixed English acronym and Japanese"),
55
+ ("Wi-Fi пароль: 123456", "mix", "Mixed English tech term and Russian"),
56
+ ("GPS координаты", "mix", "Mixed English acronym and Russian"),
57
+ ("URL: https://example.com", "en", "Web address"),
58
+ ("HTML <div>content</div>", "en", "Code with markup"),
59
+ ],
60
+
61
+ "Proper Nouns": [
62
+ ("Paris", "ambiguous", "City name - French or English context?"),
63
+ ("Berlin", "ambiguous", "City name - German or English context?"),
64
+ ("Madrid", "ambiguous", "City name - Spanish or English context?"),
65
+ ("Tokyo", "ambiguous", "City name - Japanese or English context?"),
66
+ ("Maria", "ambiguous", "Common name in many languages"),
67
+ ("Alexander", "ambiguous", "Name used in many languages"),
68
+ ("David", "ambiguous", "Biblical name used worldwide"),
69
+ ("Anna", "ambiguous", "Name common across languages"),
70
+ ("Michael", "ambiguous", "International name"),
71
+ ("Sofia", "ambiguous", "Name and city, multiple languages"),
72
+ ],
73
+
74
+ "Common Words": [
75
+ ("hotel", "ambiguous", "Same spelling in many languages"),
76
+ ("restaurant", "ambiguous", "French origin, used worldwide"),
77
+ ("taxi", "ambiguous", "International word"),
78
+ ("pizza", "ambiguous", "Italian origin, used worldwide"),
79
+ ("chocolate", "ambiguous", "Similar in many languages"),
80
+ ("hospital", "ambiguous", "Medical term used internationally"),
81
+ ("radio", "ambiguous", "Technology term used worldwide"),
82
+ ("metro", "ambiguous", "Transportation term"),
83
+ ("cafe", "ambiguous", "French origin, international use"),
84
+ ("photo", "ambiguous", "Greek origin, used worldwide"),
85
+ ],
86
+
87
+ "Technical Terms": [
88
+ ("algorithm", "en", "Technical English term"),
89
+ ("algorithme", "fr", "Technical French term"),
90
+ ("algoritmo", "es", "Technical Spanish term"),
91
+ ("Algorithmus", "de", "Technical German term"),
92
+ ("アルゴリズム", "ja", "Technical Japanese term"),
93
+ ("алгоритм", "ru", "Technical Russian term"),
94
+ ("algoritmo", "it", "Technical Italian term"),
95
+ ("algoritmo", "pt", "Technical Portuguese term"),
96
+ ("算法", "zh", "Technical Chinese term"),
97
+ ("알고리즘", "ko", "Technical Korean term"),
98
+ ],
99
+
100
+ "Code-switching": [
101
+ ("I love sushi とても美味しい", "mix", "English-Japanese code switching"),
102
+ ("C'est très nice aujourd'hui", "mix", "French-English code switching"),
103
+ ("Me gusta this song mucho", "mix", "Spanish-English code switching"),
104
+ ("Das ist very interessant", "mix", "German-English code switching"),
105
+ ("Это really хорошо", "mix", "Russian-English code switching"),
106
+ ("È molto beautiful oggi", "mix", "Italian-English code switching"),
107
+ ("Está muito good today", "mix", "Portuguese-English code switching"),
108
+ ("这个 is very 好", "mix", "Chinese-English code switching"),
109
+ ("이것은 really 좋다", "mix", "Korean-English code switching"),
110
+ ("Merci beaucoup for everything", "mix", "French-English code switching"),
111
+ ],
112
+
113
+ "Transliterated Text": [
114
+ ("Konnichiwa", "transliteration", "Japanese こんにちは in Latin script"),
115
+ ("Spasibo", "transliteration", "Russian спасибо in Latin script"),
116
+ ("Arigato", "transliteration", "Japanese ありがとう in Latin script"),
117
+ ("Privyet", "transliteration", "Russian привет in Latin script"),
118
+ ("Sayonara", "transliteration", "Japanese さようなら in Latin script"),
119
+ ("Dosvedanya", "transliteration", "Russian до свидания in Latin script"),
120
+ ("Nihao", "transliteration", "Chinese 你好 in Latin script"),
121
+ ("Annyeonghaseyo", "transliteration", "Korean 안녕하세요 in Latin script"),
122
+ ("Zdravstvuyte", "transliteration", "Russian здравствуйте in Latin script"),
123
+ ("Ohayo gozaimasu", "transliteration", "Japanese おはようございます in Latin script"),
124
+ ],
125
+
126
+ "Ambiguous Script": [
127
+ ("casa", "ambiguous", "House in Spanish/Italian/Portuguese"),
128
+ ("rose", "ambiguous", "Flower in English or pink in French"),
129
+ ("more", "ambiguous", "English word or Italian 'deaths'"),
130
+ ("come", "ambiguous", "English verb or Italian 'how/like'"),
131
+ ("no", "ambiguous", "English word or Spanish 'no'"),
132
+ ("si", "ambiguous", "Spanish 'if' or Italian 'yes'"),
133
+ ("la", "ambiguous", "English 'la' or French/Spanish/Italian article"),
134
+ ("me", "ambiguous", "English pronoun or Spanish 'me'"),
135
+ ("le", "ambiguous", "French article or Italian article"),
136
+ ("son", "ambiguous", "English word or Spanish 'they are'"),
137
+ ]
138
+ }
139
+
140
+ def initialize_models():
141
+ """Initialize all four models for comparison."""
142
+ models = {}
143
+ model_configs = [
144
+ ("model-a-dataset-a", "Model A Dataset A"),
145
+ ("model-b-dataset-a", "Model B Dataset A"),
146
+ ("model-a-dataset-b", "Model A Dataset B"),
147
+ ("model-b-dataset-b", "Model B Dataset B")
148
+ ]
149
+
150
+ for model_key, model_name in model_configs:
151
+ try:
152
+ models[model_key] = {
153
+ "detector": LanguageDetector(model_key=model_key),
154
+ "name": model_name,
155
+ "status": "Ready"
156
+ }
157
+ except Exception as e:
158
+ models[model_key] = {
159
+ "detector": None,
160
+ "name": model_name,
161
+ "status": f"Error: {str(e)}"
162
+ }
163
+
164
+ return models
165
+
166
+ def detect_with_all_models(text: str, models: Dict) -> Dict[str, Any]:
167
+ """Run language detection with all models and return results."""
168
+ results = {}
169
+
170
+ for model_key, model_info in models.items():
171
+ if model_info["detector"] is None:
172
+ results[model_key] = {
173
+ "language": "Error",
174
+ "confidence": 0.0,
175
+ "language_code": "error",
176
+ "status": model_info["status"]
177
+ }
178
+ else:
179
+ try:
180
+ result = model_info["detector"].detect_language(text)
181
+ results[model_key] = {
182
+ "language": result["language"],
183
+ "confidence": result["confidence"],
184
+ "language_code": result["language_code"],
185
+ "status": "Success"
186
+ }
187
+ except Exception as e:
188
+ results[model_key] = {
189
+ "language": "Error",
190
+ "confidence": 0.0,
191
+ "language_code": "error",
192
+ "status": f"Error: {str(e)}"
193
+ }
194
+
195
+ return results
196
+
197
+ def create_results_dataframe(texts: List[str], all_results: List[Dict], expected_langs: List[str] = None, categories: List[str] = None) -> pd.DataFrame:
198
+ """Create a pandas DataFrame for results display."""
199
+ data = []
200
+
201
+ for i, (text, results) in enumerate(zip(texts, all_results)):
202
+ row = {
203
+ "Text": text[:40] + "..." if len(text) > 40 else text, # Shortened text display
204
+ "Expected": expected_langs[i] if expected_langs else "N/A",
205
+ "Category": categories[i] if categories else "Custom"
206
+ }
207
+
208
+ expected_lang = expected_langs[i] if expected_langs else None
209
+
210
+ # Add results from each model - combine language and confidence
211
+ for model_key, result in results.items():
212
+ # Shortened model names
213
+ if model_key == "model-a-dataset-a":
214
+ col_name = "A-A"
215
+ elif model_key == "model-b-dataset-a":
216
+ col_name = "B-A"
217
+ elif model_key == "model-a-dataset-b":
218
+ col_name = "A-B"
219
+ elif model_key == "model-b-dataset-b":
220
+ col_name = "B-B"
221
+ else:
222
+ col_name = model_key[:6]
223
+
224
+ # Determine if prediction is correct
225
+ predicted_lang = result['language_code']
226
+ is_correct = False
227
+
228
+ if expected_lang and expected_lang not in ['ambiguous', 'mix', 'transliteration', 'unknown', 'N/A']:
229
+ # For specific expected languages, check exact match
230
+ is_correct = predicted_lang == expected_lang
231
+ emoji = "✅" if is_correct else "🚫"
232
+ else:
233
+ # For ambiguous/mixed/transliterated/unknown cases, don't show emoji
234
+ emoji = ""
235
+
236
+ # Combine emoji, language code and confidence in one column
237
+ if emoji:
238
+ row[col_name] = f"{emoji} {predicted_lang} ({result['confidence']:.3f})"
239
+ else:
240
+ row[col_name] = f"{predicted_lang} ({result['confidence']:.3f})"
241
+
242
+ data.append(row)
243
+
244
+ return pd.DataFrame(data)
245
+
246
+ def run_demo_tests(selected_categories: List[str], custom_texts: str, models: Dict):
247
+ """Run tests on selected categories and custom texts."""
248
+ if not selected_categories and not custom_texts.strip():
249
+ return "Please select at least one category or enter custom text.", None
250
+
251
+ all_texts = []
252
+ expected_langs = []
253
+ categories = []
254
+
255
+ # Add selected category samples
256
+ for category in selected_categories:
257
+ if category in DEMO_SAMPLES:
258
+ for text, expected, description in DEMO_SAMPLES[category]:
259
+ all_texts.append(text)
260
+ expected_langs.append(expected)
261
+ categories.append(category)
262
+
263
+ # Add custom texts
264
+ if custom_texts.strip():
265
+ custom_lines = [line.strip() for line in custom_texts.strip().split('\n') if line.strip()]
266
+ for text in custom_lines:
267
+ all_texts.append(text)
268
+ expected_langs.append("unknown")
269
+ categories.append("Custom")
270
+
271
+ if not all_texts:
272
+ return "No texts to analyze.", None
273
+
274
+ # Run detection on all texts
275
+ all_results = []
276
+ for text in all_texts:
277
+ results = detect_with_all_models(text, models)
278
+ all_results.append(results)
279
+
280
+ # Create results DataFrame
281
+ df = create_results_dataframe(all_texts, all_results, expected_langs, categories)
282
+
283
+ summary = f"Analyzed {len(all_texts)} texts across {len(set(categories))} categories."
284
+
285
+ return summary, df
286
+
287
+ def create_demo_interface():
288
+ """Create the demo interface."""
289
+
290
+ # Initialize models
291
+ models = initialize_models()
292
+
293
+ with gr.Blocks(title="Language Detection Demo - Model Comparison", theme=gr.themes.Soft()) as demo:
294
+ gr.Markdown("# 🚀 Language Detection Demo - Model Comparison")
295
+ gr.Markdown("Compare all four language detection models simultaneously across various difficulty categories.")
296
+
297
+ # Model Status Section
298
+ with gr.Group():
299
+ gr.Markdown("## 🤖 Model Status")
300
+ model_status_text = ""
301
+ for model_key, model_info in models.items():
302
+ status_icon = "✅" if model_info["status"] == "Ready" else "❌"
303
+ model_status_text += f"{status_icon} **{model_info['name']}**: {model_info['status']}\n\n"
304
+ gr.Markdown(model_status_text)
305
+
306
+ # Category Selection Section
307
+ with gr.Group():
308
+ gr.Markdown("## 📊 Test Categories")
309
+ gr.Markdown("Select categories to test different aspects of language detection difficulty:")
310
+
311
+ category_checkboxes = gr.CheckboxGroup(
312
+ choices=list(DEMO_SAMPLES.keys()),
313
+ label="Select Test Categories",
314
+ value=["Easy/Obvious", "Short Text"], # Default selection
315
+ interactive=True
316
+ )
317
+
318
+ # Custom Text Input Section
319
+ with gr.Group():
320
+ gr.Markdown("## ✏️ Custom Text Input")
321
+ gr.Markdown("Enter your own texts to test (one per line):")
322
+
323
+ custom_text_input = gr.Textbox(
324
+ label="Custom Texts",
325
+ placeholder="Enter custom texts here, one per line...\nExample:\nHello world\nBonjour le monde\n你好世界",
326
+ lines=5,
327
+ max_lines=10
328
+ )
329
+
330
+ # Control Buttons
331
+ with gr.Row():
332
+ run_demo_btn = gr.Button("🔍 Run Demo Tests", variant="primary", size="lg")
333
+ clear_btn = gr.Button("🗑️ Clear Results", variant="secondary")
334
+
335
+ # Sample Preview Section (moved up, condensed)
336
+ with gr.Group():
337
+ gr.Markdown("## 📚 Category Explanations")
338
+ gr.Markdown("Understanding what each test category evaluates:")
339
+
340
+ category_explanations = """
341
+ **Easy/Obvious:** Clear, unambiguous sentences in their native language. Tests basic language detection capability.
342
+
343
+ **Short Text:** Single words or very short phrases. Tests model performance with minimal context.
344
+
345
+ **False Friends:** Words that look similar across languages but have different meanings. Tests ability to distinguish between closely related languages.
346
+
347
+ **Mixed Scripts:** Text containing multiple languages, numbers, symbols, or scripts. Tests handling of multilingual content.
348
+
349
+ **Proper Nouns:** Names of people, places, or entities that exist across multiple languages. Tests context-dependent detection.
350
+
351
+ **Common Words:** International words with similar spelling across languages (hotel, taxi, etc.). Tests disambiguation of universal terms.
352
+
353
+ **Technical Terms:** Specialized vocabulary that may be borrowed or translated across languages. Tests domain-specific detection.
354
+
355
+ **Code-switching:** Text that switches between languages mid-sentence. Tests handling of bilingual communication patterns.
356
+
357
+ **Transliterated Text:** Non-Latin scripts written in Latin characters. Tests recognition of transliteration vs. native language.
358
+
359
+ **Ambiguous Script:** Words that could belong to multiple languages with identical spelling. Tests the model's decision-making under uncertainty.
360
+ """
361
+
362
+ gr.Markdown(category_explanations)
363
+
364
+ # Results Section (moved to bottom)
365
+ with gr.Group():
366
+ gr.Markdown("## 📈 Results")
367
+
368
+ summary_output = gr.Textbox(
369
+ label="Summary",
370
+ interactive=False,
371
+ visible=False
372
+ )
373
+
374
+ results_dataframe = gr.Dataframe(
375
+ label="Model Comparison Results (A-A: Model A Dataset A, B-A: Model B Dataset A, A-B: Model A Dataset B, B-B: Model B Dataset B)",
376
+ wrap=True,
377
+ interactive=False,
378
+ visible=False
379
+ )
380
+
381
+ # Event Handlers
382
+ def run_tests(selected_cats, custom_texts):
383
+ summary, df = run_demo_tests(selected_cats, custom_texts, models)
384
+
385
+ if df is not None:
386
+ return (
387
+ gr.update(value=summary, visible=True),
388
+ gr.update(value=df, visible=True)
389
+ )
390
+ else:
391
+ return (
392
+ gr.update(value=summary, visible=True),
393
+ gr.update(visible=False)
394
+ )
395
+
396
+ def clear_results():
397
+ return (
398
+ gr.update(value="", visible=False),
399
+ gr.update(value=None, visible=False)
400
+ )
401
+
402
+ # Connect event handlers
403
+ run_demo_btn.click(
404
+ fn=run_tests,
405
+ inputs=[category_checkboxes, custom_text_input],
406
+ outputs=[summary_output, results_dataframe]
407
+ )
408
+
409
+ clear_btn.click(
410
+ fn=clear_results,
411
+ outputs=[summary_output, results_dataframe]
412
+ )
413
+
414
+ return demo
415
+
416
+ if __name__ == "__main__":
417
+ demo = create_demo_interface()
418
+ demo.launch(
419
+ server_name="localhost",
420
+ server_port=7860,
421
+ share=False,
422
+ debug=True
423
+ )
requirements.txt ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ annotated-types==0.7.0
3
+ anyio==4.9.0
4
+ audioop-lts==0.2.1
5
+ certifi==2025.4.26
6
+ charset-normalizer==3.4.2
7
+ click==8.1.8
8
+ fastapi==0.115.12
9
+ ffmpy==0.5.0
10
+ filelock==3.18.0
11
+ fsspec==2025.5.1
12
+ gradio==5.31.0
13
+ gradio_client==1.10.1
14
+ groovy==0.1.2
15
+ h11==0.16.0
16
+ hf-xet==1.1.2
17
+ httpcore==1.0.9
18
+ httpx==0.28.1
19
+ huggingface-hub==0.32.0
20
+ idna==3.10
21
+ Jinja2==3.1.6
22
+ markdown-it-py==3.0.0
23
+ MarkupSafe==3.0.2
24
+ mdurl==0.1.2
25
+ mpmath==1.3.0
26
+ networkx==3.4.2
27
+ numpy==2.2.6
28
+ orjson==3.10.18
29
+ packaging==25.0
30
+ pandas==2.2.3
31
+ pillow==11.2.1
32
+ pydantic==2.11.5
33
+ pydantic_core==2.33.2
34
+ pydub==0.25.1
35
+ Pygments==2.19.1
36
+ python-dateutil==2.9.0.post0
37
+ python-multipart==0.0.20
38
+ pytz==2025.2
39
+ PyYAML==6.0.2
40
+ regex==2024.11.6
41
+ requests==2.32.3
42
+ rich==14.0.0
43
+ ruff==0.11.11
44
+ safehttpx==0.1.6
45
+ safetensors==0.5.3
46
+ semantic-version==2.10.0
47
+ setuptools==80.8.0
48
+ shellingham==1.5.4
49
+ six==1.17.0
50
+ sniffio==1.3.1
51
+ starlette==0.46.2
52
+ sympy==1.14.0
53
+ tokenizers==0.21.1
54
+ tomlkit==0.13.2
55
+ torch==2.7.0
56
+ tqdm==4.67.1
57
+ transformers==4.52.3
58
+ typer==0.15.4
59
+ typing-inspection==0.4.1
60
+ typing_extensions==4.13.2
61
+ tzdata==2025.2
62
+ urllib3==2.4.0
63
+ uvicorn==0.34.2
64
+ websockets==15.0.1
run.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple startup script for the Language Detection App
4
+
5
+ This script provides an easy way to run the app with different configurations.
6
+ """
7
+
8
+ import sys
9
+ import os
10
+ import argparse
11
+ from pathlib import Path
12
+
13
+ def check_dependencies():
14
+ """Check if required dependencies are installed."""
15
+ try:
16
+ import gradio
17
+ print("✅ Gradio is available")
18
+ except ImportError:
19
+ print("❌ Gradio not found. Install with: pip install -r requirements.txt")
20
+ return False
21
+
22
+ return True
23
+
24
+ def run_tests():
25
+ """Run the test suite."""
26
+ print("🧪 Running tests...")
27
+ os.system("python test_app.py")
28
+
29
+ def run_app(model_type="placeholder", host="0.0.0.0", port=7860, share=False):
30
+ """Run the main application."""
31
+
32
+ if not check_dependencies():
33
+ return 1
34
+
35
+ # Set environment variables for configuration
36
+ os.environ["MODEL_TYPE"] = model_type
37
+ os.environ["HOST"] = host
38
+ os.environ["PORT"] = str(port)
39
+ os.environ["SHARE"] = str(share).lower()
40
+
41
+ print(f"🚀 Starting Language Detection App...")
42
+ print(f"📊 Model: {model_type}")
43
+ print(f"🌐 Host: {host}:{port}")
44
+ print(f"🔗 Share: {share}")
45
+ print("-" * 50)
46
+
47
+ # Import and run the app
48
+ try:
49
+ from app import main
50
+ app = main()
51
+ app.launch(
52
+ server_name=host,
53
+ server_port=port,
54
+ share=share,
55
+ debug=True
56
+ )
57
+ except KeyboardInterrupt:
58
+ print("\n👋 App stopped by user")
59
+ except Exception as e:
60
+ print(f"❌ Error running app: {e}")
61
+ return 1
62
+
63
+ return 0
64
+
65
+ def main():
66
+ """Main entry point."""
67
+ parser = argparse.ArgumentParser(
68
+ description="Language Detection App Runner",
69
+ formatter_class=argparse.RawDescriptionHelpFormatter,
70
+ epilog="""
71
+ Examples:
72
+ python run.py # Run with default settings
73
+ python run.py --test # Run tests only
74
+ python run.py --model huggingface # Use Hugging Face model (if available)
75
+ python run.py --port 8080 # Run on port 8080
76
+ python run.py --share # Create public link
77
+ """
78
+ )
79
+
80
+ parser.add_argument(
81
+ "--test",
82
+ action="store_true",
83
+ help="Run tests instead of starting the app"
84
+ )
85
+
86
+ parser.add_argument(
87
+ "--model",
88
+ choices=["placeholder", "huggingface", "custom"],
89
+ default="placeholder",
90
+ help="Model type to use (default: placeholder)"
91
+ )
92
+
93
+ parser.add_argument(
94
+ "--host",
95
+ default="0.0.0.0",
96
+ help="Host to bind to (default: 0.0.0.0)"
97
+ )
98
+
99
+ parser.add_argument(
100
+ "--port",
101
+ type=int,
102
+ default=7860,
103
+ help="Port to bind to (default: 7860)"
104
+ )
105
+
106
+ parser.add_argument(
107
+ "--share",
108
+ action="store_true",
109
+ help="Create a public link via Gradio"
110
+ )
111
+
112
+ args = parser.parse_args()
113
+
114
+ print("🌍 Language Detection App Runner")
115
+ print("=" * 40)
116
+
117
+ if args.test:
118
+ run_tests()
119
+ return 0
120
+
121
+ # Validate model choice
122
+ if args.model == "huggingface":
123
+ try:
124
+ import transformers
125
+ print("✅ Transformers available for Hugging Face model")
126
+ except ImportError:
127
+ print("⚠️ Transformers not available. Install with:")
128
+ print(" pip install transformers torch")
129
+ print(" Falling back to placeholder model...")
130
+ args.model = "placeholder"
131
+
132
+ return run_app(
133
+ model_type=args.model,
134
+ host=args.host,
135
+ port=args.port,
136
+ share=args.share
137
+ )
138
+
139
+ if __name__ == "__main__":
140
+ sys.exit(main())
test_demo_functionality.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for the demo functionality
4
+ """
5
+
6
+ from demo_page import initialize_models, detect_with_all_models, create_results_dataframe, run_demo_tests, DEMO_SAMPLES
7
+
8
+ def test_model_initialization():
9
+ """Test that all models can be initialized."""
10
+ print("🔄 Testing model initialization...")
11
+ models = initialize_models()
12
+
13
+ print(f"✅ Initialized {len(models)} models:")
14
+ for model_key, model_info in models.items():
15
+ status_icon = "✅" if model_info["status"] == "Ready" else "❌"
16
+ print(f" {status_icon} {model_info['name']}: {model_info['status']}")
17
+
18
+ return models
19
+
20
+ def test_single_detection():
21
+ """Test detection with a single text across all models."""
22
+ print("\n🔄 Testing single text detection...")
23
+
24
+ models = initialize_models()
25
+ test_text = "Hello, how are you today?"
26
+
27
+ results = detect_with_all_models(test_text, models)
28
+
29
+ print(f"Text: '{test_text}'")
30
+ print("Results:")
31
+ for model_key, result in results.items():
32
+ print(f" {model_key}: {result['language_code']} ({result['confidence']:.3f}) - {result['status']}")
33
+
34
+ return results
35
+
36
+ def test_category_samples():
37
+ """Test a few samples from each category."""
38
+ print("\n🔄 Testing category samples...")
39
+
40
+ models = initialize_models()
41
+
42
+ for category, samples in DEMO_SAMPLES.items():
43
+ print(f"\n📊 Category: {category}")
44
+ # Test first sample from each category
45
+ text, expected, description = samples[0]
46
+ results = detect_with_all_models(text, models)
47
+
48
+ print(f" Text: '{text}' (Expected: {expected})")
49
+ print(f" Description: {description}")
50
+ for model_key, result in results.items():
51
+ match_icon = "✅" if result['language_code'] == expected or expected in ['ambiguous', 'mix', 'transliteration'] else "❌"
52
+ print(f" {model_key}: {result['language_code']} ({result['confidence']:.3f}) {match_icon}")
53
+
54
+ def test_dataframe_creation():
55
+ """Test DataFrame creation with sample data."""
56
+ print("\n🔄 Testing DataFrame creation...")
57
+
58
+ models = initialize_models()
59
+
60
+ # Test with a few samples
61
+ test_texts = [
62
+ "Hello world",
63
+ "Bonjour le monde",
64
+ "Hola mundo"
65
+ ]
66
+ expected_langs = ["en", "fr", "es"]
67
+ categories = ["Custom", "Custom", "Custom"]
68
+
69
+ all_results = []
70
+ for text in test_texts:
71
+ results = detect_with_all_models(text, models)
72
+ all_results.append(results)
73
+
74
+ df = create_results_dataframe(test_texts, all_results, expected_langs, categories)
75
+
76
+ print("DataFrame shape:", df.shape)
77
+ print("Columns:", list(df.columns))
78
+ print("\nFirst few rows:")
79
+ print(df.head())
80
+
81
+ return df
82
+
83
+ def test_demo_workflow():
84
+ """Test the complete demo workflow."""
85
+ print("\n🔄 Testing complete demo workflow...")
86
+
87
+ models = initialize_models()
88
+
89
+ # Test with selected categories and custom text
90
+ selected_categories = ["Easy/Obvious", "Short Text"]
91
+ custom_texts = "Hello world\nBonjour\n你好"
92
+
93
+ summary, df = run_demo_tests(selected_categories, custom_texts, models)
94
+
95
+ print(f"Summary: {summary}")
96
+ if df is not None:
97
+ print(f"Results DataFrame shape: {df.shape}")
98
+ print("Sample results:")
99
+ print(df.head())
100
+ else:
101
+ print("❌ No DataFrame returned")
102
+
103
+ return summary, df
104
+
105
+ def main():
106
+ """Run all tests."""
107
+ print("🚀 Starting demo functionality tests...\n")
108
+
109
+ try:
110
+ # Test 1: Model initialization
111
+ models = test_model_initialization()
112
+
113
+ # Test 2: Single detection
114
+ single_results = test_single_detection()
115
+
116
+ # Test 3: Category samples
117
+ test_category_samples()
118
+
119
+ # Test 4: DataFrame creation
120
+ df = test_dataframe_creation()
121
+
122
+ # Test 5: Complete workflow
123
+ summary, demo_df = test_demo_workflow()
124
+
125
+ print("\n✅ All tests completed successfully!")
126
+ print(f"📊 Total categories available: {len(DEMO_SAMPLES)}")
127
+ print(f"📝 Total sample texts: {sum(len(samples) for samples in DEMO_SAMPLES.values())}")
128
+
129
+ except Exception as e:
130
+ print(f"\n❌ Test failed with error: {e}")
131
+ import traceback
132
+ traceback.print_exc()
133
+
134
+ if __name__ == "__main__":
135
+ main()
test_model_a_dataset_a.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for Model A Dataset A - XLM-RoBERTa + Standard Dataset
4
+
5
+ This script tests the XLM-RoBERTa based language detection model
6
+ trained on the standard multilingual dataset to ensure it works correctly.
7
+ """
8
+
9
+ import sys
10
+ import os
11
+
12
+ # Add the project root to the Python path
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
14
+
15
+ from backend.language_detector import LanguageDetector
16
+
17
+
18
+ def test_model_a_dataset_a():
19
+ """Test the Model A Dataset A implementation."""
20
+ print("🧪 Testing Model A Dataset A - XLM-RoBERTa + Standard Dataset")
21
+ print("=" * 75)
22
+
23
+ try:
24
+ # Initialize detector with Model A Dataset A
25
+ detector = LanguageDetector(model_key="model-a-dataset-a")
26
+ print("✅ Successfully initialized Model A Dataset A")
27
+
28
+ # Test texts in different languages
29
+ test_texts = [
30
+ ("Hello, how are you today?", "en"),
31
+ ("Bonjour, comment allez-vous?", "fr"),
32
+ ("Hola, ¿cómo estás?", "es"),
33
+ ("Guten Tag, wie geht es Ihnen?", "de"),
34
+ ("こんにちは、元気ですか?", "ja"),
35
+ ("Привет, как дела?", "ru"),
36
+ ("Ciao, come stai?", "it"),
37
+ ("Olá, como você está?", "pt"),
38
+ ("你好,你好吗?", "zh"),
39
+ ("안녕하세요, 어떻게 지내세요?", "ko"),
40
+ ("مرحبا، كيف حالك؟", "ar"),
41
+ ("नमस्ते, आप कैसे हैं?", "hi")
42
+ ]
43
+
44
+ print("\n🔍 Running language detection tests:")
45
+ print("-" * 75)
46
+
47
+ correct_predictions = 0
48
+ total_predictions = len(test_texts)
49
+
50
+ for text, expected_lang in test_texts:
51
+ try:
52
+ result = detector.detect_language(text)
53
+ predicted_lang = result['language_code']
54
+ confidence = result['confidence']
55
+ language_name = result['language']
56
+
57
+ # Check if prediction is correct (allow some flexibility for Chinese variants)
58
+ is_correct = (predicted_lang == expected_lang or
59
+ (expected_lang == "zh" and predicted_lang in ["zh-hans", "zh-hant", "zh-cn", "zh-tw"]))
60
+ if is_correct:
61
+ correct_predictions += 1
62
+ status = "✅"
63
+ else:
64
+ status = "❌"
65
+
66
+ print(f"{status} Text: {text[:40]}{'...' if len(text) > 40 else ''}")
67
+ print(f" Expected: {expected_lang} | Predicted: {predicted_lang} ({language_name})")
68
+ print(f" Confidence: {confidence:.4f}")
69
+ print()
70
+
71
+ except Exception as e:
72
+ print(f"❌ Error testing '{text[:30]}...': {str(e)}")
73
+ print()
74
+
75
+ # Calculate accuracy
76
+ accuracy = (correct_predictions / total_predictions) * 100
77
+ print(f"📊 Test Results: {correct_predictions}/{total_predictions} correct")
78
+ print(f"📈 Accuracy: {accuracy:.1f}%")
79
+
80
+ # Test model info
81
+ print("\n📋 Model Information:")
82
+ print("-" * 75)
83
+ model_info = detector.get_current_model_info()
84
+ for key, value in model_info.items():
85
+ print(f"{key.title().replace('_', ' ')}: {value}")
86
+
87
+ print("🎉 Model A Dataset A test completed successfully!")
88
+
89
+ except Exception as e:
90
+ print(f"❌ Test failed: {str(e)}")
91
+ import traceback
92
+ traceback.print_exc()
93
+ return False
94
+
95
+ return True
96
+
97
+
98
+ def test_model_architecture():
99
+ """Test the model architecture information."""
100
+ print("\n🏗️ Testing Model Architecture Information")
101
+ print("=" * 75)
102
+
103
+ try:
104
+ detector = LanguageDetector(model_key="model-a-dataset-a")
105
+ model_info = detector.get_current_model_info()
106
+
107
+ # Verify key architecture information
108
+ expected_info = {
109
+ "architecture": "XLM-RoBERTa",
110
+ "dataset": "Dataset A",
111
+ "accuracy": "97.9%",
112
+ "model_size": "278M parameters"
113
+ }
114
+
115
+ print("🔍 Verifying model architecture information:")
116
+ print("-" * 50)
117
+
118
+ all_correct = True
119
+ for key, expected_value in expected_info.items():
120
+ actual_value = model_info.get(key, "Not found")
121
+ if actual_value == expected_value:
122
+ print(f"✅ {key}: {actual_value}")
123
+ else:
124
+ print(f"❌ {key}: Expected '{expected_value}', got '{actual_value}'")
125
+ all_correct = False
126
+
127
+ if all_correct:
128
+ print("\n🎉 All architecture information verified successfully!")
129
+ else:
130
+ print("\n⚠️ Some architecture information mismatches found.")
131
+
132
+ return all_correct
133
+
134
+ except Exception as e:
135
+ print(f"❌ Architecture test failed: {str(e)}")
136
+ return False
137
+
138
+
139
+ if __name__ == "__main__":
140
+ print("🚀 Starting Model A Dataset A Tests\n")
141
+
142
+ # Run tests
143
+ test1_passed = test_model_a_dataset_a()
144
+ test2_passed = test_model_architecture()
145
+
146
+ # Final results
147
+ print("\n" + "=" * 75)
148
+ if test1_passed and test2_passed:
149
+ print("🎉 All tests passed! Model A Dataset A is ready to use.")
150
+ print("⚖️ This model offers balanced performance with robust cross-lingual capabilities!")
151
+ else:
152
+ print("❌ Some tests failed. Please check the implementation.")
153
+ sys.exit(1)
test_model_b_dataset_b.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for Model B Dataset B - BERT + Enhanced Dataset
4
+
5
+ This script tests the BERT based language detection model
6
+ trained on the enhanced dataset, achieving the highest accuracy (99.85%).
7
+ """
8
+
9
+ import sys
10
+ import os
11
+
12
+ # Add the project root to the Python path
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
14
+
15
+ from backend.language_detector import LanguageDetector
16
+
17
+
18
+ def test_model_b_dataset_b():
19
+ """Test the Model B Dataset B implementation."""
20
+ print("🧪 Testing Model B Dataset B - BERT + Enhanced Dataset")
21
+ print("=" * 75)
22
+
23
+ try:
24
+ # Initialize detector with Model B Dataset B (highest accuracy)
25
+ detector = LanguageDetector(model_key="model-b-dataset-b")
26
+ print("✅ Successfully initialized Model B Dataset B")
27
+
28
+ # Test texts in the 20 supported languages
29
+ test_texts = [
30
+ ("Hello, how are you today?", "en"), # English
31
+ ("Bonjour, comment allez-vous?", "fr"), # French
32
+ ("Hola, ¿cómo estás?", "es"), # Spanish
33
+ ("Guten Tag, wie geht es Ihnen?", "de"), # German
34
+ ("Ciao, come stai?", "it"), # Italian
35
+ ("Olá, como você está?", "pt"), # Portuguese
36
+ ("Привет, как дела?", "ru"), # Russian
37
+ ("こんにちは、元気ですか?", "ja"), # Japanese
38
+ ("你好,你好吗?", "zh"), # Chinese
39
+ ("مرحبا، كيف حالك؟", "ar"), # Arabic
40
+ ("नमस्ते, आप कैसे हैं?", "hi"), # Hindi
41
+ ("Hallo, hoe gaat het met je?", "nl"), # Dutch
42
+ ("Γεια σας, πώς είστε;", "el"), # Greek
43
+ ("Здравейте, как сте?", "bg"), # Bulgarian
44
+ ("Witaj, jak się masz?", "pl"), # Polish
45
+ ("สวัสดี คุณเป็นอย่างไรบ้าง?", "th"), # Thai
46
+ ("Merhaba, nasılsınız?", "tr"), # Turkish
47
+ ("آپ کیسے ہیں؟", "ur"), # Urdu
48
+ ("Xin chào, bạn khỏe không?", "vi"), # Vietnamese
49
+ ("Habari, unajehje?", "sw") # Swahili
50
+ ]
51
+
52
+ print("\n🔍 Running language detection tests on 20 supported languages:")
53
+ print("-" * 75)
54
+
55
+ correct_predictions = 0
56
+ total_predictions = len(test_texts)
57
+
58
+ for text, expected_lang in test_texts:
59
+ try:
60
+ result = detector.detect_language(text)
61
+ predicted_lang = result['language_code']
62
+ confidence = result['confidence']
63
+ language_name = result['language']
64
+
65
+ # Check if prediction is correct
66
+ is_correct = predicted_lang == expected_lang
67
+ if is_correct:
68
+ correct_predictions += 1
69
+ status = "✅"
70
+ else:
71
+ status = "❌"
72
+
73
+ print(f"{status} Text: {text[:40]}{'...' if len(text) > 40 else ''}")
74
+ print(f" Expected: {expected_lang} | Predicted: {predicted_lang} ({language_name})")
75
+ print(f" Confidence: {confidence:.4f}")
76
+ print()
77
+
78
+ except Exception as e:
79
+ print(f"❌ Error testing '{text[:30]}...': {str(e)}")
80
+ print()
81
+
82
+ # Calculate accuracy
83
+ accuracy = (correct_predictions / total_predictions) * 100
84
+ print(f"📊 Test Results: {correct_predictions}/{total_predictions} correct")
85
+ print(f"📈 Accuracy: {accuracy:.1f}%")
86
+
87
+ # Test model info
88
+ print("\n📋 Model Information:")
89
+ print("-" * 75)
90
+ model_info = detector.get_current_model_info()
91
+ for key, value in model_info.items():
92
+ print(f"{key.title().replace('_', ' ')}: {value}")
93
+
94
+ print("🎉 Model B Dataset B test completed successfully!")
95
+
96
+ except Exception as e:
97
+ print(f"❌ Test failed: {str(e)}")
98
+ import traceback
99
+ traceback.print_exc()
100
+ return False
101
+
102
+ return True
103
+
104
+
105
+ def test_all_models_comprehensive():
106
+ """Test and compare all four available model combinations."""
107
+ print("\n🔄 Comprehensive All-Model Combinations Comparison")
108
+ print("=" * 75)
109
+
110
+ models_to_test = [
111
+ ("model-a-dataset-a", "Model A Dataset A", "XLM-RoBERTa + Standard", "97.9%"),
112
+ ("model-b-dataset-a", "Model B Dataset A", "BERT + Standard", "96.17%"),
113
+ ("model-a-dataset-b", "Model A Dataset B", "XLM-RoBERTa + Enhanced", "99.72%"),
114
+ ("model-b-dataset-b", "Model B Dataset B", "BERT + Enhanced", "99.85%")
115
+ ]
116
+
117
+ test_texts = [
118
+ "Hello, this is a test in English.",
119
+ "Bonjour, ceci est un test en français.",
120
+ "Hola, esto es una prueba en español.",
121
+ "Guten Tag, das ist ein Test auf Deutsch."
122
+ ]
123
+
124
+ print("🧪 Testing with multiple sentences across all model combinations:")
125
+ print("-" * 75)
126
+
127
+ try:
128
+ results_summary = {}
129
+
130
+ for model_key, model_name, description, claimed_accuracy in models_to_test:
131
+ print(f"\n🤖 Testing {model_name} ({description}) - Claimed: {claimed_accuracy}")
132
+ print("-" * 60)
133
+
134
+ try:
135
+ detector = LanguageDetector(model_key=model_key)
136
+ model_results = []
137
+
138
+ for text in test_texts:
139
+ result = detector.detect_language(text)
140
+ model_results.append({
141
+ 'text': text[:30] + '...' if len(text) > 30 else text,
142
+ 'language': result['language'],
143
+ 'code': result['language_code'],
144
+ 'confidence': result['confidence']
145
+ })
146
+
147
+ print(f" Text: {text[:30]}{'...' if len(text) > 30 else ''}")
148
+ print(f" → {result['language']} ({result['language_code']}) - {result['confidence']:.4f}")
149
+
150
+ results_summary[model_name] = model_results
151
+ print(f"✅ {model_name} completed successfully")
152
+
153
+ except Exception as e:
154
+ print(f"❌ {model_name}: {str(e)}")
155
+ results_summary[model_name] = f"Error: {str(e)}"
156
+
157
+ print(f"\n📊 All Model Combinations Testing Summary:")
158
+ print("-" * 75)
159
+ for model_name, results in results_summary.items():
160
+ if isinstance(results, str):
161
+ print(f"❌ {model_name}: {results}")
162
+ else:
163
+ avg_confidence = sum(r['confidence'] for r in results) / len(results)
164
+ print(f"✅ {model_name}: Avg Confidence: {avg_confidence:.4f}")
165
+
166
+ print("🎉 Comprehensive model comparison completed successfully!")
167
+ return True
168
+
169
+ except Exception as e:
170
+ print(f"❌ Comprehensive test failed: {str(e)}")
171
+ return False
172
+
173
+
174
+ def test_model_architecture():
175
+ """Test the model architecture information for Model B Dataset B."""
176
+ print("\n🏗️ Testing Model B Dataset B Architecture Information")
177
+ print("=" * 75)
178
+
179
+ try:
180
+ detector = LanguageDetector(model_key="model-b-dataset-b")
181
+ model_info = detector.get_current_model_info()
182
+
183
+ # Verify key architecture information
184
+ expected_info = {
185
+ "architecture": "BERT",
186
+ "dataset": "Dataset B",
187
+ "accuracy": "99.85%",
188
+ "model_size": "178M parameters"
189
+ }
190
+
191
+ print("🔍 Verifying model architecture information:")
192
+ print("-" * 50)
193
+
194
+ all_correct = True
195
+ for key, expected_value in expected_info.items():
196
+ actual_value = model_info.get(key, "Not found")
197
+ if actual_value == expected_value:
198
+ print(f"✅ {key}: {actual_value}")
199
+ else:
200
+ print(f"❌ {key}: Expected '{expected_value}', got '{actual_value}'")
201
+ all_correct = False
202
+
203
+ if all_correct:
204
+ print("\n🎉 All architecture information verified successfully!")
205
+ else:
206
+ print("\n⚠️ Some architecture information mismatches found.")
207
+
208
+ return all_correct
209
+
210
+ except Exception as e:
211
+ print(f"❌ Architecture test failed: {str(e)}")
212
+ return False
213
+
214
+
215
+ if __name__ == "__main__":
216
+ print("🚀 Starting Model B Dataset B Tests\n")
217
+
218
+ # Run tests
219
+ test1_passed = test_model_b_dataset_b()
220
+ test2_passed = test_all_models_comprehensive()
221
+ test3_passed = test_model_architecture()
222
+
223
+ # Final results
224
+ print("\n" + "=" * 75)
225
+ if test1_passed and test2_passed and test3_passed:
226
+ print("🎉 All tests passed! Model B Dataset B is ready to use.")
227
+ print("🏆 This model offers the highest accuracy (99.85%) of all available models!")
228
+ print("📝 Note: Optimized for 20 carefully selected languages for maximum precision.")
229
+ else:
230
+ print("❌ Some tests failed. Please check the implementation.")
231
+ sys.exit(1)