yangding14
commited on
Commit
·
72f90b1
1
Parent(s):
f5560c6
first commit
Browse files- app.py +260 -0
- backend/.DS_Store +0 -0
- backend/__init__.py +1 -0
- backend/__pycache__/__init__.cpython-313.pyc +0 -0
- backend/__pycache__/language_detector.cpython-313.pyc +0 -0
- backend/language_detector.py +269 -0
- backend/models/__init__.py +34 -0
- backend/models/__pycache__/__init__.cpython-313.pyc +0 -0
- backend/models/__pycache__/base_model.cpython-313.pyc +0 -0
- backend/models/__pycache__/model_a_dataset_a.cpython-313.pyc +0 -0
- backend/models/__pycache__/model_a_dataset_b.cpython-313.pyc +0 -0
- backend/models/__pycache__/model_b_dataset_a.cpython-313.pyc +0 -0
- backend/models/__pycache__/model_b_dataset_b.cpython-313.pyc +0 -0
- backend/models/__pycache__/model_config.cpython-313.pyc +0 -0
- backend/models/__pycache__/placeholder_model_1.cpython-313.pyc +0 -0
- backend/models/__pycache__/placeholder_model_2.cpython-313.pyc +0 -0
- backend/models/__pycache__/placeholder_model_3.cpython-313.pyc +0 -0
- backend/models/__pycache__/songjun.cpython-313.pyc +0 -0
- backend/models/__pycache__/xlm_roberta_detector.cpython-313.pyc +0 -0
- backend/models/base_model.py +73 -0
- backend/models/model_a_dataset_a.py +147 -0
- backend/models/model_a_dataset_b.py +149 -0
- backend/models/model_b_dataset_a.py +147 -0
- backend/models/model_b_dataset_b.py +149 -0
- backend/models/model_config.py +184 -0
- combined_app.py +414 -0
- demo_page.py +423 -0
- requirements.txt +64 -0
- run.py +140 -0
- test_demo_functionality.py +135 -0
- test_model_a_dataset_a.py +153 -0
- test_model_b_dataset_b.py +231 -0
app.py
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from backend.language_detector import LanguageDetector
|
3 |
+
|
4 |
+
def main():
|
5 |
+
# Initialize the language detector with default model (Model A Dataset A)
|
6 |
+
detector = LanguageDetector()
|
7 |
+
|
8 |
+
# Create Gradio interface
|
9 |
+
with gr.Blocks(title="Language Detection App", theme=gr.themes.Soft()) as app:
|
10 |
+
gr.Markdown("# 🌍 Language Detection App")
|
11 |
+
gr.Markdown("Select a model and enter text below to detect its language with confidence scores.")
|
12 |
+
|
13 |
+
# Model Selection Section with visual styling
|
14 |
+
with gr.Group():
|
15 |
+
gr.Markdown(
|
16 |
+
"<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.1), transparent); border-radius: 8px 8px 0 0;'>🤖 Model Selection</div>"
|
17 |
+
)
|
18 |
+
|
19 |
+
# Get available models
|
20 |
+
available_models = detector.get_available_models()
|
21 |
+
model_choices = []
|
22 |
+
model_info_map = {}
|
23 |
+
|
24 |
+
for key, info in available_models.items():
|
25 |
+
if info["status"] == "available":
|
26 |
+
model_choices.append((info["display_name"], key))
|
27 |
+
else:
|
28 |
+
model_choices.append((f"{info['display_name']} (Coming Soon)", key))
|
29 |
+
model_info_map[key] = info
|
30 |
+
|
31 |
+
model_selector = gr.Dropdown(
|
32 |
+
choices=model_choices,
|
33 |
+
value="model-a-dataset-a", # Default to Model A Dataset A
|
34 |
+
label="Choose Language Detection Model",
|
35 |
+
interactive=True
|
36 |
+
)
|
37 |
+
|
38 |
+
# Model Information Display
|
39 |
+
model_info_display = gr.Markdown(
|
40 |
+
value=_format_model_info(detector.get_current_model_info()),
|
41 |
+
label="Model Information"
|
42 |
+
)
|
43 |
+
|
44 |
+
# Add visual separator
|
45 |
+
gr.Markdown(
|
46 |
+
"<div style='margin: 24px 0; border-top: 3px solid rgba(99, 102, 241, 0.2); background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.05), transparent); height: 2px;'></div>"
|
47 |
+
)
|
48 |
+
|
49 |
+
# Analysis Section
|
50 |
+
with gr.Group():
|
51 |
+
gr.Markdown(
|
52 |
+
"<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(34, 197, 94, 0.1), transparent); border-radius: 8px 8px 0 0;'>🔍 Language Analysis</div>"
|
53 |
+
)
|
54 |
+
|
55 |
+
with gr.Row():
|
56 |
+
with gr.Column(scale=2):
|
57 |
+
# Input section
|
58 |
+
text_input = gr.Textbox(
|
59 |
+
label="Text to Analyze",
|
60 |
+
placeholder="Enter text here to detect its language...",
|
61 |
+
lines=5,
|
62 |
+
max_lines=10
|
63 |
+
)
|
64 |
+
|
65 |
+
detect_btn = gr.Button("🔍 Detect Language", variant="primary", size="lg")
|
66 |
+
|
67 |
+
# Example texts
|
68 |
+
gr.Examples(
|
69 |
+
examples=[
|
70 |
+
["Hello, how are you today?"],
|
71 |
+
["Bonjour, comment allez-vous?"],
|
72 |
+
["Hola, ¿cómo estás?"],
|
73 |
+
["Guten Tag, wie geht es Ihnen?"],
|
74 |
+
["こんにちは、元気ですか?"],
|
75 |
+
["Привет, как дела?"],
|
76 |
+
["Ciao, come stai?"],
|
77 |
+
["Olá, como você está?"],
|
78 |
+
["你好,你好吗?"],
|
79 |
+
["안녕하세요, 어떻게 지내세요?"]
|
80 |
+
],
|
81 |
+
inputs=text_input,
|
82 |
+
label="Try these examples:"
|
83 |
+
)
|
84 |
+
|
85 |
+
with gr.Column(scale=2):
|
86 |
+
# Output section
|
87 |
+
with gr.Group():
|
88 |
+
gr.Markdown(
|
89 |
+
"<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 12px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(168, 85, 247, 0.1), transparent); border-radius: 8px 8px 0 0;'>📊 Detection Results</div>"
|
90 |
+
)
|
91 |
+
|
92 |
+
detected_language = gr.Textbox(
|
93 |
+
label="Detected Language",
|
94 |
+
interactive=False
|
95 |
+
)
|
96 |
+
|
97 |
+
confidence_score = gr.Number(
|
98 |
+
label="Confidence Score",
|
99 |
+
interactive=False,
|
100 |
+
precision=4
|
101 |
+
)
|
102 |
+
|
103 |
+
language_code = gr.Textbox(
|
104 |
+
label="Language Code (ISO 639-1)",
|
105 |
+
interactive=False
|
106 |
+
)
|
107 |
+
|
108 |
+
# Top predictions table
|
109 |
+
top_predictions = gr.Dataframe(
|
110 |
+
headers=["Language", "Code", "Confidence"],
|
111 |
+
label="Top 5 Predictions",
|
112 |
+
interactive=False,
|
113 |
+
wrap=True
|
114 |
+
)
|
115 |
+
|
116 |
+
# Status/Info section
|
117 |
+
with gr.Row():
|
118 |
+
status_text = gr.Textbox(
|
119 |
+
label="Status",
|
120 |
+
interactive=False,
|
121 |
+
visible=False
|
122 |
+
)
|
123 |
+
|
124 |
+
# Event handlers
|
125 |
+
def detect_language_wrapper(text, selected_model):
|
126 |
+
if not text.strip():
|
127 |
+
return (
|
128 |
+
"No text provided",
|
129 |
+
0.0,
|
130 |
+
"",
|
131 |
+
[],
|
132 |
+
gr.update(value="Please enter some text to analyze.", visible=True)
|
133 |
+
)
|
134 |
+
|
135 |
+
try:
|
136 |
+
# Switch model if needed
|
137 |
+
if detector.current_model_key != selected_model:
|
138 |
+
try:
|
139 |
+
detector.switch_model(selected_model)
|
140 |
+
except NotImplementedError:
|
141 |
+
return (
|
142 |
+
"Model unavailable",
|
143 |
+
0.0,
|
144 |
+
"",
|
145 |
+
[],
|
146 |
+
gr.update(value="This model is not yet implemented. Please select an available model.", visible=True)
|
147 |
+
)
|
148 |
+
except Exception as e:
|
149 |
+
return (
|
150 |
+
"Model error",
|
151 |
+
0.0,
|
152 |
+
"",
|
153 |
+
[],
|
154 |
+
gr.update(value=f"Error loading model: {str(e)}", visible=True)
|
155 |
+
)
|
156 |
+
|
157 |
+
result = detector.detect_language(text)
|
158 |
+
|
159 |
+
# Extract main prediction
|
160 |
+
main_lang = result['language']
|
161 |
+
main_confidence = result['confidence']
|
162 |
+
main_code = result['language_code']
|
163 |
+
|
164 |
+
# Format top predictions for table
|
165 |
+
predictions_table = [
|
166 |
+
[pred['language'], pred['language_code'], f"{pred['confidence']:.4f}"]
|
167 |
+
for pred in result['top_predictions']
|
168 |
+
]
|
169 |
+
|
170 |
+
model_info = result.get('metadata', {}).get('model_info', {})
|
171 |
+
model_name = model_info.get('name', 'Unknown Model')
|
172 |
+
|
173 |
+
return (
|
174 |
+
main_lang,
|
175 |
+
main_confidence,
|
176 |
+
main_code,
|
177 |
+
predictions_table,
|
178 |
+
gr.update(value=f"✅ Analysis Complete\n\nInput Text: {text[:100]}{'...' if len(text) > 100 else ''}\n\nDetected Language: {main_lang} ({main_code})\nConfidence: {main_confidence:.2%}\n\nModel: {model_name}", visible=True)
|
179 |
+
)
|
180 |
+
|
181 |
+
except Exception as e:
|
182 |
+
return (
|
183 |
+
"Error occurred",
|
184 |
+
0.0,
|
185 |
+
"",
|
186 |
+
[],
|
187 |
+
gr.update(value=f"Error: {str(e)}", visible=True)
|
188 |
+
)
|
189 |
+
|
190 |
+
def update_model_info(selected_model):
|
191 |
+
"""Update model information display when model selection changes."""
|
192 |
+
try:
|
193 |
+
if detector.current_model_key != selected_model:
|
194 |
+
detector.switch_model(selected_model)
|
195 |
+
model_info = detector.get_current_model_info()
|
196 |
+
return _format_model_info(model_info)
|
197 |
+
except NotImplementedError:
|
198 |
+
return "**This model is not yet implemented.** Please select an available model."
|
199 |
+
except Exception as e:
|
200 |
+
return f"**Error loading model information:** {str(e)}"
|
201 |
+
|
202 |
+
# Connect the button to the detection function
|
203 |
+
detect_btn.click(
|
204 |
+
fn=detect_language_wrapper,
|
205 |
+
inputs=[text_input, model_selector],
|
206 |
+
outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
|
207 |
+
)
|
208 |
+
|
209 |
+
# Also trigger on Enter key in text input
|
210 |
+
text_input.submit(
|
211 |
+
fn=detect_language_wrapper,
|
212 |
+
inputs=[text_input, model_selector],
|
213 |
+
outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
|
214 |
+
)
|
215 |
+
|
216 |
+
# Update model info when selection changes
|
217 |
+
model_selector.change(
|
218 |
+
fn=update_model_info,
|
219 |
+
inputs=[model_selector],
|
220 |
+
outputs=[model_info_display]
|
221 |
+
)
|
222 |
+
|
223 |
+
return app
|
224 |
+
|
225 |
+
|
226 |
+
def _format_model_info(model_info):
|
227 |
+
"""Format model information for display."""
|
228 |
+
if not model_info:
|
229 |
+
return "No model information available."
|
230 |
+
|
231 |
+
formatted_info = f"""
|
232 |
+
**{model_info.get('name', 'Unknown Model')}**
|
233 |
+
|
234 |
+
{model_info.get('description', 'No description available.')}
|
235 |
+
|
236 |
+
**📊 Performance:**
|
237 |
+
- Accuracy: {model_info.get('accuracy', 'N/A')}
|
238 |
+
- Model Size: {model_info.get('model_size', 'N/A')}
|
239 |
+
|
240 |
+
**🏗️ Architecture:**
|
241 |
+
- Model Architecture: {model_info.get('architecture', 'N/A')}
|
242 |
+
- Base Model: {model_info.get('base_model', 'N/A')}
|
243 |
+
- Training Dataset: {model_info.get('dataset', 'N/A')}
|
244 |
+
|
245 |
+
**🌐 Languages:** {model_info.get('languages_supported', 'N/A')}
|
246 |
+
|
247 |
+
**⚙️ Training Details:** {model_info.get('training_details', 'N/A')}
|
248 |
+
|
249 |
+
**💡 Use Cases:** {model_info.get('use_cases', 'N/A')}
|
250 |
+
|
251 |
+
**✅ Strengths:** {model_info.get('strengths', 'N/A')}
|
252 |
+
|
253 |
+
**⚠️ Limitations:** {model_info.get('limitations', 'N/A')}
|
254 |
+
"""
|
255 |
+
return formatted_info
|
256 |
+
|
257 |
+
|
258 |
+
if __name__ == "__main__":
|
259 |
+
app = main()
|
260 |
+
app.launch()
|
backend/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
backend/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# Backend package for language detection models
|
backend/__pycache__/__init__.cpython-313.pyc
ADDED
Binary file (173 Bytes). View file
|
|
backend/__pycache__/language_detector.cpython-313.pyc
ADDED
Binary file (10.1 kB). View file
|
|
backend/language_detector.py
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Language Detection Backend
|
3 |
+
|
4 |
+
This module provides the main LanguageDetector class and ModelRegistry
|
5 |
+
for managing multiple language detection models organized by architecture and dataset.
|
6 |
+
|
7 |
+
Model Architecture:
|
8 |
+
- Model A: XLM-RoBERTa based architectures
|
9 |
+
- Model B: BERT based architectures
|
10 |
+
|
11 |
+
Training Datasets:
|
12 |
+
- Dataset A: Standard multilingual language detection dataset
|
13 |
+
- Dataset B: Enhanced/specialized language detection dataset
|
14 |
+
"""
|
15 |
+
|
16 |
+
import logging
|
17 |
+
from typing import Dict, List, Any
|
18 |
+
|
19 |
+
from .models import (
|
20 |
+
BaseLanguageModel,
|
21 |
+
ModelADatasetA,
|
22 |
+
ModelBDatasetA,
|
23 |
+
ModelADatasetB,
|
24 |
+
ModelBDatasetB,
|
25 |
+
get_all_model_configs,
|
26 |
+
get_language_name,
|
27 |
+
LANGUAGE_MAPPINGS
|
28 |
+
)
|
29 |
+
|
30 |
+
|
31 |
+
class ModelRegistry:
|
32 |
+
"""
|
33 |
+
Registry for managing available language detection models.
|
34 |
+
|
35 |
+
This class handles the registration and creation of language detection models
|
36 |
+
organized by model architecture (A: XLM-RoBERTa, B: BERT) and training
|
37 |
+
dataset (A: standard, B: enhanced).
|
38 |
+
"""
|
39 |
+
|
40 |
+
def __init__(self):
|
41 |
+
"""Initialize the model registry with available models."""
|
42 |
+
# Get model configurations from centralized config
|
43 |
+
self.model_configs = get_all_model_configs()
|
44 |
+
|
45 |
+
# Map model keys to their implementation classes
|
46 |
+
self.model_classes = {
|
47 |
+
"model-a-dataset-a": ModelADatasetA, # XLM-RoBERTa + Dataset A
|
48 |
+
"model-b-dataset-a": ModelBDatasetA, # BERT + Dataset A
|
49 |
+
"model-a-dataset-b": ModelADatasetB, # XLM-RoBERTa + Dataset B
|
50 |
+
"model-b-dataset-b": ModelBDatasetB, # BERT + Dataset B
|
51 |
+
}
|
52 |
+
|
53 |
+
# Build models registry by combining configs with classes
|
54 |
+
self.models = {}
|
55 |
+
|
56 |
+
# Add the new organized models
|
57 |
+
for model_key, config in self.model_configs.items():
|
58 |
+
if model_key in self.model_classes:
|
59 |
+
self.models[model_key] = {
|
60 |
+
"class": self.model_classes[model_key],
|
61 |
+
"display_name": config["display_name"],
|
62 |
+
"description": config["description"],
|
63 |
+
"status": config["status"]
|
64 |
+
}
|
65 |
+
|
66 |
+
def get_available_models(self) -> Dict[str, Dict[str, Any]]:
|
67 |
+
"""
|
68 |
+
Get all registered models.
|
69 |
+
|
70 |
+
Returns:
|
71 |
+
Dict containing all model information
|
72 |
+
"""
|
73 |
+
return self.models.copy()
|
74 |
+
|
75 |
+
def create_model(self, model_key: str) -> BaseLanguageModel:
|
76 |
+
"""
|
77 |
+
Create an instance of the specified model.
|
78 |
+
|
79 |
+
Args:
|
80 |
+
model_key (str): Key of the model to create
|
81 |
+
|
82 |
+
Returns:
|
83 |
+
BaseLanguageModel: Instance of the requested model
|
84 |
+
|
85 |
+
Raises:
|
86 |
+
ValueError: If the model key is not found
|
87 |
+
"""
|
88 |
+
if model_key not in self.models:
|
89 |
+
available_keys = list(self.models.keys())
|
90 |
+
raise ValueError(f"Unknown model: {model_key}. Available models: {available_keys}")
|
91 |
+
|
92 |
+
model_class = self.models[model_key]["class"]
|
93 |
+
return model_class()
|
94 |
+
|
95 |
+
|
96 |
+
class LanguageDetector:
|
97 |
+
"""
|
98 |
+
Main language detection class that orchestrates model predictions.
|
99 |
+
|
100 |
+
This class provides a unified interface for language detection using
|
101 |
+
different model architectures and training datasets. It handles model
|
102 |
+
switching and provides consistent output formatting.
|
103 |
+
"""
|
104 |
+
|
105 |
+
def __init__(self, model_key: str = "model-a-dataset-a"):
|
106 |
+
"""
|
107 |
+
Initialize the language detector.
|
108 |
+
|
109 |
+
Args:
|
110 |
+
model_key (str): Key of the model to use from the registry
|
111 |
+
- "model-a-dataset-a": XLM-RoBERTa + standard dataset
|
112 |
+
- "model-b-dataset-a": BERT + standard dataset
|
113 |
+
- "model-a-dataset-b": XLM-RoBERTa + enhanced dataset
|
114 |
+
- "model-b-dataset-b": BERT + enhanced dataset
|
115 |
+
"""
|
116 |
+
self.registry = ModelRegistry()
|
117 |
+
self.current_model_key = model_key
|
118 |
+
self.model = self.registry.create_model(model_key)
|
119 |
+
|
120 |
+
# Use centralized language mappings
|
121 |
+
self.language_names = LANGUAGE_MAPPINGS
|
122 |
+
|
123 |
+
def switch_model(self, model_key: str):
|
124 |
+
"""
|
125 |
+
Switch to a different model.
|
126 |
+
|
127 |
+
Args:
|
128 |
+
model_key (str): Key of the new model to use
|
129 |
+
|
130 |
+
Raises:
|
131 |
+
Exception: If model switching fails
|
132 |
+
"""
|
133 |
+
try:
|
134 |
+
self.model = self.registry.create_model(model_key)
|
135 |
+
self.current_model_key = model_key
|
136 |
+
logging.info(f"Successfully switched to model: {model_key}")
|
137 |
+
except Exception as e:
|
138 |
+
logging.error(f"Failed to switch to model {model_key}: {e}")
|
139 |
+
raise
|
140 |
+
|
141 |
+
def get_current_model_info(self) -> Dict[str, Any]:
|
142 |
+
"""
|
143 |
+
Get information about the currently selected model.
|
144 |
+
|
145 |
+
Returns:
|
146 |
+
Dict containing current model information
|
147 |
+
"""
|
148 |
+
return self.model.get_model_info()
|
149 |
+
|
150 |
+
def get_available_models(self) -> Dict[str, Dict[str, Any]]:
|
151 |
+
"""
|
152 |
+
Get all available models.
|
153 |
+
|
154 |
+
Returns:
|
155 |
+
Dict containing all available models
|
156 |
+
"""
|
157 |
+
return self.registry.get_available_models()
|
158 |
+
|
159 |
+
def detect_language(self, text: str) -> Dict[str, Any]:
|
160 |
+
"""
|
161 |
+
Detect the language of the input text.
|
162 |
+
|
163 |
+
Args:
|
164 |
+
text (str): Input text to analyze
|
165 |
+
|
166 |
+
Returns:
|
167 |
+
Dict containing:
|
168 |
+
- language: Main predicted language name
|
169 |
+
- language_code: Main predicted language code
|
170 |
+
- confidence: Confidence score for main prediction
|
171 |
+
- top_predictions: List of top 5 predictions with details
|
172 |
+
- metadata: Additional information about the prediction
|
173 |
+
|
174 |
+
Raises:
|
175 |
+
ValueError: If input text is empty
|
176 |
+
RuntimeError: If model prediction fails
|
177 |
+
"""
|
178 |
+
if not text or not text.strip():
|
179 |
+
raise ValueError("Input text cannot be empty")
|
180 |
+
|
181 |
+
# Get predictions from the current model
|
182 |
+
model_result = self.model.predict(text.strip())
|
183 |
+
predictions = model_result['predictions']
|
184 |
+
|
185 |
+
if not predictions:
|
186 |
+
raise RuntimeError("Model returned no predictions")
|
187 |
+
|
188 |
+
# Extract main prediction
|
189 |
+
top_prediction = predictions[0]
|
190 |
+
main_language_code = top_prediction['language_code']
|
191 |
+
main_confidence = top_prediction['confidence']
|
192 |
+
|
193 |
+
# Get human-readable language name using centralized function
|
194 |
+
main_language_name = get_language_name(main_language_code)
|
195 |
+
|
196 |
+
# Format top predictions (limit to 5)
|
197 |
+
top_predictions = []
|
198 |
+
for pred in predictions[:5]:
|
199 |
+
lang_code = pred['language_code']
|
200 |
+
lang_name = get_language_name(lang_code)
|
201 |
+
top_predictions.append({
|
202 |
+
'language': lang_name,
|
203 |
+
'language_code': lang_code,
|
204 |
+
'confidence': pred['confidence']
|
205 |
+
})
|
206 |
+
|
207 |
+
# Prepare metadata
|
208 |
+
metadata = {
|
209 |
+
'text_length': model_result.get('text_length', len(text)),
|
210 |
+
'model_name': model_result.get('model_version', 'unknown'),
|
211 |
+
'model_type': model_result.get('model_type', 'unknown'),
|
212 |
+
'current_model_key': self.current_model_key,
|
213 |
+
'model_info': self.get_current_model_info()
|
214 |
+
}
|
215 |
+
|
216 |
+
return {
|
217 |
+
'language': main_language_name,
|
218 |
+
'language_code': main_language_code,
|
219 |
+
'confidence': main_confidence,
|
220 |
+
'top_predictions': top_predictions,
|
221 |
+
'metadata': metadata
|
222 |
+
}
|
223 |
+
|
224 |
+
def get_supported_languages(self) -> Dict[str, str]:
|
225 |
+
"""
|
226 |
+
Get dictionary of supported language codes and names.
|
227 |
+
|
228 |
+
Returns:
|
229 |
+
Dict mapping language codes to language names
|
230 |
+
"""
|
231 |
+
supported_codes = self.model.get_supported_languages()
|
232 |
+
return {
|
233 |
+
code: get_language_name(code)
|
234 |
+
for code in supported_codes
|
235 |
+
}
|
236 |
+
|
237 |
+
|
238 |
+
# Example usage and testing
|
239 |
+
if __name__ == "__main__":
|
240 |
+
# Initialize detector with default model (Model A Dataset A)
|
241 |
+
detector = LanguageDetector()
|
242 |
+
|
243 |
+
# Test with sample texts
|
244 |
+
test_texts = [
|
245 |
+
"Hello, how are you today?",
|
246 |
+
"Bonjour, comment allez-vous?",
|
247 |
+
"Hola, ¿cómo estás?",
|
248 |
+
"Guten Tag, wie geht es Ihnen?"
|
249 |
+
]
|
250 |
+
|
251 |
+
print("Language Detection Test - Model A Dataset A")
|
252 |
+
print("=" * 60)
|
253 |
+
|
254 |
+
for text in test_texts:
|
255 |
+
try:
|
256 |
+
result = detector.detect_language(text)
|
257 |
+
print(f"Text: {text}")
|
258 |
+
print(f"Detected: {result['language']} ({result['language_code']}) - {result['confidence']:.3f}")
|
259 |
+
print("---")
|
260 |
+
except Exception as e:
|
261 |
+
print(f"Error detecting language for '{text}': {e}")
|
262 |
+
print("---")
|
263 |
+
|
264 |
+
# Show available models
|
265 |
+
print("\nAvailable Models:")
|
266 |
+
models = detector.get_available_models()
|
267 |
+
for key, info in models.items():
|
268 |
+
status = "✅" if info["status"] == "available" else "🚧"
|
269 |
+
print(f"{status} {info['display_name']} ({key}): {info['description']}")
|
backend/models/__init__.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Language Detection Models Module
|
3 |
+
|
4 |
+
This module provides the base model interface and all available language detection models.
|
5 |
+
Models are organized by architecture (A: XLM-RoBERTa, B: BERT) and training dataset (A: standard, B: enhanced).
|
6 |
+
"""
|
7 |
+
|
8 |
+
from .base_model import BaseLanguageModel
|
9 |
+
from .model_config import (
|
10 |
+
get_model_config,
|
11 |
+
get_all_model_configs,
|
12 |
+
get_supported_languages,
|
13 |
+
get_language_name,
|
14 |
+
LANGUAGE_MAPPINGS
|
15 |
+
)
|
16 |
+
|
17 |
+
# Import all model implementations
|
18 |
+
from .model_a_dataset_a import ModelADatasetA
|
19 |
+
from .model_b_dataset_a import ModelBDatasetA
|
20 |
+
from .model_a_dataset_b import ModelADatasetB
|
21 |
+
from .model_b_dataset_b import ModelBDatasetB
|
22 |
+
|
23 |
+
__all__ = [
|
24 |
+
'BaseLanguageModel',
|
25 |
+
'ModelADatasetA',
|
26 |
+
'ModelBDatasetA',
|
27 |
+
'ModelADatasetB',
|
28 |
+
'ModelBDatasetB',
|
29 |
+
'get_model_config',
|
30 |
+
'get_all_model_configs',
|
31 |
+
'get_supported_languages',
|
32 |
+
'get_language_name',
|
33 |
+
'LANGUAGE_MAPPINGS'
|
34 |
+
]
|
backend/models/__pycache__/__init__.cpython-313.pyc
ADDED
Binary file (984 Bytes). View file
|
|
backend/models/__pycache__/base_model.cpython-313.pyc
ADDED
Binary file (2.39 kB). View file
|
|
backend/models/__pycache__/model_a_dataset_a.cpython-313.pyc
ADDED
Binary file (6.57 kB). View file
|
|
backend/models/__pycache__/model_a_dataset_b.cpython-313.pyc
ADDED
Binary file (6.71 kB). View file
|
|
backend/models/__pycache__/model_b_dataset_a.cpython-313.pyc
ADDED
Binary file (6.56 kB). View file
|
|
backend/models/__pycache__/model_b_dataset_b.cpython-313.pyc
ADDED
Binary file (6.92 kB). View file
|
|
backend/models/__pycache__/model_config.cpython-313.pyc
ADDED
Binary file (10.4 kB). View file
|
|
backend/models/__pycache__/placeholder_model_1.cpython-313.pyc
ADDED
Binary file (6.9 kB). View file
|
|
backend/models/__pycache__/placeholder_model_2.cpython-313.pyc
ADDED
Binary file (7 kB). View file
|
|
backend/models/__pycache__/placeholder_model_3.cpython-313.pyc
ADDED
Binary file (6.83 kB). View file
|
|
backend/models/__pycache__/songjun.cpython-313.pyc
ADDED
Binary file (6.89 kB). View file
|
|
backend/models/__pycache__/xlm_roberta_detector.cpython-313.pyc
ADDED
Binary file (6.3 kB). View file
|
|
backend/models/base_model.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Base Language Model Abstract Class
|
3 |
+
|
4 |
+
This module defines the interface that all language detection models must implement.
|
5 |
+
"""
|
6 |
+
|
7 |
+
from abc import ABC, abstractmethod
|
8 |
+
from typing import Dict, List, Any
|
9 |
+
|
10 |
+
|
11 |
+
class BaseLanguageModel(ABC):
|
12 |
+
"""
|
13 |
+
Abstract base class for language detection models.
|
14 |
+
|
15 |
+
All language detection models must inherit from this class and implement
|
16 |
+
the required methods.
|
17 |
+
"""
|
18 |
+
|
19 |
+
@abstractmethod
|
20 |
+
def predict(self, text: str) -> Dict[str, Any]:
|
21 |
+
"""
|
22 |
+
Predict the language of the given text.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
text (str): Input text to analyze
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
Dict containing prediction results with structure:
|
29 |
+
{
|
30 |
+
'predictions': [
|
31 |
+
{
|
32 |
+
'language_code': str,
|
33 |
+
'confidence': float
|
34 |
+
},
|
35 |
+
...
|
36 |
+
],
|
37 |
+
'text_length': int,
|
38 |
+
'model_version': str,
|
39 |
+
'model_type': str
|
40 |
+
}
|
41 |
+
"""
|
42 |
+
pass
|
43 |
+
|
44 |
+
@abstractmethod
|
45 |
+
def get_supported_languages(self) -> List[str]:
|
46 |
+
"""
|
47 |
+
Get list of supported language codes.
|
48 |
+
|
49 |
+
Returns:
|
50 |
+
List of ISO 639-1 language codes
|
51 |
+
"""
|
52 |
+
pass
|
53 |
+
|
54 |
+
@abstractmethod
|
55 |
+
def get_model_info(self) -> Dict[str, Any]:
|
56 |
+
"""
|
57 |
+
Get information about the model.
|
58 |
+
|
59 |
+
Returns:
|
60 |
+
Dict containing model metadata and description with structure:
|
61 |
+
{
|
62 |
+
'name': str,
|
63 |
+
'description': str,
|
64 |
+
'accuracy': str,
|
65 |
+
'model_size': str,
|
66 |
+
'languages_supported': str,
|
67 |
+
'training_details': str,
|
68 |
+
'use_cases': str,
|
69 |
+
'strengths': str,
|
70 |
+
'limitations': str
|
71 |
+
}
|
72 |
+
"""
|
73 |
+
pass
|
backend/models/model_a_dataset_a.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Model A Dataset A - XLM-RoBERTa Language Detection
|
3 |
+
|
4 |
+
This module implements the XLM-RoBERTa based language detection model
|
5 |
+
fine-tuned on Dataset A (standard multilingual language detection dataset).
|
6 |
+
|
7 |
+
Model Architecture: XLM-RoBERTa (Model A)
|
8 |
+
Training Dataset: Dataset A (standard multilingual)
|
9 |
+
Performance: 97.9% accuracy across 60+ languages
|
10 |
+
"""
|
11 |
+
|
12 |
+
import logging
|
13 |
+
from typing import Dict, List, Any
|
14 |
+
|
15 |
+
from .base_model import BaseLanguageModel
|
16 |
+
from .model_config import get_model_config, get_supported_languages, get_language_name
|
17 |
+
|
18 |
+
try:
|
19 |
+
from transformers import pipeline
|
20 |
+
HF_AVAILABLE = True
|
21 |
+
except ImportError:
|
22 |
+
HF_AVAILABLE = False
|
23 |
+
logging.warning("Transformers library not available. Please install with: pip install transformers torch")
|
24 |
+
|
25 |
+
|
26 |
+
class ModelADatasetA(BaseLanguageModel):
|
27 |
+
"""
|
28 |
+
XLM-RoBERTa based language detection model (Model A) trained on Dataset A.
|
29 |
+
|
30 |
+
This model represents the XLM-RoBERTa architecture fine-tuned on a standard
|
31 |
+
multilingual language detection dataset, achieving 97.9% accuracy with
|
32 |
+
robust cross-lingual performance across 60+ languages.
|
33 |
+
|
34 |
+
Architecture: XLM-RoBERTa (Model A)
|
35 |
+
Dataset: Dataset A (standard multilingual)
|
36 |
+
Base Model: xlm-roberta-base
|
37 |
+
Accuracy: 97.9%
|
38 |
+
Parameters: 278M
|
39 |
+
"""
|
40 |
+
|
41 |
+
def __init__(self):
|
42 |
+
"""Initialize the Model A Dataset A language detector."""
|
43 |
+
self.model_key = "model-a-dataset-a"
|
44 |
+
self.config = get_model_config(self.model_key)
|
45 |
+
self.model_name = self.config["huggingface_model"]
|
46 |
+
|
47 |
+
# Check if transformers library is available
|
48 |
+
if not HF_AVAILABLE:
|
49 |
+
raise ImportError(
|
50 |
+
"Transformers library required for Model A Dataset A. "
|
51 |
+
"Install with: pip install transformers torch"
|
52 |
+
)
|
53 |
+
|
54 |
+
# Initialize the model pipeline
|
55 |
+
try:
|
56 |
+
self.classifier = pipeline(
|
57 |
+
"text-classification",
|
58 |
+
model=self.model_name,
|
59 |
+
device=0,
|
60 |
+
top_k=None # Return all scores
|
61 |
+
)
|
62 |
+
logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
|
63 |
+
except Exception as e:
|
64 |
+
logging.error(f"Failed to load {self.config['display_name']}: {e}")
|
65 |
+
raise RuntimeError(f"Could not initialize Model A Dataset A: {str(e)}")
|
66 |
+
|
67 |
+
def predict(self, text: str) -> Dict[str, Any]:
|
68 |
+
"""
|
69 |
+
Predict language using Model A Dataset A (XLM-RoBERTa).
|
70 |
+
|
71 |
+
Args:
|
72 |
+
text (str): Input text to analyze
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
Dict with predictions, metadata, and model information
|
76 |
+
"""
|
77 |
+
if not text or not text.strip():
|
78 |
+
raise ValueError("Input text cannot be empty")
|
79 |
+
|
80 |
+
try:
|
81 |
+
# Run the model prediction
|
82 |
+
results = self.classifier(text)
|
83 |
+
|
84 |
+
# Handle the format returned by the pipeline
|
85 |
+
if isinstance(results, list) and len(results) > 0:
|
86 |
+
if isinstance(results[0], list):
|
87 |
+
# Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
|
88 |
+
prediction_list = results[0]
|
89 |
+
else:
|
90 |
+
# Direct list format: [{'label': 'en', 'score': 0.99}, ...]
|
91 |
+
prediction_list = results
|
92 |
+
else:
|
93 |
+
raise ValueError("Unexpected pipeline output format")
|
94 |
+
|
95 |
+
# Sort predictions by confidence score (descending)
|
96 |
+
predictions = [
|
97 |
+
{
|
98 |
+
'language_code': result['label'].lower(),
|
99 |
+
'confidence': result['score']
|
100 |
+
}
|
101 |
+
for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
|
102 |
+
]
|
103 |
+
|
104 |
+
return {
|
105 |
+
'predictions': predictions,
|
106 |
+
'text_length': len(text),
|
107 |
+
'model_version': self.model_name,
|
108 |
+
'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
|
109 |
+
}
|
110 |
+
|
111 |
+
except Exception as e:
|
112 |
+
logging.error(f"Model A Dataset A prediction failed: {e}")
|
113 |
+
raise RuntimeError(f"Model prediction failed: {str(e)}")
|
114 |
+
|
115 |
+
def get_supported_languages(self) -> List[str]:
|
116 |
+
"""
|
117 |
+
Get supported languages for Model A Dataset A.
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
List of ISO 639-1 language codes supported by the model
|
121 |
+
"""
|
122 |
+
return get_supported_languages(self.model_key)
|
123 |
+
|
124 |
+
def get_model_info(self) -> Dict[str, Any]:
|
125 |
+
"""
|
126 |
+
Get detailed information about Model A Dataset A.
|
127 |
+
|
128 |
+
Returns:
|
129 |
+
Dict containing comprehensive model metadata
|
130 |
+
"""
|
131 |
+
# Build comprehensive model info from centralized config
|
132 |
+
model_info = {
|
133 |
+
"name": self.config["display_name"],
|
134 |
+
"description": self.config["description"],
|
135 |
+
"accuracy": self.config["accuracy"],
|
136 |
+
"model_size": self.config["model_size"],
|
137 |
+
"architecture": self.config["architecture"],
|
138 |
+
"base_model": self.config["base_model"],
|
139 |
+
"dataset": self.config["dataset"],
|
140 |
+
"languages_supported": f"{self.config['languages_supported']}+ languages",
|
141 |
+
"training_details": self.config["training_details"],
|
142 |
+
"use_cases": self.config["use_cases"],
|
143 |
+
"strengths": self.config["strengths"],
|
144 |
+
"limitations": self.config["limitations"]
|
145 |
+
}
|
146 |
+
|
147 |
+
return model_info
|
backend/models/model_a_dataset_b.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Model A Dataset B - XLM-RoBERTa Language Detection
|
3 |
+
|
4 |
+
This module implements the XLM-RoBERTa based language detection model
|
5 |
+
fine-tuned on Dataset B (enhanced/specialized language detection dataset).
|
6 |
+
|
7 |
+
Model Architecture: XLM-RoBERTa (Model A)
|
8 |
+
Training Dataset: Dataset B (enhanced/specialized)
|
9 |
+
Performance: 99.72% accuracy across 100+ languages
|
10 |
+
"""
|
11 |
+
|
12 |
+
import logging
|
13 |
+
from typing import Dict, List, Any
|
14 |
+
|
15 |
+
from .base_model import BaseLanguageModel
|
16 |
+
from .model_config import get_model_config, get_supported_languages, get_language_name
|
17 |
+
|
18 |
+
try:
|
19 |
+
from transformers import pipeline
|
20 |
+
HF_AVAILABLE = True
|
21 |
+
except ImportError:
|
22 |
+
HF_AVAILABLE = False
|
23 |
+
logging.warning("Transformers library not available. Please install with: pip install transformers torch")
|
24 |
+
|
25 |
+
|
26 |
+
class ModelADatasetB(BaseLanguageModel):
|
27 |
+
"""
|
28 |
+
XLM-RoBERTa based language detection model (Model A) trained on Dataset B.
|
29 |
+
|
30 |
+
This model represents the XLM-RoBERTa architecture fine-tuned on an enhanced
|
31 |
+
language detection dataset, achieving exceptional 99.72% accuracy with
|
32 |
+
state-of-the-art performance across 100+ languages.
|
33 |
+
|
34 |
+
Architecture: XLM-RoBERTa (Model A)
|
35 |
+
Dataset: Dataset B (enhanced/specialized)
|
36 |
+
Base Model: xlm-roberta-base
|
37 |
+
Accuracy: 99.72%
|
38 |
+
Parameters: 278M
|
39 |
+
Training Loss: 0.0176
|
40 |
+
"""
|
41 |
+
|
42 |
+
def __init__(self):
|
43 |
+
"""Initialize the Model A Dataset B language detector."""
|
44 |
+
self.model_key = "model-a-dataset-b"
|
45 |
+
self.config = get_model_config(self.model_key)
|
46 |
+
self.model_name = self.config["huggingface_model"]
|
47 |
+
|
48 |
+
# Check if transformers library is available
|
49 |
+
if not HF_AVAILABLE:
|
50 |
+
raise ImportError(
|
51 |
+
"Transformers library required for Model A Dataset B. "
|
52 |
+
"Install with: pip install transformers torch"
|
53 |
+
)
|
54 |
+
|
55 |
+
# Initialize the model pipeline
|
56 |
+
try:
|
57 |
+
self.classifier = pipeline(
|
58 |
+
"text-classification",
|
59 |
+
model=self.model_name,
|
60 |
+
device=0,
|
61 |
+
top_k=None # Return all scores
|
62 |
+
)
|
63 |
+
logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
|
64 |
+
except Exception as e:
|
65 |
+
logging.error(f"Failed to load {self.config['display_name']}: {e}")
|
66 |
+
raise RuntimeError(f"Could not initialize Model A Dataset B: {str(e)}")
|
67 |
+
|
68 |
+
def predict(self, text: str) -> Dict[str, Any]:
|
69 |
+
"""
|
70 |
+
Predict language using Model A Dataset B (XLM-RoBERTa enhanced).
|
71 |
+
|
72 |
+
Args:
|
73 |
+
text (str): Input text to analyze
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
Dict with predictions, metadata, and model information
|
77 |
+
"""
|
78 |
+
if not text or not text.strip():
|
79 |
+
raise ValueError("Input text cannot be empty")
|
80 |
+
|
81 |
+
try:
|
82 |
+
# Run the model prediction
|
83 |
+
results = self.classifier(text)
|
84 |
+
|
85 |
+
# Handle the format returned by the pipeline
|
86 |
+
if isinstance(results, list) and len(results) > 0:
|
87 |
+
if isinstance(results[0], list):
|
88 |
+
# Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
|
89 |
+
prediction_list = results[0]
|
90 |
+
else:
|
91 |
+
# Direct list format: [{'label': 'en', 'score': 0.99}, ...]
|
92 |
+
prediction_list = results
|
93 |
+
else:
|
94 |
+
raise ValueError("Unexpected pipeline output format")
|
95 |
+
|
96 |
+
# Sort predictions by confidence score (descending)
|
97 |
+
predictions = [
|
98 |
+
{
|
99 |
+
'language_code': result['label'].lower(),
|
100 |
+
'confidence': result['score']
|
101 |
+
}
|
102 |
+
for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
|
103 |
+
]
|
104 |
+
|
105 |
+
return {
|
106 |
+
'predictions': predictions,
|
107 |
+
'text_length': len(text),
|
108 |
+
'model_version': self.model_name,
|
109 |
+
'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
|
110 |
+
}
|
111 |
+
|
112 |
+
except Exception as e:
|
113 |
+
logging.error(f"Model A Dataset B prediction failed: {e}")
|
114 |
+
raise RuntimeError(f"Model prediction failed: {str(e)}")
|
115 |
+
|
116 |
+
def get_supported_languages(self) -> List[str]:
|
117 |
+
"""
|
118 |
+
Get supported languages for Model A Dataset B.
|
119 |
+
|
120 |
+
Returns:
|
121 |
+
List of ISO 639-1 language codes supported by the model
|
122 |
+
"""
|
123 |
+
return get_supported_languages(self.model_key)
|
124 |
+
|
125 |
+
def get_model_info(self) -> Dict[str, Any]:
|
126 |
+
"""
|
127 |
+
Get detailed information about Model A Dataset B.
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
Dict containing comprehensive model metadata
|
131 |
+
"""
|
132 |
+
# Build comprehensive model info from centralized config
|
133 |
+
model_info = {
|
134 |
+
"name": self.config["display_name"],
|
135 |
+
"description": self.config["description"],
|
136 |
+
"accuracy": self.config["accuracy"],
|
137 |
+
"model_size": self.config["model_size"],
|
138 |
+
"architecture": self.config["architecture"],
|
139 |
+
"base_model": self.config["base_model"],
|
140 |
+
"dataset": self.config["dataset"],
|
141 |
+
"languages_supported": f"{self.config['languages_supported']}+ languages",
|
142 |
+
"training_details": self.config["training_details"],
|
143 |
+
"training_loss": f"{self.config.get('training_loss', 'N/A')}",
|
144 |
+
"use_cases": self.config["use_cases"],
|
145 |
+
"strengths": self.config["strengths"],
|
146 |
+
"limitations": self.config["limitations"]
|
147 |
+
}
|
148 |
+
|
149 |
+
return model_info
|
backend/models/model_b_dataset_a.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Model B Dataset A - BERT Language Detection
|
3 |
+
|
4 |
+
This module implements the BERT based language detection model
|
5 |
+
fine-tuned on Dataset A (standard multilingual language detection dataset).
|
6 |
+
|
7 |
+
Model Architecture: BERT (Model B)
|
8 |
+
Training Dataset: Dataset A (standard multilingual)
|
9 |
+
Performance: 96.17% accuracy across 100+ languages
|
10 |
+
"""
|
11 |
+
|
12 |
+
import logging
|
13 |
+
from typing import Dict, List, Any
|
14 |
+
|
15 |
+
from .base_model import BaseLanguageModel
|
16 |
+
from .model_config import get_model_config, get_supported_languages, get_language_name
|
17 |
+
|
18 |
+
try:
|
19 |
+
from transformers import pipeline
|
20 |
+
HF_AVAILABLE = True
|
21 |
+
except ImportError:
|
22 |
+
HF_AVAILABLE = False
|
23 |
+
logging.warning("Transformers library not available. Please install with: pip install transformers torch")
|
24 |
+
|
25 |
+
|
26 |
+
class ModelBDatasetA(BaseLanguageModel):
|
27 |
+
"""
|
28 |
+
BERT based language detection model (Model B) trained on Dataset A.
|
29 |
+
|
30 |
+
This model represents the BERT architecture fine-tuned on a standard
|
31 |
+
multilingual language detection dataset, achieving 96.17% accuracy with
|
32 |
+
optimized efficiency and broad language coverage across 100+ languages.
|
33 |
+
|
34 |
+
Architecture: BERT (Model B)
|
35 |
+
Dataset: Dataset A (standard multilingual)
|
36 |
+
Base Model: bert-base-multilingual-cased
|
37 |
+
Accuracy: 96.17%
|
38 |
+
Parameters: 178M
|
39 |
+
"""
|
40 |
+
|
41 |
+
def __init__(self):
|
42 |
+
"""Initialize the Model B Dataset A language detector."""
|
43 |
+
self.model_key = "model-b-dataset-a"
|
44 |
+
self.config = get_model_config(self.model_key)
|
45 |
+
self.model_name = self.config["huggingface_model"]
|
46 |
+
|
47 |
+
# Check if transformers library is available
|
48 |
+
if not HF_AVAILABLE:
|
49 |
+
raise ImportError(
|
50 |
+
"Transformers library required for Model B Dataset A. "
|
51 |
+
"Install with: pip install transformers torch"
|
52 |
+
)
|
53 |
+
|
54 |
+
# Initialize the model pipeline
|
55 |
+
try:
|
56 |
+
self.classifier = pipeline(
|
57 |
+
"text-classification",
|
58 |
+
model=self.model_name,
|
59 |
+
device=0,
|
60 |
+
top_k=None # Return all scores
|
61 |
+
)
|
62 |
+
logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
|
63 |
+
except Exception as e:
|
64 |
+
logging.error(f"Failed to load {self.config['display_name']}: {e}")
|
65 |
+
raise RuntimeError(f"Could not initialize Model B Dataset A: {str(e)}")
|
66 |
+
|
67 |
+
def predict(self, text: str) -> Dict[str, Any]:
|
68 |
+
"""
|
69 |
+
Predict language using Model B Dataset A (BERT).
|
70 |
+
|
71 |
+
Args:
|
72 |
+
text (str): Input text to analyze
|
73 |
+
|
74 |
+
Returns:
|
75 |
+
Dict with predictions, metadata, and model information
|
76 |
+
"""
|
77 |
+
if not text or not text.strip():
|
78 |
+
raise ValueError("Input text cannot be empty")
|
79 |
+
|
80 |
+
try:
|
81 |
+
# Run the model prediction
|
82 |
+
results = self.classifier(text)
|
83 |
+
|
84 |
+
# Handle the format returned by the pipeline
|
85 |
+
if isinstance(results, list) and len(results) > 0:
|
86 |
+
if isinstance(results[0], list):
|
87 |
+
# Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
|
88 |
+
prediction_list = results[0]
|
89 |
+
else:
|
90 |
+
# Direct list format: [{'label': 'en', 'score': 0.99}, ...]
|
91 |
+
prediction_list = results
|
92 |
+
else:
|
93 |
+
raise ValueError("Unexpected pipeline output format")
|
94 |
+
|
95 |
+
# Sort predictions by confidence score (descending)
|
96 |
+
predictions = [
|
97 |
+
{
|
98 |
+
'language_code': result['label'].lower(),
|
99 |
+
'confidence': result['score']
|
100 |
+
}
|
101 |
+
for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
|
102 |
+
]
|
103 |
+
|
104 |
+
return {
|
105 |
+
'predictions': predictions,
|
106 |
+
'text_length': len(text),
|
107 |
+
'model_version': self.model_name,
|
108 |
+
'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
|
109 |
+
}
|
110 |
+
|
111 |
+
except Exception as e:
|
112 |
+
logging.error(f"Model B Dataset A prediction failed: {e}")
|
113 |
+
raise RuntimeError(f"Model prediction failed: {str(e)}")
|
114 |
+
|
115 |
+
def get_supported_languages(self) -> List[str]:
|
116 |
+
"""
|
117 |
+
Get supported languages for Model B Dataset A.
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
List of ISO 639-1 language codes supported by the model
|
121 |
+
"""
|
122 |
+
return get_supported_languages(self.model_key)
|
123 |
+
|
124 |
+
def get_model_info(self) -> Dict[str, Any]:
|
125 |
+
"""
|
126 |
+
Get detailed information about Model B Dataset A.
|
127 |
+
|
128 |
+
Returns:
|
129 |
+
Dict containing comprehensive model metadata
|
130 |
+
"""
|
131 |
+
# Build comprehensive model info from centralized config
|
132 |
+
model_info = {
|
133 |
+
"name": self.config["display_name"],
|
134 |
+
"description": self.config["description"],
|
135 |
+
"accuracy": self.config["accuracy"],
|
136 |
+
"model_size": self.config["model_size"],
|
137 |
+
"architecture": self.config["architecture"],
|
138 |
+
"base_model": self.config["base_model"],
|
139 |
+
"dataset": self.config["dataset"],
|
140 |
+
"languages_supported": f"{self.config['languages_supported']}+ languages",
|
141 |
+
"training_details": self.config["training_details"],
|
142 |
+
"use_cases": self.config["use_cases"],
|
143 |
+
"strengths": self.config["strengths"],
|
144 |
+
"limitations": self.config["limitations"]
|
145 |
+
}
|
146 |
+
|
147 |
+
return model_info
|
backend/models/model_b_dataset_b.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Model B Dataset B - BERT Language Detection
|
3 |
+
|
4 |
+
This module implements the BERT based language detection model
|
5 |
+
fine-tuned on Dataset B (enhanced/specialized language detection dataset).
|
6 |
+
|
7 |
+
Model Architecture: BERT (Model B)
|
8 |
+
Training Dataset: Dataset B (enhanced/specialized)
|
9 |
+
Performance: 99.85% accuracy across 20 carefully selected languages
|
10 |
+
"""
|
11 |
+
|
12 |
+
import logging
|
13 |
+
from typing import Dict, List, Any
|
14 |
+
|
15 |
+
from .base_model import BaseLanguageModel
|
16 |
+
from .model_config import get_model_config, get_supported_languages, get_language_name
|
17 |
+
|
18 |
+
try:
|
19 |
+
from transformers import pipeline
|
20 |
+
HF_AVAILABLE = True
|
21 |
+
except ImportError:
|
22 |
+
HF_AVAILABLE = False
|
23 |
+
logging.warning("Transformers library not available. Please install with: pip install transformers torch")
|
24 |
+
|
25 |
+
|
26 |
+
class ModelBDatasetB(BaseLanguageModel):
|
27 |
+
"""
|
28 |
+
BERT based language detection model (Model B) trained on Dataset B.
|
29 |
+
|
30 |
+
This model represents the BERT architecture fine-tuned on an enhanced
|
31 |
+
language detection dataset, achieving the highest accuracy (99.85%) with
|
32 |
+
precision-optimized performance on 20 carefully selected languages.
|
33 |
+
|
34 |
+
Architecture: BERT (Model B)
|
35 |
+
Dataset: Dataset B (enhanced/specialized)
|
36 |
+
Base Model: bert-base-multilingual-cased
|
37 |
+
Accuracy: 99.85%
|
38 |
+
Parameters: 178M
|
39 |
+
Training Loss: 0.0125
|
40 |
+
"""
|
41 |
+
|
42 |
+
def __init__(self):
|
43 |
+
"""Initialize the Model B Dataset B language detector."""
|
44 |
+
self.model_key = "model-b-dataset-b"
|
45 |
+
self.config = get_model_config(self.model_key)
|
46 |
+
self.model_name = self.config["huggingface_model"]
|
47 |
+
|
48 |
+
# Check if transformers library is available
|
49 |
+
if not HF_AVAILABLE:
|
50 |
+
raise ImportError(
|
51 |
+
"Transformers library required for Model B Dataset B. "
|
52 |
+
"Install with: pip install transformers torch"
|
53 |
+
)
|
54 |
+
|
55 |
+
# Initialize the model pipeline
|
56 |
+
try:
|
57 |
+
self.classifier = pipeline(
|
58 |
+
"text-classification",
|
59 |
+
model=self.model_name,
|
60 |
+
device=0,
|
61 |
+
top_k=None # Return all scores
|
62 |
+
)
|
63 |
+
logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
|
64 |
+
except Exception as e:
|
65 |
+
logging.error(f"Failed to load {self.config['display_name']}: {e}")
|
66 |
+
raise RuntimeError(f"Could not initialize Model B Dataset B: {str(e)}")
|
67 |
+
|
68 |
+
def predict(self, text: str) -> Dict[str, Any]:
|
69 |
+
"""
|
70 |
+
Predict language using Model B Dataset B (BERT enhanced).
|
71 |
+
|
72 |
+
Args:
|
73 |
+
text (str): Input text to analyze
|
74 |
+
|
75 |
+
Returns:
|
76 |
+
Dict with predictions, metadata, and model information
|
77 |
+
"""
|
78 |
+
if not text or not text.strip():
|
79 |
+
raise ValueError("Input text cannot be empty")
|
80 |
+
|
81 |
+
try:
|
82 |
+
# Run the model prediction
|
83 |
+
results = self.classifier(text)
|
84 |
+
|
85 |
+
# Handle the format returned by the pipeline
|
86 |
+
if isinstance(results, list) and len(results) > 0:
|
87 |
+
if isinstance(results[0], list):
|
88 |
+
# Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
|
89 |
+
prediction_list = results[0]
|
90 |
+
else:
|
91 |
+
# Direct list format: [{'label': 'en', 'score': 0.99}, ...]
|
92 |
+
prediction_list = results
|
93 |
+
else:
|
94 |
+
raise ValueError("Unexpected pipeline output format")
|
95 |
+
|
96 |
+
# Sort predictions by confidence score (descending)
|
97 |
+
predictions = [
|
98 |
+
{
|
99 |
+
'language_code': result['label'].lower(),
|
100 |
+
'confidence': result['score']
|
101 |
+
}
|
102 |
+
for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
|
103 |
+
]
|
104 |
+
|
105 |
+
return {
|
106 |
+
'predictions': predictions,
|
107 |
+
'text_length': len(text),
|
108 |
+
'model_version': self.model_name,
|
109 |
+
'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
|
110 |
+
}
|
111 |
+
|
112 |
+
except Exception as e:
|
113 |
+
logging.error(f"Model B Dataset B prediction failed: {e}")
|
114 |
+
raise RuntimeError(f"Model prediction failed: {str(e)}")
|
115 |
+
|
116 |
+
def get_supported_languages(self) -> List[str]:
|
117 |
+
"""
|
118 |
+
Get supported languages for Model B Dataset B.
|
119 |
+
|
120 |
+
Returns:
|
121 |
+
List of ISO 639-1 language codes supported by the model (20 languages)
|
122 |
+
"""
|
123 |
+
return get_supported_languages(self.model_key)
|
124 |
+
|
125 |
+
def get_model_info(self) -> Dict[str, Any]:
|
126 |
+
"""
|
127 |
+
Get detailed information about Model B Dataset B.
|
128 |
+
|
129 |
+
Returns:
|
130 |
+
Dict containing comprehensive model metadata
|
131 |
+
"""
|
132 |
+
# Build comprehensive model info from centralized config
|
133 |
+
model_info = {
|
134 |
+
"name": self.config["display_name"],
|
135 |
+
"description": self.config["description"],
|
136 |
+
"accuracy": self.config["accuracy"],
|
137 |
+
"model_size": self.config["model_size"],
|
138 |
+
"architecture": self.config["architecture"],
|
139 |
+
"base_model": self.config["base_model"],
|
140 |
+
"dataset": self.config["dataset"],
|
141 |
+
"languages_supported": f"{self.config['languages_supported']} carefully selected languages: Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Italian, Japanese, Dutch, Polish, Portuguese, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese",
|
142 |
+
"training_details": self.config["training_details"],
|
143 |
+
"training_loss": f"{self.config.get('training_loss', 'N/A')}",
|
144 |
+
"use_cases": self.config["use_cases"],
|
145 |
+
"strengths": self.config["strengths"],
|
146 |
+
"limitations": self.config["limitations"]
|
147 |
+
}
|
148 |
+
|
149 |
+
return model_info
|
backend/models/model_config.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Centralized Model Configuration
|
3 |
+
|
4 |
+
This module contains the configuration for all language detection models
|
5 |
+
organized by the model architecture and training dataset combinations.
|
6 |
+
|
7 |
+
Model Architecture:
|
8 |
+
- Model A: XLM-RoBERTa based architectures
|
9 |
+
- Model B: BERT based architectures
|
10 |
+
|
11 |
+
Training Datasets:
|
12 |
+
- Dataset A: Standard multilingual language detection dataset
|
13 |
+
- Dataset B: Enhanced/specialized language detection dataset
|
14 |
+
"""
|
15 |
+
|
16 |
+
from typing import Dict, Any, List
|
17 |
+
|
18 |
+
# Model configurations organized by architecture and dataset
|
19 |
+
MODEL_CONFIGURATIONS = {
|
20 |
+
"model-a-dataset-a": {
|
21 |
+
"huggingface_model": "ZheYu03/xlm-r-langdetect-model",
|
22 |
+
"display_name": "XLM-RoBERTa Model A Dataset A",
|
23 |
+
"short_name": "Model A Dataset A",
|
24 |
+
"architecture": "XLM-RoBERTa",
|
25 |
+
"base_model": "xlm-roberta-base",
|
26 |
+
"dataset": "Dataset A",
|
27 |
+
"accuracy": "97.9%",
|
28 |
+
"model_size": "278M parameters",
|
29 |
+
"training_epochs": 10,
|
30 |
+
"languages_supported": 100,
|
31 |
+
"description": "High-performance XLM-RoBERTa based language detection model fine-tuned on standard multilingual dataset. Delivers reliable 97.9% accuracy across 100+ languages with robust cross-lingual capabilities.",
|
32 |
+
"training_details": "Fine-tuned XLM-RoBERTa base model with AdamW optimizer, 10 epochs training on comprehensive multilingual language detection dataset",
|
33 |
+
"use_cases": "General-purpose language detection, multilingual content processing, cross-lingual text analysis",
|
34 |
+
"strengths": "Excellent multilingual performance, robust cross-lingual transfer, proven reliability, comprehensive language coverage",
|
35 |
+
"limitations": "Higher computational requirements, moderate inference speed",
|
36 |
+
"status": "available"
|
37 |
+
},
|
38 |
+
|
39 |
+
"model-b-dataset-a": {
|
40 |
+
"huggingface_model": "SongJuNN/xlm-r-langdetect-model",
|
41 |
+
"display_name": "BERT Model B Dataset A",
|
42 |
+
"short_name": "Model B Dataset A",
|
43 |
+
"architecture": "BERT",
|
44 |
+
"base_model": "bert-base-multilingual-cased",
|
45 |
+
"dataset": "Dataset A",
|
46 |
+
"accuracy": "96.17%",
|
47 |
+
"model_size": "178M parameters",
|
48 |
+
"training_epochs": 10,
|
49 |
+
"languages_supported": 100,
|
50 |
+
"description": "Efficient BERT-based language detection model trained on standard multilingual dataset. Optimized for speed and broad language coverage with 96.17% accuracy across 100+ languages.",
|
51 |
+
"training_details": "BERT multilingual model fine-tuned with AdamW optimizer (lr=2e-05), mixed precision training, optimized for efficiency",
|
52 |
+
"use_cases": "High-throughput language detection, real-time applications, resource-constrained environments",
|
53 |
+
"strengths": "Fast inference speed, lower memory usage, broad language support, efficient processing",
|
54 |
+
"limitations": "Slightly lower accuracy compared to XLM-RoBERTa variants",
|
55 |
+
"status": "available"
|
56 |
+
},
|
57 |
+
|
58 |
+
"model-a-dataset-b": {
|
59 |
+
"huggingface_model": "zues0102/xlmr-papluca-model",
|
60 |
+
"display_name": "XLM-RoBERTa Model A Dataset B",
|
61 |
+
"short_name": "Model A Dataset B",
|
62 |
+
"architecture": "XLM-RoBERTa",
|
63 |
+
"base_model": "xlm-roberta-base",
|
64 |
+
"dataset": "Dataset B",
|
65 |
+
"accuracy": "99.72%",
|
66 |
+
"model_size": "278M parameters",
|
67 |
+
"training_epochs": 10,
|
68 |
+
"training_loss": 0.0176,
|
69 |
+
"languages_supported": 20,
|
70 |
+
"description": "Ultra high-accuracy XLM-RoBERTa model fine-tuned on enhanced dataset. Achieves exceptional 99.72% accuracy with support for 20 carefully selected high-performance languages and state-of-the-art performance.",
|
71 |
+
"training_details": "Advanced fine-tuning of XLM-RoBERTa on enhanced dataset with specialized training procedures, achieving loss of 0.0176",
|
72 |
+
"use_cases": "Research applications, high-precision language detection, critical accuracy requirements",
|
73 |
+
"strengths": "Exceptional accuracy (99.72%), focused language support, state-of-the-art results",
|
74 |
+
"limitations": "Higher computational requirements, limited to 20 languages",
|
75 |
+
"status": "available"
|
76 |
+
},
|
77 |
+
|
78 |
+
"model-b-dataset-b": {
|
79 |
+
"huggingface_model": "zues0102/bert-base-multilingual-cased",
|
80 |
+
"display_name": "BERT Model B Dataset B",
|
81 |
+
"short_name": "Model B Dataset B",
|
82 |
+
"architecture": "BERT",
|
83 |
+
"base_model": "bert-base-multilingual-cased",
|
84 |
+
"dataset": "Dataset B",
|
85 |
+
"accuracy": "99.85%",
|
86 |
+
"model_size": "178M parameters",
|
87 |
+
"training_epochs": 10,
|
88 |
+
"training_loss": 0.0125,
|
89 |
+
"languages_supported": 20,
|
90 |
+
"description": "State-of-the-art BERT model achieving highest accuracy (99.85%) through specialized training on enhanced dataset. Optimized for 20 carefully selected high-performance languages.",
|
91 |
+
"training_details": "Precision-optimized BERT training on enhanced dataset achieving ultra-low loss of 0.0125, specialized for maximum accuracy",
|
92 |
+
"use_cases": "Maximum precision applications, research requiring highest accuracy, critical language identification",
|
93 |
+
"strengths": "Highest accuracy (99.85%), ultra-low training loss, optimized precision, efficient architecture",
|
94 |
+
"limitations": "Limited to 20 languages, specialized for specific language set",
|
95 |
+
"status": "available"
|
96 |
+
}
|
97 |
+
}
|
98 |
+
|
99 |
+
# Language mappings - comprehensive set
|
100 |
+
LANGUAGE_MAPPINGS = {
|
101 |
+
'af': 'Afrikaans', 'am': 'Amharic', 'ar': 'Arabic', 'as': 'Assamese',
|
102 |
+
'az': 'Azerbaijani', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bn': 'Bengali',
|
103 |
+
'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'cs': 'Czech',
|
104 |
+
'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dz': 'Dzongkha',
|
105 |
+
'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish',
|
106 |
+
'et': 'Estonian', 'eu': 'Basque', 'fa': 'Persian', 'fi': 'Finnish',
|
107 |
+
'fr': 'French', 'fy': 'Frisian', 'ga': 'Irish', 'gd': 'Scottish Gaelic',
|
108 |
+
'gl': 'Galician', 'gu': 'Gujarati', 'ha': 'Hausa', 'he': 'Hebrew',
|
109 |
+
'hi': 'Hindi', 'hr': 'Croatian', 'ht': 'Haitian Creole', 'hu': 'Hungarian',
|
110 |
+
'hy': 'Armenian', 'id': 'Indonesian', 'is': 'Icelandic', 'it': 'Italian',
|
111 |
+
'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kk': 'Kazakh',
|
112 |
+
'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean', 'ku': 'Kurdish',
|
113 |
+
'ky': 'Kyrgyz', 'la': 'Latin', 'lb': 'Luxembourgish', 'lo': 'Lao',
|
114 |
+
'lt': 'Lithuanian', 'lv': 'Latvian', 'mg': 'Malagasy', 'mk': 'Macedonian',
|
115 |
+
'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay',
|
116 |
+
'mt': 'Maltese', 'my': 'Myanmar (Burmese)', 'nb': 'Norwegian Bokmål',
|
117 |
+
'ne': 'Nepali', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian',
|
118 |
+
'oc': 'Occitan', 'or': 'Odia', 'pa': 'Punjabi', 'pl': 'Polish',
|
119 |
+
'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'ro': 'Romanian',
|
120 |
+
'ru': 'Russian', 'rw': 'Kinyarwanda', 'se': 'Northern Sami', 'si': 'Sinhala',
|
121 |
+
'sk': 'Slovak', 'sl': 'Slovenian', 'so': 'Somali', 'sq': 'Albanian',
|
122 |
+
'sr': 'Serbian', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil',
|
123 |
+
'te': 'Telugu', 'th': 'Thai', 'tl': 'Filipino', 'tr': 'Turkish',
|
124 |
+
'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'vi': 'Vietnamese',
|
125 |
+
'vo': 'Volapük', 'wa': 'Walloon', 'xh': 'Xhosa', 'yi': 'Yiddish',
|
126 |
+
'yo': 'Yoruba', 'zh': 'Chinese', 'zh-cn': 'Chinese (Simplified)',
|
127 |
+
'zh-tw': 'Chinese (Traditional)', 'zh-hans': 'Chinese (Simplified)',
|
128 |
+
'zh-hant': 'Chinese (Traditional)', 'zu': 'Zulu'
|
129 |
+
}
|
130 |
+
|
131 |
+
# Model-specific language support
|
132 |
+
MODEL_LANGUAGE_SUPPORT = {
|
133 |
+
"model-a-dataset-a": [
|
134 |
+
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
|
135 |
+
'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
|
136 |
+
'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
|
137 |
+
'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
|
138 |
+
'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
|
139 |
+
'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
|
140 |
+
'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
|
141 |
+
'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
|
142 |
+
'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
|
143 |
+
'yo', 'zh', 'zu'
|
144 |
+
],
|
145 |
+
|
146 |
+
"model-b-dataset-a": [
|
147 |
+
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca',
|
148 |
+
'cs', 'cy', 'da', 'de', 'dz', 'el', 'en', 'eo', 'es', 'et', 'eu',
|
149 |
+
'fa', 'fi', 'fr', 'fy', 'ga', 'gd', 'gl', 'gu', 'ha', 'he', 'hi',
|
150 |
+
'hr', 'ht', 'hu', 'hy', 'id', 'is', 'it', 'ja', 'jv', 'ka', 'kk',
|
151 |
+
'km', 'kn', 'ko', 'ku', 'ky', 'la', 'lb', 'lo', 'lt', 'lv', 'mg',
|
152 |
+
'mk', 'ml', 'mn', 'mr', 'ms', 'mt', 'my', 'nb', 'ne', 'nl', 'nn',
|
153 |
+
'no', 'oc', 'or', 'pa', 'pl', 'ps', 'pt', 'qu', 'ro', 'ru', 'rw',
|
154 |
+
'se', 'si', 'sk', 'sl', 'so', 'sq', 'sr', 'sv', 'sw', 'ta', 'te',
|
155 |
+
'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'vi', 'vo', 'wa', 'xh', 'yi',
|
156 |
+
'yo', 'zh', 'zu'
|
157 |
+
],
|
158 |
+
|
159 |
+
"model-a-dataset-b": [
|
160 |
+
'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
|
161 |
+
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
|
162 |
+
],
|
163 |
+
|
164 |
+
"model-b-dataset-b": [
|
165 |
+
'ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'it', 'ja',
|
166 |
+
'nl', 'pl', 'pt', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh'
|
167 |
+
]
|
168 |
+
}
|
169 |
+
|
170 |
+
def get_model_config(model_key: str) -> Dict[str, Any]:
|
171 |
+
"""Get configuration for a specific model."""
|
172 |
+
return MODEL_CONFIGURATIONS.get(model_key, {})
|
173 |
+
|
174 |
+
def get_all_model_configs() -> Dict[str, Dict[str, Any]]:
|
175 |
+
"""Get all model configurations."""
|
176 |
+
return MODEL_CONFIGURATIONS.copy()
|
177 |
+
|
178 |
+
def get_supported_languages(model_key: str) -> List[str]:
|
179 |
+
"""Get supported languages for a specific model."""
|
180 |
+
return MODEL_LANGUAGE_SUPPORT.get(model_key, [])
|
181 |
+
|
182 |
+
def get_language_name(language_code: str) -> str:
|
183 |
+
"""Get human-readable language name from code."""
|
184 |
+
return LANGUAGE_MAPPINGS.get(language_code.lower(), f"Unknown ({language_code})")
|
combined_app.py
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from backend.language_detector import LanguageDetector
|
4 |
+
from typing import List, Dict, Any
|
5 |
+
import time
|
6 |
+
|
7 |
+
# Import demo samples from demo_page
|
8 |
+
from demo_page import DEMO_SAMPLES, initialize_models, detect_with_all_models, create_results_dataframe, run_demo_tests
|
9 |
+
|
10 |
+
def create_single_model_interface():
|
11 |
+
"""Create the original single model interface."""
|
12 |
+
# Initialize the language detector with default model (Model A Dataset A)
|
13 |
+
detector = LanguageDetector()
|
14 |
+
|
15 |
+
with gr.Column() as single_interface:
|
16 |
+
gr.Markdown("# 🌍 Language Detection App")
|
17 |
+
gr.Markdown("Select a model and enter text below to detect its language with confidence scores.")
|
18 |
+
|
19 |
+
# Model Selection Section with visual styling
|
20 |
+
with gr.Group():
|
21 |
+
gr.Markdown(
|
22 |
+
"<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.1), transparent); border-radius: 8px 8px 0 0;'>🤖 Model Selection</div>"
|
23 |
+
)
|
24 |
+
|
25 |
+
# Get available models
|
26 |
+
available_models = detector.get_available_models()
|
27 |
+
model_choices = []
|
28 |
+
model_info_map = {}
|
29 |
+
|
30 |
+
for key, info in available_models.items():
|
31 |
+
if info["status"] == "available":
|
32 |
+
model_choices.append((info["display_name"], key))
|
33 |
+
else:
|
34 |
+
model_choices.append((f"{info['display_name']} (Coming Soon)", key))
|
35 |
+
model_info_map[key] = info
|
36 |
+
|
37 |
+
model_selector = gr.Dropdown(
|
38 |
+
choices=model_choices,
|
39 |
+
value="model-a-dataset-a", # Default to Model A Dataset A
|
40 |
+
label="Choose Language Detection Model",
|
41 |
+
interactive=True
|
42 |
+
)
|
43 |
+
|
44 |
+
# Model Information Display
|
45 |
+
model_info_display = gr.Markdown(
|
46 |
+
value=_format_model_info(detector.get_current_model_info()),
|
47 |
+
label="Model Information"
|
48 |
+
)
|
49 |
+
|
50 |
+
# Add visual separator
|
51 |
+
gr.Markdown(
|
52 |
+
"<div style='margin: 24px 0; border-top: 3px solid rgba(99, 102, 241, 0.2); background: linear-gradient(90deg, transparent, rgba(99, 102, 241, 0.05), transparent); height: 2px;'></div>"
|
53 |
+
)
|
54 |
+
|
55 |
+
# Analysis Section
|
56 |
+
with gr.Group():
|
57 |
+
gr.Markdown(
|
58 |
+
"<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 16px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(34, 197, 94, 0.1), transparent); border-radius: 8px 8px 0 0;'>🔍 Language Analysis</div>"
|
59 |
+
)
|
60 |
+
|
61 |
+
with gr.Row():
|
62 |
+
with gr.Column(scale=2):
|
63 |
+
# Input section
|
64 |
+
text_input = gr.Textbox(
|
65 |
+
label="Text to Analyze",
|
66 |
+
placeholder="Enter text here to detect its language...",
|
67 |
+
lines=5,
|
68 |
+
max_lines=10
|
69 |
+
)
|
70 |
+
|
71 |
+
detect_btn = gr.Button("🔍 Detect Language", variant="primary", size="lg")
|
72 |
+
|
73 |
+
# Example texts
|
74 |
+
gr.Examples(
|
75 |
+
examples=[
|
76 |
+
["Hello, how are you today?"],
|
77 |
+
["Bonjour, comment allez-vous?"],
|
78 |
+
["Hola, ¿cómo estás?"],
|
79 |
+
["Guten Tag, wie geht es Ihnen?"],
|
80 |
+
["こんにちは、元気ですか?"],
|
81 |
+
["Привет, как дела?"],
|
82 |
+
["Ciao, come stai?"],
|
83 |
+
["Olá, como você está?"],
|
84 |
+
["你好,你好吗?"],
|
85 |
+
["안녕하세요, 어떻게 지내세요?"]
|
86 |
+
],
|
87 |
+
inputs=text_input,
|
88 |
+
label="Try these examples:"
|
89 |
+
)
|
90 |
+
|
91 |
+
with gr.Column(scale=2):
|
92 |
+
# Output section
|
93 |
+
with gr.Group():
|
94 |
+
gr.Markdown(
|
95 |
+
"<div style='text-align: center; padding: 16px 0 8px 0; margin-bottom: 12px; font-size: 18px; font-weight: 600; border-bottom: 2px solid; background: linear-gradient(90deg, transparent, rgba(168, 85, 247, 0.1), transparent); border-radius: 8px 8px 0 0;'>📊 Detection Results</div>"
|
96 |
+
)
|
97 |
+
|
98 |
+
detected_language = gr.Textbox(
|
99 |
+
label="Detected Language",
|
100 |
+
interactive=False
|
101 |
+
)
|
102 |
+
|
103 |
+
confidence_score = gr.Number(
|
104 |
+
label="Confidence Score",
|
105 |
+
interactive=False,
|
106 |
+
precision=4
|
107 |
+
)
|
108 |
+
|
109 |
+
language_code = gr.Textbox(
|
110 |
+
label="Language Code (ISO 639-1)",
|
111 |
+
interactive=False
|
112 |
+
)
|
113 |
+
|
114 |
+
# Top predictions table
|
115 |
+
top_predictions = gr.Dataframe(
|
116 |
+
headers=["Language", "Code", "Confidence"],
|
117 |
+
label="Top 5 Predictions",
|
118 |
+
interactive=False,
|
119 |
+
wrap=True
|
120 |
+
)
|
121 |
+
|
122 |
+
# Status/Info section
|
123 |
+
with gr.Row():
|
124 |
+
status_text = gr.Textbox(
|
125 |
+
label="Status",
|
126 |
+
interactive=False,
|
127 |
+
visible=False
|
128 |
+
)
|
129 |
+
|
130 |
+
# Event handlers
|
131 |
+
def detect_language_wrapper(text, selected_model):
|
132 |
+
if not text.strip():
|
133 |
+
return (
|
134 |
+
"No text provided",
|
135 |
+
0.0,
|
136 |
+
"",
|
137 |
+
[],
|
138 |
+
gr.update(value="Please enter some text to analyze.", visible=True)
|
139 |
+
)
|
140 |
+
|
141 |
+
try:
|
142 |
+
# Switch model if needed
|
143 |
+
if detector.current_model_key != selected_model:
|
144 |
+
try:
|
145 |
+
detector.switch_model(selected_model)
|
146 |
+
except NotImplementedError:
|
147 |
+
return (
|
148 |
+
"Model unavailable",
|
149 |
+
0.0,
|
150 |
+
"",
|
151 |
+
[],
|
152 |
+
gr.update(value="This model is not yet implemented. Please select an available model.", visible=True)
|
153 |
+
)
|
154 |
+
except Exception as e:
|
155 |
+
return (
|
156 |
+
"Model error",
|
157 |
+
0.0,
|
158 |
+
"",
|
159 |
+
[],
|
160 |
+
gr.update(value=f"Error loading model: {str(e)}", visible=True)
|
161 |
+
)
|
162 |
+
|
163 |
+
result = detector.detect_language(text)
|
164 |
+
|
165 |
+
# Extract main prediction
|
166 |
+
main_lang = result['language']
|
167 |
+
main_confidence = result['confidence']
|
168 |
+
main_code = result['language_code']
|
169 |
+
|
170 |
+
# Format top predictions for table
|
171 |
+
predictions_table = [
|
172 |
+
[pred['language'], pred['language_code'], f"{pred['confidence']:.4f}"]
|
173 |
+
for pred in result['top_predictions']
|
174 |
+
]
|
175 |
+
|
176 |
+
model_info = result.get('metadata', {}).get('model_info', {})
|
177 |
+
model_name = model_info.get('name', 'Unknown Model')
|
178 |
+
|
179 |
+
return (
|
180 |
+
main_lang,
|
181 |
+
main_confidence,
|
182 |
+
main_code,
|
183 |
+
predictions_table,
|
184 |
+
gr.update(value=f"✅ Analysis Complete\n\nInput Text: {text[:100]}{'...' if len(text) > 100 else ''}\n\nDetected Language: {main_lang} ({main_code})\nConfidence: {main_confidence:.2%}\n\nModel: {model_name}", visible=True)
|
185 |
+
)
|
186 |
+
|
187 |
+
except Exception as e:
|
188 |
+
return (
|
189 |
+
"Error occurred",
|
190 |
+
0.0,
|
191 |
+
"",
|
192 |
+
[],
|
193 |
+
gr.update(value=f"Error: {str(e)}", visible=True)
|
194 |
+
)
|
195 |
+
|
196 |
+
def update_model_info(selected_model):
|
197 |
+
"""Update model information display when model selection changes."""
|
198 |
+
try:
|
199 |
+
if detector.current_model_key != selected_model:
|
200 |
+
detector.switch_model(selected_model)
|
201 |
+
model_info = detector.get_current_model_info()
|
202 |
+
return _format_model_info(model_info)
|
203 |
+
except NotImplementedError:
|
204 |
+
return "**This model is not yet implemented.** Please select an available model."
|
205 |
+
except Exception as e:
|
206 |
+
return f"**Error loading model information:** {str(e)}"
|
207 |
+
|
208 |
+
# Connect the button to the detection function
|
209 |
+
detect_btn.click(
|
210 |
+
fn=detect_language_wrapper,
|
211 |
+
inputs=[text_input, model_selector],
|
212 |
+
outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
|
213 |
+
)
|
214 |
+
|
215 |
+
# Also trigger on Enter key in text input
|
216 |
+
text_input.submit(
|
217 |
+
fn=detect_language_wrapper,
|
218 |
+
inputs=[text_input, model_selector],
|
219 |
+
outputs=[detected_language, confidence_score, language_code, top_predictions, status_text]
|
220 |
+
)
|
221 |
+
|
222 |
+
# Update model info when selection changes
|
223 |
+
model_selector.change(
|
224 |
+
fn=update_model_info,
|
225 |
+
inputs=[model_selector],
|
226 |
+
outputs=[model_info_display]
|
227 |
+
)
|
228 |
+
|
229 |
+
return single_interface
|
230 |
+
|
231 |
+
def create_demo_comparison_interface():
|
232 |
+
"""Create the demo comparison interface."""
|
233 |
+
|
234 |
+
# Initialize models
|
235 |
+
models = initialize_models()
|
236 |
+
|
237 |
+
with gr.Column() as demo_interface:
|
238 |
+
gr.Markdown("# 🚀 Language Detection Demo - Model Comparison")
|
239 |
+
gr.Markdown("Compare all four language detection models simultaneously across various difficulty categories.")
|
240 |
+
|
241 |
+
# Model Status Section
|
242 |
+
with gr.Group():
|
243 |
+
gr.Markdown("## 🤖 Model Status")
|
244 |
+
model_status_text = ""
|
245 |
+
for model_key, model_info in models.items():
|
246 |
+
status_icon = "✅" if model_info["status"] == "Ready" else "❌"
|
247 |
+
model_status_text += f"{status_icon} **{model_info['name']}**: {model_info['status']}\n\n"
|
248 |
+
gr.Markdown(model_status_text)
|
249 |
+
|
250 |
+
# Category Selection Section
|
251 |
+
with gr.Group():
|
252 |
+
gr.Markdown("## 📊 Test Categories")
|
253 |
+
gr.Markdown("Select categories to test different aspects of language detection difficulty:")
|
254 |
+
|
255 |
+
category_checkboxes = gr.CheckboxGroup(
|
256 |
+
choices=list(DEMO_SAMPLES.keys()),
|
257 |
+
label="Select Test Categories",
|
258 |
+
value=["Easy/Obvious", "Short Text"], # Default selection
|
259 |
+
interactive=True
|
260 |
+
)
|
261 |
+
|
262 |
+
# Custom Text Input Section
|
263 |
+
with gr.Group():
|
264 |
+
gr.Markdown("## ✏️ Custom Text Input")
|
265 |
+
gr.Markdown("Enter your own texts to test (one per line):")
|
266 |
+
|
267 |
+
custom_text_input = gr.Textbox(
|
268 |
+
label="Custom Texts",
|
269 |
+
placeholder="Enter custom texts here, one per line...\nExample:\nHello world\nBonjour le monde\n你好世界",
|
270 |
+
lines=5,
|
271 |
+
max_lines=10
|
272 |
+
)
|
273 |
+
|
274 |
+
# Control Buttons
|
275 |
+
with gr.Row():
|
276 |
+
run_demo_btn = gr.Button("🔍 Run Demo Tests", variant="primary", size="lg")
|
277 |
+
clear_btn = gr.Button("🗑️ Clear Results", variant="secondary")
|
278 |
+
|
279 |
+
# Sample Preview Section (moved up, condensed)
|
280 |
+
with gr.Group():
|
281 |
+
gr.Markdown("## 📚 Category Explanations")
|
282 |
+
gr.Markdown("Understanding what each test category evaluates:")
|
283 |
+
|
284 |
+
category_explanations = """
|
285 |
+
**Easy/Obvious:** Clear, unambiguous sentences in their native language. Tests basic language detection capability.
|
286 |
+
|
287 |
+
**Short Text:** Single words or very short phrases. Tests model performance with minimal context.
|
288 |
+
|
289 |
+
**False Friends:** Words that look similar across languages but have different meanings. Tests ability to distinguish between closely related languages.
|
290 |
+
|
291 |
+
**Mixed Scripts:** Text containing multiple languages, numbers, symbols, or scripts. Tests handling of multilingual content.
|
292 |
+
|
293 |
+
**Proper Nouns:** Names of people, places, or entities that exist across multiple languages. Tests context-dependent detection.
|
294 |
+
|
295 |
+
**Common Words:** International words with similar spelling across languages (hotel, taxi, etc.). Tests disambiguation of universal terms.
|
296 |
+
|
297 |
+
**Technical Terms:** Specialized vocabulary that may be borrowed or translated across languages. Tests domain-specific detection.
|
298 |
+
|
299 |
+
**Code-switching:** Text that switches between languages mid-sentence. Tests handling of bilingual communication patterns.
|
300 |
+
|
301 |
+
**Transliterated Text:** Non-Latin scripts written in Latin characters. Tests recognition of transliteration vs. native language.
|
302 |
+
|
303 |
+
**Ambiguous Script:** Words that could belong to multiple languages with identical spelling. Tests the model's decision-making under uncertainty.
|
304 |
+
"""
|
305 |
+
|
306 |
+
gr.Markdown(category_explanations)
|
307 |
+
|
308 |
+
# Results Section (moved to bottom)
|
309 |
+
with gr.Group():
|
310 |
+
gr.Markdown("## 📈 Results")
|
311 |
+
|
312 |
+
summary_output = gr.Textbox(
|
313 |
+
label="Summary",
|
314 |
+
interactive=False,
|
315 |
+
visible=False
|
316 |
+
)
|
317 |
+
|
318 |
+
results_dataframe = gr.Dataframe(
|
319 |
+
label="Model Comparison Results (A-A: Model A Dataset A, B-A: Model B Dataset A, A-B: Model A Dataset B, B-B: Model B Dataset B)",
|
320 |
+
wrap=True,
|
321 |
+
interactive=False,
|
322 |
+
visible=False
|
323 |
+
)
|
324 |
+
|
325 |
+
# Event Handlers
|
326 |
+
def run_tests(selected_cats, custom_texts):
|
327 |
+
summary, df = run_demo_tests(selected_cats, custom_texts, models)
|
328 |
+
|
329 |
+
if df is not None:
|
330 |
+
return (
|
331 |
+
gr.update(value=summary, visible=True),
|
332 |
+
gr.update(value=df, visible=True)
|
333 |
+
)
|
334 |
+
else:
|
335 |
+
return (
|
336 |
+
gr.update(value=summary, visible=True),
|
337 |
+
gr.update(visible=False)
|
338 |
+
)
|
339 |
+
|
340 |
+
def clear_results():
|
341 |
+
return (
|
342 |
+
gr.update(value="", visible=False),
|
343 |
+
gr.update(value=None, visible=False)
|
344 |
+
)
|
345 |
+
|
346 |
+
# Connect event handlers
|
347 |
+
run_demo_btn.click(
|
348 |
+
fn=run_tests,
|
349 |
+
inputs=[category_checkboxes, custom_text_input],
|
350 |
+
outputs=[summary_output, results_dataframe]
|
351 |
+
)
|
352 |
+
|
353 |
+
clear_btn.click(
|
354 |
+
fn=clear_results,
|
355 |
+
outputs=[summary_output, results_dataframe]
|
356 |
+
)
|
357 |
+
|
358 |
+
return demo_interface
|
359 |
+
|
360 |
+
def _format_model_info(model_info):
|
361 |
+
"""Format model information for display."""
|
362 |
+
if not model_info:
|
363 |
+
return "No model information available."
|
364 |
+
|
365 |
+
formatted_info = f"""
|
366 |
+
**{model_info.get('name', 'Unknown Model')}**
|
367 |
+
|
368 |
+
{model_info.get('description', 'No description available.')}
|
369 |
+
|
370 |
+
**📊 Performance:**
|
371 |
+
- Accuracy: {model_info.get('accuracy', 'N/A')}
|
372 |
+
- Model Size: {model_info.get('model_size', 'N/A')}
|
373 |
+
|
374 |
+
**🏗️ Architecture:**
|
375 |
+
- Model Architecture: {model_info.get('architecture', 'N/A')}
|
376 |
+
- Base Model: {model_info.get('base_model', 'N/A')}
|
377 |
+
- Training Dataset: {model_info.get('dataset', 'N/A')}
|
378 |
+
|
379 |
+
**🌐 Languages:** {model_info.get('languages_supported', 'N/A')}
|
380 |
+
|
381 |
+
**⚙️ Training Details:** {model_info.get('training_details', 'N/A')}
|
382 |
+
|
383 |
+
**💡 Use Cases:** {model_info.get('use_cases', 'N/A')}
|
384 |
+
|
385 |
+
**✅ Strengths:** {model_info.get('strengths', 'N/A')}
|
386 |
+
|
387 |
+
**⚠️ Limitations:** {model_info.get('limitations', 'N/A')}
|
388 |
+
"""
|
389 |
+
return formatted_info
|
390 |
+
|
391 |
+
def main():
|
392 |
+
"""Create the main application with tabbed interface."""
|
393 |
+
|
394 |
+
with gr.Blocks(title="Language Detection App Suite", theme=gr.themes.Soft()) as app:
|
395 |
+
gr.Markdown("# 🌍 Language Detection App Suite")
|
396 |
+
gr.Markdown("Choose between single model testing or comprehensive model comparison.")
|
397 |
+
|
398 |
+
with gr.Tabs():
|
399 |
+
with gr.TabItem("🔍 Single Model Detection"):
|
400 |
+
single_model_interface = create_single_model_interface()
|
401 |
+
|
402 |
+
with gr.TabItem("🚀 Model Comparison Demo"):
|
403 |
+
demo_comparison_interface = create_demo_comparison_interface()
|
404 |
+
|
405 |
+
return app
|
406 |
+
|
407 |
+
if __name__ == "__main__":
|
408 |
+
app = main()
|
409 |
+
app.launch(
|
410 |
+
server_name="localhost",
|
411 |
+
server_port=7860,
|
412 |
+
share=False,
|
413 |
+
debug=True
|
414 |
+
)
|
demo_page.py
ADDED
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from backend.language_detector import LanguageDetector
|
4 |
+
from typing import List, Dict, Any
|
5 |
+
import time
|
6 |
+
|
7 |
+
# Sample text database organized by difficulty categories
|
8 |
+
DEMO_SAMPLES = {
|
9 |
+
"Easy/Obvious": [
|
10 |
+
("Hello, how are you doing today?", "en", "Clear English sentence"),
|
11 |
+
("Bonjour, comment allez-vous aujourd'hui?", "fr", "Clear French sentence"),
|
12 |
+
("Hola, ¿cómo estás hoy?", "es", "Clear Spanish sentence"),
|
13 |
+
("Guten Tag, wie geht es Ihnen heute?", "de", "Clear German sentence"),
|
14 |
+
("こんにちは、今日はどうですか?", "ja", "Clear Japanese sentence"),
|
15 |
+
("Привет, как дела сегодня?", "ru", "Clear Russian sentence"),
|
16 |
+
("Ciao, come stai oggi?", "it", "Clear Italian sentence"),
|
17 |
+
("Olá, como você está hoje?", "pt", "Clear Portuguese sentence"),
|
18 |
+
("你好,你今天怎么样?", "zh", "Clear Chinese sentence"),
|
19 |
+
("안녕하세요, 오늘 어떻게 지내세요?", "ko", "Clear Korean sentence"),
|
20 |
+
],
|
21 |
+
|
22 |
+
"Short Text": [
|
23 |
+
("Hi", "en", "Very short greeting"),
|
24 |
+
("Oui", "fr", "Single word French"),
|
25 |
+
("Sí", "es", "Single word Spanish"),
|
26 |
+
("Ja", "de", "Single word German"),
|
27 |
+
("はい", "ja", "Single word Japanese"),
|
28 |
+
("Да", "ru", "Single word Russian"),
|
29 |
+
("Sì", "it", "Single word Italian"),
|
30 |
+
("Sim", "pt", "Single word Portuguese"),
|
31 |
+
("是", "zh", "Single character Chinese"),
|
32 |
+
("네", "ko", "Single word Korean"),
|
33 |
+
],
|
34 |
+
|
35 |
+
"False Friends": [
|
36 |
+
("actual", "en", "English word, but means 'current' in Spanish"),
|
37 |
+
("embarazada", "es", "Spanish for pregnant, not embarrassed"),
|
38 |
+
("gift", "en", "English word, but means 'poison' in German"),
|
39 |
+
("preservativo", "es", "Spanish for condom, not preservative"),
|
40 |
+
("sensible", "en", "English word, but means 'sensitive' in Spanish"),
|
41 |
+
("sympathique", "fr", "French for nice, not sympathetic"),
|
42 |
+
("biblioteca", "es", "Spanish for library, not Bible place"),
|
43 |
+
("realizzare", "it", "Italian for to achieve, not realize"),
|
44 |
+
("parents", "en", "English word, but means 'relatives' in French"),
|
45 |
+
("attualmente", "it", "Italian for currently, not actually"),
|
46 |
+
],
|
47 |
+
|
48 |
+
"Mixed Scripts": [
|
49 |
+
("Hello123世界", "mix", "Mixed English, numbers, Chinese"),
|
50 |
+
("Café #1 في العالم", "mix", "Mixed French, numbers, Arabic"),
|
51 |
+
("2023年は良い年です", "ja", "Japanese with numbers"),
|
52 |
+
("Prix: €50,000", "fr", "French with currency and numbers"),
|
53 |
+
("iPhone 15 Pro Max", "en", "Product name with numbers"),
|
54 |
+
("COVID-19 パンデミック", "mix", "Mixed English acronym and Japanese"),
|
55 |
+
("Wi-Fi пароль: 123456", "mix", "Mixed English tech term and Russian"),
|
56 |
+
("GPS координаты", "mix", "Mixed English acronym and Russian"),
|
57 |
+
("URL: https://example.com", "en", "Web address"),
|
58 |
+
("HTML <div>content</div>", "en", "Code with markup"),
|
59 |
+
],
|
60 |
+
|
61 |
+
"Proper Nouns": [
|
62 |
+
("Paris", "ambiguous", "City name - French or English context?"),
|
63 |
+
("Berlin", "ambiguous", "City name - German or English context?"),
|
64 |
+
("Madrid", "ambiguous", "City name - Spanish or English context?"),
|
65 |
+
("Tokyo", "ambiguous", "City name - Japanese or English context?"),
|
66 |
+
("Maria", "ambiguous", "Common name in many languages"),
|
67 |
+
("Alexander", "ambiguous", "Name used in many languages"),
|
68 |
+
("David", "ambiguous", "Biblical name used worldwide"),
|
69 |
+
("Anna", "ambiguous", "Name common across languages"),
|
70 |
+
("Michael", "ambiguous", "International name"),
|
71 |
+
("Sofia", "ambiguous", "Name and city, multiple languages"),
|
72 |
+
],
|
73 |
+
|
74 |
+
"Common Words": [
|
75 |
+
("hotel", "ambiguous", "Same spelling in many languages"),
|
76 |
+
("restaurant", "ambiguous", "French origin, used worldwide"),
|
77 |
+
("taxi", "ambiguous", "International word"),
|
78 |
+
("pizza", "ambiguous", "Italian origin, used worldwide"),
|
79 |
+
("chocolate", "ambiguous", "Similar in many languages"),
|
80 |
+
("hospital", "ambiguous", "Medical term used internationally"),
|
81 |
+
("radio", "ambiguous", "Technology term used worldwide"),
|
82 |
+
("metro", "ambiguous", "Transportation term"),
|
83 |
+
("cafe", "ambiguous", "French origin, international use"),
|
84 |
+
("photo", "ambiguous", "Greek origin, used worldwide"),
|
85 |
+
],
|
86 |
+
|
87 |
+
"Technical Terms": [
|
88 |
+
("algorithm", "en", "Technical English term"),
|
89 |
+
("algorithme", "fr", "Technical French term"),
|
90 |
+
("algoritmo", "es", "Technical Spanish term"),
|
91 |
+
("Algorithmus", "de", "Technical German term"),
|
92 |
+
("アルゴリズム", "ja", "Technical Japanese term"),
|
93 |
+
("алгоритм", "ru", "Technical Russian term"),
|
94 |
+
("algoritmo", "it", "Technical Italian term"),
|
95 |
+
("algoritmo", "pt", "Technical Portuguese term"),
|
96 |
+
("算法", "zh", "Technical Chinese term"),
|
97 |
+
("알고리즘", "ko", "Technical Korean term"),
|
98 |
+
],
|
99 |
+
|
100 |
+
"Code-switching": [
|
101 |
+
("I love sushi とても美味しい", "mix", "English-Japanese code switching"),
|
102 |
+
("C'est très nice aujourd'hui", "mix", "French-English code switching"),
|
103 |
+
("Me gusta this song mucho", "mix", "Spanish-English code switching"),
|
104 |
+
("Das ist very interessant", "mix", "German-English code switching"),
|
105 |
+
("Это really хорошо", "mix", "Russian-English code switching"),
|
106 |
+
("È molto beautiful oggi", "mix", "Italian-English code switching"),
|
107 |
+
("Está muito good today", "mix", "Portuguese-English code switching"),
|
108 |
+
("这个 is very 好", "mix", "Chinese-English code switching"),
|
109 |
+
("이것은 really 좋다", "mix", "Korean-English code switching"),
|
110 |
+
("Merci beaucoup for everything", "mix", "French-English code switching"),
|
111 |
+
],
|
112 |
+
|
113 |
+
"Transliterated Text": [
|
114 |
+
("Konnichiwa", "transliteration", "Japanese こんにちは in Latin script"),
|
115 |
+
("Spasibo", "transliteration", "Russian спасибо in Latin script"),
|
116 |
+
("Arigato", "transliteration", "Japanese ありがとう in Latin script"),
|
117 |
+
("Privyet", "transliteration", "Russian привет in Latin script"),
|
118 |
+
("Sayonara", "transliteration", "Japanese さようなら in Latin script"),
|
119 |
+
("Dosvedanya", "transliteration", "Russian до свидания in Latin script"),
|
120 |
+
("Nihao", "transliteration", "Chinese 你好 in Latin script"),
|
121 |
+
("Annyeonghaseyo", "transliteration", "Korean 안녕하세요 in Latin script"),
|
122 |
+
("Zdravstvuyte", "transliteration", "Russian здравствуйте in Latin script"),
|
123 |
+
("Ohayo gozaimasu", "transliteration", "Japanese おはようございます in Latin script"),
|
124 |
+
],
|
125 |
+
|
126 |
+
"Ambiguous Script": [
|
127 |
+
("casa", "ambiguous", "House in Spanish/Italian/Portuguese"),
|
128 |
+
("rose", "ambiguous", "Flower in English or pink in French"),
|
129 |
+
("more", "ambiguous", "English word or Italian 'deaths'"),
|
130 |
+
("come", "ambiguous", "English verb or Italian 'how/like'"),
|
131 |
+
("no", "ambiguous", "English word or Spanish 'no'"),
|
132 |
+
("si", "ambiguous", "Spanish 'if' or Italian 'yes'"),
|
133 |
+
("la", "ambiguous", "English 'la' or French/Spanish/Italian article"),
|
134 |
+
("me", "ambiguous", "English pronoun or Spanish 'me'"),
|
135 |
+
("le", "ambiguous", "French article or Italian article"),
|
136 |
+
("son", "ambiguous", "English word or Spanish 'they are'"),
|
137 |
+
]
|
138 |
+
}
|
139 |
+
|
140 |
+
def initialize_models():
|
141 |
+
"""Initialize all four models for comparison."""
|
142 |
+
models = {}
|
143 |
+
model_configs = [
|
144 |
+
("model-a-dataset-a", "Model A Dataset A"),
|
145 |
+
("model-b-dataset-a", "Model B Dataset A"),
|
146 |
+
("model-a-dataset-b", "Model A Dataset B"),
|
147 |
+
("model-b-dataset-b", "Model B Dataset B")
|
148 |
+
]
|
149 |
+
|
150 |
+
for model_key, model_name in model_configs:
|
151 |
+
try:
|
152 |
+
models[model_key] = {
|
153 |
+
"detector": LanguageDetector(model_key=model_key),
|
154 |
+
"name": model_name,
|
155 |
+
"status": "Ready"
|
156 |
+
}
|
157 |
+
except Exception as e:
|
158 |
+
models[model_key] = {
|
159 |
+
"detector": None,
|
160 |
+
"name": model_name,
|
161 |
+
"status": f"Error: {str(e)}"
|
162 |
+
}
|
163 |
+
|
164 |
+
return models
|
165 |
+
|
166 |
+
def detect_with_all_models(text: str, models: Dict) -> Dict[str, Any]:
|
167 |
+
"""Run language detection with all models and return results."""
|
168 |
+
results = {}
|
169 |
+
|
170 |
+
for model_key, model_info in models.items():
|
171 |
+
if model_info["detector"] is None:
|
172 |
+
results[model_key] = {
|
173 |
+
"language": "Error",
|
174 |
+
"confidence": 0.0,
|
175 |
+
"language_code": "error",
|
176 |
+
"status": model_info["status"]
|
177 |
+
}
|
178 |
+
else:
|
179 |
+
try:
|
180 |
+
result = model_info["detector"].detect_language(text)
|
181 |
+
results[model_key] = {
|
182 |
+
"language": result["language"],
|
183 |
+
"confidence": result["confidence"],
|
184 |
+
"language_code": result["language_code"],
|
185 |
+
"status": "Success"
|
186 |
+
}
|
187 |
+
except Exception as e:
|
188 |
+
results[model_key] = {
|
189 |
+
"language": "Error",
|
190 |
+
"confidence": 0.0,
|
191 |
+
"language_code": "error",
|
192 |
+
"status": f"Error: {str(e)}"
|
193 |
+
}
|
194 |
+
|
195 |
+
return results
|
196 |
+
|
197 |
+
def create_results_dataframe(texts: List[str], all_results: List[Dict], expected_langs: List[str] = None, categories: List[str] = None) -> pd.DataFrame:
|
198 |
+
"""Create a pandas DataFrame for results display."""
|
199 |
+
data = []
|
200 |
+
|
201 |
+
for i, (text, results) in enumerate(zip(texts, all_results)):
|
202 |
+
row = {
|
203 |
+
"Text": text[:40] + "..." if len(text) > 40 else text, # Shortened text display
|
204 |
+
"Expected": expected_langs[i] if expected_langs else "N/A",
|
205 |
+
"Category": categories[i] if categories else "Custom"
|
206 |
+
}
|
207 |
+
|
208 |
+
expected_lang = expected_langs[i] if expected_langs else None
|
209 |
+
|
210 |
+
# Add results from each model - combine language and confidence
|
211 |
+
for model_key, result in results.items():
|
212 |
+
# Shortened model names
|
213 |
+
if model_key == "model-a-dataset-a":
|
214 |
+
col_name = "A-A"
|
215 |
+
elif model_key == "model-b-dataset-a":
|
216 |
+
col_name = "B-A"
|
217 |
+
elif model_key == "model-a-dataset-b":
|
218 |
+
col_name = "A-B"
|
219 |
+
elif model_key == "model-b-dataset-b":
|
220 |
+
col_name = "B-B"
|
221 |
+
else:
|
222 |
+
col_name = model_key[:6]
|
223 |
+
|
224 |
+
# Determine if prediction is correct
|
225 |
+
predicted_lang = result['language_code']
|
226 |
+
is_correct = False
|
227 |
+
|
228 |
+
if expected_lang and expected_lang not in ['ambiguous', 'mix', 'transliteration', 'unknown', 'N/A']:
|
229 |
+
# For specific expected languages, check exact match
|
230 |
+
is_correct = predicted_lang == expected_lang
|
231 |
+
emoji = "✅" if is_correct else "🚫"
|
232 |
+
else:
|
233 |
+
# For ambiguous/mixed/transliterated/unknown cases, don't show emoji
|
234 |
+
emoji = ""
|
235 |
+
|
236 |
+
# Combine emoji, language code and confidence in one column
|
237 |
+
if emoji:
|
238 |
+
row[col_name] = f"{emoji} {predicted_lang} ({result['confidence']:.3f})"
|
239 |
+
else:
|
240 |
+
row[col_name] = f"{predicted_lang} ({result['confidence']:.3f})"
|
241 |
+
|
242 |
+
data.append(row)
|
243 |
+
|
244 |
+
return pd.DataFrame(data)
|
245 |
+
|
246 |
+
def run_demo_tests(selected_categories: List[str], custom_texts: str, models: Dict):
|
247 |
+
"""Run tests on selected categories and custom texts."""
|
248 |
+
if not selected_categories and not custom_texts.strip():
|
249 |
+
return "Please select at least one category or enter custom text.", None
|
250 |
+
|
251 |
+
all_texts = []
|
252 |
+
expected_langs = []
|
253 |
+
categories = []
|
254 |
+
|
255 |
+
# Add selected category samples
|
256 |
+
for category in selected_categories:
|
257 |
+
if category in DEMO_SAMPLES:
|
258 |
+
for text, expected, description in DEMO_SAMPLES[category]:
|
259 |
+
all_texts.append(text)
|
260 |
+
expected_langs.append(expected)
|
261 |
+
categories.append(category)
|
262 |
+
|
263 |
+
# Add custom texts
|
264 |
+
if custom_texts.strip():
|
265 |
+
custom_lines = [line.strip() for line in custom_texts.strip().split('\n') if line.strip()]
|
266 |
+
for text in custom_lines:
|
267 |
+
all_texts.append(text)
|
268 |
+
expected_langs.append("unknown")
|
269 |
+
categories.append("Custom")
|
270 |
+
|
271 |
+
if not all_texts:
|
272 |
+
return "No texts to analyze.", None
|
273 |
+
|
274 |
+
# Run detection on all texts
|
275 |
+
all_results = []
|
276 |
+
for text in all_texts:
|
277 |
+
results = detect_with_all_models(text, models)
|
278 |
+
all_results.append(results)
|
279 |
+
|
280 |
+
# Create results DataFrame
|
281 |
+
df = create_results_dataframe(all_texts, all_results, expected_langs, categories)
|
282 |
+
|
283 |
+
summary = f"Analyzed {len(all_texts)} texts across {len(set(categories))} categories."
|
284 |
+
|
285 |
+
return summary, df
|
286 |
+
|
287 |
+
def create_demo_interface():
|
288 |
+
"""Create the demo interface."""
|
289 |
+
|
290 |
+
# Initialize models
|
291 |
+
models = initialize_models()
|
292 |
+
|
293 |
+
with gr.Blocks(title="Language Detection Demo - Model Comparison", theme=gr.themes.Soft()) as demo:
|
294 |
+
gr.Markdown("# 🚀 Language Detection Demo - Model Comparison")
|
295 |
+
gr.Markdown("Compare all four language detection models simultaneously across various difficulty categories.")
|
296 |
+
|
297 |
+
# Model Status Section
|
298 |
+
with gr.Group():
|
299 |
+
gr.Markdown("## 🤖 Model Status")
|
300 |
+
model_status_text = ""
|
301 |
+
for model_key, model_info in models.items():
|
302 |
+
status_icon = "✅" if model_info["status"] == "Ready" else "❌"
|
303 |
+
model_status_text += f"{status_icon} **{model_info['name']}**: {model_info['status']}\n\n"
|
304 |
+
gr.Markdown(model_status_text)
|
305 |
+
|
306 |
+
# Category Selection Section
|
307 |
+
with gr.Group():
|
308 |
+
gr.Markdown("## 📊 Test Categories")
|
309 |
+
gr.Markdown("Select categories to test different aspects of language detection difficulty:")
|
310 |
+
|
311 |
+
category_checkboxes = gr.CheckboxGroup(
|
312 |
+
choices=list(DEMO_SAMPLES.keys()),
|
313 |
+
label="Select Test Categories",
|
314 |
+
value=["Easy/Obvious", "Short Text"], # Default selection
|
315 |
+
interactive=True
|
316 |
+
)
|
317 |
+
|
318 |
+
# Custom Text Input Section
|
319 |
+
with gr.Group():
|
320 |
+
gr.Markdown("## ✏️ Custom Text Input")
|
321 |
+
gr.Markdown("Enter your own texts to test (one per line):")
|
322 |
+
|
323 |
+
custom_text_input = gr.Textbox(
|
324 |
+
label="Custom Texts",
|
325 |
+
placeholder="Enter custom texts here, one per line...\nExample:\nHello world\nBonjour le monde\n你好世界",
|
326 |
+
lines=5,
|
327 |
+
max_lines=10
|
328 |
+
)
|
329 |
+
|
330 |
+
# Control Buttons
|
331 |
+
with gr.Row():
|
332 |
+
run_demo_btn = gr.Button("🔍 Run Demo Tests", variant="primary", size="lg")
|
333 |
+
clear_btn = gr.Button("🗑️ Clear Results", variant="secondary")
|
334 |
+
|
335 |
+
# Sample Preview Section (moved up, condensed)
|
336 |
+
with gr.Group():
|
337 |
+
gr.Markdown("## 📚 Category Explanations")
|
338 |
+
gr.Markdown("Understanding what each test category evaluates:")
|
339 |
+
|
340 |
+
category_explanations = """
|
341 |
+
**Easy/Obvious:** Clear, unambiguous sentences in their native language. Tests basic language detection capability.
|
342 |
+
|
343 |
+
**Short Text:** Single words or very short phrases. Tests model performance with minimal context.
|
344 |
+
|
345 |
+
**False Friends:** Words that look similar across languages but have different meanings. Tests ability to distinguish between closely related languages.
|
346 |
+
|
347 |
+
**Mixed Scripts:** Text containing multiple languages, numbers, symbols, or scripts. Tests handling of multilingual content.
|
348 |
+
|
349 |
+
**Proper Nouns:** Names of people, places, or entities that exist across multiple languages. Tests context-dependent detection.
|
350 |
+
|
351 |
+
**Common Words:** International words with similar spelling across languages (hotel, taxi, etc.). Tests disambiguation of universal terms.
|
352 |
+
|
353 |
+
**Technical Terms:** Specialized vocabulary that may be borrowed or translated across languages. Tests domain-specific detection.
|
354 |
+
|
355 |
+
**Code-switching:** Text that switches between languages mid-sentence. Tests handling of bilingual communication patterns.
|
356 |
+
|
357 |
+
**Transliterated Text:** Non-Latin scripts written in Latin characters. Tests recognition of transliteration vs. native language.
|
358 |
+
|
359 |
+
**Ambiguous Script:** Words that could belong to multiple languages with identical spelling. Tests the model's decision-making under uncertainty.
|
360 |
+
"""
|
361 |
+
|
362 |
+
gr.Markdown(category_explanations)
|
363 |
+
|
364 |
+
# Results Section (moved to bottom)
|
365 |
+
with gr.Group():
|
366 |
+
gr.Markdown("## 📈 Results")
|
367 |
+
|
368 |
+
summary_output = gr.Textbox(
|
369 |
+
label="Summary",
|
370 |
+
interactive=False,
|
371 |
+
visible=False
|
372 |
+
)
|
373 |
+
|
374 |
+
results_dataframe = gr.Dataframe(
|
375 |
+
label="Model Comparison Results (A-A: Model A Dataset A, B-A: Model B Dataset A, A-B: Model A Dataset B, B-B: Model B Dataset B)",
|
376 |
+
wrap=True,
|
377 |
+
interactive=False,
|
378 |
+
visible=False
|
379 |
+
)
|
380 |
+
|
381 |
+
# Event Handlers
|
382 |
+
def run_tests(selected_cats, custom_texts):
|
383 |
+
summary, df = run_demo_tests(selected_cats, custom_texts, models)
|
384 |
+
|
385 |
+
if df is not None:
|
386 |
+
return (
|
387 |
+
gr.update(value=summary, visible=True),
|
388 |
+
gr.update(value=df, visible=True)
|
389 |
+
)
|
390 |
+
else:
|
391 |
+
return (
|
392 |
+
gr.update(value=summary, visible=True),
|
393 |
+
gr.update(visible=False)
|
394 |
+
)
|
395 |
+
|
396 |
+
def clear_results():
|
397 |
+
return (
|
398 |
+
gr.update(value="", visible=False),
|
399 |
+
gr.update(value=None, visible=False)
|
400 |
+
)
|
401 |
+
|
402 |
+
# Connect event handlers
|
403 |
+
run_demo_btn.click(
|
404 |
+
fn=run_tests,
|
405 |
+
inputs=[category_checkboxes, custom_text_input],
|
406 |
+
outputs=[summary_output, results_dataframe]
|
407 |
+
)
|
408 |
+
|
409 |
+
clear_btn.click(
|
410 |
+
fn=clear_results,
|
411 |
+
outputs=[summary_output, results_dataframe]
|
412 |
+
)
|
413 |
+
|
414 |
+
return demo
|
415 |
+
|
416 |
+
if __name__ == "__main__":
|
417 |
+
demo = create_demo_interface()
|
418 |
+
demo.launch(
|
419 |
+
server_name="localhost",
|
420 |
+
server_port=7860,
|
421 |
+
share=False,
|
422 |
+
debug=True
|
423 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==24.1.0
|
2 |
+
annotated-types==0.7.0
|
3 |
+
anyio==4.9.0
|
4 |
+
audioop-lts==0.2.1
|
5 |
+
certifi==2025.4.26
|
6 |
+
charset-normalizer==3.4.2
|
7 |
+
click==8.1.8
|
8 |
+
fastapi==0.115.12
|
9 |
+
ffmpy==0.5.0
|
10 |
+
filelock==3.18.0
|
11 |
+
fsspec==2025.5.1
|
12 |
+
gradio==5.31.0
|
13 |
+
gradio_client==1.10.1
|
14 |
+
groovy==0.1.2
|
15 |
+
h11==0.16.0
|
16 |
+
hf-xet==1.1.2
|
17 |
+
httpcore==1.0.9
|
18 |
+
httpx==0.28.1
|
19 |
+
huggingface-hub==0.32.0
|
20 |
+
idna==3.10
|
21 |
+
Jinja2==3.1.6
|
22 |
+
markdown-it-py==3.0.0
|
23 |
+
MarkupSafe==3.0.2
|
24 |
+
mdurl==0.1.2
|
25 |
+
mpmath==1.3.0
|
26 |
+
networkx==3.4.2
|
27 |
+
numpy==2.2.6
|
28 |
+
orjson==3.10.18
|
29 |
+
packaging==25.0
|
30 |
+
pandas==2.2.3
|
31 |
+
pillow==11.2.1
|
32 |
+
pydantic==2.11.5
|
33 |
+
pydantic_core==2.33.2
|
34 |
+
pydub==0.25.1
|
35 |
+
Pygments==2.19.1
|
36 |
+
python-dateutil==2.9.0.post0
|
37 |
+
python-multipart==0.0.20
|
38 |
+
pytz==2025.2
|
39 |
+
PyYAML==6.0.2
|
40 |
+
regex==2024.11.6
|
41 |
+
requests==2.32.3
|
42 |
+
rich==14.0.0
|
43 |
+
ruff==0.11.11
|
44 |
+
safehttpx==0.1.6
|
45 |
+
safetensors==0.5.3
|
46 |
+
semantic-version==2.10.0
|
47 |
+
setuptools==80.8.0
|
48 |
+
shellingham==1.5.4
|
49 |
+
six==1.17.0
|
50 |
+
sniffio==1.3.1
|
51 |
+
starlette==0.46.2
|
52 |
+
sympy==1.14.0
|
53 |
+
tokenizers==0.21.1
|
54 |
+
tomlkit==0.13.2
|
55 |
+
torch==2.7.0
|
56 |
+
tqdm==4.67.1
|
57 |
+
transformers==4.52.3
|
58 |
+
typer==0.15.4
|
59 |
+
typing-inspection==0.4.1
|
60 |
+
typing_extensions==4.13.2
|
61 |
+
tzdata==2025.2
|
62 |
+
urllib3==2.4.0
|
63 |
+
uvicorn==0.34.2
|
64 |
+
websockets==15.0.1
|
run.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Simple startup script for the Language Detection App
|
4 |
+
|
5 |
+
This script provides an easy way to run the app with different configurations.
|
6 |
+
"""
|
7 |
+
|
8 |
+
import sys
|
9 |
+
import os
|
10 |
+
import argparse
|
11 |
+
from pathlib import Path
|
12 |
+
|
13 |
+
def check_dependencies():
|
14 |
+
"""Check if required dependencies are installed."""
|
15 |
+
try:
|
16 |
+
import gradio
|
17 |
+
print("✅ Gradio is available")
|
18 |
+
except ImportError:
|
19 |
+
print("❌ Gradio not found. Install with: pip install -r requirements.txt")
|
20 |
+
return False
|
21 |
+
|
22 |
+
return True
|
23 |
+
|
24 |
+
def run_tests():
|
25 |
+
"""Run the test suite."""
|
26 |
+
print("🧪 Running tests...")
|
27 |
+
os.system("python test_app.py")
|
28 |
+
|
29 |
+
def run_app(model_type="placeholder", host="0.0.0.0", port=7860, share=False):
|
30 |
+
"""Run the main application."""
|
31 |
+
|
32 |
+
if not check_dependencies():
|
33 |
+
return 1
|
34 |
+
|
35 |
+
# Set environment variables for configuration
|
36 |
+
os.environ["MODEL_TYPE"] = model_type
|
37 |
+
os.environ["HOST"] = host
|
38 |
+
os.environ["PORT"] = str(port)
|
39 |
+
os.environ["SHARE"] = str(share).lower()
|
40 |
+
|
41 |
+
print(f"🚀 Starting Language Detection App...")
|
42 |
+
print(f"📊 Model: {model_type}")
|
43 |
+
print(f"🌐 Host: {host}:{port}")
|
44 |
+
print(f"🔗 Share: {share}")
|
45 |
+
print("-" * 50)
|
46 |
+
|
47 |
+
# Import and run the app
|
48 |
+
try:
|
49 |
+
from app import main
|
50 |
+
app = main()
|
51 |
+
app.launch(
|
52 |
+
server_name=host,
|
53 |
+
server_port=port,
|
54 |
+
share=share,
|
55 |
+
debug=True
|
56 |
+
)
|
57 |
+
except KeyboardInterrupt:
|
58 |
+
print("\n👋 App stopped by user")
|
59 |
+
except Exception as e:
|
60 |
+
print(f"❌ Error running app: {e}")
|
61 |
+
return 1
|
62 |
+
|
63 |
+
return 0
|
64 |
+
|
65 |
+
def main():
|
66 |
+
"""Main entry point."""
|
67 |
+
parser = argparse.ArgumentParser(
|
68 |
+
description="Language Detection App Runner",
|
69 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
70 |
+
epilog="""
|
71 |
+
Examples:
|
72 |
+
python run.py # Run with default settings
|
73 |
+
python run.py --test # Run tests only
|
74 |
+
python run.py --model huggingface # Use Hugging Face model (if available)
|
75 |
+
python run.py --port 8080 # Run on port 8080
|
76 |
+
python run.py --share # Create public link
|
77 |
+
"""
|
78 |
+
)
|
79 |
+
|
80 |
+
parser.add_argument(
|
81 |
+
"--test",
|
82 |
+
action="store_true",
|
83 |
+
help="Run tests instead of starting the app"
|
84 |
+
)
|
85 |
+
|
86 |
+
parser.add_argument(
|
87 |
+
"--model",
|
88 |
+
choices=["placeholder", "huggingface", "custom"],
|
89 |
+
default="placeholder",
|
90 |
+
help="Model type to use (default: placeholder)"
|
91 |
+
)
|
92 |
+
|
93 |
+
parser.add_argument(
|
94 |
+
"--host",
|
95 |
+
default="0.0.0.0",
|
96 |
+
help="Host to bind to (default: 0.0.0.0)"
|
97 |
+
)
|
98 |
+
|
99 |
+
parser.add_argument(
|
100 |
+
"--port",
|
101 |
+
type=int,
|
102 |
+
default=7860,
|
103 |
+
help="Port to bind to (default: 7860)"
|
104 |
+
)
|
105 |
+
|
106 |
+
parser.add_argument(
|
107 |
+
"--share",
|
108 |
+
action="store_true",
|
109 |
+
help="Create a public link via Gradio"
|
110 |
+
)
|
111 |
+
|
112 |
+
args = parser.parse_args()
|
113 |
+
|
114 |
+
print("🌍 Language Detection App Runner")
|
115 |
+
print("=" * 40)
|
116 |
+
|
117 |
+
if args.test:
|
118 |
+
run_tests()
|
119 |
+
return 0
|
120 |
+
|
121 |
+
# Validate model choice
|
122 |
+
if args.model == "huggingface":
|
123 |
+
try:
|
124 |
+
import transformers
|
125 |
+
print("✅ Transformers available for Hugging Face model")
|
126 |
+
except ImportError:
|
127 |
+
print("⚠️ Transformers not available. Install with:")
|
128 |
+
print(" pip install transformers torch")
|
129 |
+
print(" Falling back to placeholder model...")
|
130 |
+
args.model = "placeholder"
|
131 |
+
|
132 |
+
return run_app(
|
133 |
+
model_type=args.model,
|
134 |
+
host=args.host,
|
135 |
+
port=args.port,
|
136 |
+
share=args.share
|
137 |
+
)
|
138 |
+
|
139 |
+
if __name__ == "__main__":
|
140 |
+
sys.exit(main())
|
test_demo_functionality.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for the demo functionality
|
4 |
+
"""
|
5 |
+
|
6 |
+
from demo_page import initialize_models, detect_with_all_models, create_results_dataframe, run_demo_tests, DEMO_SAMPLES
|
7 |
+
|
8 |
+
def test_model_initialization():
|
9 |
+
"""Test that all models can be initialized."""
|
10 |
+
print("🔄 Testing model initialization...")
|
11 |
+
models = initialize_models()
|
12 |
+
|
13 |
+
print(f"✅ Initialized {len(models)} models:")
|
14 |
+
for model_key, model_info in models.items():
|
15 |
+
status_icon = "✅" if model_info["status"] == "Ready" else "❌"
|
16 |
+
print(f" {status_icon} {model_info['name']}: {model_info['status']}")
|
17 |
+
|
18 |
+
return models
|
19 |
+
|
20 |
+
def test_single_detection():
|
21 |
+
"""Test detection with a single text across all models."""
|
22 |
+
print("\n🔄 Testing single text detection...")
|
23 |
+
|
24 |
+
models = initialize_models()
|
25 |
+
test_text = "Hello, how are you today?"
|
26 |
+
|
27 |
+
results = detect_with_all_models(test_text, models)
|
28 |
+
|
29 |
+
print(f"Text: '{test_text}'")
|
30 |
+
print("Results:")
|
31 |
+
for model_key, result in results.items():
|
32 |
+
print(f" {model_key}: {result['language_code']} ({result['confidence']:.3f}) - {result['status']}")
|
33 |
+
|
34 |
+
return results
|
35 |
+
|
36 |
+
def test_category_samples():
|
37 |
+
"""Test a few samples from each category."""
|
38 |
+
print("\n🔄 Testing category samples...")
|
39 |
+
|
40 |
+
models = initialize_models()
|
41 |
+
|
42 |
+
for category, samples in DEMO_SAMPLES.items():
|
43 |
+
print(f"\n📊 Category: {category}")
|
44 |
+
# Test first sample from each category
|
45 |
+
text, expected, description = samples[0]
|
46 |
+
results = detect_with_all_models(text, models)
|
47 |
+
|
48 |
+
print(f" Text: '{text}' (Expected: {expected})")
|
49 |
+
print(f" Description: {description}")
|
50 |
+
for model_key, result in results.items():
|
51 |
+
match_icon = "✅" if result['language_code'] == expected or expected in ['ambiguous', 'mix', 'transliteration'] else "❌"
|
52 |
+
print(f" {model_key}: {result['language_code']} ({result['confidence']:.3f}) {match_icon}")
|
53 |
+
|
54 |
+
def test_dataframe_creation():
|
55 |
+
"""Test DataFrame creation with sample data."""
|
56 |
+
print("\n🔄 Testing DataFrame creation...")
|
57 |
+
|
58 |
+
models = initialize_models()
|
59 |
+
|
60 |
+
# Test with a few samples
|
61 |
+
test_texts = [
|
62 |
+
"Hello world",
|
63 |
+
"Bonjour le monde",
|
64 |
+
"Hola mundo"
|
65 |
+
]
|
66 |
+
expected_langs = ["en", "fr", "es"]
|
67 |
+
categories = ["Custom", "Custom", "Custom"]
|
68 |
+
|
69 |
+
all_results = []
|
70 |
+
for text in test_texts:
|
71 |
+
results = detect_with_all_models(text, models)
|
72 |
+
all_results.append(results)
|
73 |
+
|
74 |
+
df = create_results_dataframe(test_texts, all_results, expected_langs, categories)
|
75 |
+
|
76 |
+
print("DataFrame shape:", df.shape)
|
77 |
+
print("Columns:", list(df.columns))
|
78 |
+
print("\nFirst few rows:")
|
79 |
+
print(df.head())
|
80 |
+
|
81 |
+
return df
|
82 |
+
|
83 |
+
def test_demo_workflow():
|
84 |
+
"""Test the complete demo workflow."""
|
85 |
+
print("\n🔄 Testing complete demo workflow...")
|
86 |
+
|
87 |
+
models = initialize_models()
|
88 |
+
|
89 |
+
# Test with selected categories and custom text
|
90 |
+
selected_categories = ["Easy/Obvious", "Short Text"]
|
91 |
+
custom_texts = "Hello world\nBonjour\n你好"
|
92 |
+
|
93 |
+
summary, df = run_demo_tests(selected_categories, custom_texts, models)
|
94 |
+
|
95 |
+
print(f"Summary: {summary}")
|
96 |
+
if df is not None:
|
97 |
+
print(f"Results DataFrame shape: {df.shape}")
|
98 |
+
print("Sample results:")
|
99 |
+
print(df.head())
|
100 |
+
else:
|
101 |
+
print("❌ No DataFrame returned")
|
102 |
+
|
103 |
+
return summary, df
|
104 |
+
|
105 |
+
def main():
|
106 |
+
"""Run all tests."""
|
107 |
+
print("🚀 Starting demo functionality tests...\n")
|
108 |
+
|
109 |
+
try:
|
110 |
+
# Test 1: Model initialization
|
111 |
+
models = test_model_initialization()
|
112 |
+
|
113 |
+
# Test 2: Single detection
|
114 |
+
single_results = test_single_detection()
|
115 |
+
|
116 |
+
# Test 3: Category samples
|
117 |
+
test_category_samples()
|
118 |
+
|
119 |
+
# Test 4: DataFrame creation
|
120 |
+
df = test_dataframe_creation()
|
121 |
+
|
122 |
+
# Test 5: Complete workflow
|
123 |
+
summary, demo_df = test_demo_workflow()
|
124 |
+
|
125 |
+
print("\n✅ All tests completed successfully!")
|
126 |
+
print(f"📊 Total categories available: {len(DEMO_SAMPLES)}")
|
127 |
+
print(f"📝 Total sample texts: {sum(len(samples) for samples in DEMO_SAMPLES.values())}")
|
128 |
+
|
129 |
+
except Exception as e:
|
130 |
+
print(f"\n❌ Test failed with error: {e}")
|
131 |
+
import traceback
|
132 |
+
traceback.print_exc()
|
133 |
+
|
134 |
+
if __name__ == "__main__":
|
135 |
+
main()
|
test_model_a_dataset_a.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for Model A Dataset A - XLM-RoBERTa + Standard Dataset
|
4 |
+
|
5 |
+
This script tests the XLM-RoBERTa based language detection model
|
6 |
+
trained on the standard multilingual dataset to ensure it works correctly.
|
7 |
+
"""
|
8 |
+
|
9 |
+
import sys
|
10 |
+
import os
|
11 |
+
|
12 |
+
# Add the project root to the Python path
|
13 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
|
14 |
+
|
15 |
+
from backend.language_detector import LanguageDetector
|
16 |
+
|
17 |
+
|
18 |
+
def test_model_a_dataset_a():
|
19 |
+
"""Test the Model A Dataset A implementation."""
|
20 |
+
print("🧪 Testing Model A Dataset A - XLM-RoBERTa + Standard Dataset")
|
21 |
+
print("=" * 75)
|
22 |
+
|
23 |
+
try:
|
24 |
+
# Initialize detector with Model A Dataset A
|
25 |
+
detector = LanguageDetector(model_key="model-a-dataset-a")
|
26 |
+
print("✅ Successfully initialized Model A Dataset A")
|
27 |
+
|
28 |
+
# Test texts in different languages
|
29 |
+
test_texts = [
|
30 |
+
("Hello, how are you today?", "en"),
|
31 |
+
("Bonjour, comment allez-vous?", "fr"),
|
32 |
+
("Hola, ¿cómo estás?", "es"),
|
33 |
+
("Guten Tag, wie geht es Ihnen?", "de"),
|
34 |
+
("こんにちは、元気ですか?", "ja"),
|
35 |
+
("Привет, как дела?", "ru"),
|
36 |
+
("Ciao, come stai?", "it"),
|
37 |
+
("Olá, como você está?", "pt"),
|
38 |
+
("你好,你好吗?", "zh"),
|
39 |
+
("안녕하세요, 어떻게 지내세요?", "ko"),
|
40 |
+
("مرحبا، كيف حالك؟", "ar"),
|
41 |
+
("नमस्ते, आप कैसे हैं?", "hi")
|
42 |
+
]
|
43 |
+
|
44 |
+
print("\n🔍 Running language detection tests:")
|
45 |
+
print("-" * 75)
|
46 |
+
|
47 |
+
correct_predictions = 0
|
48 |
+
total_predictions = len(test_texts)
|
49 |
+
|
50 |
+
for text, expected_lang in test_texts:
|
51 |
+
try:
|
52 |
+
result = detector.detect_language(text)
|
53 |
+
predicted_lang = result['language_code']
|
54 |
+
confidence = result['confidence']
|
55 |
+
language_name = result['language']
|
56 |
+
|
57 |
+
# Check if prediction is correct (allow some flexibility for Chinese variants)
|
58 |
+
is_correct = (predicted_lang == expected_lang or
|
59 |
+
(expected_lang == "zh" and predicted_lang in ["zh-hans", "zh-hant", "zh-cn", "zh-tw"]))
|
60 |
+
if is_correct:
|
61 |
+
correct_predictions += 1
|
62 |
+
status = "✅"
|
63 |
+
else:
|
64 |
+
status = "❌"
|
65 |
+
|
66 |
+
print(f"{status} Text: {text[:40]}{'...' if len(text) > 40 else ''}")
|
67 |
+
print(f" Expected: {expected_lang} | Predicted: {predicted_lang} ({language_name})")
|
68 |
+
print(f" Confidence: {confidence:.4f}")
|
69 |
+
print()
|
70 |
+
|
71 |
+
except Exception as e:
|
72 |
+
print(f"❌ Error testing '{text[:30]}...': {str(e)}")
|
73 |
+
print()
|
74 |
+
|
75 |
+
# Calculate accuracy
|
76 |
+
accuracy = (correct_predictions / total_predictions) * 100
|
77 |
+
print(f"📊 Test Results: {correct_predictions}/{total_predictions} correct")
|
78 |
+
print(f"📈 Accuracy: {accuracy:.1f}%")
|
79 |
+
|
80 |
+
# Test model info
|
81 |
+
print("\n📋 Model Information:")
|
82 |
+
print("-" * 75)
|
83 |
+
model_info = detector.get_current_model_info()
|
84 |
+
for key, value in model_info.items():
|
85 |
+
print(f"{key.title().replace('_', ' ')}: {value}")
|
86 |
+
|
87 |
+
print("🎉 Model A Dataset A test completed successfully!")
|
88 |
+
|
89 |
+
except Exception as e:
|
90 |
+
print(f"❌ Test failed: {str(e)}")
|
91 |
+
import traceback
|
92 |
+
traceback.print_exc()
|
93 |
+
return False
|
94 |
+
|
95 |
+
return True
|
96 |
+
|
97 |
+
|
98 |
+
def test_model_architecture():
|
99 |
+
"""Test the model architecture information."""
|
100 |
+
print("\n🏗️ Testing Model Architecture Information")
|
101 |
+
print("=" * 75)
|
102 |
+
|
103 |
+
try:
|
104 |
+
detector = LanguageDetector(model_key="model-a-dataset-a")
|
105 |
+
model_info = detector.get_current_model_info()
|
106 |
+
|
107 |
+
# Verify key architecture information
|
108 |
+
expected_info = {
|
109 |
+
"architecture": "XLM-RoBERTa",
|
110 |
+
"dataset": "Dataset A",
|
111 |
+
"accuracy": "97.9%",
|
112 |
+
"model_size": "278M parameters"
|
113 |
+
}
|
114 |
+
|
115 |
+
print("🔍 Verifying model architecture information:")
|
116 |
+
print("-" * 50)
|
117 |
+
|
118 |
+
all_correct = True
|
119 |
+
for key, expected_value in expected_info.items():
|
120 |
+
actual_value = model_info.get(key, "Not found")
|
121 |
+
if actual_value == expected_value:
|
122 |
+
print(f"✅ {key}: {actual_value}")
|
123 |
+
else:
|
124 |
+
print(f"❌ {key}: Expected '{expected_value}', got '{actual_value}'")
|
125 |
+
all_correct = False
|
126 |
+
|
127 |
+
if all_correct:
|
128 |
+
print("\n🎉 All architecture information verified successfully!")
|
129 |
+
else:
|
130 |
+
print("\n⚠️ Some architecture information mismatches found.")
|
131 |
+
|
132 |
+
return all_correct
|
133 |
+
|
134 |
+
except Exception as e:
|
135 |
+
print(f"❌ Architecture test failed: {str(e)}")
|
136 |
+
return False
|
137 |
+
|
138 |
+
|
139 |
+
if __name__ == "__main__":
|
140 |
+
print("🚀 Starting Model A Dataset A Tests\n")
|
141 |
+
|
142 |
+
# Run tests
|
143 |
+
test1_passed = test_model_a_dataset_a()
|
144 |
+
test2_passed = test_model_architecture()
|
145 |
+
|
146 |
+
# Final results
|
147 |
+
print("\n" + "=" * 75)
|
148 |
+
if test1_passed and test2_passed:
|
149 |
+
print("🎉 All tests passed! Model A Dataset A is ready to use.")
|
150 |
+
print("⚖️ This model offers balanced performance with robust cross-lingual capabilities!")
|
151 |
+
else:
|
152 |
+
print("❌ Some tests failed. Please check the implementation.")
|
153 |
+
sys.exit(1)
|
test_model_b_dataset_b.py
ADDED
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for Model B Dataset B - BERT + Enhanced Dataset
|
4 |
+
|
5 |
+
This script tests the BERT based language detection model
|
6 |
+
trained on the enhanced dataset, achieving the highest accuracy (99.85%).
|
7 |
+
"""
|
8 |
+
|
9 |
+
import sys
|
10 |
+
import os
|
11 |
+
|
12 |
+
# Add the project root to the Python path
|
13 |
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__)))
|
14 |
+
|
15 |
+
from backend.language_detector import LanguageDetector
|
16 |
+
|
17 |
+
|
18 |
+
def test_model_b_dataset_b():
|
19 |
+
"""Test the Model B Dataset B implementation."""
|
20 |
+
print("🧪 Testing Model B Dataset B - BERT + Enhanced Dataset")
|
21 |
+
print("=" * 75)
|
22 |
+
|
23 |
+
try:
|
24 |
+
# Initialize detector with Model B Dataset B (highest accuracy)
|
25 |
+
detector = LanguageDetector(model_key="model-b-dataset-b")
|
26 |
+
print("✅ Successfully initialized Model B Dataset B")
|
27 |
+
|
28 |
+
# Test texts in the 20 supported languages
|
29 |
+
test_texts = [
|
30 |
+
("Hello, how are you today?", "en"), # English
|
31 |
+
("Bonjour, comment allez-vous?", "fr"), # French
|
32 |
+
("Hola, ¿cómo estás?", "es"), # Spanish
|
33 |
+
("Guten Tag, wie geht es Ihnen?", "de"), # German
|
34 |
+
("Ciao, come stai?", "it"), # Italian
|
35 |
+
("Olá, como você está?", "pt"), # Portuguese
|
36 |
+
("Привет, как дела?", "ru"), # Russian
|
37 |
+
("こんにちは、元気ですか?", "ja"), # Japanese
|
38 |
+
("你好,你好吗?", "zh"), # Chinese
|
39 |
+
("مرحبا، كيف حالك؟", "ar"), # Arabic
|
40 |
+
("नमस्ते, आप कैसे हैं?", "hi"), # Hindi
|
41 |
+
("Hallo, hoe gaat het met je?", "nl"), # Dutch
|
42 |
+
("Γεια σας, πώς είστε;", "el"), # Greek
|
43 |
+
("Здравейте, как сте?", "bg"), # Bulgarian
|
44 |
+
("Witaj, jak się masz?", "pl"), # Polish
|
45 |
+
("สวัสดี คุณเป็นอย่างไรบ้าง?", "th"), # Thai
|
46 |
+
("Merhaba, nasılsınız?", "tr"), # Turkish
|
47 |
+
("آپ کیسے ہیں؟", "ur"), # Urdu
|
48 |
+
("Xin chào, bạn khỏe không?", "vi"), # Vietnamese
|
49 |
+
("Habari, unajehje?", "sw") # Swahili
|
50 |
+
]
|
51 |
+
|
52 |
+
print("\n🔍 Running language detection tests on 20 supported languages:")
|
53 |
+
print("-" * 75)
|
54 |
+
|
55 |
+
correct_predictions = 0
|
56 |
+
total_predictions = len(test_texts)
|
57 |
+
|
58 |
+
for text, expected_lang in test_texts:
|
59 |
+
try:
|
60 |
+
result = detector.detect_language(text)
|
61 |
+
predicted_lang = result['language_code']
|
62 |
+
confidence = result['confidence']
|
63 |
+
language_name = result['language']
|
64 |
+
|
65 |
+
# Check if prediction is correct
|
66 |
+
is_correct = predicted_lang == expected_lang
|
67 |
+
if is_correct:
|
68 |
+
correct_predictions += 1
|
69 |
+
status = "✅"
|
70 |
+
else:
|
71 |
+
status = "❌"
|
72 |
+
|
73 |
+
print(f"{status} Text: {text[:40]}{'...' if len(text) > 40 else ''}")
|
74 |
+
print(f" Expected: {expected_lang} | Predicted: {predicted_lang} ({language_name})")
|
75 |
+
print(f" Confidence: {confidence:.4f}")
|
76 |
+
print()
|
77 |
+
|
78 |
+
except Exception as e:
|
79 |
+
print(f"❌ Error testing '{text[:30]}...': {str(e)}")
|
80 |
+
print()
|
81 |
+
|
82 |
+
# Calculate accuracy
|
83 |
+
accuracy = (correct_predictions / total_predictions) * 100
|
84 |
+
print(f"📊 Test Results: {correct_predictions}/{total_predictions} correct")
|
85 |
+
print(f"📈 Accuracy: {accuracy:.1f}%")
|
86 |
+
|
87 |
+
# Test model info
|
88 |
+
print("\n📋 Model Information:")
|
89 |
+
print("-" * 75)
|
90 |
+
model_info = detector.get_current_model_info()
|
91 |
+
for key, value in model_info.items():
|
92 |
+
print(f"{key.title().replace('_', ' ')}: {value}")
|
93 |
+
|
94 |
+
print("🎉 Model B Dataset B test completed successfully!")
|
95 |
+
|
96 |
+
except Exception as e:
|
97 |
+
print(f"❌ Test failed: {str(e)}")
|
98 |
+
import traceback
|
99 |
+
traceback.print_exc()
|
100 |
+
return False
|
101 |
+
|
102 |
+
return True
|
103 |
+
|
104 |
+
|
105 |
+
def test_all_models_comprehensive():
|
106 |
+
"""Test and compare all four available model combinations."""
|
107 |
+
print("\n🔄 Comprehensive All-Model Combinations Comparison")
|
108 |
+
print("=" * 75)
|
109 |
+
|
110 |
+
models_to_test = [
|
111 |
+
("model-a-dataset-a", "Model A Dataset A", "XLM-RoBERTa + Standard", "97.9%"),
|
112 |
+
("model-b-dataset-a", "Model B Dataset A", "BERT + Standard", "96.17%"),
|
113 |
+
("model-a-dataset-b", "Model A Dataset B", "XLM-RoBERTa + Enhanced", "99.72%"),
|
114 |
+
("model-b-dataset-b", "Model B Dataset B", "BERT + Enhanced", "99.85%")
|
115 |
+
]
|
116 |
+
|
117 |
+
test_texts = [
|
118 |
+
"Hello, this is a test in English.",
|
119 |
+
"Bonjour, ceci est un test en français.",
|
120 |
+
"Hola, esto es una prueba en español.",
|
121 |
+
"Guten Tag, das ist ein Test auf Deutsch."
|
122 |
+
]
|
123 |
+
|
124 |
+
print("🧪 Testing with multiple sentences across all model combinations:")
|
125 |
+
print("-" * 75)
|
126 |
+
|
127 |
+
try:
|
128 |
+
results_summary = {}
|
129 |
+
|
130 |
+
for model_key, model_name, description, claimed_accuracy in models_to_test:
|
131 |
+
print(f"\n🤖 Testing {model_name} ({description}) - Claimed: {claimed_accuracy}")
|
132 |
+
print("-" * 60)
|
133 |
+
|
134 |
+
try:
|
135 |
+
detector = LanguageDetector(model_key=model_key)
|
136 |
+
model_results = []
|
137 |
+
|
138 |
+
for text in test_texts:
|
139 |
+
result = detector.detect_language(text)
|
140 |
+
model_results.append({
|
141 |
+
'text': text[:30] + '...' if len(text) > 30 else text,
|
142 |
+
'language': result['language'],
|
143 |
+
'code': result['language_code'],
|
144 |
+
'confidence': result['confidence']
|
145 |
+
})
|
146 |
+
|
147 |
+
print(f" Text: {text[:30]}{'...' if len(text) > 30 else ''}")
|
148 |
+
print(f" → {result['language']} ({result['language_code']}) - {result['confidence']:.4f}")
|
149 |
+
|
150 |
+
results_summary[model_name] = model_results
|
151 |
+
print(f"✅ {model_name} completed successfully")
|
152 |
+
|
153 |
+
except Exception as e:
|
154 |
+
print(f"❌ {model_name}: {str(e)}")
|
155 |
+
results_summary[model_name] = f"Error: {str(e)}"
|
156 |
+
|
157 |
+
print(f"\n📊 All Model Combinations Testing Summary:")
|
158 |
+
print("-" * 75)
|
159 |
+
for model_name, results in results_summary.items():
|
160 |
+
if isinstance(results, str):
|
161 |
+
print(f"❌ {model_name}: {results}")
|
162 |
+
else:
|
163 |
+
avg_confidence = sum(r['confidence'] for r in results) / len(results)
|
164 |
+
print(f"✅ {model_name}: Avg Confidence: {avg_confidence:.4f}")
|
165 |
+
|
166 |
+
print("🎉 Comprehensive model comparison completed successfully!")
|
167 |
+
return True
|
168 |
+
|
169 |
+
except Exception as e:
|
170 |
+
print(f"❌ Comprehensive test failed: {str(e)}")
|
171 |
+
return False
|
172 |
+
|
173 |
+
|
174 |
+
def test_model_architecture():
|
175 |
+
"""Test the model architecture information for Model B Dataset B."""
|
176 |
+
print("\n🏗️ Testing Model B Dataset B Architecture Information")
|
177 |
+
print("=" * 75)
|
178 |
+
|
179 |
+
try:
|
180 |
+
detector = LanguageDetector(model_key="model-b-dataset-b")
|
181 |
+
model_info = detector.get_current_model_info()
|
182 |
+
|
183 |
+
# Verify key architecture information
|
184 |
+
expected_info = {
|
185 |
+
"architecture": "BERT",
|
186 |
+
"dataset": "Dataset B",
|
187 |
+
"accuracy": "99.85%",
|
188 |
+
"model_size": "178M parameters"
|
189 |
+
}
|
190 |
+
|
191 |
+
print("🔍 Verifying model architecture information:")
|
192 |
+
print("-" * 50)
|
193 |
+
|
194 |
+
all_correct = True
|
195 |
+
for key, expected_value in expected_info.items():
|
196 |
+
actual_value = model_info.get(key, "Not found")
|
197 |
+
if actual_value == expected_value:
|
198 |
+
print(f"✅ {key}: {actual_value}")
|
199 |
+
else:
|
200 |
+
print(f"❌ {key}: Expected '{expected_value}', got '{actual_value}'")
|
201 |
+
all_correct = False
|
202 |
+
|
203 |
+
if all_correct:
|
204 |
+
print("\n🎉 All architecture information verified successfully!")
|
205 |
+
else:
|
206 |
+
print("\n⚠️ Some architecture information mismatches found.")
|
207 |
+
|
208 |
+
return all_correct
|
209 |
+
|
210 |
+
except Exception as e:
|
211 |
+
print(f"❌ Architecture test failed: {str(e)}")
|
212 |
+
return False
|
213 |
+
|
214 |
+
|
215 |
+
if __name__ == "__main__":
|
216 |
+
print("🚀 Starting Model B Dataset B Tests\n")
|
217 |
+
|
218 |
+
# Run tests
|
219 |
+
test1_passed = test_model_b_dataset_b()
|
220 |
+
test2_passed = test_all_models_comprehensive()
|
221 |
+
test3_passed = test_model_architecture()
|
222 |
+
|
223 |
+
# Final results
|
224 |
+
print("\n" + "=" * 75)
|
225 |
+
if test1_passed and test2_passed and test3_passed:
|
226 |
+
print("🎉 All tests passed! Model B Dataset B is ready to use.")
|
227 |
+
print("🏆 This model offers the highest accuracy (99.85%) of all available models!")
|
228 |
+
print("📝 Note: Optimized for 20 carefully selected languages for maximum precision.")
|
229 |
+
else:
|
230 |
+
print("❌ Some tests failed. Please check the implementation.")
|
231 |
+
sys.exit(1)
|