Spaces:

yangdingcheok
/

language-detection-assignment

Running

language-detection-assignment / backend /models /model_b_dataset_b.py

yangding14

first commit

72f90b1 5 days ago

5.9 kB

	"""
	Model B Dataset B - BERT Language Detection

	This module implements the BERT based language detection model
	fine-tuned on Dataset B (enhanced/specialized language detection dataset).

	Model Architecture: BERT (Model B)
	Training Dataset: Dataset B (enhanced/specialized)
	Performance: 99.85% accuracy across 20 carefully selected languages
	"""

	import logging
	from typing import Dict, List, Any

	from .base_model import BaseLanguageModel
	from .model_config import get_model_config, get_supported_languages, get_language_name

	try:
	from transformers import pipeline
	HF_AVAILABLE = True
	except ImportError:
	HF_AVAILABLE = False
	logging.warning("Transformers library not available. Please install with: pip install transformers torch")


	class ModelBDatasetB(BaseLanguageModel):
	"""
	BERT based language detection model (Model B) trained on Dataset B.

	This model represents the BERT architecture fine-tuned on an enhanced
	language detection dataset, achieving the highest accuracy (99.85%) with
	precision-optimized performance on 20 carefully selected languages.

	Architecture: BERT (Model B)
	Dataset: Dataset B (enhanced/specialized)
	Base Model: bert-base-multilingual-cased
	Accuracy: 99.85%
	Parameters: 178M
	Training Loss: 0.0125
	"""

	def __init__(self):
	"""Initialize the Model B Dataset B language detector."""
	self.model_key = "model-b-dataset-b"
	self.config = get_model_config(self.model_key)
	self.model_name = self.config["huggingface_model"]

	# Check if transformers library is available
	if not HF_AVAILABLE:
	raise ImportError(
	"Transformers library required for Model B Dataset B. "
	"Install with: pip install transformers torch"
	)

	# Initialize the model pipeline
	try:
	self.classifier = pipeline(
	"text-classification",
	model=self.model_name,
	device=0,
	top_k=None # Return all scores
	)
	logging.info(f"Successfully loaded {self.config['display_name']} ({self.model_name})")
	except Exception as e:
	logging.error(f"Failed to load {self.config['display_name']}: {e}")
	raise RuntimeError(f"Could not initialize Model B Dataset B: {str(e)}")

	def predict(self, text: str) -> Dict[str, Any]:
	"""
	Predict language using Model B Dataset B (BERT enhanced).

	Args:
	text (str): Input text to analyze

	Returns:
	Dict with predictions, metadata, and model information
	"""
	if not text or not text.strip():
	raise ValueError("Input text cannot be empty")

	try:
	# Run the model prediction
	results = self.classifier(text)

	# Handle the format returned by the pipeline
	if isinstance(results, list) and len(results) > 0:
	if isinstance(results[0], list):
	# Nested list format: [[{'label': 'en', 'score': 0.99}, ...]]
	prediction_list = results[0]
	else:
	# Direct list format: [{'label': 'en', 'score': 0.99}, ...]
	prediction_list = results
	else:
	raise ValueError("Unexpected pipeline output format")

	# Sort predictions by confidence score (descending)
	predictions = [
	{
	'language_code': result['label'].lower(),
	'confidence': result['score']
	}
	for result in sorted(prediction_list, key=lambda x: x['score'], reverse=True)
	]

	return {
	'predictions': predictions,
	'text_length': len(text),
	'model_version': self.model_name,
	'model_type': f"{self.config['architecture'].lower()}-{self.config['dataset'].lower().replace(' ', '-')}"
	}

	except Exception as e:
	logging.error(f"Model B Dataset B prediction failed: {e}")
	raise RuntimeError(f"Model prediction failed: {str(e)}")

	def get_supported_languages(self) -> List[str]:
	"""
	Get supported languages for Model B Dataset B.

	Returns:
	List of ISO 639-1 language codes supported by the model (20 languages)
	"""
	return get_supported_languages(self.model_key)

	def get_model_info(self) -> Dict[str, Any]:
	"""
	Get detailed information about Model B Dataset B.

	Returns:
	Dict containing comprehensive model metadata
	"""
	# Build comprehensive model info from centralized config
	model_info = {
	"name": self.config["display_name"],
	"description": self.config["description"],
	"accuracy": self.config["accuracy"],
	"model_size": self.config["model_size"],
	"architecture": self.config["architecture"],
	"base_model": self.config["base_model"],
	"dataset": self.config["dataset"],
	"languages_supported": f"{self.config['languages_supported']} carefully selected languages: Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Italian, Japanese, Dutch, Polish, Portuguese, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese",
	"training_details": self.config["training_details"],
	"training_loss": f"{self.config.get('training_loss', 'N/A')}",
	"use_cases": self.config["use_cases"],
	"strengths": self.config["strengths"],
	"limitations": self.config["limitations"]
	}

	return model_info