|
import logging |
|
from typing import Dict |
|
from datetime import datetime |
|
from deep_translator import GoogleTranslator |
|
from langdetect import detect, DetectorFactory |
|
import time |
|
import re |
|
|
|
|
|
DetectorFactory.seed = 0 |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
class MultilingualTranslationModel: |
|
def __init__(self): |
|
|
|
self.supported_languages = { |
|
'en': 'English', |
|
'hi': 'Hindi', |
|
'bn': 'Bengali', |
|
'te': 'Telugu', |
|
'ta': 'Tamil', |
|
'mr': 'Marathi', |
|
'gu': 'Gujarati', |
|
'kn': 'Kannada', |
|
'ml': 'Malayalam', |
|
'pa': 'Punjabi' |
|
} |
|
|
|
|
|
self.hinglish_patterns = { |
|
r'\b(nahi|nhi)\b': 'not', |
|
r'\b(hai|he)\b': 'is', |
|
r'\bpaani\b': 'water', |
|
r'\bkharab\b': 'broken', |
|
r'\bgaya\b': 'gone', |
|
r'\braha\b': 'staying', |
|
r'\bho\b': 'happening', |
|
r'\bme\b': 'in', |
|
r'\bka\b': 'of', |
|
r'\bki\b': 'of' |
|
} |
|
|
|
def is_hinglish(self, text: str) -> bool: |
|
"""Check if text is likely Hinglish""" |
|
|
|
pattern_count = sum(1 for pattern in self.hinglish_patterns if re.search(pattern, text.lower())) |
|
words = text.split() |
|
|
|
|
|
return pattern_count / len(words) > 0.3 if words else False |
|
|
|
def convert_hinglish_to_english(self, text: str) -> str: |
|
"""Convert Hinglish text to proper English""" |
|
try: |
|
|
|
translator = GoogleTranslator(source='auto', target='en') |
|
result = translator.translate(text) |
|
|
|
|
|
if not result or result.lower() == text.lower(): |
|
|
|
processed_text = text.lower() |
|
for pattern, replacement in self.hinglish_patterns.items(): |
|
processed_text = re.sub(pattern, replacement, processed_text) |
|
return processed_text |
|
|
|
return result |
|
|
|
except Exception as e: |
|
logger.error(f"Hinglish conversion failed: {str(e)}") |
|
return text |
|
|
|
def detect_language(self, text: str, max_retries=3) -> str: |
|
"""Detect the language of input text with retry logic""" |
|
|
|
if self.is_hinglish(text): |
|
logger.info("Detected Hinglish text") |
|
return 'hi' |
|
|
|
for attempt in range(max_retries): |
|
try: |
|
detected_lang = detect(text) |
|
logger.info(f"Detected language: {detected_lang}") |
|
return detected_lang |
|
|
|
except Exception as e: |
|
logger.error(f"Language detection attempt {attempt + 1} failed: {str(e)}") |
|
if attempt < max_retries - 1: |
|
time.sleep(1) |
|
|
|
return 'en' |
|
|
|
def translate(self, text: str, target_lang: str = 'en', max_retries=3) -> Dict: |
|
"""Translate text to target language with retry logic""" |
|
for attempt in range(max_retries): |
|
try: |
|
source_lang = self.detect_language(text) |
|
logger.info(f"Source language detected: {source_lang}") |
|
|
|
|
|
if self.is_hinglish(text) and target_lang == 'en': |
|
translated_text = self.convert_hinglish_to_english(text) |
|
else: |
|
|
|
if source_lang == target_lang: |
|
return { |
|
'translated_text': text, |
|
'source_language': source_lang, |
|
'target_language': target_lang, |
|
'confidence': 1.0, |
|
'timestamp': datetime.utcnow().isoformat() |
|
} |
|
|
|
|
|
translator = GoogleTranslator( |
|
source='auto', |
|
target=target_lang |
|
) |
|
|
|
|
|
translated_text = translator.translate(text) |
|
|
|
logger.info(f"Translation result: {translated_text}") |
|
|
|
return { |
|
'translated_text': translated_text, |
|
'source_language': source_lang, |
|
'target_language': target_lang, |
|
'confidence': 0.9, |
|
'timestamp': datetime.utcnow().isoformat() |
|
} |
|
|
|
except Exception as e: |
|
logger.error(f"Translation attempt {attempt + 1} failed: {str(e)}") |
|
if attempt < max_retries - 1: |
|
time.sleep(1) |
|
continue |
|
|
|
return { |
|
'error': 'Translation failed after maximum retries', |
|
'timestamp': datetime.utcnow().isoformat() |
|
} |
|
|
|
def process_message(self, message_data: Dict) -> Dict: |
|
"""Process a chat message and return translation""" |
|
try: |
|
if not message_data.get('user_message'): |
|
return { |
|
'error': 'No message provided', |
|
'timestamp': datetime.utcnow().isoformat() |
|
} |
|
|
|
target_lang = message_data.get('target_language', 'en').lower() |
|
|
|
|
|
if target_lang not in self.supported_languages: |
|
return { |
|
'error': f'Unsupported target language: {target_lang}', |
|
'timestamp': datetime.utcnow().isoformat() |
|
} |
|
|
|
result = self.translate( |
|
message_data['user_message'], |
|
target_lang |
|
) |
|
|
|
return { |
|
'original_message': message_data['user_message'], |
|
'translated_message': result.get('translated_text', ''), |
|
'source_language': result.get('source_language', ''), |
|
'target_language': target_lang, |
|
'language_name': self.supported_languages.get(target_lang, ''), |
|
'confidence': result.get('confidence', 0.0), |
|
'timestamp': result.get('timestamp', datetime.utcnow().isoformat()), |
|
'error': result.get('error') |
|
} |
|
|
|
except Exception as e: |
|
logger.error(f"Message processing error: {str(e)}") |
|
return { |
|
'error': str(e), |
|
'timestamp': datetime.utcnow().isoformat() |
|
} |
|
|