Spaces:
Running
Running
Delete language_detection.py
Browse files- language_detection.py +0 -373
language_detection.py
DELETED
@@ -1,373 +0,0 @@
|
|
1 |
-
# Standard library imports
|
2 |
-
import logging
|
3 |
-
import re
|
4 |
-
from typing import List, Dict, Set, Tuple, Optional, Union, Any
|
5 |
-
from functools import lru_cache
|
6 |
-
|
7 |
-
# Configure logging
|
8 |
-
logging.basicConfig(level=logging.INFO,
|
9 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
10 |
-
logger = logging.getLogger(__name__)
|
11 |
-
|
12 |
-
class LanguageDetector:
|
13 |
-
"""
|
14 |
-
A language detection system that provides balanced detection across multiple languages
|
15 |
-
using an enhanced statistical approach.
|
16 |
-
"""
|
17 |
-
|
18 |
-
def __init__(self):
|
19 |
-
"""Initialize the language detector with statistical language models"""
|
20 |
-
logger.info("Initializing language detector with statistical models")
|
21 |
-
|
22 |
-
# Initialize language indicators dictionary for statistical detection
|
23 |
-
self._init_language_indicators()
|
24 |
-
# Set thresholds for language detection confidence
|
25 |
-
self.single_lang_confidence = 65 # Minimum score to consider a language detected
|
26 |
-
self.secondary_lang_threshold = 0.75 # Secondary language must be at least this fraction of primary score
|
27 |
-
|
28 |
-
def _init_language_indicators(self):
|
29 |
-
"""Initialize language indicators for statistical detection with historical markers"""
|
30 |
-
# Define indicators for all supported languages with equal detail level
|
31 |
-
# Each language has:
|
32 |
-
# - Distinctive characters
|
33 |
-
# - Common words (including historical forms)
|
34 |
-
# - N-grams (character sequences)
|
35 |
-
# - Historical markers specific to older forms of the language
|
36 |
-
self.language_indicators = {
|
37 |
-
"English": {
|
38 |
-
"chars": [], # English uses basic Latin alphabet without special chars
|
39 |
-
"words": ['the', 'and', 'of', 'to', 'in', 'a', 'is', 'that', 'for', 'it',
|
40 |
-
'with', 'as', 'be', 'on', 'by', 'at', 'this', 'have', 'from', 'or',
|
41 |
-
'an', 'but', 'not', 'what', 'all', 'were', 'when', 'we', 'there', 'can',
|
42 |
-
'would', 'who', 'you', 'been', 'one', 'their', 'has', 'more', 'if', 'no'],
|
43 |
-
"ngrams": ['th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd', 'ti', 'es', 'or',
|
44 |
-
'ing', 'tion', 'the', 'and', 'tha', 'ent', 'ion'],
|
45 |
-
"historical": {
|
46 |
-
"chars": ['þ', 'ȝ', 'æ', 'ſ'], # Thorn, yogh, ash, long s
|
47 |
-
"words": ['thou', 'thee', 'thy', 'thine', 'hath', 'doth', 'ere', 'whilom', 'betwixt',
|
48 |
-
'ye', 'art', 'wast', 'dost', 'hast', 'shalt', 'mayst', 'verily'],
|
49 |
-
"patterns": ['eth$', '^y[^a-z]', 'ck$', 'aught', 'ought'] # -eth endings, y- prefixes
|
50 |
-
}
|
51 |
-
},
|
52 |
-
"French": {
|
53 |
-
"chars": ['é', 'è', 'ê', 'à', 'ç', 'ù', 'â', 'î', 'ô', 'û', 'ë', 'ï', 'ü'],
|
54 |
-
"words": ['le', 'la', 'les', 'et', 'en', 'de', 'du', 'des', 'un', 'une', 'ce', 'cette',
|
55 |
-
'ces', 'dans', 'par', 'pour', 'sur', 'qui', 'que', 'quoi', 'où', 'quand', 'comment',
|
56 |
-
'est', 'sont', 'ont', 'nous', 'vous', 'ils', 'elles', 'avec', 'sans', 'mais', 'ou'],
|
57 |
-
"ngrams": ['es', 'le', 'de', 'en', 'on', 'nt', 'qu', 'ai', 'an', 'ou', 'ur', 're', 'me',
|
58 |
-
'les', 'ent', 'que', 'des', 'ons', 'ant', 'ion'],
|
59 |
-
"historical": {
|
60 |
-
"chars": ['ſ', 'æ', 'œ'], # Long s and ligatures
|
61 |
-
"words": ['aultre', 'avecq', 'icelluy', 'oncques', 'moult', 'estre', 'mesme', 'ceste',
|
62 |
-
'ledict', 'celuy', 'ceulx', 'aulcun', 'ainſi', 'touſiours', 'eſtre',
|
63 |
-
'eſt', 'meſme', 'felon', 'auec', 'iufques', 'chofe', 'fcience'],
|
64 |
-
"patterns": ['oi[ts]$', 'oi[re]$', 'f[^aeiou]', 'ff', 'ſ', 'auoit', 'eſtoit',
|
65 |
-
'ſi', 'ſur', 'ſa', 'cy', 'ayant', 'oy', 'uſ', 'auſ']
|
66 |
-
},
|
67 |
-
},
|
68 |
-
"German": {
|
69 |
-
"chars": ['ä', 'ö', 'ü', 'ß'],
|
70 |
-
"words": ['der', 'die', 'das', 'und', 'in', 'zu', 'den', 'ein', 'eine', 'mit', 'ist', 'von',
|
71 |
-
'des', 'sich', 'auf', 'für', 'als', 'auch', 'werden', 'bei', 'durch', 'aus', 'sind',
|
72 |
-
'nicht', 'nur', 'wurde', 'wie', 'wenn', 'aber', 'noch', 'nach', 'so', 'sein', 'über'],
|
73 |
-
"ngrams": ['en', 'er', 'ch', 'de', 'ei', 'in', 'te', 'nd', 'ie', 'ge', 'un', 'sch', 'ich',
|
74 |
-
'den', 'die', 'und', 'der', 'ein', 'ung', 'cht'],
|
75 |
-
"historical": {
|
76 |
-
"chars": ['ſ', 'ů', 'ė', 'ÿ'],
|
77 |
-
"words": ['vnnd', 'vnnd', 'vnter', 'vnd', 'seyn', 'thun', 'auff', 'auß', 'deß', 'diß'],
|
78 |
-
"patterns": ['^v[nd]', 'th', 'vnter', 'ſch']
|
79 |
-
}
|
80 |
-
},
|
81 |
-
"Spanish": {
|
82 |
-
"chars": ['á', 'é', 'í', 'ó', 'ú', 'ñ', 'ü', '¿', '¡'],
|
83 |
-
"words": ['el', 'la', 'los', 'las', 'de', 'en', 'y', 'a', 'que', 'por', 'un', 'una', 'no',
|
84 |
-
'es', 'con', 'para', 'su', 'al', 'se', 'del', 'como', 'más', 'pero', 'lo', 'mi',
|
85 |
-
'si', 'ya', 'todo', 'esta', 'cuando', 'hay', 'muy', 'bien', 'sin', 'así'],
|
86 |
-
"ngrams": ['de', 'en', 'os', 'es', 'la', 'ar', 'el', 'er', 'ra', 'as', 'an', 'do', 'or',
|
87 |
-
'que', 'nte', 'los', 'ado', 'con', 'ent', 'ien'],
|
88 |
-
"historical": {
|
89 |
-
"chars": ['ſ', 'ç', 'ñ'],
|
90 |
-
"words": ['facer', 'fijo', 'fermoso', 'agora', 'asaz', 'aver', 'caſa', 'deſde', 'eſte',
|
91 |
-
'eſta', 'eſto', 'deſto', 'deſta', 'eſſo', 'muger', 'dixo', 'fazer'],
|
92 |
-
"patterns": ['^f[aei]', 'ſſ', 'ſc', '^deſ', 'xo$', 'xe$']
|
93 |
-
},
|
94 |
-
},
|
95 |
-
"Italian": {
|
96 |
-
"chars": ['à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú'],
|
97 |
-
"words": ['il', 'la', 'i', 'le', 'e', 'di', 'a', 'in', 'che', 'non', 'per', 'con', 'un',
|
98 |
-
'una', 'del', 'della', 'è', 'sono', 'da', 'si', 'come', 'anche', 'più', 'ma', 'ci',
|
99 |
-
'se', 'ha', 'mi', 'lo', 'ti', 'al', 'tu', 'questo', 'questi'],
|
100 |
-
"ngrams": ['di', 'la', 'er', 'to', 're', 'co', 'de', 'in', 'ra', 'on', 'li', 'no', 'ri',
|
101 |
-
'che', 'ent', 'con', 'per', 'ion', 'ato', 'lla']
|
102 |
-
},
|
103 |
-
"Portuguese": {
|
104 |
-
"chars": ['á', 'â', 'ã', 'à', 'é', 'ê', 'í', 'ó', 'ô', 'õ', 'ú', 'ç'],
|
105 |
-
"words": ['o', 'a', 'os', 'as', 'de', 'em', 'e', 'do', 'da', 'dos', 'das', 'no', 'na',
|
106 |
-
'para', 'que', 'um', 'uma', 'por', 'com', 'se', 'não', 'mais', 'como', 'mas',
|
107 |
-
'você', 'eu', 'este', 'isso', 'ele', 'seu', 'sua', 'ou', 'já', 'me'],
|
108 |
-
"ngrams": ['de', 'os', 'em', 'ar', 'es', 'ra', 'do', 'da', 'en', 'co', 'nt', 'ad', 'to',
|
109 |
-
'que', 'nto', 'ent', 'com', 'ção', 'ado', 'ment']
|
110 |
-
},
|
111 |
-
"Dutch": {
|
112 |
-
"chars": ['ë', 'ï', 'ö', 'ü', 'é', 'è', 'ê', 'ç', 'á', 'à', 'ä', 'ó', 'ô', 'ú', 'ù', 'û', 'ij'],
|
113 |
-
"words": ['de', 'het', 'een', 'en', 'van', 'in', 'is', 'dat', 'op', 'te', 'zijn', 'met',
|
114 |
-
'voor', 'niet', 'aan', 'er', 'die', 'maar', 'dan', 'ik', 'je', 'hij', 'zij', 'we',
|
115 |
-
'kunnen', 'wordt', 'nog', 'door', 'over', 'als', 'uit', 'bij', 'om', 'ook'],
|
116 |
-
"ngrams": ['en', 'de', 'er', 'ee', 'ge', 'an', 'aa', 'in', 'te', 'et', 'ng', 'ee', 'or',
|
117 |
-
'van', 'het', 'een', 'ing', 'ver', 'den', 'sch']
|
118 |
-
},
|
119 |
-
"Russian": {
|
120 |
-
# Russian (Cyrillic alphabet) characters
|
121 |
-
"chars": ['а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
|
122 |
-
'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'],
|
123 |
-
"words": ['и', 'в', 'не', 'на', 'что', 'я', 'с', 'а', 'то', 'он', 'как', 'этот', 'по',
|
124 |
-
'но', 'из', 'к', 'у', 'за', 'вы', 'все', 'так', 'же', 'от', 'для', 'о', 'его',
|
125 |
-
'мы', 'было', 'она', 'бы', 'мне', 'еще', 'есть', 'быть', 'был'],
|
126 |
-
"ngrams": ['о', 'е', 'а', 'н', 'и', 'т', 'р', 'с', 'в', 'л', 'к', 'м', 'д',
|
127 |
-
'ст', 'но', 'то', 'ни', 'на', 'по', 'ет']
|
128 |
-
},
|
129 |
-
"Chinese": {
|
130 |
-
"chars": ['的', '是', '不', '了', '在', '和', '有', '我', '们', '人', '这', '上', '中',
|
131 |
-
'个', '大', '来', '到', '国', '时', '要', '地', '出', '会', '可', '也', '就',
|
132 |
-
'年', '生', '对', '能', '自', '那', '都', '得', '说', '过', '子', '家', '后', '多'],
|
133 |
-
# Chinese doesn't have "words" in the same way as alphabetic languages
|
134 |
-
"words": ['的', '是', '不', '了', '在', '和', '有', '我', '们', '人', '这', '上', '中',
|
135 |
-
'个', '大', '来', '到', '国', '时', '要', '地', '出', '会', '可', '也', '就'],
|
136 |
-
"ngrams": ['的', '是', '不', '了', '在', '我', '有', '和', '人', '这', '中', '大', '来', '上',
|
137 |
-
'国', '个', '到', '说', '们', '为']
|
138 |
-
},
|
139 |
-
"Japanese": {
|
140 |
-
# A mix of hiragana, katakana, and common kanji
|
141 |
-
"chars": ['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ',
|
142 |
-
'ア', 'イ', 'ウ', 'エ', 'オ', 'カ', 'キ', 'ク', 'ケ', 'コ', 'サ', 'シ', 'ス', 'セ', 'ソ',
|
143 |
-
'日', '本', '人', '大', '小', '中', '山', '川', '田', '子', '女', '男', '月', '火', '水'],
|
144 |
-
"words": ['は', 'を', 'に', 'の', 'が', 'で', 'へ', 'から', 'より', 'まで', 'だ', 'です', 'した',
|
145 |
-
'ます', 'ません', 'です', 'これ', 'それ', 'あれ', 'この', 'その', 'あの', 'わたし'],
|
146 |
-
"ngrams": ['の', 'は', 'た', 'が', 'を', 'に', 'て', 'で', 'と', 'し', 'か', 'ま', 'こ', 'い',
|
147 |
-
'する', 'いる', 'れる', 'なる', 'れて', 'した']
|
148 |
-
},
|
149 |
-
"Korean": {
|
150 |
-
"chars": ['가', '나', '다', '라', '마', '바', '사', '아', '자', '차', '카', '타', '파', '하',
|
151 |
-
'그', '는', '을', '이', '에', '에서', '로', '으로', '와', '과', '또는', '하지만'],
|
152 |
-
"words": ['이', '그', '저', '나', '너', '우리', '그들', '이것', '그것', '저것', '은', '는',
|
153 |
-
'이', '가', '을', '를', '에', '에서', '으로', '로', '와', '과', '의', '하다', '되다'],
|
154 |
-
"ngrams": ['이', '다', '는', '에', '하', '고', '지', '서', '의', '가', '을', '로', '을', '으',
|
155 |
-
'니다', '습니', '하는', '이다', '에서', '하고']
|
156 |
-
},
|
157 |
-
"Arabic": {
|
158 |
-
"chars": ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض',
|
159 |
-
'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'ء', 'ة', 'ى'],
|
160 |
-
"words": ['في', 'من', 'على', 'إلى', 'هذا', 'هذه', 'ذلك', 'تلك', 'هو', 'هي', 'هم', 'أنا',
|
161 |
-
'أنت', 'نحن', 'كان', 'كانت', 'يكون', 'لا', 'لم', 'ما', 'أن', 'و', 'أو', 'ثم', 'بعد'],
|
162 |
-
"ngrams": ['ال', 'ان', 'في', 'من', 'ون', 'ين', 'ات', 'ار', 'ور', 'ما', 'لا', 'ها', 'ان',
|
163 |
-
'الم', 'لان', 'علا', 'الح', 'الس', 'الع', 'الت']
|
164 |
-
},
|
165 |
-
"Hindi": {
|
166 |
-
"chars": ['अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ए', 'ऐ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ',
|
167 |
-
'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न',
|
168 |
-
'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ल', 'व', 'श', 'ष', 'स', 'ह', 'ा', 'ि', 'ी',
|
169 |
-
'ु', 'ू', 'े', 'ै', 'ो', 'ौ', '्', 'ं', 'ः'],
|
170 |
-
"words": ['और', 'का', 'के', 'की', 'एक', 'में', 'है', 'यह', 'हैं', 'से', 'को', 'पर', 'इस',
|
171 |
-
'हो', 'गया', 'कर', 'मैं', 'या', 'हुआ', 'था', 'वह', 'अपने', 'सकता', 'ने', 'बहुत'],
|
172 |
-
"ngrams": ['का', 'के', 'की', 'है', 'ने', 'से', 'मे', 'को', 'पर', 'हा', 'रा', 'ता', 'या',
|
173 |
-
'ार', 'ान', 'कार', 'राज', 'ारा', 'जाए', 'ेजा']
|
174 |
-
},
|
175 |
-
"Latin": {
|
176 |
-
"chars": [], # Latin uses basic Latin alphabet
|
177 |
-
"words": ['et', 'in', 'ad', 'est', 'sunt', 'non', 'cum', 'sed', 'qui', 'quod', 'ut', 'si',
|
178 |
-
'nec', 'ex', 'per', 'quam', 'pro', 'iam', 'hoc', 'aut', 'esse', 'enim', 'de',
|
179 |
-
'atque', 'ac', 'ante', 'post', 'sub', 'ab'],
|
180 |
-
"ngrams": ['us', 'is', 'um', 'er', 'it', 'nt', 'am', 'em', 're', 'at', 'ti', 'es', 'ur',
|
181 |
-
'tur', 'que', 'ere', 'ent', 'ius', 'rum', 'tus']
|
182 |
-
},
|
183 |
-
"Greek": {
|
184 |
-
"chars": ['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π',
|
185 |
-
'ρ', 'σ', 'ς', 'τ', 'υ', 'φ', 'χ', 'ψ', 'ω', 'ά', 'έ', 'ή', 'ί', 'ό', 'ύ', 'ώ'],
|
186 |
-
"words": ['και', 'του', 'της', 'των', 'στο', 'στη', 'με', 'από', 'για', 'είναι', 'να',
|
187 |
-
'ότι', 'δεν', 'στον', 'μια', 'που', 'ένα', 'έχει', 'θα', 'το', 'ο', 'η', 'τον'],
|
188 |
-
"ngrams": ['αι', 'τα', 'ου', 'τη', 'οι', 'το', 'ης', 'αν', 'ος', 'ον', 'ις', 'ει', 'ερ',
|
189 |
-
'και', 'την', 'τον', 'ους', 'νου', 'εντ', 'μεν']
|
190 |
-
}
|
191 |
-
}
|
192 |
-
|
193 |
-
def detect_languages(self, text: str, filename: str = None, current_languages: List[str] = None) -> List[str]:
|
194 |
-
"""
|
195 |
-
Detect languages in text using an enhanced statistical approach
|
196 |
-
|
197 |
-
Args:
|
198 |
-
text: Text to analyze
|
199 |
-
filename: Optional filename to provide additional context
|
200 |
-
current_languages: Optional list of languages already detected
|
201 |
-
|
202 |
-
Returns:
|
203 |
-
List of detected languages
|
204 |
-
"""
|
205 |
-
logger = logging.getLogger("language_detector")
|
206 |
-
|
207 |
-
# If no text provided, return current languages or default
|
208 |
-
if not text or len(text.strip()) < 10:
|
209 |
-
return current_languages if current_languages else ["English"]
|
210 |
-
|
211 |
-
# If we already have detected languages, use them
|
212 |
-
if current_languages and len(current_languages) > 0:
|
213 |
-
logger.info(f"Using already detected languages: {current_languages}")
|
214 |
-
return current_languages
|
215 |
-
|
216 |
-
# Use enhanced statistical detection
|
217 |
-
detected_languages = self._detect_statistically(text, filename)
|
218 |
-
logger.info(f"Statistical language detection results: {detected_languages}")
|
219 |
-
return detected_languages
|
220 |
-
|
221 |
-
def _detect_statistically(self, text: str, filename: str = None) -> List[str]:
|
222 |
-
"""
|
223 |
-
Detect languages using enhanced statistical analysis with historical language indicators
|
224 |
-
|
225 |
-
Args:
|
226 |
-
text: Text to analyze
|
227 |
-
filename: Optional filename for additional context
|
228 |
-
|
229 |
-
Returns:
|
230 |
-
List of detected languages
|
231 |
-
"""
|
232 |
-
logger = logging.getLogger("language_detector")
|
233 |
-
|
234 |
-
# Normalize text to lowercase for consistent analysis
|
235 |
-
text_lower = text.lower()
|
236 |
-
words = re.findall(r'\b\w+\b', text_lower) # Extract words
|
237 |
-
|
238 |
-
# Score each language based on characters, words, n-grams, and historical markers
|
239 |
-
language_scores = {}
|
240 |
-
historical_bonus = {}
|
241 |
-
|
242 |
-
# PHASE 1: Special character analysis
|
243 |
-
# Count special characters for each language
|
244 |
-
special_char_counts = {}
|
245 |
-
total_special_chars = 0
|
246 |
-
|
247 |
-
for language, indicators in self.language_indicators.items():
|
248 |
-
chars = indicators["chars"]
|
249 |
-
count = 0
|
250 |
-
for char in chars:
|
251 |
-
if char in text_lower:
|
252 |
-
count += text_lower.count(char)
|
253 |
-
special_char_counts[language] = count
|
254 |
-
total_special_chars += count
|
255 |
-
|
256 |
-
# Normalize character scores (0-30 points)
|
257 |
-
for language, count in special_char_counts.items():
|
258 |
-
if total_special_chars > 0:
|
259 |
-
# Scale score to 0-30 range (reduced from 35 to make room for historical)
|
260 |
-
normalized_score = (count / total_special_chars) * 30
|
261 |
-
language_scores[language] = normalized_score
|
262 |
-
else:
|
263 |
-
language_scores[language] = 0
|
264 |
-
|
265 |
-
# PHASE 2: Word analysis (0-30 points)
|
266 |
-
# Count common words for each language
|
267 |
-
for language, indicators in self.language_indicators.items():
|
268 |
-
word_list = indicators["words"]
|
269 |
-
word_matches = sum(1 for word in words if word in word_list)
|
270 |
-
|
271 |
-
# Normalize word score based on text length and word list size
|
272 |
-
word_score_factor = min(1.0, word_matches / (len(words) * 0.1)) # Max 1.0 if 10% match
|
273 |
-
language_scores[language] = language_scores.get(language, 0) + (word_score_factor * 30)
|
274 |
-
|
275 |
-
# PHASE 3: N-gram analysis (0-20 points)
|
276 |
-
for language, indicators in self.language_indicators.items():
|
277 |
-
ngram_list = indicators["ngrams"]
|
278 |
-
ngram_matches = 0
|
279 |
-
|
280 |
-
# Count ngram occurrences
|
281 |
-
for ngram in ngram_list:
|
282 |
-
ngram_matches += text_lower.count(ngram)
|
283 |
-
|
284 |
-
# Normalize ngram score based on text length
|
285 |
-
if len(text_lower) > 0:
|
286 |
-
ngram_score_factor = min(1.0, ngram_matches / (len(text_lower) * 0.05)) # Max 1.0 if 5% match
|
287 |
-
language_scores[language] = language_scores.get(language, 0) + (ngram_score_factor * 20)
|
288 |
-
|
289 |
-
# PHASE 4: Historical language markers (0-20 points)
|
290 |
-
for language, indicators in self.language_indicators.items():
|
291 |
-
if "historical" in indicators:
|
292 |
-
historical_indicators = indicators["historical"]
|
293 |
-
historical_score = 0
|
294 |
-
|
295 |
-
# Check for historical chars
|
296 |
-
if "chars" in historical_indicators:
|
297 |
-
for char in historical_indicators["chars"]:
|
298 |
-
if char in text_lower:
|
299 |
-
historical_score += text_lower.count(char) * 0.5
|
300 |
-
|
301 |
-
# Check for historical words
|
302 |
-
if "words" in historical_indicators:
|
303 |
-
hist_words = historical_indicators["words"]
|
304 |
-
hist_word_matches = sum(1 for word in words if word in hist_words)
|
305 |
-
if hist_word_matches > 0:
|
306 |
-
# Historical words are strong indicators
|
307 |
-
historical_score += min(10, hist_word_matches * 2)
|
308 |
-
|
309 |
-
# Check for historical patterns
|
310 |
-
if "patterns" in historical_indicators:
|
311 |
-
for pattern in historical_indicators["patterns"]:
|
312 |
-
matches = len(re.findall(pattern, text_lower))
|
313 |
-
if matches > 0:
|
314 |
-
historical_score += min(5, matches * 0.5)
|
315 |
-
|
316 |
-
# Cap historical score at 20 points
|
317 |
-
historical_score = min(20, historical_score)
|
318 |
-
historical_bonus[language] = historical_score
|
319 |
-
|
320 |
-
# Apply historical bonus
|
321 |
-
language_scores[language] += historical_score
|
322 |
-
|
323 |
-
# Apply language-specific exclusivity multiplier if present
|
324 |
-
if "exclusivity" in indicators:
|
325 |
-
exclusivity = indicators["exclusivity"]
|
326 |
-
language_scores[language] *= exclusivity
|
327 |
-
logger.info(f"Applied exclusivity multiplier {exclusivity} to {language}")
|
328 |
-
|
329 |
-
# Print historical bonus for debugging
|
330 |
-
for language, bonus in historical_bonus.items():
|
331 |
-
if bonus > 0:
|
332 |
-
logger.info(f"Historical language bonus for {language}: {bonus} points")
|
333 |
-
|
334 |
-
# Final language selection with more stringent criteria
|
335 |
-
# Get languages with scores above threshold
|
336 |
-
threshold = self.single_lang_confidence # Higher minimum score
|
337 |
-
candidates = [(lang, score) for lang, score in language_scores.items() if score >= threshold]
|
338 |
-
candidates.sort(key=lambda x: x[1], reverse=True)
|
339 |
-
|
340 |
-
logger.info(f"Language candidates: {candidates}")
|
341 |
-
|
342 |
-
# If we have candidate languages, return top 1-2 with higher threshold for secondary
|
343 |
-
if candidates:
|
344 |
-
# Always take top language
|
345 |
-
result = [candidates[0][0]]
|
346 |
-
|
347 |
-
# Add second language only if it's significantly strong compared to primary
|
348 |
-
# and doesn't have a historical/exclusivity conflict
|
349 |
-
if len(candidates) > 1:
|
350 |
-
primary_lang = candidates[0][0]
|
351 |
-
secondary_lang = candidates[1][0]
|
352 |
-
primary_score = candidates[0][1]
|
353 |
-
secondary_score = candidates[1][1]
|
354 |
-
|
355 |
-
# Only add secondary if it meets threshold and doesn't conflict
|
356 |
-
ratio = secondary_score / primary_score
|
357 |
-
|
358 |
-
# Check for French and Spanish conflict (historical French often gets misidentified)
|
359 |
-
historical_conflict = False
|
360 |
-
if (primary_lang == "French" and secondary_lang == "Spanish" and
|
361 |
-
historical_bonus.get("French", 0) > 5):
|
362 |
-
historical_conflict = True
|
363 |
-
logger.info("Historical French markers detected, suppressing Spanish detection")
|
364 |
-
|
365 |
-
if ratio >= self.secondary_lang_threshold and not historical_conflict:
|
366 |
-
result.append(secondary_lang)
|
367 |
-
logger.info(f"Added secondary language {secondary_lang} (score ratio: {ratio:.2f})")
|
368 |
-
else:
|
369 |
-
logger.info(f"Rejected secondary language {secondary_lang} (score ratio: {ratio:.2f})")
|
370 |
-
|
371 |
-
return result
|
372 |
-
|
373 |
-
# Default to English if no clear signals
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|