Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -15,6 +15,51 @@ load_dotenv()
|
|
15 |
|
16 |
DEFAULT_FAST_TEXT_MODEL = "laurievb/OpenLID"
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
def load_model(repo_id: str) -> fasttext.FastText._FastText:
|
19 |
model_path = hf_hub_download(repo_id, filename="model.bin")
|
20 |
return fasttext.load_model(model_path)
|
@@ -39,6 +84,62 @@ def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterat
|
|
39 |
|
40 |
FASTTEXT_PREFIX_LENGTH = 9 # fasttext labels are formatted like "__label__eng_Latn"
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# Load the model
|
43 |
Path("code/models").mkdir(parents=True, exist_ok=True)
|
44 |
model = fasttext.load_model(
|
@@ -97,19 +198,37 @@ def simple_predict(text, num_predictions=3):
|
|
97 |
# Calculate average scores for each language
|
98 |
language_scores = valmap(get_mean_score, predictions_by_lang)
|
99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
# Format results
|
101 |
results = {
|
102 |
-
"detected_languages":
|
103 |
"language_counts": dict(language_counts),
|
104 |
"total_predictions": len(all_predictions),
|
105 |
"text_lines_analyzed": len(cleaned_lines)
|
106 |
}
|
107 |
|
108 |
-
|
|
|
|
|
109 |
|
|
|
|
|
110 |
except Exception as e:
|
111 |
return {"error": f"Error during prediction: {str(e)}"}
|
112 |
|
|
|
113 |
def batch_predict(text, threshold_percent=0.2):
|
114 |
"""More advanced prediction with filtering"""
|
115 |
if not text or not text.strip():
|
@@ -135,18 +254,35 @@ def batch_predict(text, threshold_percent=0.2):
|
|
135 |
keys_to_keep = filter_by_frequency(language_counts, threshold_percent=threshold_percent)
|
136 |
filtered_dict = {k: v for k, v in predictions_by_lang.items() if k in keys_to_keep}
|
137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
results = {
|
139 |
-
"predictions":
|
140 |
"all_language_counts": dict(language_counts),
|
141 |
"filtered_languages": list(keys_to_keep),
|
142 |
"threshold_used": threshold_percent
|
143 |
}
|
144 |
|
|
|
|
|
|
|
|
|
145 |
return results
|
146 |
|
147 |
except Exception as e:
|
148 |
return {"error": f"Error during prediction: {str(e)}"}
|
149 |
|
|
|
150 |
def build_demo_interface():
|
151 |
app_title = "Language Detection Tool"
|
152 |
with gr.Blocks(title=app_title) as demo:
|
|
|
15 |
|
16 |
DEFAULT_FAST_TEXT_MODEL = "laurievb/OpenLID"
|
17 |
|
18 |
+
# Language code mapping - feel free to expand this
|
19 |
+
LANGUAGE_MAPPING = {
|
20 |
+
"spa_Latn": {"name": "Spanish", "iso_639_1": "es", "full_code": "es_ES"},
|
21 |
+
"eng_Latn": {"name": "English", "iso_639_1": "en", "full_code": "en_US"},
|
22 |
+
"fra_Latn": {"name": "French", "iso_639_1": "fr", "full_code": "fr_FR"},
|
23 |
+
"deu_Latn": {"name": "German", "iso_639_1": "de", "full_code": "de_DE"},
|
24 |
+
"ita_Latn": {"name": "Italian", "iso_639_1": "it", "full_code": "it_IT"},
|
25 |
+
"por_Latn": {"name": "Portuguese", "iso_639_1": "pt", "full_code": "pt_PT"},
|
26 |
+
"rus_Cyrl": {"name": "Russian", "iso_639_1": "ru", "full_code": "ru_RU"},
|
27 |
+
"zho_Hans": {"name": "Chinese (Simplified)", "iso_639_1": "zh", "full_code": "zh_CN"},
|
28 |
+
"zho_Hant": {"name": "Chinese (Traditional)", "iso_639_1": "zh", "full_code": "zh_TW"},
|
29 |
+
"jpn_Jpan": {"name": "Japanese", "iso_639_1": "ja", "full_code": "ja_JP"},
|
30 |
+
"kor_Hang": {"name": "Korean", "iso_639_1": "ko", "full_code": "ko_KR"},
|
31 |
+
"ara_Arab": {"name": "Arabic", "iso_639_1": "ar", "full_code": "ar_SA"},
|
32 |
+
"hin_Deva": {"name": "Hindi", "iso_639_1": "hi", "full_code": "hi_IN"},
|
33 |
+
"cat_Latn": {"name": "Catalan", "iso_639_1": "ca", "full_code": "ca_ES"},
|
34 |
+
"glg_Latn": {"name": "Galician", "iso_639_1": "gl", "full_code": "gl_ES"},
|
35 |
+
"nld_Latn": {"name": "Dutch", "iso_639_1": "nl", "full_code": "nl_NL"},
|
36 |
+
"swe_Latn": {"name": "Swedish", "iso_639_1": "sv", "full_code": "sv_SE"},
|
37 |
+
"nor_Latn": {"name": "Norwegian", "iso_639_1": "no", "full_code": "no_NO"},
|
38 |
+
"dan_Latn": {"name": "Danish", "iso_639_1": "da", "full_code": "da_DK"},
|
39 |
+
"fin_Latn": {"name": "Finnish", "iso_639_1": "fi", "full_code": "fi_FI"},
|
40 |
+
"pol_Latn": {"name": "Polish", "iso_639_1": "pl", "full_code": "pl_PL"},
|
41 |
+
"ces_Latn": {"name": "Czech", "iso_639_1": "cs", "full_code": "cs_CZ"},
|
42 |
+
"hun_Latn": {"name": "Hungarian", "iso_639_1": "hu", "full_code": "hu_HU"},
|
43 |
+
"tur_Latn": {"name": "Turkish", "iso_639_1": "tr", "full_code": "tr_TR"},
|
44 |
+
"heb_Hebr": {"name": "Hebrew", "iso_639_1": "he", "full_code": "he_IL"},
|
45 |
+
"tha_Thai": {"name": "Thai", "iso_639_1": "th", "full_code": "th_TH"},
|
46 |
+
"vie_Latn": {"name": "Vietnamese", "iso_639_1": "vi", "full_code": "vi_VN"},
|
47 |
+
"ukr_Cyrl": {"name": "Ukrainian", "iso_639_1": "uk", "full_code": "uk_UA"},
|
48 |
+
"ell_Grek": {"name": "Greek", "iso_639_1": "el", "full_code": "el_GR"},
|
49 |
+
"bul_Cyrl": {"name": "Bulgarian", "iso_639_1": "bg", "full_code": "bg_BG"},
|
50 |
+
"ron_Latn": {"name": "Romanian", "iso_639_1": "ro", "full_code": "ro_RO"},
|
51 |
+
"hrv_Latn": {"name": "Croatian", "iso_639_1": "hr", "full_code": "hr_HR"},
|
52 |
+
"srp_Cyrl": {"name": "Serbian", "iso_639_1": "sr", "full_code": "sr_RS"},
|
53 |
+
"slv_Latn": {"name": "Slovenian", "iso_639_1": "sl", "full_code": "sl_SI"},
|
54 |
+
"slk_Latn": {"name": "Slovak", "iso_639_1": "sk", "full_code": "sk_SK"},
|
55 |
+
"est_Latn": {"name": "Estonian", "iso_639_1": "et", "full_code": "et_EE"},
|
56 |
+
"lav_Latn": {"name": "Latvian", "iso_639_1": "lv", "full_code": "lv_LV"},
|
57 |
+
"lit_Latn": {"name": "Lithuanian", "iso_639_1": "lt", "full_code": "lt_LT"},
|
58 |
+
"msa_Latn": {"name": "Malay", "iso_639_1": "ms", "full_code": "ms_MY"},
|
59 |
+
"ind_Latn": {"name": "Indonesian", "iso_639_1": "id", "full_code": "id_ID"},
|
60 |
+
"tgl_Latn": {"name": "Filipino", "iso_639_1": "tl", "full_code": "tl_PH"},
|
61 |
+
}
|
62 |
+
|
63 |
def load_model(repo_id: str) -> fasttext.FastText._FastText:
|
64 |
model_path = hf_hub_download(repo_id, filename="model.bin")
|
65 |
return fasttext.load_model(model_path)
|
|
|
84 |
|
85 |
FASTTEXT_PREFIX_LENGTH = 9 # fasttext labels are formatted like "__label__eng_Latn"
|
86 |
|
87 |
+
def format_language_info(fasttext_code):
|
88 |
+
"""Convert FastText language code to human readable format"""
|
89 |
+
if fasttext_code in LANGUAGE_MAPPING:
|
90 |
+
lang_info = LANGUAGE_MAPPING[fasttext_code]
|
91 |
+
return {
|
92 |
+
"name": lang_info["name"],
|
93 |
+
"iso_code": lang_info["iso_639_1"],
|
94 |
+
"full_code": lang_info["full_code"],
|
95 |
+
"fasttext_code": fasttext_code
|
96 |
+
}
|
97 |
+
else:
|
98 |
+
# Graceful fallback for unmapped languages
|
99 |
+
return {
|
100 |
+
"name": fasttext_code,
|
101 |
+
"iso_code": "unknown",
|
102 |
+
"full_code": "unknown",
|
103 |
+
"fasttext_code": fasttext_code
|
104 |
+
}
|
105 |
+
|
106 |
+
def detect_language_segments(text, confidence_threshold=0.3):
|
107 |
+
"""Detect language changes in text segments"""
|
108 |
+
# Split text into logical segments (sentences, clauses)
|
109 |
+
import re
|
110 |
+
|
111 |
+
# More sophisticated splitting on common separators
|
112 |
+
segments = re.split(r'[.!?;/|]\s+|\s+/\s+|\s+\|\s+', text.strip())
|
113 |
+
segments = [seg.strip() for seg in segments if seg.strip() and len(seg.strip()) > 10]
|
114 |
+
|
115 |
+
if len(segments) < 2:
|
116 |
+
return None
|
117 |
+
|
118 |
+
segment_results = []
|
119 |
+
for i, segment in enumerate(segments):
|
120 |
+
predictions = model_predict(segment, k=1)
|
121 |
+
if predictions and predictions[0]['score'] > confidence_threshold:
|
122 |
+
lang_info = format_language_info(predictions[0]['label'])
|
123 |
+
segment_results.append({
|
124 |
+
"segment_number": i + 1,
|
125 |
+
"text": segment,
|
126 |
+
"language": lang_info,
|
127 |
+
"confidence": predictions[0]['score']
|
128 |
+
})
|
129 |
+
|
130 |
+
# Check if we found different languages
|
131 |
+
languages_found = set(result['language']['fasttext_code'] for result in segment_results)
|
132 |
+
|
133 |
+
if len(languages_found) > 1:
|
134 |
+
return {
|
135 |
+
"is_multilingual": True,
|
136 |
+
"languages_detected": list(languages_found),
|
137 |
+
"segments": segment_results
|
138 |
+
}
|
139 |
+
|
140 |
+
return None
|
141 |
+
|
142 |
+
|
143 |
# Load the model
|
144 |
Path("code/models").mkdir(parents=True, exist_ok=True)
|
145 |
model = fasttext.load_model(
|
|
|
198 |
# Calculate average scores for each language
|
199 |
language_scores = valmap(get_mean_score, predictions_by_lang)
|
200 |
|
201 |
+
# Format results
|
202 |
+
# Format with human-readable language info
|
203 |
+
formatted_languages = {}
|
204 |
+
for fasttext_code, score in language_scores.items():
|
205 |
+
lang_info = format_language_info(fasttext_code)
|
206 |
+
formatted_languages[fasttext_code] = {
|
207 |
+
"score": score,
|
208 |
+
"language_info": lang_info
|
209 |
+
}
|
210 |
+
|
211 |
+
# Check for multilingual segments
|
212 |
+
segment_analysis = detect_language_segments(text)
|
213 |
+
|
214 |
# Format results
|
215 |
results = {
|
216 |
+
"detected_languages": formatted_languages,
|
217 |
"language_counts": dict(language_counts),
|
218 |
"total_predictions": len(all_predictions),
|
219 |
"text_lines_analyzed": len(cleaned_lines)
|
220 |
}
|
221 |
|
222 |
+
# Add segment analysis if multilingual
|
223 |
+
if segment_analysis:
|
224 |
+
results["segment_analysis"] = segment_analysis
|
225 |
|
226 |
+
return results
|
227 |
+
|
228 |
except Exception as e:
|
229 |
return {"error": f"Error during prediction: {str(e)}"}
|
230 |
|
231 |
+
|
232 |
def batch_predict(text, threshold_percent=0.2):
|
233 |
"""More advanced prediction with filtering"""
|
234 |
if not text or not text.strip():
|
|
|
254 |
keys_to_keep = filter_by_frequency(language_counts, threshold_percent=threshold_percent)
|
255 |
filtered_dict = {k: v for k, v in predictions_by_lang.items() if k in keys_to_keep}
|
256 |
|
257 |
+
# Format with human-readable language info
|
258 |
+
formatted_predictions = {}
|
259 |
+
for fasttext_code, score in valmap(get_mean_score, filtered_dict).items():
|
260 |
+
lang_info = format_language_info(fasttext_code)
|
261 |
+
formatted_predictions[fasttext_code] = {
|
262 |
+
"score": score,
|
263 |
+
"language_info": lang_info
|
264 |
+
}
|
265 |
+
|
266 |
+
# Check for multilingual segments
|
267 |
+
segment_analysis = detect_language_segments(text)
|
268 |
+
|
269 |
results = {
|
270 |
+
"predictions": formatted_predictions,
|
271 |
"all_language_counts": dict(language_counts),
|
272 |
"filtered_languages": list(keys_to_keep),
|
273 |
"threshold_used": threshold_percent
|
274 |
}
|
275 |
|
276 |
+
# Add segment analysis if multilingual
|
277 |
+
if segment_analysis:
|
278 |
+
results["segment_analysis"] = segment_analysis
|
279 |
+
|
280 |
return results
|
281 |
|
282 |
except Exception as e:
|
283 |
return {"error": f"Error during prediction: {str(e)}"}
|
284 |
|
285 |
+
|
286 |
def build_demo_interface():
|
287 |
app_title = "Language Detection Tool"
|
288 |
with gr.Blocks(title=app_title) as demo:
|