Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,6 +15,51 @@ load_dotenv()
|
|
| 15 |
|
| 16 |
DEFAULT_FAST_TEXT_MODEL = "laurievb/OpenLID"
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
def load_model(repo_id: str) -> fasttext.FastText._FastText:
|
| 19 |
model_path = hf_hub_download(repo_id, filename="model.bin")
|
| 20 |
return fasttext.load_model(model_path)
|
|
@@ -39,6 +84,62 @@ def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterat
|
|
| 39 |
|
| 40 |
FASTTEXT_PREFIX_LENGTH = 9 # fasttext labels are formatted like "__label__eng_Latn"
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
# Load the model
|
| 43 |
Path("code/models").mkdir(parents=True, exist_ok=True)
|
| 44 |
model = fasttext.load_model(
|
|
@@ -97,19 +198,37 @@ def simple_predict(text, num_predictions=3):
|
|
| 97 |
# Calculate average scores for each language
|
| 98 |
language_scores = valmap(get_mean_score, predictions_by_lang)
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
# Format results
|
| 101 |
results = {
|
| 102 |
-
"detected_languages":
|
| 103 |
"language_counts": dict(language_counts),
|
| 104 |
"total_predictions": len(all_predictions),
|
| 105 |
"text_lines_analyzed": len(cleaned_lines)
|
| 106 |
}
|
| 107 |
|
| 108 |
-
|
|
|
|
|
|
|
| 109 |
|
|
|
|
|
|
|
| 110 |
except Exception as e:
|
| 111 |
return {"error": f"Error during prediction: {str(e)}"}
|
| 112 |
|
|
|
|
| 113 |
def batch_predict(text, threshold_percent=0.2):
|
| 114 |
"""More advanced prediction with filtering"""
|
| 115 |
if not text or not text.strip():
|
|
@@ -135,18 +254,35 @@ def batch_predict(text, threshold_percent=0.2):
|
|
| 135 |
keys_to_keep = filter_by_frequency(language_counts, threshold_percent=threshold_percent)
|
| 136 |
filtered_dict = {k: v for k, v in predictions_by_lang.items() if k in keys_to_keep}
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
results = {
|
| 139 |
-
"predictions":
|
| 140 |
"all_language_counts": dict(language_counts),
|
| 141 |
"filtered_languages": list(keys_to_keep),
|
| 142 |
"threshold_used": threshold_percent
|
| 143 |
}
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
return results
|
| 146 |
|
| 147 |
except Exception as e:
|
| 148 |
return {"error": f"Error during prediction: {str(e)}"}
|
| 149 |
|
|
|
|
| 150 |
def build_demo_interface():
|
| 151 |
app_title = "Language Detection Tool"
|
| 152 |
with gr.Blocks(title=app_title) as demo:
|
|
|
|
| 15 |
|
| 16 |
DEFAULT_FAST_TEXT_MODEL = "laurievb/OpenLID"
|
| 17 |
|
| 18 |
+
# Language code mapping - feel free to expand this
|
| 19 |
+
LANGUAGE_MAPPING = {
|
| 20 |
+
"spa_Latn": {"name": "Spanish", "iso_639_1": "es", "full_code": "es_ES"},
|
| 21 |
+
"eng_Latn": {"name": "English", "iso_639_1": "en", "full_code": "en_US"},
|
| 22 |
+
"fra_Latn": {"name": "French", "iso_639_1": "fr", "full_code": "fr_FR"},
|
| 23 |
+
"deu_Latn": {"name": "German", "iso_639_1": "de", "full_code": "de_DE"},
|
| 24 |
+
"ita_Latn": {"name": "Italian", "iso_639_1": "it", "full_code": "it_IT"},
|
| 25 |
+
"por_Latn": {"name": "Portuguese", "iso_639_1": "pt", "full_code": "pt_PT"},
|
| 26 |
+
"rus_Cyrl": {"name": "Russian", "iso_639_1": "ru", "full_code": "ru_RU"},
|
| 27 |
+
"zho_Hans": {"name": "Chinese (Simplified)", "iso_639_1": "zh", "full_code": "zh_CN"},
|
| 28 |
+
"zho_Hant": {"name": "Chinese (Traditional)", "iso_639_1": "zh", "full_code": "zh_TW"},
|
| 29 |
+
"jpn_Jpan": {"name": "Japanese", "iso_639_1": "ja", "full_code": "ja_JP"},
|
| 30 |
+
"kor_Hang": {"name": "Korean", "iso_639_1": "ko", "full_code": "ko_KR"},
|
| 31 |
+
"ara_Arab": {"name": "Arabic", "iso_639_1": "ar", "full_code": "ar_SA"},
|
| 32 |
+
"hin_Deva": {"name": "Hindi", "iso_639_1": "hi", "full_code": "hi_IN"},
|
| 33 |
+
"cat_Latn": {"name": "Catalan", "iso_639_1": "ca", "full_code": "ca_ES"},
|
| 34 |
+
"glg_Latn": {"name": "Galician", "iso_639_1": "gl", "full_code": "gl_ES"},
|
| 35 |
+
"nld_Latn": {"name": "Dutch", "iso_639_1": "nl", "full_code": "nl_NL"},
|
| 36 |
+
"swe_Latn": {"name": "Swedish", "iso_639_1": "sv", "full_code": "sv_SE"},
|
| 37 |
+
"nor_Latn": {"name": "Norwegian", "iso_639_1": "no", "full_code": "no_NO"},
|
| 38 |
+
"dan_Latn": {"name": "Danish", "iso_639_1": "da", "full_code": "da_DK"},
|
| 39 |
+
"fin_Latn": {"name": "Finnish", "iso_639_1": "fi", "full_code": "fi_FI"},
|
| 40 |
+
"pol_Latn": {"name": "Polish", "iso_639_1": "pl", "full_code": "pl_PL"},
|
| 41 |
+
"ces_Latn": {"name": "Czech", "iso_639_1": "cs", "full_code": "cs_CZ"},
|
| 42 |
+
"hun_Latn": {"name": "Hungarian", "iso_639_1": "hu", "full_code": "hu_HU"},
|
| 43 |
+
"tur_Latn": {"name": "Turkish", "iso_639_1": "tr", "full_code": "tr_TR"},
|
| 44 |
+
"heb_Hebr": {"name": "Hebrew", "iso_639_1": "he", "full_code": "he_IL"},
|
| 45 |
+
"tha_Thai": {"name": "Thai", "iso_639_1": "th", "full_code": "th_TH"},
|
| 46 |
+
"vie_Latn": {"name": "Vietnamese", "iso_639_1": "vi", "full_code": "vi_VN"},
|
| 47 |
+
"ukr_Cyrl": {"name": "Ukrainian", "iso_639_1": "uk", "full_code": "uk_UA"},
|
| 48 |
+
"ell_Grek": {"name": "Greek", "iso_639_1": "el", "full_code": "el_GR"},
|
| 49 |
+
"bul_Cyrl": {"name": "Bulgarian", "iso_639_1": "bg", "full_code": "bg_BG"},
|
| 50 |
+
"ron_Latn": {"name": "Romanian", "iso_639_1": "ro", "full_code": "ro_RO"},
|
| 51 |
+
"hrv_Latn": {"name": "Croatian", "iso_639_1": "hr", "full_code": "hr_HR"},
|
| 52 |
+
"srp_Cyrl": {"name": "Serbian", "iso_639_1": "sr", "full_code": "sr_RS"},
|
| 53 |
+
"slv_Latn": {"name": "Slovenian", "iso_639_1": "sl", "full_code": "sl_SI"},
|
| 54 |
+
"slk_Latn": {"name": "Slovak", "iso_639_1": "sk", "full_code": "sk_SK"},
|
| 55 |
+
"est_Latn": {"name": "Estonian", "iso_639_1": "et", "full_code": "et_EE"},
|
| 56 |
+
"lav_Latn": {"name": "Latvian", "iso_639_1": "lv", "full_code": "lv_LV"},
|
| 57 |
+
"lit_Latn": {"name": "Lithuanian", "iso_639_1": "lt", "full_code": "lt_LT"},
|
| 58 |
+
"msa_Latn": {"name": "Malay", "iso_639_1": "ms", "full_code": "ms_MY"},
|
| 59 |
+
"ind_Latn": {"name": "Indonesian", "iso_639_1": "id", "full_code": "id_ID"},
|
| 60 |
+
"tgl_Latn": {"name": "Filipino", "iso_639_1": "tl", "full_code": "tl_PH"},
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
def load_model(repo_id: str) -> fasttext.FastText._FastText:
|
| 64 |
model_path = hf_hub_download(repo_id, filename="model.bin")
|
| 65 |
return fasttext.load_model(model_path)
|
|
|
|
| 84 |
|
| 85 |
FASTTEXT_PREFIX_LENGTH = 9 # fasttext labels are formatted like "__label__eng_Latn"
|
| 86 |
|
| 87 |
+
def format_language_info(fasttext_code):
|
| 88 |
+
"""Convert FastText language code to human readable format"""
|
| 89 |
+
if fasttext_code in LANGUAGE_MAPPING:
|
| 90 |
+
lang_info = LANGUAGE_MAPPING[fasttext_code]
|
| 91 |
+
return {
|
| 92 |
+
"name": lang_info["name"],
|
| 93 |
+
"iso_code": lang_info["iso_639_1"],
|
| 94 |
+
"full_code": lang_info["full_code"],
|
| 95 |
+
"fasttext_code": fasttext_code
|
| 96 |
+
}
|
| 97 |
+
else:
|
| 98 |
+
# Graceful fallback for unmapped languages
|
| 99 |
+
return {
|
| 100 |
+
"name": fasttext_code,
|
| 101 |
+
"iso_code": "unknown",
|
| 102 |
+
"full_code": "unknown",
|
| 103 |
+
"fasttext_code": fasttext_code
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
def detect_language_segments(text, confidence_threshold=0.3):
|
| 107 |
+
"""Detect language changes in text segments"""
|
| 108 |
+
# Split text into logical segments (sentences, clauses)
|
| 109 |
+
import re
|
| 110 |
+
|
| 111 |
+
# More sophisticated splitting on common separators
|
| 112 |
+
segments = re.split(r'[.!?;/|]\s+|\s+/\s+|\s+\|\s+', text.strip())
|
| 113 |
+
segments = [seg.strip() for seg in segments if seg.strip() and len(seg.strip()) > 10]
|
| 114 |
+
|
| 115 |
+
if len(segments) < 2:
|
| 116 |
+
return None
|
| 117 |
+
|
| 118 |
+
segment_results = []
|
| 119 |
+
for i, segment in enumerate(segments):
|
| 120 |
+
predictions = model_predict(segment, k=1)
|
| 121 |
+
if predictions and predictions[0]['score'] > confidence_threshold:
|
| 122 |
+
lang_info = format_language_info(predictions[0]['label'])
|
| 123 |
+
segment_results.append({
|
| 124 |
+
"segment_number": i + 1,
|
| 125 |
+
"text": segment,
|
| 126 |
+
"language": lang_info,
|
| 127 |
+
"confidence": predictions[0]['score']
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
# Check if we found different languages
|
| 131 |
+
languages_found = set(result['language']['fasttext_code'] for result in segment_results)
|
| 132 |
+
|
| 133 |
+
if len(languages_found) > 1:
|
| 134 |
+
return {
|
| 135 |
+
"is_multilingual": True,
|
| 136 |
+
"languages_detected": list(languages_found),
|
| 137 |
+
"segments": segment_results
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
return None
|
| 141 |
+
|
| 142 |
+
|
| 143 |
# Load the model
|
| 144 |
Path("code/models").mkdir(parents=True, exist_ok=True)
|
| 145 |
model = fasttext.load_model(
|
|
|
|
| 198 |
# Calculate average scores for each language
|
| 199 |
language_scores = valmap(get_mean_score, predictions_by_lang)
|
| 200 |
|
| 201 |
+
# Format results
|
| 202 |
+
# Format with human-readable language info
|
| 203 |
+
formatted_languages = {}
|
| 204 |
+
for fasttext_code, score in language_scores.items():
|
| 205 |
+
lang_info = format_language_info(fasttext_code)
|
| 206 |
+
formatted_languages[fasttext_code] = {
|
| 207 |
+
"score": score,
|
| 208 |
+
"language_info": lang_info
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
# Check for multilingual segments
|
| 212 |
+
segment_analysis = detect_language_segments(text)
|
| 213 |
+
|
| 214 |
# Format results
|
| 215 |
results = {
|
| 216 |
+
"detected_languages": formatted_languages,
|
| 217 |
"language_counts": dict(language_counts),
|
| 218 |
"total_predictions": len(all_predictions),
|
| 219 |
"text_lines_analyzed": len(cleaned_lines)
|
| 220 |
}
|
| 221 |
|
| 222 |
+
# Add segment analysis if multilingual
|
| 223 |
+
if segment_analysis:
|
| 224 |
+
results["segment_analysis"] = segment_analysis
|
| 225 |
|
| 226 |
+
return results
|
| 227 |
+
|
| 228 |
except Exception as e:
|
| 229 |
return {"error": f"Error during prediction: {str(e)}"}
|
| 230 |
|
| 231 |
+
|
| 232 |
def batch_predict(text, threshold_percent=0.2):
|
| 233 |
"""More advanced prediction with filtering"""
|
| 234 |
if not text or not text.strip():
|
|
|
|
| 254 |
keys_to_keep = filter_by_frequency(language_counts, threshold_percent=threshold_percent)
|
| 255 |
filtered_dict = {k: v for k, v in predictions_by_lang.items() if k in keys_to_keep}
|
| 256 |
|
| 257 |
+
# Format with human-readable language info
|
| 258 |
+
formatted_predictions = {}
|
| 259 |
+
for fasttext_code, score in valmap(get_mean_score, filtered_dict).items():
|
| 260 |
+
lang_info = format_language_info(fasttext_code)
|
| 261 |
+
formatted_predictions[fasttext_code] = {
|
| 262 |
+
"score": score,
|
| 263 |
+
"language_info": lang_info
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
# Check for multilingual segments
|
| 267 |
+
segment_analysis = detect_language_segments(text)
|
| 268 |
+
|
| 269 |
results = {
|
| 270 |
+
"predictions": formatted_predictions,
|
| 271 |
"all_language_counts": dict(language_counts),
|
| 272 |
"filtered_languages": list(keys_to_keep),
|
| 273 |
"threshold_used": threshold_percent
|
| 274 |
}
|
| 275 |
|
| 276 |
+
# Add segment analysis if multilingual
|
| 277 |
+
if segment_analysis:
|
| 278 |
+
results["segment_analysis"] = segment_analysis
|
| 279 |
+
|
| 280 |
return results
|
| 281 |
|
| 282 |
except Exception as e:
|
| 283 |
return {"error": f"Error during prediction: {str(e)}"}
|
| 284 |
|
| 285 |
+
|
| 286 |
def build_demo_interface():
|
| 287 |
app_title = "Language Detection Tool"
|
| 288 |
with gr.Blocks(title=app_title) as demo:
|