bwingenroth commited on
Commit
0f7e916
·
verified ·
1 Parent(s): bd40e81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -3
app.py CHANGED
@@ -15,6 +15,51 @@ load_dotenv()
15
 
16
  DEFAULT_FAST_TEXT_MODEL = "laurievb/OpenLID"
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def load_model(repo_id: str) -> fasttext.FastText._FastText:
19
  model_path = hf_hub_download(repo_id, filename="model.bin")
20
  return fasttext.load_model(model_path)
@@ -39,6 +84,62 @@ def yield_clean_rows(rows: Union[list[str], str], min_length: int = 3) -> Iterat
39
 
40
  FASTTEXT_PREFIX_LENGTH = 9 # fasttext labels are formatted like "__label__eng_Latn"
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # Load the model
43
  Path("code/models").mkdir(parents=True, exist_ok=True)
44
  model = fasttext.load_model(
@@ -97,19 +198,37 @@ def simple_predict(text, num_predictions=3):
97
  # Calculate average scores for each language
98
  language_scores = valmap(get_mean_score, predictions_by_lang)
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  # Format results
101
  results = {
102
- "detected_languages": dict(language_scores),
103
  "language_counts": dict(language_counts),
104
  "total_predictions": len(all_predictions),
105
  "text_lines_analyzed": len(cleaned_lines)
106
  }
107
 
108
- return results
 
 
109
 
 
 
110
  except Exception as e:
111
  return {"error": f"Error during prediction: {str(e)}"}
112
 
 
113
  def batch_predict(text, threshold_percent=0.2):
114
  """More advanced prediction with filtering"""
115
  if not text or not text.strip():
@@ -135,18 +254,35 @@ def batch_predict(text, threshold_percent=0.2):
135
  keys_to_keep = filter_by_frequency(language_counts, threshold_percent=threshold_percent)
136
  filtered_dict = {k: v for k, v in predictions_by_lang.items() if k in keys_to_keep}
137
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  results = {
139
- "predictions": dict(valmap(get_mean_score, filtered_dict)),
140
  "all_language_counts": dict(language_counts),
141
  "filtered_languages": list(keys_to_keep),
142
  "threshold_used": threshold_percent
143
  }
144
 
 
 
 
 
145
  return results
146
 
147
  except Exception as e:
148
  return {"error": f"Error during prediction: {str(e)}"}
149
 
 
150
  def build_demo_interface():
151
  app_title = "Language Detection Tool"
152
  with gr.Blocks(title=app_title) as demo:
 
15
 
16
  DEFAULT_FAST_TEXT_MODEL = "laurievb/OpenLID"
17
 
18
+ # Language code mapping - feel free to expand this
19
+ LANGUAGE_MAPPING = {
20
+ "spa_Latn": {"name": "Spanish", "iso_639_1": "es", "full_code": "es_ES"},
21
+ "eng_Latn": {"name": "English", "iso_639_1": "en", "full_code": "en_US"},
22
+ "fra_Latn": {"name": "French", "iso_639_1": "fr", "full_code": "fr_FR"},
23
+ "deu_Latn": {"name": "German", "iso_639_1": "de", "full_code": "de_DE"},
24
+ "ita_Latn": {"name": "Italian", "iso_639_1": "it", "full_code": "it_IT"},
25
+ "por_Latn": {"name": "Portuguese", "iso_639_1": "pt", "full_code": "pt_PT"},
26
+ "rus_Cyrl": {"name": "Russian", "iso_639_1": "ru", "full_code": "ru_RU"},
27
+ "zho_Hans": {"name": "Chinese (Simplified)", "iso_639_1": "zh", "full_code": "zh_CN"},
28
+ "zho_Hant": {"name": "Chinese (Traditional)", "iso_639_1": "zh", "full_code": "zh_TW"},
29
+ "jpn_Jpan": {"name": "Japanese", "iso_639_1": "ja", "full_code": "ja_JP"},
30
+ "kor_Hang": {"name": "Korean", "iso_639_1": "ko", "full_code": "ko_KR"},
31
+ "ara_Arab": {"name": "Arabic", "iso_639_1": "ar", "full_code": "ar_SA"},
32
+ "hin_Deva": {"name": "Hindi", "iso_639_1": "hi", "full_code": "hi_IN"},
33
+ "cat_Latn": {"name": "Catalan", "iso_639_1": "ca", "full_code": "ca_ES"},
34
+ "glg_Latn": {"name": "Galician", "iso_639_1": "gl", "full_code": "gl_ES"},
35
+ "nld_Latn": {"name": "Dutch", "iso_639_1": "nl", "full_code": "nl_NL"},
36
+ "swe_Latn": {"name": "Swedish", "iso_639_1": "sv", "full_code": "sv_SE"},
37
+ "nor_Latn": {"name": "Norwegian", "iso_639_1": "no", "full_code": "no_NO"},
38
+ "dan_Latn": {"name": "Danish", "iso_639_1": "da", "full_code": "da_DK"},
39
+ "fin_Latn": {"name": "Finnish", "iso_639_1": "fi", "full_code": "fi_FI"},
40
+ "pol_Latn": {"name": "Polish", "iso_639_1": "pl", "full_code": "pl_PL"},
41
+ "ces_Latn": {"name": "Czech", "iso_639_1": "cs", "full_code": "cs_CZ"},
42
+ "hun_Latn": {"name": "Hungarian", "iso_639_1": "hu", "full_code": "hu_HU"},
43
+ "tur_Latn": {"name": "Turkish", "iso_639_1": "tr", "full_code": "tr_TR"},
44
+ "heb_Hebr": {"name": "Hebrew", "iso_639_1": "he", "full_code": "he_IL"},
45
+ "tha_Thai": {"name": "Thai", "iso_639_1": "th", "full_code": "th_TH"},
46
+ "vie_Latn": {"name": "Vietnamese", "iso_639_1": "vi", "full_code": "vi_VN"},
47
+ "ukr_Cyrl": {"name": "Ukrainian", "iso_639_1": "uk", "full_code": "uk_UA"},
48
+ "ell_Grek": {"name": "Greek", "iso_639_1": "el", "full_code": "el_GR"},
49
+ "bul_Cyrl": {"name": "Bulgarian", "iso_639_1": "bg", "full_code": "bg_BG"},
50
+ "ron_Latn": {"name": "Romanian", "iso_639_1": "ro", "full_code": "ro_RO"},
51
+ "hrv_Latn": {"name": "Croatian", "iso_639_1": "hr", "full_code": "hr_HR"},
52
+ "srp_Cyrl": {"name": "Serbian", "iso_639_1": "sr", "full_code": "sr_RS"},
53
+ "slv_Latn": {"name": "Slovenian", "iso_639_1": "sl", "full_code": "sl_SI"},
54
+ "slk_Latn": {"name": "Slovak", "iso_639_1": "sk", "full_code": "sk_SK"},
55
+ "est_Latn": {"name": "Estonian", "iso_639_1": "et", "full_code": "et_EE"},
56
+ "lav_Latn": {"name": "Latvian", "iso_639_1": "lv", "full_code": "lv_LV"},
57
+ "lit_Latn": {"name": "Lithuanian", "iso_639_1": "lt", "full_code": "lt_LT"},
58
+ "msa_Latn": {"name": "Malay", "iso_639_1": "ms", "full_code": "ms_MY"},
59
+ "ind_Latn": {"name": "Indonesian", "iso_639_1": "id", "full_code": "id_ID"},
60
+ "tgl_Latn": {"name": "Filipino", "iso_639_1": "tl", "full_code": "tl_PH"},
61
+ }
62
+
63
  def load_model(repo_id: str) -> fasttext.FastText._FastText:
64
  model_path = hf_hub_download(repo_id, filename="model.bin")
65
  return fasttext.load_model(model_path)
 
84
 
85
  FASTTEXT_PREFIX_LENGTH = 9 # fasttext labels are formatted like "__label__eng_Latn"
86
 
87
+ def format_language_info(fasttext_code):
88
+ """Convert FastText language code to human readable format"""
89
+ if fasttext_code in LANGUAGE_MAPPING:
90
+ lang_info = LANGUAGE_MAPPING[fasttext_code]
91
+ return {
92
+ "name": lang_info["name"],
93
+ "iso_code": lang_info["iso_639_1"],
94
+ "full_code": lang_info["full_code"],
95
+ "fasttext_code": fasttext_code
96
+ }
97
+ else:
98
+ # Graceful fallback for unmapped languages
99
+ return {
100
+ "name": fasttext_code,
101
+ "iso_code": "unknown",
102
+ "full_code": "unknown",
103
+ "fasttext_code": fasttext_code
104
+ }
105
+
106
+ def detect_language_segments(text, confidence_threshold=0.3):
107
+ """Detect language changes in text segments"""
108
+ # Split text into logical segments (sentences, clauses)
109
+ import re
110
+
111
+ # More sophisticated splitting on common separators
112
+ segments = re.split(r'[.!?;/|]\s+|\s+/\s+|\s+\|\s+', text.strip())
113
+ segments = [seg.strip() for seg in segments if seg.strip() and len(seg.strip()) > 10]
114
+
115
+ if len(segments) < 2:
116
+ return None
117
+
118
+ segment_results = []
119
+ for i, segment in enumerate(segments):
120
+ predictions = model_predict(segment, k=1)
121
+ if predictions and predictions[0]['score'] > confidence_threshold:
122
+ lang_info = format_language_info(predictions[0]['label'])
123
+ segment_results.append({
124
+ "segment_number": i + 1,
125
+ "text": segment,
126
+ "language": lang_info,
127
+ "confidence": predictions[0]['score']
128
+ })
129
+
130
+ # Check if we found different languages
131
+ languages_found = set(result['language']['fasttext_code'] for result in segment_results)
132
+
133
+ if len(languages_found) > 1:
134
+ return {
135
+ "is_multilingual": True,
136
+ "languages_detected": list(languages_found),
137
+ "segments": segment_results
138
+ }
139
+
140
+ return None
141
+
142
+
143
  # Load the model
144
  Path("code/models").mkdir(parents=True, exist_ok=True)
145
  model = fasttext.load_model(
 
198
  # Calculate average scores for each language
199
  language_scores = valmap(get_mean_score, predictions_by_lang)
200
 
201
+ # Format results
202
+ # Format with human-readable language info
203
+ formatted_languages = {}
204
+ for fasttext_code, score in language_scores.items():
205
+ lang_info = format_language_info(fasttext_code)
206
+ formatted_languages[fasttext_code] = {
207
+ "score": score,
208
+ "language_info": lang_info
209
+ }
210
+
211
+ # Check for multilingual segments
212
+ segment_analysis = detect_language_segments(text)
213
+
214
  # Format results
215
  results = {
216
+ "detected_languages": formatted_languages,
217
  "language_counts": dict(language_counts),
218
  "total_predictions": len(all_predictions),
219
  "text_lines_analyzed": len(cleaned_lines)
220
  }
221
 
222
+ # Add segment analysis if multilingual
223
+ if segment_analysis:
224
+ results["segment_analysis"] = segment_analysis
225
 
226
+ return results
227
+
228
  except Exception as e:
229
  return {"error": f"Error during prediction: {str(e)}"}
230
 
231
+
232
  def batch_predict(text, threshold_percent=0.2):
233
  """More advanced prediction with filtering"""
234
  if not text or not text.strip():
 
254
  keys_to_keep = filter_by_frequency(language_counts, threshold_percent=threshold_percent)
255
  filtered_dict = {k: v for k, v in predictions_by_lang.items() if k in keys_to_keep}
256
 
257
+ # Format with human-readable language info
258
+ formatted_predictions = {}
259
+ for fasttext_code, score in valmap(get_mean_score, filtered_dict).items():
260
+ lang_info = format_language_info(fasttext_code)
261
+ formatted_predictions[fasttext_code] = {
262
+ "score": score,
263
+ "language_info": lang_info
264
+ }
265
+
266
+ # Check for multilingual segments
267
+ segment_analysis = detect_language_segments(text)
268
+
269
  results = {
270
+ "predictions": formatted_predictions,
271
  "all_language_counts": dict(language_counts),
272
  "filtered_languages": list(keys_to_keep),
273
  "threshold_used": threshold_percent
274
  }
275
 
276
+ # Add segment analysis if multilingual
277
+ if segment_analysis:
278
+ results["segment_analysis"] = segment_analysis
279
+
280
  return results
281
 
282
  except Exception as e:
283
  return {"error": f"Error during prediction: {str(e)}"}
284
 
285
+
286
  def build_demo_interface():
287
  app_title = "Language Detection Tool"
288
  with gr.Blocks(title=app_title) as demo: