Spaces:
Running
Running
import fasttext | |
import re | |
from collections import Counter | |
languages = { | |
"0": "sma", | |
"1": "sme", | |
"2": "smj", | |
"3": "fin", | |
"4": "est", | |
"5": "eng" | |
} | |
class WordLid: | |
def __init__(self, model_path): | |
self.model = fasttext.load_model(model_path) | |
self.threshold = 0.5 | |
def set_threshold(self, threshold): | |
self.threshold = threshold | |
def _clean_word(self, word): | |
word = word.lower() | |
word = re.sub(r'[^\w\s]', '', word) | |
word = re.sub(r'\s+', ' ', word) | |
word = re.sub(r'\d', '', word) | |
return word.strip() | |
def _predict_all_languages(self, word): | |
cleaned_word = self._clean_word(word) | |
labels, probabilities = self.model.predict(cleaned_word, k=-1) | |
print(word) | |
for l, p in zip(labels, probabilities): | |
print(f'{languages[l.replace("__label__", "")]} {p:.4f}') | |
return {label.replace('__label__', ''): prob for label, prob in zip(labels, probabilities)} | |
def _get_main_language(self, text): | |
words = [self._clean_word(word) for word in text.split() if word] | |
language_counts = Counter( | |
max(self._predict_all_languages(word), key=self._predict_all_languages(word).get) | |
for word in words if self._predict_all_languages(word) | |
) | |
return language_counts.most_common(1)[0][0] if language_counts else None | |
def get_lang_array(self, text): | |
main_language = self._get_main_language(text) | |
if main_language is None: | |
return ['unk'] * len(text) | |
lang_array = [main_language] * len(text) | |
word_start_index = 0 | |
for word in text.split(): | |
word_start_index = text.find(word, word_start_index) | |
cleaned_word = self._clean_word(word) | |
if not cleaned_word: | |
word_start_index += len(word) | |
continue | |
predictions = self._predict_all_languages(cleaned_word) | |
if not predictions: | |
word_start_index += len(word) | |
continue | |
best_word_lang = max(predictions, key=predictions.get) | |
main_lang_prob_for_word = predictions.get(main_language, 0.0) # Get main lang prob *for this word* | |
best_lang_prob = predictions[best_word_lang] | |
# Key change: Check if the best language probability is 0.5 greater than the main language probability *for this word* | |
if best_lang_prob >= main_lang_prob_for_word + 0.5: | |
for i in range(len(word)): | |
lang_array[word_start_index + i] = best_word_lang | |
word_start_index += len(word) | |
return [int(x) for x in lang_array] | |
#return lang_array | |
if __name__ == '__main__': | |
model_path = 'lang_id_model_q.bin' | |
identifier = WordLid(model_path) | |
test_texts = [ | |
"Mumenvákki ođđasamos badji ii gávdno vuos sámegillii. Áigumuššan lea goit dubbet dan maiddái sámegielaide, lohká Yle Draama hoavda Jarmo Lampela." | |
] | |
for text in test_texts: | |
lang_array = identifier.get_lang_array(text) | |
print(f"\nText: '{text}'") | |
print(f"Language Array: {lang_array}") | |
#for i in range(0,len(text)): | |
# print(text[i], lang_array[i]) | |
assert len(lang_array) == len(text), "Length mismatch!" | |
# Example of changing the threshold: | |
identifier.set_threshold(0.8) | |
lang_array = identifier.get_lang_array("Bonjour le monde!") | |
print(f"\nText: 'Bonjour le monde!' (with threshold 0.8)") | |
print(f"Language Array: {lang_array}") | |