Spaces:

divvun-tts
/

6L-TTS

Running

App Files Files Community

asuni commited on Jun 27

Commit

4bfd07a

verified ·

1 Parent(s): a56c1ff

Upload langid.py

Browse files

Files changed (1) hide show

langid/langid.py +101 -0

langid/langid.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import fasttext
+import re
+from collections import Counter
+languages = {
+    "0": "sma",
+    "1": "sme",
+    "2": "smj",
+    "3": "fin",
+    "4": "est",
+    "5": "eng"
+}
+class WordLid:
+    def __init__(self, model_path):
+        self.model = fasttext.load_model(model_path)
+        self.threshold = 0.5
+    def set_threshold(self, threshold):
+        self.threshold = threshold
+    def _clean_word(self, word):
+        word = word.lower()
+        word = re.sub(r'[^\w\s]', '', word)
+        word = re.sub(r'\s+', ' ', word)
+        word = re.sub(r'\d', '', word)
+        return word.strip()
+    def _predict_all_languages(self, word):
+        cleaned_word = self._clean_word(word)
+        labels, probabilities = self.model.predict(cleaned_word, k=-1)
+        print(word)
+        for l, p in zip(labels, probabilities):
+            print(f'{languages[l.replace("__label__", "")]} {p:.4f}')
+        return {label.replace('__label__', ''): prob for label, prob in zip(labels, probabilities)}
+    def _get_main_language(self, text):
+        words = [self._clean_word(word) for word in text.split() if word]
+        language_counts = Counter(
+            max(self._predict_all_languages(word), key=self._predict_all_languages(word).get)
+            for word in words if self._predict_all_languages(word)
+        )
+        return language_counts.most_common(1)[0][0] if language_counts else None
+    def get_lang_array(self, text):
+        main_language = self._get_main_language(text)
+        if main_language is None:
+            return ['unk'] * len(text)
+        lang_array = [main_language] * len(text)
+        word_start_index = 0
+        for word in text.split():
+            word_start_index = text.find(word, word_start_index)
+            cleaned_word = self._clean_word(word)
+            if not cleaned_word:
+                word_start_index += len(word)
+                continue
+            predictions = self._predict_all_languages(cleaned_word)
+            if not predictions:
+                word_start_index += len(word)
+                continue
+            best_word_lang = max(predictions, key=predictions.get)
+            main_lang_prob_for_word = predictions.get(main_language, 0.0)  # Get main lang prob *for this word*
+            best_lang_prob = predictions[best_word_lang]
+            # Key change: Check if the best language probability is 0.5 greater than the main language probability *for this word*
+            if best_lang_prob >= main_lang_prob_for_word + 0.5:
+                for i in range(len(word)):
+                    lang_array[word_start_index + i] = best_word_lang
+            word_start_index += len(word)
+        return [int(x) for x in lang_array]
+        #return lang_array
+if __name__ == '__main__':
+    model_path = 'lang_id_model_q.bin'
+    identifier = WordLid(model_path)
+    test_texts = [
+        "Mumenvákki ođđasamos badji ii gávdno vuos sámegillii. Áigumuššan lea goit dubbet dan maiddái sámegielaide, lohká Yle Draama hoavda Jarmo Lampela."
+    ]
+    for text in test_texts:
+        lang_array = identifier.get_lang_array(text)
+        print(f"\nText: '{text}'")
+        print(f"Language Array: {lang_array}")
+        #for i in range(0,len(text)):
+        #    print(text[i], lang_array[i])
+        assert len(lang_array) == len(text), "Length mismatch!"
+    # Example of changing the threshold:
+    identifier.set_threshold(0.8)
+    lang_array = identifier.get_lang_array("Bonjour le monde!")
+    print(f"\nText: 'Bonjour le monde!' (with threshold 0.8)")
+    print(f"Language Array: {lang_array}")