Moore-Language-Space-ZeroGPU

Sleeping

App Files Files Community

ArissBandoss commited on May 19

Commit

4a24c4f

verified ·

1 Parent(s): 7cc5f39

Update goai_helpers/goai_traduction.py

Browse files

Files changed (1) hide show

goai_helpers/goai_traduction.py +21 -26

goai_helpers/goai_traduction.py CHANGED Viewed

@@ -15,53 +15,48 @@ login(token=auth_token)
 def goai_traduction(text, src_lang, tgt_lang):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    if src_lang == "fra_Latn" and tgt_lang == "mos_Latn":
-        model_id = "ArissBandoss/nllb-200-distilled-600M-finetuned-fr-to-mos-V4"
-    elif src_lang == "mos_Latn" and tgt_lang == "fra_Latn":
-        model_id = "ArissBandoss/3b-new-400"
     else:
         model_id = "ArissBandoss/nllb-200-distilled-600M-finetuned-fr-to-mos-V4"
-    print(f"Chargement du modèle: {model_id}")
     tokenizer = AutoTokenizer.from_pretrained(model_id, token=auth_token)
     model = AutoModelForSeq2SeqLM.from_pretrained(model_id, token=auth_token).to(device)
-    print(f"Texte brut ({len(text)} caractères / {len(text.split())} mots):")
-    print(text)
-    print(f"Configuration du modèle:")
-    print(f"- tokenizer.model_max_length: {tokenizer.model_max_length}")
-    print(f"- Position embeddings shape: {model.model.encoder.embed_positions.weights.shape}")
-    print(f"- decoder.embed_positions shape: {model.model.decoder.embed_positions.weights.shape}")
     # Configuration du tokenizer
     tokenizer.src_lang = src_lang
     # Tokenisation
     inputs = tokenizer(text, return_tensors="pt", truncation=False).to(device)
-    input_ids = inputs["input_ids"][0]
-    print("Tokens d'entrée:")
-    print(f"- Nombre de tokens: {input_ids.shape[0]}")
-    print(f"- Premiers tokens: {input_ids[:10].tolist()}")
-    print(f"- Derniers tokens: {input_ids[-10:].tolist()}")
     # ID du token de langue cible
     tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang)
-    print(f"Token ID de la langue cible ({tgt_lang}): {tgt_lang_id}")
-    bad_words_ids = [[tokenizer.eos_token_id]]
     outputs = model.generate(
         **inputs,
         forced_bos_token_id=tgt_lang_id,
-        max_length=max_length,
-        min_length=max_length,
         num_beams=5,
-        no_repeat_ngram_size=0,
-        length_penalty=2.0
     )
     translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
     return translation
 def real_time_traduction(input_text, src_lang, tgt_lang):

 def goai_traduction(text, src_lang, tgt_lang):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    if src_lang == "mos_Latn" and tgt_lang == "fra_Latn":
+        model_id = "ArissBandoss/3b-new-400"
     else:
         model_id = "ArissBandoss/nllb-200-distilled-600M-finetuned-fr-to-mos-V4"
     tokenizer = AutoTokenizer.from_pretrained(model_id, token=auth_token)
     model = AutoModelForSeq2SeqLM.from_pretrained(model_id, token=auth_token).to(device)
     # Configuration du tokenizer
     tokenizer.src_lang = src_lang
     # Tokenisation
     inputs = tokenizer(text, return_tensors="pt", truncation=False).to(device)
+    input_length = inputs["input_ids"].shape[1]
+    # Estimation intelligente de la longueur de sortie attendue
+    # Pour le mooré vers français, un facteur de 1.2-1.5 est généralement bon
+    expected_output_length = int(input_length * 1.3)
     # ID du token de langue cible
     tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang)
+    # ID du token EOS
+    eos_token_id = tokenizer.eos_token_id
+    # Bloquer complètement le token EOS jusqu'à un certain point
     outputs = model.generate(
         **inputs,
         forced_bos_token_id=tgt_lang_id,
+        max_new_tokens=1024,
+        min_length=expected_output_length,
         num_beams=5,
+        no_repeat_ngram_size=4,
+        repetition_penalty=2.0,
+        length_penalty=1.5,
+        diversity_penalty=0.5,
+        num_beam_groups=5
     )
+    # Décodage
     translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
     return translation
 def real_time_traduction(input_text, src_lang, tgt_lang):