Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,580 Bytes
78d1101 54108c5 3cd8516 78d1101 54108c5 1c7cbff 78d1101 6b82660 54108c5 585f2eb 54108c5 585f2eb 242f488 54108c5 585f2eb 54108c5 78d1101 242f488 585f2eb 54108c5 78d1101 261a5aa 4a24c4f dd0fc62 78d1101 dd0fc62 3cd8516 7f3d8a9 dfb286c fb3e214 7f3d8a9 261a5aa 7f3d8a9 54108c5 7f3d8a9 261a5aa 4a24c4f 585f2eb c220549 54108c5 c220549 4a24c4f 7cc5f39 dd0f81b 3cd8516 c716abf 3cd8516 4a24c4f 261a5aa 20a6b7b 585f2eb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import torch
import spaces
import re
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from goai_helpers.utils import MooreConverter, mark_numbers, unmark_numbers
from huggingface_hub import login
import os
max_length = 512
auth_token = os.getenv('HF_SPACE_TOKEN')
login(token=auth_token)
def split_text_intelligently(text, max_chunk_length=80):
"""
Divise le texte en chunks en respectant les phrases complètes.
"""
# Séparation basée sur les phrases (utilise les points, points d'interrogation, etc.)
sentences = re.split(r'([.!?:])', text)
chunks = []
current_chunk = ""
for i in range(0, len(sentences), 2):
# Reconstruire la phrase avec sa ponctuation
if i + 1 < len(sentences):
sentence = sentences[i] + sentences[i+1]
else:
sentence = sentences[i]
# Si l'ajout de cette phrase dépasse la longueur maximale, on crée un nouveau chunk
if len(current_chunk) + len(sentence) > max_chunk_length and current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence
else:
current_chunk += sentence
# Ajouter le dernier chunk s'il reste du texte
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
@spaces.GPU
def goai_traduction(text, src_lang, tgt_lang, max_chunk_length=80):
# Si le texte est trop long, le diviser en chunks
if len(text) > max_chunk_length:
chunks = split_text_intelligently(text, max_chunk_length)
translations = []
for chunk in chunks:
translated_chunk = translate_chunk(chunk, src_lang, tgt_lang)
translations.append(translated_chunk)
return " ".join(translations)
else:
return translate_chunk(text, src_lang, tgt_lang)
def translate_chunk(text, src_lang, tgt_lang):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if src_lang == "mos_Latn" and tgt_lang == "fra_Latn":
model_id = "ArissBandoss/nllb-200-3.3B-mos2fr"
else:
model_id = "ArissBandoss/nllb-200-3.3B-fr2mos"
#model_id = "ArissBandoss/nllb-200-3.3B-mos-fr-bidirectional-peft"
text = mark_numbers(text)
tokenizer = AutoTokenizer.from_pretrained(model_id, token=auth_token)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, token=auth_token).to(device)
# Configuration du tokenizer
tokenizer.src_lang = src_lang
# Tokenisation
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
# ID du token de langue cible
tgt_lang_id = tokenizer.convert_tokens_to_ids(tgt_lang)
# Paramètres de génération optimisés pour éviter les répétitions
outputs = model.generate(
**inputs,
forced_bos_token_id=tgt_lang_id,
max_new_tokens=512,
early_stopping=True
)
# Décodage
translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
translation = unmark_numbers(translation)
number_converter = MooreConverter()
numbers = re.findall(r'\b\d+\b', translation)
for number in numbers:
moore_number = number_converter.number_to_moore(int(number))
if moore_number: # Only replace if conversion succeeded
translation = translation.replace(number,moore_number)
return translation
def real_time_traduction(input_text, src_lang, tgt_lang):
return goai_traduction(input_text, src_lang, tgt_lang) |