|
import os |
|
import re |
|
import time |
|
import torch |
|
import spaces |
|
import requests |
|
import tempfile |
|
import concurrent |
|
import numpy as np |
|
from tqdm import tqdm |
|
from huggingface_hub import hf_hub_download, hf_hub_url, login |
|
|
|
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer |
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
from TTS.tts.models.xtts import Xtts |
|
|
|
|
|
def download_file(url: str, destination: str, token: str = None): |
|
""" |
|
Télécharge un fichier à partir d'une URL avec une barre de progression. Prend en charge les tokens API Hugging Face pour les modèles protégés. |
|
:param url: L'URL à partir de laquelle télécharger. |
|
:param destination: Le chemin de destination pour enregistrer le fichier téléchargé. |
|
:param token: Le jeton API Hugging Face (optionnel). Si non fourni, la variable d'environnement HF_API_TOKEN sera utilisée. |
|
""" |
|
|
|
|
|
|
|
if token is None: |
|
token = os.getenv("HF_SPACE_TOKEN") |
|
|
|
|
|
headers = {} |
|
if token: |
|
headers['Authorization'] = f'Bearer {token}' |
|
|
|
|
|
response = requests.get(url, stream=True, headers=headers) |
|
|
|
|
|
total_size = int(response.headers.get('content-length', 0)) |
|
|
|
|
|
with open(destination, 'wb') as file, tqdm(desc=destination, total=total_size, unit='B', unit_scale=True, |
|
unit_divisor=1024) as bar: |
|
for data in response.iter_content(chunk_size=1024): |
|
size = file.write(data) |
|
bar.update(size) |
|
|
|
|
|
def diviser_phrases_moore(texte: str) -> list: |
|
""" |
|
Divise un texte en phrases en fonction des signes de ponctuation de fin de phrase. |
|
|
|
Cette fonction prend un texte en entrée et le divise en phrases en se basant sur les |
|
signes de ponctuation (tels que le point (.) ...). |
|
Elle nettoie également les espaces superflus et filtre les chaînes vides. |
|
|
|
Args: |
|
texte (str): Le texte à diviser en phrases. |
|
|
|
Returns: |
|
list: Une liste de phrases nettoyées et divisées à partir du texte. |
|
""" |
|
|
|
fin_de_phrase = re.compile(r'(?<=[.!?])\s+') |
|
|
|
|
|
phrases = fin_de_phrase.split(texte) |
|
|
|
|
|
phrases = [phrase.strip() for phrase in phrases if phrase.strip()] |
|
|
|
return phrases |
|
|
|
|
|
class MooreConverter: |
|
""" |
|
Convert Arabic numerals into Mooré textual representation. |
|
""" |
|
|
|
def __init__(self): |
|
self.units = ["", "yembo", "yiibu", "tãabo", "naase", "nu", "yoobe", "yopoe", "nii", "wae"] |
|
self.tens_base = ["", "piiga", "pisi", "pis-tã", "pis-naase", "pis-nu", "pis-yoobe", "pis-yopoe", "pis-nii", "pis-wae"] |
|
self.hundreds = ["", "koabga"] |
|
self.hundreds_prefix = "kobs-" |
|
self.thousands = ["", "tusri"] |
|
|
|
|
|
def number_to_moore(self, n: int, is_price: bool = False) -> str: |
|
if is_price: |
|
n = n / 5 |
|
n = int(n) |
|
|
|
if n == 0: |
|
return "" |
|
|
|
if n < 10: |
|
return self.units[n] |
|
|
|
if 11 <= n <= 19: |
|
unit_part = self.units[n - 10] |
|
if unit_part == "yembo": |
|
unit_part = "ye" |
|
elif unit_part == "yiibu": |
|
unit_part = "yi" |
|
elif unit_part == "tãabo": |
|
unit_part = "tã" |
|
return "piig la a " + unit_part |
|
|
|
if n == 10: |
|
return self.tens_base[1] |
|
|
|
if n < 100: |
|
tens_part = self.tens_base[n // 10] |
|
units_part = n % 10 |
|
if units_part > 0: |
|
unit_text = self.units[units_part] |
|
if unit_text == "yembo": |
|
unit_text = "ye" |
|
elif unit_text == "yiibu": |
|
unit_text = "yi" |
|
elif unit_text == "tãabo": |
|
unit_text = "tã" |
|
return tens_part + " la a " + unit_text |
|
else: |
|
return tens_part |
|
|
|
if n < 1000: |
|
hundreds_count = n // 100 |
|
remainder = n % 100 |
|
|
|
if hundreds_count == 1: |
|
result = self.hundreds[1] |
|
else: |
|
|
|
unit_name = self.units[hundreds_count] |
|
if unit_name == "yembo": |
|
unit_name = "ye" |
|
elif unit_name == "yiibu": |
|
unit_name = "yiibu" |
|
elif unit_name == "tãabo": |
|
unit_name = "tã" |
|
result = self.hundreds_prefix + unit_name |
|
|
|
if remainder > 0: |
|
result += " la " + self.number_to_moore(remainder) |
|
return result |
|
|
|
if n < 1_000_000: |
|
thousands_count = n // 1000 |
|
remainder = n % 1000 |
|
|
|
if thousands_count == 1: |
|
result = self.thousands[1] |
|
else: |
|
|
|
|
|
result = self.thousands[1] + " " + self.number_to_moore(thousands_count) |
|
result = result.replace("tusri", "tusa") |
|
if n >= 10000: |
|
result = result.replace("tusa", "tus") |
|
|
|
if remainder > 0: |
|
result += " la " + self.number_to_moore(remainder) |
|
return result |
|
|
|
|
|
millions_count = n // 1_000_000 |
|
remainder = n % 1_000_000 |
|
result = self.number_to_moore(millions_count) + " milyɔɔng" |
|
if remainder > 0: |
|
result += " " + self.number_to_moore(remainder) |
|
return result |
|
|
|
def expand_number(self, text: str) -> str: |
|
""" |
|
Replace plain numbers (e.g. '123') with Mooré words. If conversion returns |
|
empty (e.g. 0) we fall back to keeping the original digits. |
|
""" |
|
if not isinstance(text, str): |
|
return text |
|
|
|
number_pattern = re.compile(r'\b\d+\b') |
|
|
|
def replace_number_with_text(match): |
|
s = match.group() |
|
try: |
|
number = int(s) |
|
moore = self.number_to_moore(number, True) |
|
|
|
return moore if moore else s |
|
except Exception: |
|
|
|
return s |
|
|
|
return number_pattern.sub(replace_number_with_text, text) |
|
|
|
def mark_numbers(text): |
|
return re.sub(r'\b(\d+)\b', r'*\1*', text) |
|
|
|
def unmark_numbers(text): |
|
|
|
return text.replace("*", "") |