Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,861 Bytes
78d1101 314a751 6f3d2cb 9528d5f 6f3d2cb 9528d5f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
import os
import re
import time
import torch
import spaces
import requests
import tempfile
import concurrent
import numpy as np
from tqdm import tqdm
from huggingface_hub import hf_hub_download, hf_hub_url, login
from TTS.tts.layers.xtts.tokenizer import VoiceBpeTokenizer
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
def download_file(url: str, destination: str, token: str = None):
"""
Télécharge un fichier à partir d'une URL avec une barre de progression. Prend en charge les tokens API Hugging Face pour les modèles protégés.
:param url: L'URL à partir de laquelle télécharger.
:param destination: Le chemin de destination pour enregistrer le fichier téléchargé.
:param token: Le jeton API Hugging Face (optionnel). Si non fourni, la variable d'environnement HF_API_TOKEN sera utilisée.
"""
# utiliser le jeton passé ou récupérer depuis la variable d'environnement
if token is None:
token = os.getenv("HF_SPACE_TOKEN")
# en-têtes pour la requête
headers = {}
if token:
headers['Authorization'] = f'Bearer {token}'
# requête GET en streaming avec en-têtes
response = requests.get(url, stream=True, headers=headers)
# taille totale en octets, définie à zéro si manquante
total_size = int(response.headers.get('content-length', 0))
# afficher la progression
with open(destination, 'wb') as file, tqdm(desc=destination, total=total_size, unit='B', unit_scale=True,
unit_divisor=1024) as bar:
for data in response.iter_content(chunk_size=1024):
size = file.write(data)
bar.update(size)
def diviser_phrases_moore(texte: str) -> list:
"""
Divise un texte en phrases en fonction des signes de ponctuation de fin de phrase.
Cette fonction prend un texte en entrée et le divise en phrases en se basant sur les
signes de ponctuation (tels que le point (.) ...).
Elle nettoie également les espaces superflus et filtre les chaînes vides.
Args:
texte (str): Le texte à diviser en phrases.
Returns:
list: Une liste de phrases nettoyées et divisées à partir du texte.
"""
# définir les motifs de ponctuation de fin de phrase
fin_de_phrase = re.compile(r'(?<=[.!?])\s+')
# diviser le texte en phrases
phrases = fin_de_phrase.split(texte)
# nettoyer les espaces superflus et filtrer les chaînes vides
phrases = [phrase.strip() for phrase in phrases if phrase.strip()]
return phrases
class MooreConverter:
"""
Convert Arabic numerals into Mooré textual representation.
"""
def __init__(self):
self.units = ["", "yembo", "yiibu", "tãabo", "naase", "nu", "yoobe", "yopoe", "nii", "wae"]
self.tens_base = ["", "piiga", "pisi", "pis-tã", "pis-naase", "pis-nu", "pis-yoobe", "pis-yopoe", "pis-nii", "pis-wae"]
self.hundreds = ["", "koabga"]
self.hundreds_prefix = "kobs-"
self.thousands = ["", "tusri"]
# thousands_prefix not used in this implementation; recursion is clearer
def number_to_moore(self, n: int) -> str:
if n == 0:
return "" # keep original behavior, but expand_number will fallback to the digits for zero
if n < 10:
return self.units[n]
if 11 <= n <= 19:
unit_part = self.units[n - 10]
if unit_part == "yembo":
unit_part = "ye"
elif unit_part == "yiibu":
unit_part = "yi"
elif unit_part == "tãabo":
unit_part = "tã"
return "piig la a " + unit_part
if n == 10:
return self.tens_base[1]
if n < 100:
tens_part = self.tens_base[n // 10]
units_part = n % 10
if units_part > 0:
unit_text = self.units[units_part]
if unit_text == "yembo":
unit_text = "ye"
elif unit_text == "yiibu":
unit_text = "yi"
elif unit_text == "tãabo":
unit_text = "tã"
return tens_part + " la a " + unit_text
else:
return tens_part
if n < 1000:
hundreds_count = n // 100
remainder = n % 100
if hundreds_count == 1:
result = self.hundreds[1]
else:
# hundreds_count is 2..9 for this branch, safe to index units
unit_name = self.units[hundreds_count]
if unit_name == "yembo":
unit_name = "ye"
elif unit_name == "yiibu":
unit_name = "yiibu"
elif unit_name == "tãabo":
unit_name = "tã"
result = self.hundreds_prefix + unit_name
if remainder > 0:
result += " la " + self.number_to_moore(remainder)
return result
if n < 1_000_000:
thousands_count = n // 1000
remainder = n % 1000
if thousands_count == 1:
result = self.thousands[1]
else:
# Use recursion here: for any thousands_count (can be >= 10),
# express the thousands_count in Mooré then append "tusri"
result = self.number_to_moore(thousands_count) + " " + self.thousands[1]
if remainder > 0:
result += " la " + self.number_to_moore(remainder)
return result
# millions and above
millions_count = n // 1_000_000
remainder = n % 1_000_000
result = self.number_to_moore(millions_count) + " milyɔɔng"
if remainder > 0:
result += " " + self.number_to_moore(remainder)
return result
def expand_number(self, text: str) -> str:
"""
Replace plain numbers (e.g. '123') with Mooré words. If conversion returns
empty (e.g. 0) we fall back to keeping the original digits.
"""
if not isinstance(text, str):
return text
number_pattern = re.compile(r'\b\d+\b')
def replace_number_with_text(match):
s = match.group()
try:
number = int(s)
moore = self.number_to_moore(number)
# if converter returns empty string (e.g. for 0), keep original digits
return moore if moore else s
except Exception:
# on any unexpected error, keep the digits
return s
return number_pattern.sub(replace_number_with_text, text)
def mark_numbers(text):
return re.sub(r'\b(\d+)\b', r'*\1*', text)
def unmark_numbers(text):
return text.replace("*", "") |