File size: 4,476 Bytes
6940bfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1a999a
6940bfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7317760
6940bfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load processor & vocoder
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

# Load both TTS models
model_male = SpeechT5ForTextToSpeech.from_pretrained("HusseinBashir/xus23").to(device)
model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)

# Load speaker encoder model
speaker_model = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-xvect-voxceleb",
    run_opts={"device": device},
    savedir="./spk_model"
)

# Auto-generate embedding
def get_embedding(wav_path, pt_path):
    if os.path.exists(pt_path):
        return torch.load(pt_path).to(device)
    else:
        audio, sr = torchaudio.load(wav_path)
        audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
        with torch.no_grad():
            emb = speaker_model.encode_batch(audio)
            emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
        torch.save(emb.cpu(), pt_path)
        return emb

# Ensure embeddings are created or loaded
embedding_male = get_embedding("498-enhanced-v2.wav", "male_embedding.pt")
embedding_female = get_embedding("caasho.wav", "female_embedding.pt")

# Somali numbers to words
number_words = {
    0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
    6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
    11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
    14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
    17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
    20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
    60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
    100: "boqol", 1000: "kun",
}

def number_to_words(number):
    if number < 20:
        return number_words[number]
    elif number < 100:
        tens, unit = divmod(number, 10)
        return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
    elif number < 1000:
        hundreds, remainder = divmod(number, 100)
        return (number_words[hundreds] + " boqol" if hundreds > 1 else "BOQOL") + (" " + number_to_words(remainder) if remainder else "")
    elif number < 1000000:
        thousands, remainder = divmod(number, 1000)
        return (number_to_words(thousands) + " kun" if thousands > 1 else "KUN") + (" " + number_to_words(remainder) if remainder else "")
    elif number < 1000000000:
        millions, remainder = divmod(number, 1000000)
        return number_to_words(millions) + " malyan" + (" " + number_to_words(remainder) if remainder else "")
    elif number < 1000000000000:
        billions, remainder = divmod(number, 1000000000)
        return number_to_words(billions) + " milyaar" + (" " + number_to_words(remainder) if remainder else "")
    else:
        return str(number)

def replace_numbers_with_words(text):
    return re.sub(r'\b\d+\b', lambda match: number_to_words(int(match.group())), text)

def normalize_text(text):
    text = text.lower()
    text = replace_numbers_with_words(text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Main TTS function
def text_to_speech(text, voice):
    text = normalize_text(text)
    inputs = processor(text=text, return_tensors="pt").to(device)

    if voice == "Male":
        model = model_male
        embedding = embedding_male
    else:
        model = model_female
        embedding = embedding_female

    with torch.no_grad():
        speech = model.generate_speech(inputs["input_ids"], embedding.unsqueeze(0), vocoder=vocoder)
    return (16000, speech.cpu().numpy())

# Gradio Interface
iface = gr.Interface(
    fn=text_to_speech,
    inputs=[
        gr.Textbox(label="Geli qoraalka Af-Soomaaliga", placeholder="Tusaale: Baro aqoonta casriga ah..."),
        gr.Radio(["Male", "Female"], label="Dooro Codka", value="Female")
    ],
    outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
    title="Somali TTS (Lab & Dhedig)",
    description="Dooro codka aad rabto, geli qoraal af-soomaali ah, codka ayaa la abuuri doonaa adigoo isticmaalaya Somali TTS (SpeechT5)."
)

iface.launch()