|
""" |
|
Copyright (c) 2025 Xposed73 |
|
All rights reserved. |
|
This file is part of the Manim Voiceover project. |
|
""" |
|
|
|
import hashlib |
|
import json |
|
import numpy as np |
|
from pathlib import Path |
|
from manim_voiceover.services.base import SpeechService |
|
from kokoro_onnx import Kokoro |
|
from manim_voiceover.helper import remove_bookmarks, wav2mp3 |
|
from scipy.io.wavfile import write as write_wav |
|
from src.config.config import Config |
|
|
|
|
|
class KokoroService(SpeechService): |
|
"""Speech service class for kokoro_self (using text_to_speech via Kokoro ONNX).""" |
|
|
|
def __init__(self, engine=None, |
|
model_path: str = Config.KOKORO_MODEL_PATH, |
|
voices_path: str = Config.KOKORO_VOICES_PATH, |
|
voice: str = Config.KOKORO_DEFAULT_VOICE, |
|
speed: float = Config.KOKORO_DEFAULT_SPEED, |
|
lang: str = Config.KOKORO_DEFAULT_LANG, |
|
**kwargs): |
|
self.kokoro = Kokoro(model_path, voices_path) |
|
self.voice = voice |
|
self.speed = speed |
|
self.lang = lang |
|
|
|
if engine is None: |
|
engine = self.text_to_speech |
|
|
|
self.engine = engine |
|
super().__init__(**kwargs) |
|
|
|
def get_data_hash(self, input_data: dict) -> str: |
|
""" |
|
Generates a hash based on the input data dictionary. |
|
The hash is used to create a unique identifier for the input data. |
|
|
|
Parameters: |
|
input_data (dict): A dictionary of input data (e.g., text, voice, etc.). |
|
|
|
Returns: |
|
str: The generated hash as a string. |
|
""" |
|
|
|
data_str = json.dumps(input_data, sort_keys=True) |
|
|
|
return hashlib.sha256(data_str.encode('utf-8')).hexdigest() |
|
|
|
def text_to_speech(self, text, output_file, voice_name, speed, lang): |
|
""" |
|
Generates speech from text using Kokoro ONNX and saves the audio file. |
|
Normalizes the audio to make it audible. |
|
""" |
|
|
|
samples, sample_rate = self.kokoro.create( |
|
text, voice=voice_name, speed=speed, lang=lang |
|
) |
|
|
|
|
|
max_val = np.max(np.abs(samples)) |
|
if max_val > 0: |
|
samples = samples / max_val |
|
|
|
|
|
samples = (samples * 32767).astype("int16") |
|
|
|
|
|
write_wav(output_file, sample_rate, samples) |
|
print(f"Saved at {output_file}") |
|
|
|
return output_file |
|
|
|
|
|
def generate_from_text(self, text: str, cache_dir: str = None, path: str = None) -> dict: |
|
if cache_dir is None: |
|
cache_dir = self.cache_dir |
|
|
|
input_data = {"input_text": text, "service": "kokoro_self", "voice": self.voice, "lang": self.lang} |
|
cached_result = self.get_cached_result(input_data, cache_dir) |
|
if cached_result is not None: |
|
return cached_result |
|
|
|
if path is None: |
|
audio_path = self.get_data_hash(input_data) + ".mp3" |
|
else: |
|
audio_path = path |
|
|
|
|
|
audio_path_wav = str(Path(cache_dir) / audio_path.replace(".mp3", ".wav")) |
|
self.engine( |
|
text=text, |
|
output_file=audio_path_wav, |
|
voice_name=self.voice, |
|
speed=self.speed, |
|
lang=self.lang, |
|
) |
|
|
|
|
|
mp3_audio_path = str(Path(cache_dir) / audio_path) |
|
wav2mp3(audio_path_wav, mp3_audio_path) |
|
|
|
|
|
remove_bookmarks(audio_path_wav) |
|
|
|
json_dict = { |
|
"input_text": text, |
|
"input_data": input_data, |
|
"original_audio": audio_path, |
|
} |
|
|
|
return json_dict |