File size: 3,945 Bytes
9b5ca29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
"""
Copyright (c) 2025 Xposed73
All rights reserved.
This file is part of the Manim Voiceover project.
"""
import hashlib
import json
import numpy as np
from pathlib import Path
from manim_voiceover.services.base import SpeechService
from kokoro_onnx import Kokoro
from manim_voiceover.helper import remove_bookmarks, wav2mp3
from scipy.io.wavfile import write as write_wav
from src.config.config import Config
class KokoroService(SpeechService):
"""Speech service class for kokoro_self (using text_to_speech via Kokoro ONNX)."""
def __init__(self, engine=None,
model_path: str = Config.KOKORO_MODEL_PATH,
voices_path: str = Config.KOKORO_VOICES_PATH,
voice: str = Config.KOKORO_DEFAULT_VOICE,
speed: float = Config.KOKORO_DEFAULT_SPEED,
lang: str = Config.KOKORO_DEFAULT_LANG,
**kwargs):
self.kokoro = Kokoro(model_path, voices_path)
self.voice = voice
self.speed = speed
self.lang = lang
if engine is None:
engine = self.text_to_speech # Default to local function
self.engine = engine
super().__init__(**kwargs)
def get_data_hash(self, input_data: dict) -> str:
"""
Generates a hash based on the input data dictionary.
The hash is used to create a unique identifier for the input data.
Parameters:
input_data (dict): A dictionary of input data (e.g., text, voice, etc.).
Returns:
str: The generated hash as a string.
"""
# Convert the input data dictionary to a JSON string (sorted for consistency)
data_str = json.dumps(input_data, sort_keys=True)
# Generate a SHA-256 hash of the JSON string
return hashlib.sha256(data_str.encode('utf-8')).hexdigest()
def text_to_speech(self, text, output_file, voice_name, speed, lang):
"""
Generates speech from text using Kokoro ONNX and saves the audio file.
Normalizes the audio to make it audible.
"""
# Generate audio samples using Kokoro
samples, sample_rate = self.kokoro.create(
text, voice=voice_name, speed=speed, lang=lang
)
# Normalize audio to the range [-1, 1]
max_val = np.max(np.abs(samples))
if max_val > 0:
samples = samples / max_val
# Convert to 16-bit integer PCM format
samples = (samples * 32767).astype("int16")
# Save the normalized audio as a .wav file
write_wav(output_file, sample_rate, samples)
print(f"Saved at {output_file}")
return output_file
def generate_from_text(self, text: str, cache_dir: str = None, path: str = None) -> dict:
if cache_dir is None:
cache_dir = self.cache_dir
input_data = {"input_text": text, "service": "kokoro_self", "voice": self.voice, "lang": self.lang}
cached_result = self.get_cached_result(input_data, cache_dir)
if cached_result is not None:
return cached_result
if path is None:
audio_path = self.get_data_hash(input_data) + ".mp3"
else:
audio_path = path
# Generate .wav file using the text_to_speech function
audio_path_wav = str(Path(cache_dir) / audio_path.replace(".mp3", ".wav"))
self.engine(
text=text,
output_file=audio_path_wav,
voice_name=self.voice,
speed=self.speed,
lang=self.lang,
)
# Convert .wav to .mp3
mp3_audio_path = str(Path(cache_dir) / audio_path)
wav2mp3(audio_path_wav, mp3_audio_path)
# Remove original .wav file
remove_bookmarks(audio_path_wav)
json_dict = {
"input_text": text,
"input_data": input_data,
"original_audio": audio_path,
}
return json_dict |