Spaces:

thanhkt
/

t2m

Running

App Files Files Community

t2m / src /utils /kokoro_voiceover.py

thanhkt

Upload 75 files

9b5ca29 verified 17 days ago

raw

history blame

3.95 kB

	"""
	Copyright (c) 2025 Xposed73
	All rights reserved.
	This file is part of the Manim Voiceover project.
	"""

	import hashlib
	import json
	import numpy as np
	from pathlib import Path
	from manim_voiceover.services.base import SpeechService
	from kokoro_onnx import Kokoro
	from manim_voiceover.helper import remove_bookmarks, wav2mp3
	from scipy.io.wavfile import write as write_wav
	from src.config.config import Config


	class KokoroService(SpeechService):
	"""Speech service class for kokoro_self (using text_to_speech via Kokoro ONNX)."""

	def __init__(self, engine=None,
	model_path: str = Config.KOKORO_MODEL_PATH,
	voices_path: str = Config.KOKORO_VOICES_PATH,
	voice: str = Config.KOKORO_DEFAULT_VOICE,
	speed: float = Config.KOKORO_DEFAULT_SPEED,
	lang: str = Config.KOKORO_DEFAULT_LANG,
	**kwargs):
	self.kokoro = Kokoro(model_path, voices_path)
	self.voice = voice
	self.speed = speed
	self.lang = lang

	if engine is None:
	engine = self.text_to_speech # Default to local function

	self.engine = engine
	super().__init__(**kwargs)

	def get_data_hash(self, input_data: dict) -> str:
	"""
	Generates a hash based on the input data dictionary.
	The hash is used to create a unique identifier for the input data.

	Parameters:
	input_data (dict): A dictionary of input data (e.g., text, voice, etc.).

	Returns:
	str: The generated hash as a string.
	"""
	# Convert the input data dictionary to a JSON string (sorted for consistency)
	data_str = json.dumps(input_data, sort_keys=True)
	# Generate a SHA-256 hash of the JSON string
	return hashlib.sha256(data_str.encode('utf-8')).hexdigest()

	def text_to_speech(self, text, output_file, voice_name, speed, lang):
	"""
	Generates speech from text using Kokoro ONNX and saves the audio file.
	Normalizes the audio to make it audible.
	"""
	# Generate audio samples using Kokoro
	samples, sample_rate = self.kokoro.create(
	text, voice=voice_name, speed=speed, lang=lang
	)

	# Normalize audio to the range [-1, 1]
	max_val = np.max(np.abs(samples))
	if max_val > 0:
	samples = samples / max_val

	# Convert to 16-bit integer PCM format
	samples = (samples * 32767).astype("int16")

	# Save the normalized audio as a .wav file
	write_wav(output_file, sample_rate, samples)
	print(f"Saved at {output_file}")

	return output_file


	def generate_from_text(self, text: str, cache_dir: str = None, path: str = None) -> dict:
	if cache_dir is None:
	cache_dir = self.cache_dir

	input_data = {"input_text": text, "service": "kokoro_self", "voice": self.voice, "lang": self.lang}
	cached_result = self.get_cached_result(input_data, cache_dir)
	if cached_result is not None:
	return cached_result

	if path is None:
	audio_path = self.get_data_hash(input_data) + ".mp3"
	else:
	audio_path = path

	# Generate .wav file using the text_to_speech function
	audio_path_wav = str(Path(cache_dir) / audio_path.replace(".mp3", ".wav"))
	self.engine(
	text=text,
	output_file=audio_path_wav,
	voice_name=self.voice,
	speed=self.speed,
	lang=self.lang,
	)

	# Convert .wav to .mp3
	mp3_audio_path = str(Path(cache_dir) / audio_path)
	wav2mp3(audio_path_wav, mp3_audio_path)

	# Remove original .wav file
	remove_bookmarks(audio_path_wav)

	json_dict = {
	"input_text": text,
	"input_data": input_data,
	"original_audio": audio_path,
	}

	return json_dict