Spaces:

thanhkt
/

t2m

Running

t2m

File size: 3,945 Bytes

9b5ca29

"""
Copyright (c) 2025 Xposed73
All rights reserved.
This file is part of the Manim Voiceover project.
"""

import hashlib
import json
import numpy as np
from pathlib import Path
from manim_voiceover.services.base import SpeechService
from kokoro_onnx import Kokoro
from manim_voiceover.helper import remove_bookmarks, wav2mp3
from scipy.io.wavfile import write as write_wav
from src.config.config import Config


class KokoroService(SpeechService):
    """Speech service class for kokoro_self (using text_to_speech via Kokoro ONNX)."""

    def __init__(self, engine=None, 
                 model_path: str = Config.KOKORO_MODEL_PATH,
                 voices_path: str = Config.KOKORO_VOICES_PATH,
                 voice: str = Config.KOKORO_DEFAULT_VOICE,
                 speed: float = Config.KOKORO_DEFAULT_SPEED,
                 lang: str = Config.KOKORO_DEFAULT_LANG,
                 **kwargs):
        self.kokoro = Kokoro(model_path, voices_path)
        self.voice = voice
        self.speed = speed
        self.lang = lang

        if engine is None:
            engine = self.text_to_speech  # Default to local function

        self.engine = engine
        super().__init__(**kwargs)

    def get_data_hash(self, input_data: dict) -> str:
        """
        Generates a hash based on the input data dictionary.
        The hash is used to create a unique identifier for the input data.

        Parameters:
            input_data (dict): A dictionary of input data (e.g., text, voice, etc.).

        Returns:
            str: The generated hash as a string.
        """
        # Convert the input data dictionary to a JSON string (sorted for consistency)
        data_str = json.dumps(input_data, sort_keys=True)
        # Generate a SHA-256 hash of the JSON string
        return hashlib.sha256(data_str.encode('utf-8')).hexdigest()

    def text_to_speech(self, text, output_file, voice_name, speed, lang):
        """
        Generates speech from text using Kokoro ONNX and saves the audio file.
        Normalizes the audio to make it audible.
        """
        # Generate audio samples using Kokoro
        samples, sample_rate = self.kokoro.create(
            text, voice=voice_name, speed=speed, lang=lang
        )

        # Normalize audio to the range [-1, 1]
        max_val = np.max(np.abs(samples))
        if max_val > 0:
            samples = samples / max_val

        # Convert to 16-bit integer PCM format
        samples = (samples * 32767).astype("int16")

        # Save the normalized audio as a .wav file
        write_wav(output_file, sample_rate, samples)
        print(f"Saved at {output_file}")

        return output_file


    def generate_from_text(self, text: str, cache_dir: str = None, path: str = None) -> dict:
        if cache_dir is None:
            cache_dir = self.cache_dir

        input_data = {"input_text": text, "service": "kokoro_self", "voice": self.voice, "lang": self.lang}
        cached_result = self.get_cached_result(input_data, cache_dir)
        if cached_result is not None:
            return cached_result

        if path is None:
            audio_path = self.get_data_hash(input_data) + ".mp3"
        else:
            audio_path = path

        # Generate .wav file using the text_to_speech function
        audio_path_wav = str(Path(cache_dir) / audio_path.replace(".mp3", ".wav"))
        self.engine(
            text=text,
            output_file=audio_path_wav,
            voice_name=self.voice,
            speed=self.speed,
            lang=self.lang,
        )

        # Convert .wav to .mp3
        mp3_audio_path = str(Path(cache_dir) / audio_path)
        wav2mp3(audio_path_wav, mp3_audio_path)

        # Remove original .wav file
        remove_bookmarks(audio_path_wav)

        json_dict = {
            "input_text": text,
            "input_data": input_data,
            "original_audio": audio_path,
        }

        return json_dict