Spaces:

DroolingPanda
/

kokoro-tts-server

Runtime error

kokoro-tts-server / api /src /structures /schemas.py

Michael Hu

initial check in

05b45a5 about 1 month ago

6.17 kB

	from enum import Enum
	from typing import List, Literal, Optional, Union

	from pydantic import BaseModel, Field


	class VoiceCombineRequest(BaseModel):
	"""Request schema for voice combination endpoint that accepts either a string with + or a list"""

	voices: Union[str, List[str]] = Field(
	...,
	description="Either a string with voices separated by + (e.g. 'voice1+voice2') or a list of voice names to combine",
	)


	class TTSStatus(str, Enum):
	PENDING = "pending"
	PROCESSING = "processing"
	COMPLETED = "completed"
	FAILED = "failed"
	DELETED = "deleted" # For files removed by cleanup


	# OpenAI-compatible schemas
	class WordTimestamp(BaseModel):
	"""Word-level timestamp information"""

	word: str = Field(..., description="The word or token")
	start_time: float = Field(..., description="Start time in seconds")
	end_time: float = Field(..., description="End time in seconds")


	class CaptionedSpeechResponse(BaseModel):
	"""Response schema for captioned speech endpoint"""

	audio: str = Field(..., description="The generated audio data encoded in base 64")
	audio_format: str = Field(..., description="The format of the output audio")
	timestamps: Optional[List[WordTimestamp]] = Field(
	..., description="Word-level timestamps"
	)


	class NormalizationOptions(BaseModel):
	"""Options for the normalization system"""

	normalize: bool = Field(
	default=True,
	description="Normalizes input text to make it easier for the model to say",
	)
	unit_normalization: bool = Field(
	default=False, description="Transforms units like 10KB to 10 kilobytes"
	)
	url_normalization: bool = Field(
	default=True,
	description="Changes urls so they can be properly pronounced by kokoro",
	)
	email_normalization: bool = Field(
	default=True,
	description="Changes emails so they can be properly pronouced by kokoro",
	)
	optional_pluralization_normalization: bool = Field(
	default=True,
	description="Replaces (s) with s so some words get pronounced correctly",
	)
	phone_normalization: bool = Field(
	default=True,
	description="Changes phone numbers so they can be properly pronouced by kokoro",
	)


	class OpenAISpeechRequest(BaseModel):
	"""Request schema for OpenAI-compatible speech endpoint"""

	model: str = Field(
	default="kokoro",
	description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",
	)
	input: str = Field(..., description="The text to generate audio for")
	voice: str = Field(
	default="af_heart",
	description="The voice to use for generation. Can be a base voice or a combined voice name.",
	)
	response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
	default="mp3",
	description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
	)
	download_format: Optional[Literal["mp3", "opus", "aac", "flac", "wav", "pcm"]] = (
	Field(
	default=None,
	description="Optional different format for the final download. If not provided, uses response_format.",
	)
	)
	speed: float = Field(
	default=1.0,
	ge=0.25,
	le=4.0,
	description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
	)
	stream: bool = Field(
	default=True, # Default to streaming for OpenAI compatibility
	description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
	)
	return_download_link: bool = Field(
	default=False,
	description="If true, returns a download link in X-Download-Path header after streaming completes",
	)
	lang_code: Optional[str] = Field(
	default=None,
	description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
	)
	normalization_options: Optional[NormalizationOptions] = Field(
	default=NormalizationOptions(),
	description="Options for the normalization system",
	)


	class CaptionedSpeechRequest(BaseModel):
	"""Request schema for captioned speech endpoint"""

	model: str = Field(
	default="kokoro",
	description="The model to use for generation. Supported models: tts-1, tts-1-hd, kokoro",
	)
	input: str = Field(..., description="The text to generate audio for")
	voice: str = Field(
	default="af_heart",
	description="The voice to use for generation. Can be a base voice or a combined voice name.",
	)
	response_format: Literal["mp3", "opus", "aac", "flac", "wav", "pcm"] = Field(
	default="mp3",
	description="The format to return audio in. Supported formats: mp3, opus, flac, wav, pcm. PCM format returns raw 16-bit samples without headers. AAC is not currently supported.",
	)
	speed: float = Field(
	default=1.0,
	ge=0.25,
	le=4.0,
	description="The speed of the generated audio. Select a value from 0.25 to 4.0.",
	)
	stream: bool = Field(
	default=True, # Default to streaming for OpenAI compatibility
	description="If true (default), audio will be streamed as it's generated. Each chunk will be a complete sentence.",
	)
	return_timestamps: bool = Field(
	default=True,
	description="If true (default), returns word-level timestamps in the response",
	)
	return_download_link: bool = Field(
	default=False,
	description="If true, returns a download link in X-Download-Path header after streaming completes",
	)
	lang_code: Optional[str] = Field(
	default=None,
	description="Optional language code to use for text processing. If not provided, will use first letter of voice name.",
	)
	normalization_options: Optional[NormalizationOptions] = Field(
	default=NormalizationOptions(),
	description="Options for the normalization system",
	)