dev-mode-orpheus-tts

Paused

App Files Files Community

dev-mode-orpheus-tts / orpheus-tts /engine_class.py

Tomtom84

Update orpheus-tts/engine_class.py

66fc62a verified about 1 month ago

raw

history blame

10.8 kB

	import asyncio
	import torch
	import os
	from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
	from transformers import AutoTokenizer
	import threading
	import queue
	from decoder import tokens_decoder_sync
	from kartoffel_decoder import tokens_decoder_kartoffel_sync

	class OrpheusModel:
	def __init__(self, model_name, dtype=torch.bfloat16, tokenizer=None, **engine_kwargs):
	self.model_name = self._map_model_params(model_name)
	self.dtype = dtype
	self.engine_kwargs = engine_kwargs # vLLM engine kwargs
	self.engine = self._setup_engine()
	# Available voices based on model type
	if "kartoffel" in model_name.lower():
	self.available_voices = ["Jakob", "Anton", "Julian", "Sophie", "Marie", "Mia"]
	elif "3b-de-ft" in model_name.lower():
	self.available_voices = ["jana", "thomas", "max"]
	else:
	# Original English voices as fallback
	self.available_voices = ["zoe", "zac", "jess", "leo", "mia", "julia", "leah", "tara"]

	# Use provided tokenizer path or default to model_name
	# For German models, try the model itself first, then fallback to original tokenizer
	if tokenizer:
	tokenizer_path = tokenizer
	elif "german" in model_name.lower() or "kartoffel" in model_name.lower():
	tokenizer_path = model_name # Try using the same model as tokenizer
	else:
	tokenizer_path = 'canopylabs/orpheus-3b-0.1-pretrained' # Original fallback

	self.tokenizer = self._load_tokenizer(tokenizer_path)

	def _load_tokenizer(self, tokenizer_path):
	"""Load tokenizer from local path or HuggingFace hub"""
	try:
	# Check if tokenizer_path is a local directory
	if os.path.isdir(tokenizer_path):
	return AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
	else:
	return AutoTokenizer.from_pretrained(tokenizer_path)
	except Exception as e:
	print(f"Error loading tokenizer: {e}")
	print(f"Falling back to default tokenizer")
	return AutoTokenizer.from_pretrained("gpt2")

	def _map_model_params(self, model_name):
	model_map = {
	# "nano-150m":{
	# "repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
	# },
	# "micro-400m":{
	# "repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
	# },
	# "small-1b":{
	# "repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
	# },
	"medium-3b":{
	"repo_id": "canopylabs/3b-de-ft-research_release",
	},
	}
	unsupported_models = ["nano-150m", "micro-400m", "small-1b"]
	if (model_name in unsupported_models):
	raise ValueError(f"Model {model_name} is not supported. Only medium-3b is supported, small, micro and nano models will be released very soon")
	elif model_name in model_map:
	return model_map[model_name]["repo_id"]
	else:
	return model_name

	def _setup_engine(self):
	# Configure for Hugging Face Spaces with L4 GPU
	engine_args = AsyncEngineArgs(
	model=self.model_name,
	dtype=self.dtype,
	gpu_memory_utilization=0.85,
	max_model_len=8192,
	trust_remote_code=True,
	enforce_eager=True, # Disable CUDA graphs for better compatibility
	**self.engine_kwargs
	)

	return AsyncLLMEngine.from_engine_args(engine_args)

	def validate_voice(self, voice):
	if voice:
	if voice not in self.engine.available_voices:
	raise ValueError(f"Voice {voice} is not available for model {self.model_name}")

	def _format_prompt(self, prompt, voice="Jakob", model_type="larger"):
	# Unterschiedliche Formate für verschiedene Modelle
	print(f"DEBUG: Model name for format check: {self.model_name}")
	if "kartoffel" in self.model_name.lower():
	print("DEBUG: Using Kartoffel format")
	# Kartoffel model format
	if voice:
	full_prompt = f"{voice}: {prompt}"
	else:
	full_prompt = prompt

	# Kartoffel model format - basierend auf Referenz-Implementierung

	# Wichtig: Die Referenz zeigt diese Token-Sequenz:
	# start_token = 128259
	# end_tokens = [128009, 128260]
	# Aber für Audio-Generierung brauchen wir auch das Audio-Start-Token!

	# Exakt wie in der Referenz-Implementierung
	start_token = torch.tensor([[128259]], dtype=torch.int64)
	end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
	input_ids = self.tokenizer(full_prompt, return_tensors="pt").input_ids

	print(f"DEBUG KARTOFFEL: Original prompt: '{full_prompt}'")
	print(f"DEBUG KARTOFFEL: input_ids shape: {input_ids.shape}")

	# Token-IDs zusammenfügen
	modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
	print(f"DEBUG KARTOFFEL: modified_input_ids shape: {modified_input_ids.shape}")
	print(f"DEBUG KARTOFFEL: modified_input_ids: {modified_input_ids[0].tolist()}")

	# Zurück zu Text dekodieren
	decoded_text = self.tokenizer.decode(modified_input_ids[0], skip_special_tokens=False)
	print(f"DEBUG KARTOFFEL: Final decoded prompt: '{decoded_text}'")

	return decoded_text
	else:
	# Original Orpheus format (für Canopy-Deutsch und English)
	if model_type == "smaller":
	if voice:
	return f"<custom_token_3>{prompt}[{voice}]<custom_token_4><custom_token_5>"
	else:
	return f"<custom_token_3>{prompt}<custom_token_4><custom_token_5>"
	else:
	if voice:
	adapted_prompt = f"{voice}: {prompt}"
	prompt_tokens = self.tokenizer(adapted_prompt, return_tensors="pt")
	start_token = torch.tensor([[ 128259]], dtype=torch.int64)
	end_tokens = torch.tensor([[128009, 128260, 128261, 128257]], dtype=torch.int64)
	all_input_ids = torch.cat([start_token, prompt_tokens.input_ids, end_tokens], dim=1)
	prompt_string = self.tokenizer.decode(all_input_ids[0])
	return prompt_string
	else:
	prompt_tokens = self.tokenizer(prompt, return_tensors="pt")
	start_token = torch.tensor([[ 128259]], dtype=torch.int64)
	end_tokens = torch.tensor([[128009, 128260, 128261, 128257]], dtype=torch.int64)
	all_input_ids = torch.cat([start_token, prompt_tokens.input_ids, end_tokens], dim=1)
	prompt_string = self.tokenizer.decode(all_input_ids[0])
	return prompt_string


	def generate_tokens_sync(self, prompt, voice=None, request_id="req-001", temperature=0.6, top_p=0.95, max_tokens=4000, stop_token_ids = [128258], repetition_penalty=1.1):
	print(f"DEBUG: Original prompt: {prompt}")
	print(f"DEBUG: Voice: {voice}")
	print(f"DEBUG: Model name: {self.model_name}")
	prompt_string = self._format_prompt(prompt, voice)
	print(f"DEBUG: Formatted prompt: {prompt_string}")

	sampling_params = SamplingParams(
	temperature=temperature,
	top_p=top_p,
	max_tokens=max_tokens, # Adjust max_tokens as needed.
	stop_token_ids = stop_token_ids,
	repetition_penalty=repetition_penalty,
	)

	token_queue = queue.Queue()
	token_count = 0

	async def async_producer():
	nonlocal token_count
	print(f"DEBUG: Starting vLLM generation with prompt: {repr(prompt_string[:100])}...")
	print(f"DEBUG: Sampling params: temp={sampling_params.temperature}, top_p={sampling_params.top_p}, max_tokens={sampling_params.max_tokens}")

	try:
	async for result in self.engine.generate(prompt=prompt_string, sampling_params=sampling_params, request_id=request_id):
	# Place each token text into the queue.
	token_text = result.outputs[0].text
	print(f"DEBUG: Generated token {token_count}: {repr(token_text)}")
	token_queue.put(token_text)
	token_count += 1

	# Show progress every 10 tokens
	if token_count % 10 == 0:
	print(f"DEBUG: Generated {token_count} tokens so far...")

	print(f"DEBUG: Generation completed. Total tokens: {token_count}")
	except Exception as e:
	print(f"DEBUG: Error during generation: {e}")
	import traceback
	traceback.print_exc()
	finally:
	token_queue.put(None) # Sentinel to indicate completion.

	def run_async():
	asyncio.run(async_producer())

	thread = threading.Thread(target=run_async)
	thread.start()

	while True:
	token = token_queue.get()
	if token is None:
	break
	yield token

	thread.join()

	def generate_speech(self, **kwargs):
	print("DEBUG: Starting generate_speech")
	try:
	token_generator = self.generate_tokens_sync(**kwargs)
	print("DEBUG: Token generator created successfully")

	# Verwende Kartoffel-Decoder nur für Kartoffel-Modell, Original-Decoder für Canopy-Deutsch
	if "kartoffel" in self.model_name.lower():
	print("DEBUG: Using Kartoffel decoder for Kartoffel model")
	audio_generator = tokens_decoder_kartoffel_sync(token_generator, self.tokenizer)
	elif "3b-de-ft" in self.model_name.lower() or "german" in self.model_name.lower():
	print("DEBUG: Using original decoder for Canopy German model")
	audio_generator = tokens_decoder_sync(token_generator)
	else:
	print("DEBUG: Using original decoder for English model")
	audio_generator = tokens_decoder_sync(token_generator)

	print("DEBUG: Audio decoder called successfully")
	return audio_generator
	except Exception as e:
	print(f"DEBUG: Error in generate_speech: {e}")
	raise e