File size: 6,601 Bytes
63ebe58 c17f584 63ebe58 c17f584 63ebe58 291cd8b 63ebe58 291cd8b 63ebe58 e77779d d662a4e 63ebe58 d662a4e 63ebe58 d662a4e 63ebe58 d660fca 63ebe58 d660fca 63ebe58 d660fca 63ebe58 e77779d 63ebe58 d660fca 63ebe58 e77779d 63ebe58 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import asyncio
import torch
import os
from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
from transformers import AutoTokenizer
import threading
import queue
from decoder import tokens_decoder_sync
class OrpheusModel:
def __init__(self, model_name, dtype=torch.bfloat16, tokenizer=None, **engine_kwargs):
self.model_name = self._map_model_params(model_name)
self.dtype = dtype
self.engine_kwargs = engine_kwargs # vLLM engine kwargs
self.engine = self._setup_engine()
# Available voices for German Kartoffel model
if "german" in model_name.lower() or "kartoffel" in model_name.lower():
self.available_voices = ["Jakob", "Anton", "Julian", "Sophie", "Marie", "Mia"]
else:
# Original English voices as fallback
self.available_voices = ["zoe", "zac", "jess", "leo", "mia", "julia", "leah", "tara"]
# Use provided tokenizer path or default to model_name
# For German models, try the model itself first, then fallback to original tokenizer
if tokenizer:
tokenizer_path = tokenizer
elif "german" in model_name.lower() or "kartoffel" in model_name.lower():
tokenizer_path = model_name # Try using the same model as tokenizer
else:
tokenizer_path = 'canopylabs/orpheus-3b-0.1-pretrained' # Original fallback
self.tokenizer = self._load_tokenizer(tokenizer_path)
def _load_tokenizer(self, tokenizer_path):
"""Load tokenizer from local path or HuggingFace hub"""
try:
# Check if tokenizer_path is a local directory
if os.path.isdir(tokenizer_path):
return AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
else:
return AutoTokenizer.from_pretrained(tokenizer_path)
except Exception as e:
print(f"Error loading tokenizer: {e}")
print(f"Falling back to default tokenizer")
return AutoTokenizer.from_pretrained("gpt2")
def _map_model_params(self, model_name):
model_map = {
# "nano-150m":{
# "repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
# },
# "micro-400m":{
# "repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
# },
# "small-1b":{
# "repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
# },
"medium-3b":{
"repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
},
}
unsupported_models = ["nano-150m", "micro-400m", "small-1b"]
if (model_name in unsupported_models):
raise ValueError(f"Model {model_name} is not supported. Only medium-3b is supported, small, micro and nano models will be released very soon")
elif model_name in model_map:
return model_map[model_name]["repo_id"]
else:
return model_name
def _setup_engine(self):
# Configure for Hugging Face Spaces with L4 GPU
engine_args = AsyncEngineArgs(
model=self.model_name,
dtype=self.dtype,
gpu_memory_utilization=0.85,
max_model_len=8192,
trust_remote_code=True,
enforce_eager=True, # Disable CUDA graphs for better compatibility
**self.engine_kwargs
)
return AsyncLLMEngine.from_engine_args(engine_args)
def validate_voice(self, voice):
if voice:
if voice not in self.engine.available_voices:
raise ValueError(f"Voice {voice} is not available for model {self.model_name}")
def _format_prompt(self, prompt, voice="Sophie", model_type="larger"):
# Use Kartoffel model format based on documentation
if voice:
full_prompt = f"{voice}: {prompt}"
else:
full_prompt = prompt
# Kartoffel model token format
start_token = torch.tensor([[128259]], dtype=torch.int64)
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
input_ids = self.tokenizer(full_prompt, return_tensors="pt").input_ids
modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
prompt_string = self.tokenizer.decode(modified_input_ids[0])
return prompt_string
def generate_tokens_sync(self, prompt, voice=None, request_id="req-001", temperature=0.6, top_p=0.95, max_tokens=4000, stop_token_ids = [128258], repetition_penalty=1.1):
prompt_string = self._format_prompt(prompt, voice)
print(f"DEBUG: Original prompt: {prompt}")
print(f"DEBUG: Formatted prompt: {prompt_string}")
sampling_params = SamplingParams(
temperature=temperature,
top_p=top_p,
max_tokens=max_tokens, # Adjust max_tokens as needed.
stop_token_ids = stop_token_ids,
repetition_penalty=repetition_penalty,
)
token_queue = queue.Queue()
token_count = 0
async def async_producer():
nonlocal token_count
async for result in self.engine.generate(prompt=prompt_string, sampling_params=sampling_params, request_id=request_id):
# Place each token text into the queue.
token_text = result.outputs[0].text
print(f"DEBUG: Generated token {token_count}: {repr(token_text)}")
token_queue.put(token_text)
token_count += 1
print(f"DEBUG: Generation completed. Total tokens: {token_count}")
token_queue.put(None) # Sentinel to indicate completion.
def run_async():
asyncio.run(async_producer())
thread = threading.Thread(target=run_async)
thread.start()
while True:
token = token_queue.get()
if token is None:
break
yield token
thread.join()
def generate_speech(self, **kwargs):
print("DEBUG: Starting generate_speech")
try:
token_generator = self.generate_tokens_sync(**kwargs)
print("DEBUG: Token generator created successfully")
audio_generator = tokens_decoder_sync(token_generator)
print("DEBUG: Audio decoder called successfully")
return audio_generator
except Exception as e:
print(f"DEBUG: Error in generate_speech: {e}")
raise e
|