File size: 6,601 Bytes
63ebe58
 
 
 
 
 
 
c17f584
63ebe58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c17f584
63ebe58
 
 
 
291cd8b
63ebe58
 
 
291cd8b
 
 
 
63ebe58
 
 
 
 
 
 
 
 
 
e77779d
d662a4e
 
 
63ebe58
d662a4e
 
 
 
 
 
 
 
 
 
 
63ebe58
 
d662a4e
63ebe58
d660fca
 
 
63ebe58
 
 
 
d660fca
 
63ebe58
 
 
d660fca
63ebe58
 
e77779d
63ebe58
 
d660fca
 
 
 
 
63ebe58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e77779d
 
 
 
 
 
 
 
 
 
 
 
63ebe58
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import asyncio
import torch
import os
from vllm import AsyncLLMEngine, AsyncEngineArgs, SamplingParams
from transformers import AutoTokenizer
import threading
import queue
from decoder import tokens_decoder_sync

class OrpheusModel:
    def __init__(self, model_name, dtype=torch.bfloat16, tokenizer=None, **engine_kwargs):
        self.model_name = self._map_model_params(model_name)
        self.dtype = dtype
        self.engine_kwargs = engine_kwargs  # vLLM engine kwargs
        self.engine = self._setup_engine()
        # Available voices for German Kartoffel model
        if "german" in model_name.lower() or "kartoffel" in model_name.lower():
            self.available_voices = ["Jakob", "Anton", "Julian", "Sophie", "Marie", "Mia"]
        else:
            # Original English voices as fallback
            self.available_voices = ["zoe", "zac", "jess", "leo", "mia", "julia", "leah", "tara"]
        
        # Use provided tokenizer path or default to model_name
        # For German models, try the model itself first, then fallback to original tokenizer
        if tokenizer:
            tokenizer_path = tokenizer
        elif "german" in model_name.lower() or "kartoffel" in model_name.lower():
            tokenizer_path = model_name  # Try using the same model as tokenizer
        else:
            tokenizer_path = 'canopylabs/orpheus-3b-0.1-pretrained'  # Original fallback
        
        self.tokenizer = self._load_tokenizer(tokenizer_path)

    def _load_tokenizer(self, tokenizer_path):
        """Load tokenizer from local path or HuggingFace hub"""
        try:
            # Check if tokenizer_path is a local directory
            if os.path.isdir(tokenizer_path):
                return AutoTokenizer.from_pretrained(tokenizer_path, local_files_only=True)
            else:
                return AutoTokenizer.from_pretrained(tokenizer_path)
        except Exception as e:
            print(f"Error loading tokenizer: {e}")
            print(f"Falling back to default tokenizer")
            return AutoTokenizer.from_pretrained("gpt2")
    
    def _map_model_params(self, model_name):
        model_map = {
            # "nano-150m":{
            #     "repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
            # }, 
            # "micro-400m":{
            #     "repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
            # }, 
            # "small-1b":{
            #     "repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
            # },
            "medium-3b":{
                "repo_id": "canopylabs/orpheus-tts-0.1-finetune-prod",
            },
        }
        unsupported_models = ["nano-150m", "micro-400m", "small-1b"]
        if (model_name  in unsupported_models):
            raise ValueError(f"Model {model_name} is not supported. Only medium-3b is supported, small, micro and nano models will be released very soon")
        elif model_name in model_map:
            return model_map[model_name]["repo_id"]
        else:
            return model_name
        
    def _setup_engine(self):
        # Configure for Hugging Face Spaces with L4 GPU
        engine_args = AsyncEngineArgs(
            model=self.model_name,
            dtype=self.dtype,
            gpu_memory_utilization=0.85,
            max_model_len=8192,
            trust_remote_code=True,
            enforce_eager=True,  # Disable CUDA graphs for better compatibility
            **self.engine_kwargs
        )
        
        return AsyncLLMEngine.from_engine_args(engine_args)
    
    def validate_voice(self, voice):
        if voice:
            if voice not in self.engine.available_voices:
                raise ValueError(f"Voice {voice} is not available for model {self.model_name}")
    
    def _format_prompt(self, prompt, voice="Sophie", model_type="larger"):
        # Use Kartoffel model format based on documentation
        if voice:
            full_prompt = f"{voice}: {prompt}"
        else:
            full_prompt = prompt
            
        # Kartoffel model token format
        start_token = torch.tensor([[128259]], dtype=torch.int64)
        end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
        
        input_ids = self.tokenizer(full_prompt, return_tensors="pt").input_ids
        modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
        
        prompt_string = self.tokenizer.decode(modified_input_ids[0])
        return prompt_string

 
    def generate_tokens_sync(self, prompt, voice=None, request_id="req-001", temperature=0.6, top_p=0.95, max_tokens=4000, stop_token_ids = [128258], repetition_penalty=1.1):
        prompt_string = self._format_prompt(prompt, voice)
        print(f"DEBUG: Original prompt: {prompt}")
        print(f"DEBUG: Formatted prompt: {prompt_string}")
        
        sampling_params = SamplingParams(
        temperature=temperature,
        top_p=top_p,
        max_tokens=max_tokens,  # Adjust max_tokens as needed.
        stop_token_ids = stop_token_ids,
        repetition_penalty=repetition_penalty,
        )

        token_queue = queue.Queue()
        token_count = 0

        async def async_producer():
            nonlocal token_count
            async for result in self.engine.generate(prompt=prompt_string, sampling_params=sampling_params, request_id=request_id):
                # Place each token text into the queue.
                token_text = result.outputs[0].text
                print(f"DEBUG: Generated token {token_count}: {repr(token_text)}")
                token_queue.put(token_text)
                token_count += 1
            print(f"DEBUG: Generation completed. Total tokens: {token_count}")
            token_queue.put(None)  # Sentinel to indicate completion.

        def run_async():
            asyncio.run(async_producer())

        thread = threading.Thread(target=run_async)
        thread.start()

        while True:
            token = token_queue.get()
            if token is None:
                break
            yield token

        thread.join()
    
    def generate_speech(self, **kwargs):
        print("DEBUG: Starting generate_speech")
        try:
            token_generator = self.generate_tokens_sync(**kwargs)
            print("DEBUG: Token generator created successfully")
            
            audio_generator = tokens_decoder_sync(token_generator)
            print("DEBUG: Audio decoder called successfully")
            
            return audio_generator
        except Exception as e:
            print(f"DEBUG: Error in generate_speech: {e}")
            raise e