dev-mode-orpheus-tts

Paused

App Files Files Community

Tomtom84 commited on Jun 9

Commit

d4e3b98

verified ·

1 Parent(s): 91da710

Create kartoffel_decoder.py

Browse files

Files changed (1) hide show

orpheus-tts/kartoffel_decoder.py +196 -0

orpheus-tts/kartoffel_decoder.py ADDED Viewed

	@@ -0,0 +1,196 @@

+from snac import SNAC
+import numpy as np
+import torch
+import asyncio
+import threading
+import queue
+import os
+# Kartoffel-spezifische Konstanten
+CODE_TOKEN_OFFSET = 128266
+CODE_START_TOKEN_ID = 128257
+CODE_REMOVE_TOKEN_ID = 128258
+print("DEBUG KARTOFFEL: Loading SNAC model...")
+model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
+snac_device = os.environ.get("SNAC_DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(snac_device)
+if snac_device == "cuda":
+    model = model.half()
+model.eval()
+print(f"DEBUG KARTOFFEL: SNAC model loaded successfully on device: {snac_device}")
+def redistribute_codes_kartoffel(code_list):
+    """Kartoffel-spezifische Code-Redistribution"""
+    if not code_list:
+        return torch.tensor([[]], device=snac_device, dtype=torch.float32)
+    num_codes = len(code_list)
+    num_groups = num_codes // 7
+    if num_groups == 0:
+        return torch.tensor([[]], device=snac_device, dtype=torch.float32)
+    # Nur vollständige 7er-Gruppen verwenden
+    code_list = code_list[:num_groups * 7]
+    layer_1, layer_2, layer_3 = [], [], []
+    for i in range(num_groups):
+        base_idx = 7 * i
+        try:
+            layer_1.append(code_list[base_idx])
+            layer_2.append(code_list[base_idx + 1] - 4096)
+            layer_3.append(code_list[base_idx + 2] - (2 * 4096))
+            layer_3.append(code_list[base_idx + 3] - (3 * 4096))
+            layer_2.append(code_list[base_idx + 4] - (4 * 4096))
+            layer_3.append(code_list[base_idx + 5] - (5 * 4096))
+            layer_3.append(code_list[base_idx + 6] - (6 * 4096))
+        except IndexError:
+            print(f"DEBUG KARTOFFEL: IndexError during code redistribution at group {i}. Skipping group.")
+            break
+    if not layer_1:
+        return torch.tensor([[]], device=snac_device, dtype=torch.float32)
+    codes = [
+        torch.tensor(layer_1, device=snac_device).unsqueeze(0),
+        torch.tensor(layer_2, device=snac_device).unsqueeze(0),
+        torch.tensor(layer_3, device=snac_device).unsqueeze(0),
+    ]
+    with torch.no_grad():
+        audio_hat = model.decode(codes)
+    return audio_hat
+def convert_to_audio_kartoffel(audio_tensor):
+    """Konvertiert Audio-Tensor zu PCM16-Bytes"""
+    if audio_tensor is None or audio_tensor.numel() == 0:
+        return b''
+    # Audio zu PCM16 konvertieren
+    audio_numpy = (audio_tensor.squeeze().cpu().to(torch.float32).numpy() * 32767)
+    audio_numpy = np.clip(audio_numpy, -32768, 32767).astype(np.int16)
+    return audio_numpy.tobytes()
+def extract_kartoffel_tokens(token_text, tokenizer):
+    """Extrahiert Audio-Token-IDs aus dem generierten Text"""
+    try:
+        # Text zu Token-IDs konvertieren
+        token_ids = tokenizer.encode(token_text)
+        # Nach Start-Token suchen
+        start_idx = -1
+        for i, token_id in enumerate(token_ids):
+            if token_id == CODE_START_TOKEN_ID:
+                start_idx = i
+                break
+        if start_idx == -1:
+            return []
+        # Audio-Tokens extrahieren (nach Start-Token)
+        potential_code_tokens = token_ids[start_idx + 1:]
+        # Nur gültige Audio-Tokens (>= CODE_TOKEN_OFFSET, nicht REMOVE_TOKEN)
+        valid_raw_codes = [
+            token for token in potential_code_tokens
+            if token != CODE_REMOVE_TOKEN_ID and token >= CODE_TOKEN_OFFSET
+        ]
+        # Offset abziehen
+        valid_codes = [token - CODE_TOKEN_OFFSET for token in valid_raw_codes]
+        return valid_codes
+    except Exception as e:
+        print(f"DEBUG KARTOFFEL: Error extracting tokens: {e}")
+        return []
+async def tokens_decoder_kartoffel(token_gen, tokenizer):
+    """Kartoffel-spezifischer Token-Decoder"""
+    buffer = []
+    accumulated_text = ""
+    processed_count = 0
+    chunk_size = 28  # 4 Gruppen à 7 Tokens
+    print("DEBUG KARTOFFEL: Starting token decoding")
+    async for token_text in token_gen:
+        accumulated_text += token_text
+        print(f"DEBUG KARTOFFEL: Accumulated text length: {len(accumulated_text)}")
+        # Audio-Tokens aus dem akkumulierten Text extrahieren
+        valid_codes = extract_kartoffel_tokens(accumulated_text, tokenizer)
+        if len(valid_codes) > processed_count:
+            new_codes = valid_codes[processed_count:]
+            buffer.extend(new_codes)
+            print(f"DEBUG KARTOFFEL: Added {len(new_codes)} new codes. Buffer size: {len(buffer)}")
+            # Wenn genug Codes für Audio-Generation vorhanden
+            while len(buffer) >= chunk_size:
+                codes_to_process = buffer[:chunk_size]
+                buffer = buffer[chunk_size:]
+                processed_count += chunk_size
+                print(f"DEBUG KARTOFFEL: Processing {len(codes_to_process)} codes")
+                # Audio generieren
+                audio_tensor = redistribute_codes_kartoffel(codes_to_process)
+                audio_bytes = convert_to_audio_kartoffel(audio_tensor)
+                if audio_bytes:
+                    print(f"DEBUG KARTOFFEL: Generated {len(audio_bytes)} bytes of audio")
+                    yield audio_bytes
+                else:
+                    print("DEBUG KARTOFFEL: No audio bytes generated")
+    # Verbleibende Codes verarbeiten
+    if len(buffer) >= 7:  # Mindestens eine vollständige Gruppe
+        final_count = (len(buffer) // 7) * 7
+        final_codes = buffer[:final_count]
+        print(f"DEBUG KARTOFFEL: Processing final {len(final_codes)} codes")
+        audio_tensor = redistribute_codes_kartoffel(final_codes)
+        audio_bytes = convert_to_audio_kartoffel(audio_tensor)
+        if audio_bytes:
+            print(f"DEBUG KARTOFFEL: Generated final {len(audio_bytes)} bytes of audio")
+            yield audio_bytes
+    print("DEBUG KARTOFFEL: Token decoding completed")
+def tokens_decoder_kartoffel_sync(syn_token_gen, tokenizer):
+    """Synchroner Wrapper für den Kartoffel-Decoder"""
+    audio_queue = queue.Queue()
+    # Synchronen Generator zu async konvertieren
+    async def async_token_gen():
+        for token in syn_token_gen:
+            yield token
+    async def async_producer():
+        try:
+            async for audio_chunk in tokens_decoder_kartoffel(async_token_gen(), tokenizer):
+                audio_queue.put(audio_chunk)
+        except Exception as e:
+            print(f"DEBUG KARTOFFEL: Error in async producer: {e}")
+            import traceback
+            traceback.print_exc()
+        finally:
+            audio_queue.put(None)  # Sentinel
+    def run_async():
+        asyncio.run(async_producer())
+    thread = threading.Thread(target=run_async)
+    thread.start()
+    while True:
+        audio = audio_queue.get()
+        if audio is None:
+            break
+        yield audio
+    thread.join()