dev-mode-orpheus-tts

Paused

App Files Files Community

Tomtom84 commited on Jun 9

Commit

90d77aa

verified ·

1 Parent(s): 3fee54e

Update orpheus-tts/engine_class.py

Browse files

Files changed (1) hide show

orpheus-tts/engine_class.py +24 -9

orpheus-tts/engine_class.py CHANGED Viewed

@@ -100,7 +100,8 @@ class OrpheusModel:
         input_ids = self.tokenizer(full_prompt, return_tensors="pt").input_ids
         modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
-        prompt_string = self.tokenizer.decode(modified_input_ids[0])
         return prompt_string
@@ -122,14 +123,28 @@ class OrpheusModel:
         async def async_producer():
             nonlocal token_count
-            async for result in self.engine.generate(prompt=prompt_string, sampling_params=sampling_params, request_id=request_id):
-                # Place each token text into the queue.
-                token_text = result.outputs[0].text
-                print(f"DEBUG: Generated token {token_count}: {repr(token_text)}")
-                token_queue.put(token_text)
-                token_count += 1
-            print(f"DEBUG: Generation completed. Total tokens: {token_count}")
-            token_queue.put(None)  # Sentinel to indicate completion.
         def run_async():
             asyncio.run(async_producer())

         input_ids = self.tokenizer(full_prompt, return_tensors="pt").input_ids
         modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
+        # Decode back to string for vLLM
+        prompt_string = self.tokenizer.decode(modified_input_ids[0], skip_special_tokens=False)
         return prompt_string
         async def async_producer():
             nonlocal token_count
+            print(f"DEBUG: Starting vLLM generation with prompt: {repr(prompt_string[:100])}...")
+            print(f"DEBUG: Sampling params: temp={sampling_params.temperature}, top_p={sampling_params.top_p}, max_tokens={sampling_params.max_tokens}")
+            try:
+                async for result in self.engine.generate(prompt=prompt_string, sampling_params=sampling_params, request_id=request_id):
+                    # Place each token text into the queue.
+                    token_text = result.outputs[0].text
+                    print(f"DEBUG: Generated token {token_count}: {repr(token_text)}")
+                    token_queue.put(token_text)
+                    token_count += 1
+                    # Show progress every 10 tokens
+                    if token_count % 10 == 0:
+                        print(f"DEBUG: Generated {token_count} tokens so far...")
+                print(f"DEBUG: Generation completed. Total tokens: {token_count}")
+            except Exception as e:
+                print(f"DEBUG: Error during generation: {e}")
+                import traceback
+                traceback.print_exc()
+            finally:
+                token_queue.put(None)  # Sentinel to indicate completion.
         def run_async():
             asyncio.run(async_producer())