Spaces:

BricksDisplay
/

OuteTTS-Speaker-Creator

Running on Zero

App Files Files Community

hans00 commited on Jul 6

Commit

ca494e8

unverified ·

1 Parent(s): bfaeb2a

Alias whisper to whisperx

Browse files

Files changed (6) hide show

.gitignore +5 -1
alias.py +116 -0
app.py +4 -2
pyproject.toml +1 -3
requirements.txt +2 -1
uv.lock +0 -0

.gitignore CHANGED Viewed

	@@ -1 +1,5 @@
1	- .venv

+.venv
+__pycache__
+.DS_Store
+.gradio
+venv

alias.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+Alias module to redirect whisper imports to whisperx.
+This allows OuteTTS to use whisperx instead of the standard whisper package.
+"""
+import sys
+import importlib.util
+def setup_whisper_alias():
+    """Setup alias so that 'import whisper' uses whisperx instead."""
+    try:
+        # Check if whisperx is available
+        whisperx_spec = importlib.util.find_spec("whisperx")
+        if whisperx_spec is None:
+            print("Warning: whisperx not found, falling back to regular whisper")
+            return
+        # Import whisperx
+        import whisperx
+        # Create a module wrapper that provides whisper-like interface
+        class WhisperAlias:
+            def __init__(self):
+                self.model = whisperx.WhisperModel if hasattr(whisperx, 'WhisperModel') else None
+                self.load_model = self._load_model
+            def _load_model(self, name, **kwargs):
+                """Load model with whisperx compatible interface."""
+                # Create WhisperX model instance
+                device = "cuda" if kwargs.get("device", "auto") == "cuda" else "cpu"
+                compute_type = "float16" if device == "cuda" else "int8"
+                model = whisperx.load_model(
+                    name,
+                    device=device,
+                    compute_type=compute_type
+                )
+                return WhisperXModelWrapper(model, device)
+        class WhisperXModelWrapper:
+            """Wrapper to make whisperx compatible with whisper interface."""
+            def __init__(self, model, device):
+                self.model = model
+                self.device = device
+            def transcribe(self, audio, **kwargs):
+                """Transcribe audio with whisper-compatible interface."""
+                # Store original word_timestamps setting
+                original_word_timestamps = kwargs.get('word_timestamps', False)
+                # Load audio if it's a file path
+                if isinstance(audio, str):
+                    audio_data = whisperx.load_audio(audio)
+                else:
+                    audio_data = audio
+                # Use whisperx's transcribe method
+                batch_size = kwargs.get('batch_size', 16)
+                result = self.model.transcribe(audio_data, batch_size=batch_size)
+                # If word timestamps are requested, perform alignment
+                if original_word_timestamps and result.get("segments"):
+                    try:
+                        # Load alignment model
+                        model_a, metadata = whisperx.load_align_model(
+                            language_code=result.get("language", "en"),
+                            device=self.device
+                        )
+                        # Align the segments
+                        result = whisperx.align(
+                            result["segments"],
+                            model_a,
+                            metadata,
+                            audio_data,
+                            self.device,
+                            return_char_alignments=False
+                        )
+                    except Exception as e:
+                        print(f"Warning: Could not perform alignment: {e}")
+                        # Continue without alignment
+                # Ensure result format is compatible with whisper format
+                if "segments" not in result:
+                    result["segments"] = []
+                # Ensure 'text' field exists - concatenate all segment texts
+                if "text" not in result:
+                    result["text"] = " ".join([segment.get("text", "") for segment in result.get("segments", [])])
+                # Add words field to segments if word timestamps were requested
+                for segment in result.get("segments", []):
+                    if original_word_timestamps and "words" not in segment:
+                        # If we don't have words but they were requested, create empty words list
+                        segment["words"] = []
+                return result
+        # Create the alias module
+        whisper_alias = WhisperAlias()
+        # Add to sys.modules so 'import whisper' uses our alias
+        sys.modules['whisper'] = whisper_alias
+        print("✅ Successfully aliased whisper to whisperx")
+    except ImportError as e:
+        print(f"Warning: Could not setup whisper alias: {e}")
+        print("Falling back to regular whisper (if available)")
+    except Exception as e:
+        print(f"Warning: Error setting up whisper alias: {e}")
+# Auto-setup when module is imported
+setup_whisper_alias()

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import gradio as gr
 import outetts
 import json
 import tempfile
@@ -14,7 +16,7 @@ def initialize_interface(model_name: str):
     config = outetts.ModelConfig.auto_config(
         model=model,
         backend=outetts.Backend.LLAMACPP,
-        quantization=outetts.LlamaCppQuantization.Q8_0,
     )
     # Initialize the interface
@@ -30,7 +32,7 @@ def create_speaker_and_generate(model_name, audio_file, test_text="", temperatur
     interface = initialize_interface(model_name)
     # Create speaker profile from audio
-    speaker = interface.create_speaker(audio_file)
     # Convert speaker dict to formatted JSON
     speaker_json = json.dumps(speaker, indent=2, ensure_ascii=False)

 import gradio as gr
+# Import alias module before outetts to setup whisper redirection
+import alias
 import outetts
 import json
 import tempfile
     config = outetts.ModelConfig.auto_config(
         model=model,
         backend=outetts.Backend.LLAMACPP,
+        quantization=outetts.LlamaCppQuantization.Q5_0,
     )
     # Initialize the interface
     interface = initialize_interface(model_name)
     # Create speaker profile from audio
+    speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo")
     # Convert speaker dict to formatted JSON
     speaker_json = json.dumps(speaker, indent=2, ensure_ascii=False)

pyproject.toml CHANGED Viewed

@@ -8,7 +8,5 @@ dependencies = [
     "gradio>=5.35.0",
     "numba==0.61.2",
     "outetts",
 ]
-[tool.uv.sources]
-outetts = { git = "https://github.com/edwko/OuteTTS.git" }

     "gradio>=5.35.0",
     "numba==0.61.2",
     "outetts",
+    "whisperx>=3.4.2",
 ]

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 llama-cpp-python
 numba==0.61.2
-outetts==0.4.4
 gradio

 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 llama-cpp-python
+whisperx
 numba==0.61.2
+outetts>=0.4.4
 gradio

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff