File size: 3,885 Bytes
df44cb1
 
 
 
 
 
 
 
 
 
 
 
 
 
c1d2f7d
df44cb1
 
 
 
77c2d4d
 
 
 
df44cb1
 
 
 
 
 
 
 
 
 
 
 
 
 
77c2d4d
df44cb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b90c952
df44cb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import gradio as gr
import whisperx
import torch
import librosa
import logging
import os
import time

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("whisperx_app")

# Device setup (force CPU)
device = "cpu"
compute_type = "int8"
torch.set_num_threads(os.cpu_count())

# Pre-load models
models = {
    "tiny": whisperx.load_model("tiny", device, compute_type=compute_type, vad_method='silero'),
    "base": whisperx.load_model("base", device, compute_type=compute_type, vad_method='silero'),
    "small": whisperx.load_model("small", device, compute_type=compute_type, vad_method='silero'),
    "large": whisperx.load_model("large", device, compute_type=compute_type, vad_method='silero'),
}

def transcribe(audio_file, model_size="base", debug=False):
    start_time = time.time()
    result = ""
    debug_log = []
    
    try:
        # Load audio file
        audio, sr = librosa.load(audio_file, sr=16000)
        
        # Run inference
        model = models[model_size]
        batch_size = 8 if model_size == "tiny" else 4
        transcript = model.transcribe(audio, batch_size=batch_size)
        
        # Align whisper output
        model_a, metadata = whisperx.load_align_model(
            language_code=transcript["language"], device=device
        )
        transcript_aligned = whisperx.align(
            transcript["segments"], model_a, metadata, audio, device
        )
        
        # Format word-level output
        for segment in transcript_aligned["segments"]:
            for word in segment["words"]:
                result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
        
        debug_log.append(f"Processed in {time.time()-start_time:.2f}s")
        debug_log.append(f"Language detected: {transcript['language']}")
        debug_log.append(f"Batch size: {batch_size}")
        
    except Exception as e:
        logger.error("Error during transcription:", exc_info=True)
        result = "Error occurred during transcription"
        debug_log.append(f"ERROR: {str(e)}")
    
    if debug:
        return result, "\n".join(debug_log)
    return result

# Gradio Interface
with gr.Blocks(title="WhisperX CPU Transcription") as demo:
    gr.Markdown("# WhisperX CPU Transcription with Word-Level Timestamps")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                label="Upload Audio File",
                type="filepath",
                sources=["upload", "microphone"],
                interactive=True,
            )
            model_selector = gr.Dropdown(
                choices=models.keys(),
                value="base",
                label="Model Size",
                interactive=True,
            )
            debug_checkbox = gr.Checkbox(label="Enable Debug Mode", value=False)
            transcribe_btn = gr.Button("Transcribe", variant="primary")
            
        with gr.Column():
            output_text = gr.Textbox(
                label="Transcription Output",
                lines=20,
                placeholder="Transcription will appear here...",
            )
            debug_output = gr.Textbox(
                label="Debug Information",
                lines=10,
                placeholder="Debug logs will appear here...",
                visible=False,
            )
    
    # Toggle debug visibility
    def toggle_debug(debug_enabled):
        return gr.update(visible=debug_enabled)
    
    debug_checkbox.change(
        toggle_debug,
        inputs=[debug_checkbox],
        outputs=[debug_output]
    )
    
    # Process transcription
    transcribe_btn.click(
        transcribe,
        inputs=[audio_input, model_selector, debug_checkbox],
        outputs=[output_text, debug_output]
    )

# Launch configuration
if __name__ == "__main__":
    demo.queue(max_size=4).launch()