File size: 3,632 Bytes
df44cb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import whisperx
import torch
import librosa
import logging
import os
import time

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("whisperx_app")

# Device setup (force CPU)
device = "cpu"
torch.set_num_threads(os.cpu_count())

# Pre-load models
models = {
    "tiny": whisperx.load_model("tiny", device),
    "base": whisperx.load_model("base", device),
    "small": whisperx.load_model("small", device),
}

def transcribe(audio_file, model_size="base", debug=False):
    start_time = time.time()
    result = ""
    debug_log = []
    
    try:
        # Load audio file
        audio, sr = librosa.load(audio_file, sr=16000)
        
        # Run inference
        model = models[model_size]
        batch_size = 8 if model_size == "tiny" else 4
        transcript = model.transcribe(audio, batch_size=batch_size)
        
        # Align whisper output
        model_a, metadata = whisperx.load_align_model(
            language_code=transcript["language"], device=device
        )
        transcript_aligned = whisperx.align(
            transcript["segments"], model_a, metadata, audio, device
        )
        
        # Format word-level output
        for segment in transcript_aligned["segments"]:
            for word in segment["words"]:
                result += f"[{word['start']:5.2f}s-{word['end']:5.2f}s] {word['word']}\n"
        
        debug_log.append(f"Processed in {time.time()-start_time:.2f}s")
        debug_log.append(f"Language detected: {transcript['language']}")
        debug_log.append(f"Batch size: {batch_size}")
        
    except Exception as e:
        logger.error("Error during transcription:", exc_info=True)
        result = "Error occurred during transcription"
        debug_log.append(f"ERROR: {str(e)}")
    
    if debug:
        return result, "\n".join(debug_log)
    return result

# Gradio Interface
with gr.Blocks(title="WhisperX CPU Transcription") as demo:
    gr.Markdown("# WhisperX CPU Transcription with Word-Level Timestamps")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                label="Upload Audio File",
                type="filepath",
                sources=["upload", "microphone"],
                interactive=True,
            )
            model_selector = gr.Dropdown(
                choices=["tiny", "base", "small"],
                value="base",
                label="Model Size",
                interactive=True,
            )
            debug_checkbox = gr.Checkbox(label="Enable Debug Mode", value=False)
            transcribe_btn = gr.Button("Transcribe", variant="primary")
            
        with gr.Column():
            output_text = gr.Textbox(
                label="Transcription Output",
                lines=20,
                placeholder="Transcription will appear here...",
            )
            debug_output = gr.Textbox(
                label="Debug Information",
                lines=10,
                placeholder="Debug logs will appear here...",
                visible=False,
            )
    
    # Toggle debug visibility
    def toggle_debug(debug_enabled):
        return gr.update(visible=debug_enabled)
    
    debug_checkbox.change(
        toggle_debug,
        inputs=[debug_checkbox],
        outputs=[debug_output]
    )
    
    # Process transcription
    transcribe_btn.click(
        transcribe,
        inputs=[audio_input, model_selector, debug_checkbox],
        outputs=[output_text, debug_output]
    )

# Launch configuration
if __name__ == "__main__":
    demo.queue(max_size=4).launch()