File size: 4,240 Bytes
32b6530
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import torch
import gradio as gr
import time  
import numpy as np
import scipy.io.wavfile
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# βœ… 1️⃣ Force Model to Run on CPU
device = "cpu"
torch_dtype = torch.float32  # Use CPU-friendly float type
MODEL_NAME = "openai/whisper-tiny"  # βœ… Switched to smallest model for fastest performance

# βœ… 2️⃣ Load Whisper Tiny Model on CPU
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

# βœ… 3️⃣ Load Processor & Pipeline
processor = AutoProcessor.from_pretrained(MODEL_NAME)

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=2,  # βœ… Process in 2-second chunks for ultra-low latency
    torch_dtype=torch_dtype,
    device=device,
)

# βœ… 4️⃣ Real-Time Streaming Transcription (Microphone)
def stream_transcribe(stream, new_chunk):
    start_time = time.time()
    try:
        sr, y = new_chunk

        # βœ… Convert stereo to mono
        if y.ndim > 1:
            y = y.mean(axis=1)
            
        y = y.astype(np.float32)
        y /= np.max(np.abs(y))

        # βœ… Append to Stream
        if stream is not None:
            stream = np.concatenate([stream, y])
        else:
            stream = y
            
        # βœ… Run Transcription
        transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
        latency = time.time() - start_time

        return stream, transcription, f"{latency:.2f} sec"
    
    except Exception as e:
        print(f"Error: {e}")
        return stream, str(e), "Error"

# βœ… 5️⃣ Transcription for File Upload
def transcribe(inputs, previous_transcription):
    start_time = time.time()  
    try:
        # βœ… Convert file input to correct format
        sample_rate, audio_data = inputs
        transcription = pipe({"sampling_rate": sample_rate, "raw": audio_data})["text"]
        
        previous_transcription += transcription
        latency = time.time() - start_time
        
        return previous_transcription, f"{latency:.2f} sec"

    except Exception as e:
        print(f"Error: {e}")
        return previous_transcription, "Error"

# βœ… 6️⃣ Clear Function
def clear():
    return ""

# βœ… 7️⃣ Gradio Interface (Microphone Streaming)
with gr.Blocks() as microphone:
    gr.Markdown(f"# Whisper Tiny - Real-Time Transcription (CPU) πŸŽ™οΈ")
    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for ultra-fast speech-to-text.")

    with gr.Row():
        input_audio_microphone = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
        output = gr.Textbox(label="Live Transcription", value="")
        latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")

    with gr.Row():
        clear_button = gr.Button("Clear Output")

    state = gr.State()
    input_audio_microphone.stream(
        stream_transcribe, [state, input_audio_microphone], 
        [state, output, latency_textbox], time_limit=30, stream_every=1
    )
    clear_button.click(clear, outputs=[output])

# βœ… 8️⃣ Gradio Interface (File Upload)
with gr.Blocks() as file:
    gr.Markdown(f"# Upload Audio File for Transcription 🎡")
    gr.Markdown(f"Using [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) for speech-to-text.")

    with gr.Row():
        input_audio = gr.Audio(sources=["upload"], type="numpy")
        output = gr.Textbox(label="Transcription", value="")
        latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0")

    with gr.Row():
        submit_button = gr.Button("Submit")
        clear_button = gr.Button("Clear Output")

    submit_button.click(transcribe, [input_audio, output], [output, latency_textbox])
    clear_button.click(clear, outputs=[output])

# βœ… 9️⃣ Final Gradio App (Supports Microphone & File Upload)
with gr.Blocks(theme=gr.themes.Ocean()) as demo:
    gr.TabbedInterface([microphone, file], ["Microphone", "Upload Audio"])

# βœ… 1️⃣0️⃣ Run Gradio Locally
if __name__ == "__main__":
    demo.launch()