File size: 2,227 Bytes
ff83bcc
 
bbaed16
ff83bcc
 
 
b70bc19
 
ff83bcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e844023
 
ff83bcc
 
bbaed16
 
ff83bcc
 
 
 
 
bbaed16
ff83bcc
 
 
 
bbaed16
 
ff83bcc
 
 
 
 
bbaed16
 
 
 
 
 
14351e5
bbaed16
 
2acd67b
ff83bcc
bbaed16
ff83bcc
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# app.py
# =============
# This is a complete app.py file for an automatic Speech Recognition (ASR) using the openai/whisper-large-v3-turbo model.
# The app is built using Gradio and Hugging Face Transformers, and it runs on the CPU to avoid video memory usage.

import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import gradio as gr

# Set device to CPU
device = "cpu"
torch_dtype = torch.float32

# Load the model and processor
model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

# Create the ASR pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
    chunk_length_s=30,  # Process audio in 30-second chunks
    return_timestamps=True  # Enable timestamp prediction for long-form generation
)

# Define the transcription function
def transcribe_audio(audio_file, language):
    """
    Transcribe the given audio file using the Whisper model.
    
    Parameters:
    audio_file (str): Path to the audio file.
    language (str): Language code for transcription.
    
    Returns:
    str: Transcribed text.
    """
    generate_kwargs = {"language": language}
    result = pipe(audio_file, generate_kwargs=generate_kwargs)
    return result["text"]

# Define the Gradio interface
iface = gr.Interface(
    fn=transcribe_audio,
    inputs=[
        gr.Audio(label="Upload Audio", type="filepath"),
        gr.Dropdown(
            label="Select Language",
            choices=["en", "ru", "es", "fr", "de", "zh", "ja", "ko", "pt", "it"],
            value="en",
            info="Select the language for transcription."
        )
    ],
    outputs=gr.Textbox(label="Transcription"),
    title="Whisper ASR Demo",
    description="Upload an audio file and select the language to get the transcribed text using the openai/whisper-large-v3-turbo model.",
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch()