File size: 3,962 Bytes
caaee3e
 
6980dd0
b357c71
caaee3e
 
 
6980dd0
b357c71
caaee3e
f2b8075
 
18b526a
f2b8075
9aedf57
caaee3e
 
 
 
 
8a1e498
caaee3e
f2b8075
6980dd0
 
f2b8075
 
6980dd0
 
f2b8075
caaee3e
 
f312b6f
caaee3e
 
0c53f4a
caaee3e
 
 
4e92421
caaee3e
 
0d95a29
caaee3e
 
 
 
 
 
 
 
 
 
b357c71
caaee3e
 
 
 
263e119
 
 
 
 
 
caaee3e
 
 
 
 
 
8a1e498
caaee3e
8a1e498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
caaee3e
 
 
8a1e498
caaee3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b357c71
caaee3e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
from transformers import WhisperTokenizer
import os
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") #, language="marathi", task="transcribe"

from transformers import pipeline
import gradio as gr
import torch 
import torchaudio

pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", 
                task="automatic-speech-recognition", 
                tokenizer= tokenizer,
               
               )  # change to "your-username/the-name-you-picked"

# pipe.model.config.forced_decoder_ids = (
#         pipe.tokenizer.get_decoder_prompt_ids(
#             language="marathi", task="transcribe"
#         )
#     )

def transcribe_speech(filepath):
    # waveform, sample_rate = torchaudio.load(filepath)

    # Resample the audio signal to 16k sampling rate
    # resampler = torchaudio.transforms.Resample(sample_rate, 16000)
    # waveform_16k = resampler(waveform)

    # Save the resampled audio signal to a new file
    # torchaudio.save(filepath, waveform_16k, 16000)    
    output = pipe(
        filepath,
        max_new_tokens=3,
        generate_kwargs={
            "task": "transcribe",
            # "language": "konkani",
        },  # update with the language you've fine-tuned on
        chunk_length_s=30,
        batch_size=8,
         # sampling_rate=16000,
        # padding=True
    )
    print(output)
    return output["text"]


demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.components.Textbox(),
)

file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.components.Textbox(),
    examples=[
         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
         [os.path.join(os.path.dirname("."),"audio/panaji1920-9.mp3")],
     ],
)
with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

demo.launch(debug=True)

# # def transcribe(audio):
# #     # text = pipe(audio)["text"]
# #     # pipe(audio)
# #     text = pipe(audio)
# #     print("op",text)
# #     return text#pipe(audio) #text

# # iface = gr.Interface(
# #     fn=transcribe, 
# #     inputs=[gr.Audio(sources=["microphone", "upload"])], 
# #     outputs="text",
# #     examples=[
# #         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
# #         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
# #         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
# #     ],
# #     title="Whisper Konkani",
# #     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# # )


# # iface.launch()


# from transformers import WhisperTokenizer, pipeline
# import gradio as gr
# import os

# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe")

# pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer)

# def transcribe(audio):
#     result = pipe(audio)
#     text = result[0]['text']
#     print("op", text)
#     return text

# iface = gr.Interface(
#     fn=transcribe,
#     inputs=[gr.Audio(sources=["microphone", "upload"])],
#     outputs="text",
#     examples=[
#         [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")],
#         [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")],
#         [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")],
#     ],
#     title="Whisper Konkani",
#     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# )

# iface.launch()