File size: 8,648 Bytes
d4b107b
 
 
 
 
 
 
 
8cb8264
592f7e1
 
fd9da0e
3d3ff49
 
 
 
c835efb
3d3ff49
d4b107b
f9564f3
440d6b7
 
3d3ff49
 
440d6b7
d4b107b
0b0bb20
 
d4b107b
440d6b7
53454a4
3d3ff49
440d6b7
0634e0f
53454a4
0634e0f
440d6b7
 
 
0634e0f
440d6b7
 
f9564f3
440d6b7
 
55eafca
440d6b7
 
 
 
 
55eafca
b3ed824
0634e0f
3d3ff49
 
440d6b7
3d3ff49
 
592f7e1
440d6b7
 
 
 
 
47661bd
f9564f3
badcd8d
 
 
 
 
caafdf0
badcd8d
 
caafdf0
badcd8d
 
 
 
 
6de75ee
14c8f51
440d6b7
 
 
 
 
47661bd
 
 
1b9402b
 
 
47661bd
badcd8d
1b9402b
badcd8d
47661bd
052955a
47661bd
052955a
0634e0f
badcd8d
440d6b7
caafdf0
 
 
47661bd
badcd8d
1b9402b
592f7e1
14c8f51
 
 
 
 
 
2d9e081
 
caafdf0
badcd8d
 
d4b107b
55eafca
d4b107b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440d6b7
2d9e081
55eafca
 
592f7e1
0a9d8a4
592f7e1
 
 
440d6b7
 
 
 
 
55eafca
440d6b7
 
b3ed824
440d6b7
 
b3ed824
440d6b7
 
 
 
 
 
8ec53db
 
 
 
 
7735671
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ec53db
7735671
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ec53db
7735671
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ec53db
 
b3ed824
869e885
7735671
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


import gradio as gr
import warnings
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq
from pydub import AudioSegment
import soundfile as sf
import numpy as np
from fpdf import FPDF
from PIL import Image
import time
import os
import spaces

warnings.filterwarnings("ignore")

HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')


model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#torch_dtype = torch.float32
model.to(device)

#asr = pipeline("automatic-speech-recognition", model=model, processor=processor.tokenizer, device=device, torch_dtype=torch.float32)
pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch.float32)


def transcribe_audio(audio_file):
    with torch.no_grad():
        output = pipe(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"})
    return output["text"]

# Gradio interface
iface = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Audio Transcription App",
    description="Upload an audio file to get the transcription",
    theme="default",
    live=False
)

# summarization model
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

# Move summarization model to device
summarization_model.to(device)

def convert_to_wav(audio_file):
    audio = AudioSegment.from_file(audio_file, format="m4a")
    wav_file = "temp.wav"
    audio.export(wav_file, format="wav")
    return wav_file

    
# @spaces.GPU(duration=120, queue=False)



# Configure tokenizer to have distinct pad_token_id and eos_token_id
if processor.tokenizer.pad_token_id is None:
    # Setting pad_token_id explicitly to ensure distinction from eos_token_id
    processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + 1

    # Sanity check to confirm distinct pad and eos tokens
    assert processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id, \
        "pad_token_id and eos_token_id must be distinct!"

# Proceed with the transcription function
def transcribe_audio(audio_file, batch_size=4):
    start_time = time.time()
    if audio_file.endswith(".m4a"):
        audio_file = convert_to_wav(audio_file)
    
    audio_input, sample_rate = sf.read(audio_file)
    chunk_size = 16000 * 30 
    chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]

    transcription = ""
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i + batch_size]
        inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
        inputs = inputs.to(device)
        
        attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
        
        with torch.no_grad():
            output = model.generate(
                inputs.input_features,
                max_length=2048, 
                num_beams=8,
                task="transcribe",
                attention_mask=attention_mask,
                language="no",
                pad_token_id=processor.tokenizer.pad_token_id,
                eos_token_id=processor.tokenizer.eos_token_id
            )
        
        transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "

    end_time = time.time()
    transcription_time = end_time - start_time
    word_count = len(transcription.split())

    result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
    
    return transcription.strip(), result




# Graph-based summarization|TextRank
def summarize_text(text):
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return ""
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # Select top N sentences (e.g., 3 sentences for the summary)
    top_n = 3
    summary = " ".join([s for _, s in ranked_sentences[:top_n]])
    return summary

# HTML syntax for imagery
image_html = """
<div style="text-align: center;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
</div>
"""

def save_to_pdf(transcription, summary):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)
    
    # include transcription
    pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
    
    # paragraph space
    pdf.ln(10)
    
    # include summary
    pdf.multi_cell(0, 10, "Summary:\n" + summary)
    
    pdf_output_path = "transcription_summary.pdf"
    pdf.output(pdf_output_path)
    return pdf_output_path

# Gradio UI
iface = gr.Blocks()

with iface:
    gr.HTML(image_html)
    gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
    
    with gr.Tabs():
        
        # First Tab: Transcription
        with gr.TabItem("Transcription"):
            audio_input = gr.Audio(type="filepath")
            batch_size_input = gr.Slider(minimum=7, maximum=16, step=1, label="Batch Size")
            transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
            result_output = gr.Textbox(label="Time taken and Number of words")
            transcribe_button = gr.Button("Transcribe")

            def transcribe(audio_file, batch_size):
                transcription, result = transcribe_audio(audio_file, batch_size)
                return transcription, result

            transcribe_button.click(
                fn=transcribe,
                inputs=[audio_input, batch_size_input],
                outputs=[transcription_output, result_output]
            )

        # Second Tab: Summary
        with gr.TabItem("Summary"):
            summary_output = gr.Textbox(label="Summary | TextRank, graph-based")
            summarize_button = gr.Button("Summarize")

            def summarize(transcription):
                if not transcription:
                    return "Warning: a transcription must be available."
                summary = summarize_text(transcription)
                return summary

            summarize_button.click(
                fn=summarize,
                inputs=[transcription_output],  # Use the transcription from the first tab
                outputs=summary_output
            )

        # Third Tab: PDF Download Options
        with gr.TabItem("Download PDF"):
            pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
            pdf_summary_only = gr.Button("Download PDF with Summary Only")
            pdf_both = gr.Button("Download PDF with Both")

            pdf_output_transcription_only = gr.File(label="Download PDF")
            pdf_output_summary_only = gr.File(label="Download PDF")
            pdf_output_both = gr.File(label="Download PDF")

            def generate_pdf_transcription_only(transcription):
                return save_to_pdf(transcription, "")

            def generate_pdf_summary_only(summary):
                return save_to_pdf("", summary)

            def generate_pdf_both(transcription, summary):
                return save_to_pdf(transcription, summary)

            pdf_transcription_only.click(
                fn=generate_pdf_transcription_only,
                inputs=[transcription_output],
                outputs=[pdf_output_transcription_only]
            )

            pdf_summary_only.click(
                fn=generate_pdf_summary_only,
                inputs=[summary_output],
                outputs=[pdf_output_summary_only]
            )

            pdf_both.click(
                fn=generate_pdf_both,
                inputs=[transcription_output, summary_output],
                outputs=[pdf_output_both]
            )


# run
iface.launch(share=True, debug=True)