File size: 6,374 Bytes
d4b107b
 
 
 
 
 
 
8cb8264
592f7e1
 
b98f4ad
3d3ff49
 
c835efb
3d3ff49
d4b107b
dda0718
 
440d6b7
 
3d3ff49
440d6b7
a5bd743
440d6b7
dda0718
b98f4ad
 
dda0718
440d6b7
 
 
 
 
47661bd
dda0718
 
 
 
b98f4ad
440d6b7
 
d2774a4
b98f4ad
d2774a4
b98f4ad
 
592f7e1
b98f4ad
14c8f51
b98f4ad
 
14c8f51
 
b98f4ad
d2774a4
2d9e081
 
dda0718
d2774a4
 
dda0718
d2774a4
badcd8d
dda0718
55eafca
d4b107b
 
 
d2774a4
d4b107b
 
 
 
 
 
 
 
dda0718
d4b107b
 
440d6b7
2d9e081
dda0718
 
 
 
 
 
 
 
 
440d6b7
 
 
 
d2774a4
b98f4ad
 
d2774a4
dda0718
440d6b7
d2774a4
b98f4ad
 
d2774a4
440d6b7
 
 
 
dda0718
 
 
 
 
 
 
 
 
 
8ec53db
 
 
 
7735671
d2774a4
7735671
b98f4ad
7735671
 
 
 
 
 
b98f4ad
 
7735671
 
 
 
b98f4ad
7735671
 
8ec53db
b98f4ad
7735671
 
 
 
 
 
 
 
 
 
 
 
b98f4ad
7735671
 
8ec53db
b98f4ad
7735671
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8ec53db
dda0718
 
869e885
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import gradio as gr
import warnings
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from pydub import AudioSegment
from fpdf import FPDF
from PIL import Image
import time
import os
# import spaces


warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32

# ASR pipeline
pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch.float32)

# Switch m4a to wav
def convert_to_wav(audio_file):
    audio = AudioSegment.from_file(audio_file, format="m4a")
    wav_file = "temp.wav"
    audio.export(wav_file, format="wav")
    return wav_file

# @spaces.GPU(queue=True)


# Transcription funct.@ASR pipeline
def transcribe_audio(audio_file):
    if audio_file.endswith(".m4a"):
        audio_file = convert_to_wav(audio_file)

    start_time = time.time()

    with torch.no_grad():
        output = pipe(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"})

    transcription = output["text"]
    end_time = time.time()

    output_time = end_time - start_time
    word_count = len(transcription.split())

    result = f"Transcription: {transcription.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"

    return transcription.strip(), result

# t5-base model@summary funct.
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
# t5-base to device
summarization_model.to(device)

# Graph-based summarization (TextRank, method)
def summarize_text(text):
    sentences = sent_tokenize(text)
    if len(sentences) == 0:
        return ""

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    # Select top N sentences (e.g., 3 sentences for the summary)
    top_n = 3
    summary = " ".join([s for _, s in ranked_sentences[:top_n]])
    return summary


# HTML syntax for imagery
image_html = """
<div style="text-align: center;">
    <img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
</div>
"""

# Transcription and summary@PDF option(s)
def save_to_pdf(transcription, summary):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    if transcription:
        pdf.multi_cell(0, 10, "Transcription:\n" + transcription)

    # paragraph space
    pdf.ln(10)

    if summary:
        pdf.multi_cell(0, 10, "Summary:\n" + summary)

    pdf_output_path = "transcription_summary.pdf"
    pdf.output(pdf_output_path)
    return pdf_output_path

# Gradio 
iface = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Audio Transcription App",
    description="Upload an audio file to get the transcription",
    theme="default",
    live=False
)
iface = gr.Blocks()

with iface:
    gr.HTML(image_html)
    gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")

    with gr.Tabs():
        # Transcription Tab
        with gr.TabItem("Transcription"):
            audio_input = gr.Audio(type="filepath")
            transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
            result_output = gr.Textbox(label="Time taken and Number of words")
            transcribe_button = gr.Button("Transcribe")

            def transcribe(audio_file):
                transcription, result = transcribe_audio(audio_file)
                return transcription, result

            transcribe_button.click(
                fn=transcribe,
                inputs=[audio_input],
                outputs=[transcription_output, result_output]
            )

        # Summary Tab
        with gr.TabItem("Summary"):
            summary_output = gr.Textbox(label="Summary | TextRank, graph-based")
            summarize_button = gr.Button("Summarize")

            def summarize(transcription):
                if not transcription:
                    return "Warning: a transcription must be available."
                summary = summarize_text(transcription)
                return summary

            summarize_button.click(
                fn=summarize,
                inputs=[transcription_output],
                outputs=summary_output
            )

        # PDF Download Tab
        with gr.TabItem("Download PDF"):
            pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
            pdf_summary_only = gr.Button("Download PDF with Summary Only")
            pdf_both = gr.Button("Download PDF with Both")

            pdf_output_transcription_only = gr.File(label="Download PDF")
            pdf_output_summary_only = gr.File(label="Download PDF")
            pdf_output_both = gr.File(label="Download PDF")

            def generate_pdf_transcription_only(transcription):
                return save_to_pdf(transcription, "")

            def generate_pdf_summary_only(summary):
                return save_to_pdf("", summary)

            def generate_pdf_both(transcription, summary):
                return save_to_pdf(transcription, summary)

            pdf_transcription_only.click(
                fn=generate_pdf_transcription_only,
                inputs=[transcription_output],
                outputs=[pdf_output_transcription_only]
            )

            pdf_summary_only.click(
                fn=generate_pdf_summary_only,
                inputs=[summary_output],
                outputs=[pdf_output_summary_only]
            )

            pdf_both.click(
                fn=generate_pdf_both,
                inputs=[transcription_output, summary_output],
                outputs=[pdf_output_both]
            )


# run
iface.launch(share=True, debug=True)