File size: 8,984 Bytes
0ca8cef
 
 
 
 
 
 
3d3ff49
d4b107b
9e722fb
8c6ad91
 
de0e923
dd58d2e
0201e30
8c6ad91
 
 
 
 
 
 
 
 
 
 
 
 
d353554
8c6ad91
 
d353554
7ec9f42
8c6ad91
 
7ec9f42
 
3d3ff49
8c6ad91
440d6b7
 
 
 
 
47661bd
8c6ad91
a10f544
329c8dd
0ca8cef
 
 
 
 
 
 
 
 
 
 
 
 
 
ca866cd
9769005
440d6b7
 
b3d3679
b98f4ad
d2774a4
0ca8cef
 
f4108af
0ca8cef
 
f4108af
0ca8cef
3698f30
b3d3679
0ca8cef
3698f30
 
0ca8cef
b3d3679
3698f30
8c6ad91
3698f30
0ac786e
f4108af
0ca8cef
8c6ad91
9e722fb
8c6ad91
 
 
9e722fb
 
 
7ec9f42
8c6ad91
 
 
badcd8d
8c6ad91
55eafca
9e722fb
8c6ad91
9e722fb
 
8c6ad91
d2774a4
8c6ad91
9e722fb
 
 
 
 
 
 
 
 
8c6ad91
9e722fb
8c6ad91
9e722fb
 
8c6ad91
9e722fb
 
 
 
d262ec1
9e722fb
 
8c6ad91
d4b107b
8c6ad91
f7e87b9
9e722fb
 
 
8c6ad91
9e722fb
 
 
 
f7e87b9
8c6ad91
 
d4b107b
f7e87b9
9e722fb
8c6ad91
f7e87b9
8c6ad91
9e722fb
 
 
 
f7e87b9
9e722fb
 
 
 
2d9e081
8c6ad91
 
 
 
9e722fb
8c6ad91
0ac786e
440d6b7
8c6ad91
440d6b7
d2774a4
0ac786e
8c6ad91
d2774a4
8c6ad91
d2774a4
b98f4ad
 
d2774a4
9e722fb
440d6b7
 
 
8c6ad91
8ec53db
 
 
8c6ad91
 
b3d3679
8c6ad91
 
2920f00
d2774a4
7735671
 
 
8c6ad91
7735671
 
 
8c6ad91
8ec53db
8c6ad91
9e722fb
7735671
 
8c6ad91
8ec53db
8c6ad91
9e722fb
 
 
8c6ad91
9e722fb
8c6ad91
9e722fb
 
 
8c6ad91
9e722fb
7735671
8c6ad91
7735671
 
 
8c6ad91
7735671
8c6ad91
 
 
8ec53db
9e722fb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
"""

This application processes audio files, transcribes them using a pretrained model (Whisper), and provides multiple summarization options for the transcribed text. The application also includes a PDF generation feature and is built with Gradio for the user interface

Webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummering, pdf-download.
"""

import time
import os
import warnings
from pydub import AudioSegment
import torch
from torch import torchaudio
from transformers import pipeline, WhisperForConditionalGeneration, WhisperProcessor
from huggingface_hub import model_info
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr
from fpdf import FPDF
from PIL import Image

# Suppress warnings
warnings.filterwarnings("ignore")

# Ascertain that NLTK dependencies for Norwegian are downloaded
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('norwegian'))


# Convert m4a audio to wav format
def convert_to_wav(audio_file):
    audio = AudioSegment.from_file(audio_file, format="m4a")
    wav_file = "temp.wav"
    audio.export(wav_file, format="wav")
    return wav_file

# Initialize device for torch
#device = "cuda" if torch.cuda.is_available() else "cpu"

# Load smoking-big-pipe
MODEL_NAME = "NbAiLab/nb-whisper-large"
lang = "no"

device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")


def transcribe_audio(audio_file):
    if audio_file.endswith(".m4a"):
        audio_file = convert_to_wav(audio_file)

    start_time = time.time()

    text = pipe(audio_file)["text"]

    # Load the audio file using torchaudio
    #waveform, sample_rate = torchaudio.load(audio_file)

    # Process the waveform with Whisper's processor
    #input_features = whisper_processor(waveform, sampling_rate=sample_rate, return_tensors="pt").input_features.to(device)

    # Generate the transcription
    #output = whisper_model.generate(input_features=input_features)

    # Decode the output
    #text = whisper_processor.batch_decode(output, skip_special_tokens=True)[0]

    output_time = time.time() - start_time
    result = f"Time taken: {output_time:.2f} seconds\nNumber of words: {len(text.split())}"

    return text, result


# Clean and preprocess text for summarization
def clean_text(text):
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_text(text):
    words = word_tokenize(text, language="norwegian")
    stop_words = set(stopwords.words('norwegian'))
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# Summarize text using the T5 model
def summarize_text(text):
    preprocessed_text = preprocess_text(text)
    inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
    inputs = inputs.to(device)
    summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
    return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Build similarity matrix for graph-based summary
def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = nx.Graph()
    for i, tokens_a in enumerate(sentences):
        for j, tokens_b in enumerate(sentences):
            if i != j:
                common_words = set(tokens_a) & set(tokens_b)
                similarity_matrix.add_edge(i, j, weight=len(common_words))
    return similarity_matrix

# Graph-based summarization
def graph_based_summary(text, num_paragraphs=3):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) < num_paragraphs:
        return sentences

    sentence_tokens = [word_tokenize(sent) for sent in sentences]
    stop_words = set(stopwords.words('norwegian'))
    filtered_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in sentence_tokens]
    similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)

    scores = nx.pagerank(similarity_matrix)
    ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
    return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])

# LexRank summarization
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) < num_paragraphs:
        return sentences

    stop_words = set(stopwords.words('norwegian'))
    vectorizer = TfidfVectorizer(stop_words=list(stop_words))
    X = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(X, X)

    # Apply threshold to the similarity matrix
    similarity_matrix[similarity_matrix < threshold] = 0
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])

# TextRank summarization
def text_rank_summary(text, num_paragraphs=3):
    sentences = nltk.sent_tokenize(text)
    if len(sentences) < num_paragraphs:
        return sentences

    stop_words = set(stopwords.words('norwegian'))
    vectorizer = TfidfVectorizer(stop_words=list(stop_words))
    X = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(X, X)

    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])

# Save text and summary to PDF
def save_to_pdf(text, summary):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    if text:
        pdf.multi_cell(0, 10, "Text:\n" + text)

    pdf.ln(10)  # Paragraph space

    if summary:
        pdf.multi_cell(0, 10, "Summary:\n" + summary)

    pdf_output_path = "transcription.pdf"
    pdf.output(pdf_output_path)
    return pdf_output_path

# Gradio Interface
iface = gr.Blocks()

with iface:
    gr.HTML("""
    <div style="text-align: center;">
        <img src="https://huggingface.co/spaces/camparchimedes/transcription_app/raw/main/banner_trans.png" alt="" width="100%" height="auto">
    </div>
    """)
    gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")

    with gr.Tabs():
        with gr.TabItem("Transcription"):
            audio_input = gr.Audio(type="filepath")
            text_output = gr.Textbox(label="Text")
            result_output = gr.Textbox(label="Time taken and Number of words")
            transcribe_button = gr.Button("Transcribe")

            transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])

        with gr.TabItem("Summary | Graph-based"):
            summary_output = gr.Textbox(label="Summary | Graph-based")
            summarize_button = gr.Button("Summarize")

            summarize_button.click(fn=lambda text: graph_based_summary(text), inputs=[text_output], outputs=[summary_output])

        with gr.TabItem("Summary | LexRank"):
            summary_output = gr.Textbox(label="Summary | LexRank")
            summarize_button = gr.Button("Summarize")

            summarize_button.click(fn=lambda text: lex_rank_summary(text), inputs=[text_output], outputs=[summary_output])

        with gr.TabItem("Summary | TextRank"):
            summary_output = gr.Textbox(label="Summary | TextRank")
            summarize_button = gr.Button("Summarize")

            summarize_button.click(fn=lambda text: text_rank_summary(text), inputs=[text_output], outputs=[summary_output])

        with gr.TabItem("Download PDF"):
            pdf_text_only = gr.Button("Download PDF with Text Only")
            pdf_summary_only = gr.Button("Download PDF with Summary Only")
            pdf_both = gr.Button("Download PDF with Both")

            pdf_output = gr.File(label="Download PDF")

            pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
            pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output], outputs=[pdf_output])
            pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output], outputs=[pdf_output])

iface.launch(share=True, debug=True)