Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ from nltk.tokenize import sent_tokenize
|
|
9 |
import gradio as gr
|
10 |
import warnings
|
11 |
import torch
|
12 |
-
from transformers import
|
13 |
from pydub import AudioSegment
|
14 |
import soundfile as sf
|
15 |
import numpy as np
|
@@ -21,43 +21,16 @@ import spaces
|
|
21 |
|
22 |
warnings.filterwarnings("ignore")
|
23 |
|
24 |
-
HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
|
25 |
|
26 |
-
|
27 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
|
28 |
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
|
|
|
|
|
29 |
|
30 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
31 |
#torch_dtype = torch.float32
|
32 |
model.to(device)
|
33 |
|
34 |
-
#asr = pipeline("automatic-speech-recognition", model=model, processor=processor.tokenizer, device=device, torch_dtype=torch.float32)
|
35 |
-
pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch.float32)
|
36 |
-
|
37 |
-
|
38 |
-
def transcribe_audio(audio_file):
|
39 |
-
with torch.no_grad():
|
40 |
-
output = pipe(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"})
|
41 |
-
return output["text"]
|
42 |
-
|
43 |
-
# Gradio UI
|
44 |
-
iface = gr.Interface(
|
45 |
-
fn=transcribe_audio,
|
46 |
-
inputs=gr.Audio(type="filepath"),
|
47 |
-
outputs="text",
|
48 |
-
title="Audio Transcription App",
|
49 |
-
description="Upload an audio file to get the transcription",
|
50 |
-
theme="default",
|
51 |
-
live=False
|
52 |
-
)
|
53 |
-
|
54 |
-
# summarization model
|
55 |
-
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
|
56 |
-
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
|
57 |
-
|
58 |
-
# t5-base to device
|
59 |
-
summarization_model.to(device)
|
60 |
-
|
61 |
def convert_to_wav(audio_file):
|
62 |
audio = AudioSegment.from_file(audio_file, format="m4a")
|
63 |
wav_file = "temp.wav"
|
@@ -65,47 +38,37 @@ def convert_to_wav(audio_file):
|
|
65 |
return wav_file
|
66 |
|
67 |
|
68 |
-
# Configure_is__not good enough
|
69 |
-
#if processor.tokenizer.pad_token_id is None:
|
70 |
-
#processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + 1
|
71 |
-
|
72 |
-
# Sanity check
|
73 |
-
#assert processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id, \
|
74 |
-
#"pad_token_id and eos_token_id must be distinct..and they is not"
|
75 |
-
|
76 |
-
|
77 |
@spaces.GPU(queue=True)
|
78 |
-
|
79 |
def transcribe_audio(audio_file, batch_size=4):
|
80 |
start_time = time.time()
|
|
|
81 |
if audio_file.endswith(".m4a"):
|
82 |
audio_file = convert_to_wav(audio_file)
|
83 |
-
|
84 |
audio_input, sample_rate = sf.read(audio_file)
|
85 |
-
chunk_size = 16000 * 30
|
86 |
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
|
87 |
|
88 |
transcription = ""
|
|
|
89 |
for i in range(0, len(chunks), batch_size):
|
90 |
batch_chunks = chunks[i:i + batch_size]
|
91 |
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
|
92 |
inputs = inputs.to(device)
|
93 |
-
|
94 |
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
|
95 |
-
|
96 |
with torch.no_grad():
|
97 |
output = model.generate(
|
98 |
inputs.input_features,
|
99 |
-
max_length=2048,
|
100 |
num_beams=8,
|
101 |
-
task="transcribe",
|
102 |
attention_mask=attention_mask,
|
103 |
-
|
104 |
-
|
105 |
-
# pad_token_id=processor.tokenizer.pad_token_id,
|
106 |
-
# eos_token_id=processor.tokenizer.eos_token_id
|
107 |
)
|
108 |
-
|
109 |
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
|
110 |
|
111 |
end_time = time.time()
|
@@ -113,18 +76,23 @@ def transcribe_audio(audio_file, batch_size=4):
|
|
113 |
word_count = len(transcription.split())
|
114 |
|
115 |
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
|
116 |
-
|
117 |
return transcription.strip(), result
|
118 |
|
119 |
|
|
|
|
|
|
|
120 |
|
|
|
|
|
121 |
|
122 |
# Graph-based summarization|TextRank
|
123 |
def summarize_text(text):
|
124 |
sentences = sent_tokenize(text)
|
125 |
if len(sentences) == 0:
|
126 |
return ""
|
127 |
-
|
128 |
tfidf_vectorizer = TfidfVectorizer()
|
129 |
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
|
130 |
similarity_matrix = cosine_similarity(tfidf_matrix)
|
@@ -142,7 +110,7 @@ def summarize_text(text):
|
|
142 |
# HTML syntax for imagery
|
143 |
image_html = """
|
144 |
<div style="text-align: center;">
|
145 |
-
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/
|
146 |
</div>
|
147 |
"""
|
148 |
|
@@ -150,29 +118,40 @@ def save_to_pdf(transcription, summary):
|
|
150 |
pdf = FPDF()
|
151 |
pdf.add_page()
|
152 |
pdf.set_font("Arial", size=12)
|
153 |
-
|
154 |
# include transcription
|
155 |
pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
|
156 |
-
|
157 |
# paragraph space
|
158 |
pdf.ln(10)
|
159 |
-
|
160 |
# include summary
|
161 |
pdf.multi_cell(0, 10, "Summary:\n" + summary)
|
162 |
-
|
163 |
pdf_output_path = "transcription_summary.pdf"
|
164 |
pdf.output(pdf_output_path)
|
165 |
return pdf_output_path
|
166 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
# Gradio UI
|
168 |
iface = gr.Blocks()
|
169 |
|
170 |
with iface:
|
171 |
gr.HTML(image_html)
|
172 |
gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
|
173 |
-
|
174 |
with gr.Tabs():
|
175 |
-
|
176 |
# First Tab: Transcription
|
177 |
with gr.TabItem("Transcription"):
|
178 |
audio_input = gr.Audio(type="filepath")
|
|
|
9 |
import gradio as gr
|
10 |
import warnings
|
11 |
import torch
|
12 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq # pipeline
|
13 |
from pydub import AudioSegment
|
14 |
import soundfile as sf
|
15 |
import numpy as np
|
|
|
21 |
|
22 |
warnings.filterwarnings("ignore")
|
23 |
|
24 |
+
# HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
|
25 |
|
|
|
|
|
26 |
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
|
27 |
+
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
|
28 |
+
model.to(device)
|
29 |
|
30 |
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
31 |
#torch_dtype = torch.float32
|
32 |
model.to(device)
|
33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
def convert_to_wav(audio_file):
|
35 |
audio = AudioSegment.from_file(audio_file, format="m4a")
|
36 |
wav_file = "temp.wav"
|
|
|
38 |
return wav_file
|
39 |
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
@spaces.GPU(queue=True)
|
42 |
+
|
43 |
def transcribe_audio(audio_file, batch_size=4):
|
44 |
start_time = time.time()
|
45 |
+
|
46 |
if audio_file.endswith(".m4a"):
|
47 |
audio_file = convert_to_wav(audio_file)
|
48 |
+
|
49 |
audio_input, sample_rate = sf.read(audio_file)
|
50 |
+
chunk_size = 16000 * 30
|
51 |
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
|
52 |
|
53 |
transcription = ""
|
54 |
+
|
55 |
for i in range(0, len(chunks), batch_size):
|
56 |
batch_chunks = chunks[i:i + batch_size]
|
57 |
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
|
58 |
inputs = inputs.to(device)
|
59 |
+
|
60 |
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
|
61 |
+
|
62 |
with torch.no_grad():
|
63 |
output = model.generate(
|
64 |
inputs.input_features,
|
65 |
+
max_length=2048,
|
66 |
num_beams=8,
|
|
|
67 |
attention_mask=attention_mask,
|
68 |
+
pad_token_id=processor.tokenizer.pad_token_id,
|
69 |
+
eos_token_id=processor.tokenizer.eos_token_id
|
|
|
|
|
70 |
)
|
71 |
+
|
72 |
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
|
73 |
|
74 |
end_time = time.time()
|
|
|
76 |
word_count = len(transcription.split())
|
77 |
|
78 |
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
|
79 |
+
|
80 |
return transcription.strip(), result
|
81 |
|
82 |
|
83 |
+
# summarization model
|
84 |
+
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
|
85 |
+
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
|
86 |
|
87 |
+
# t5-base to device
|
88 |
+
summarization_model.to(device)
|
89 |
|
90 |
# Graph-based summarization|TextRank
|
91 |
def summarize_text(text):
|
92 |
sentences = sent_tokenize(text)
|
93 |
if len(sentences) == 0:
|
94 |
return ""
|
95 |
+
|
96 |
tfidf_vectorizer = TfidfVectorizer()
|
97 |
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
|
98 |
similarity_matrix = cosine_similarity(tfidf_matrix)
|
|
|
110 |
# HTML syntax for imagery
|
111 |
image_html = """
|
112 |
<div style="text-align: center;">
|
113 |
+
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
|
114 |
</div>
|
115 |
"""
|
116 |
|
|
|
118 |
pdf = FPDF()
|
119 |
pdf.add_page()
|
120 |
pdf.set_font("Arial", size=12)
|
121 |
+
|
122 |
# include transcription
|
123 |
pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
|
124 |
+
|
125 |
# paragraph space
|
126 |
pdf.ln(10)
|
127 |
+
|
128 |
# include summary
|
129 |
pdf.multi_cell(0, 10, "Summary:\n" + summary)
|
130 |
+
|
131 |
pdf_output_path = "transcription_summary.pdf"
|
132 |
pdf.output(pdf_output_path)
|
133 |
return pdf_output_path
|
134 |
|
135 |
+
# Gradio UI
|
136 |
+
iface = gr.Interface(
|
137 |
+
fn=transcribe_audio,
|
138 |
+
inputs=gr.Audio(type="filepath"),
|
139 |
+
outputs="text",
|
140 |
+
title="Audio Transcription App",
|
141 |
+
description="Upload an audio file to get the transcription",
|
142 |
+
theme="default",
|
143 |
+
live=False
|
144 |
+
)
|
145 |
+
|
146 |
# Gradio UI
|
147 |
iface = gr.Blocks()
|
148 |
|
149 |
with iface:
|
150 |
gr.HTML(image_html)
|
151 |
gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
|
152 |
+
|
153 |
with gr.Tabs():
|
154 |
+
|
155 |
# First Tab: Transcription
|
156 |
with gr.TabItem("Transcription"):
|
157 |
audio_input = gr.Audio(type="filepath")
|