Spaces:
Build error
Build error
File size: 7,490 Bytes
d4b107b 8cb8264 592f7e1 d2774a4 3d3ff49 c835efb 3d3ff49 d4b107b f9564f3 440d6b7 3d3ff49 d2774a4 440d6b7 0b0bb20 d2774a4 d4b107b 440d6b7 53454a4 3d3ff49 440d6b7 47661bd badcd8d fe8ea39 d2774a4 6de75ee 14c8f51 d2774a4 440d6b7 d2774a4 440d6b7 d2774a4 47661bd d2774a4 1b9402b 47661bd d2774a4 1b9402b d2774a4 47661bd 052955a 47661bd d2774a4 0634e0f 440d6b7 d2774a4 47661bd d2774a4 1b9402b 592f7e1 14c8f51 d2774a4 2d9e081 caafdf0 d2774a4 badcd8d d2774a4 badcd8d d4b107b 55eafca d4b107b d2774a4 d4b107b 440d6b7 2d9e081 55eafca 592f7e1 d2774a4 592f7e1 440d6b7 d2774a4 55eafca 440d6b7 d2774a4 b3ed824 440d6b7 d2774a4 b3ed824 440d6b7 d2774a4 440d6b7 d2774a4 8ec53db 7735671 d2774a4 7735671 d2774a4 7735671 8ec53db 7735671 8ec53db 7735671 8ec53db b3ed824 869e885 7735671 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import gradio as gr
import warnings
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq # pipeline
from pydub import AudioSegment
import soundfile as sf
import numpy as np
from fpdf import FPDF
from PIL import Image
import time
import os
import spaces
warnings.filterwarnings("ignore")
# HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
model.to(device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#torch_dtype = torch.float32
model.to(device)
def convert_to_wav(audio_file):
audio = AudioSegment.from_file(audio_file, format="m4a")
wav_file = "temp.wav"
audio.export(wav_file, format="wav")
return wav_file
@spaces.GPU(queue=True)
def transcribe_audio(audio_file, batch_size=4):
start_time = time.time()
if audio_file.endswith(".m4a"):
audio_file = convert_to_wav(audio_file)
audio_input, sample_rate = sf.read(audio_file)
chunk_size = 16000 * 30
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = model.generate(
inputs.input_features,
max_length=2048,
num_beams=8,
attention_mask=attention_mask,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
end_time = time.time()
transcription_time = end_time - start_time
word_count = len(transcription.split())
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
return transcription.strip(), result
# summarization model
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
# t5-base to device
summarization_model.to(device)
# Graph-based summarization|TextRank
def summarize_text(text):
sentences = sent_tokenize(text)
if len(sentences) == 0:
return ""
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(tfidf_matrix)
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
# Select top N sentences (e.g., 3 sentences for the summary)
top_n = 3
summary = " ".join([s for _, s in ranked_sentences[:top_n]])
return summary
# HTML syntax for imagery
image_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
</div>
"""
def save_to_pdf(transcription, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
# include transcription
pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
# paragraph space
pdf.ln(10)
# include summary
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription_summary.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
# Gradio UI
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Audio Transcription App",
description="Upload an audio file to get the transcription",
theme="default",
live=False
)
# Gradio UI
iface = gr.Blocks()
with iface:
gr.HTML(image_html)
gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
with gr.Tabs():
# First Tab: Transcription
with gr.TabItem("Transcription"):
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=7, maximum=16, step=1, label="Batch Size")
transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
result_output = gr.Textbox(label="Time taken and Number of words")
transcribe_button = gr.Button("Transcribe")
def transcribe(audio_file, batch_size):
transcription, result = transcribe_audio(audio_file, batch_size)
return transcription, result
transcribe_button.click(
fn=transcribe,
inputs=[audio_input, batch_size_input],
outputs=[transcription_output, result_output]
)
# Second Tab: Summary
with gr.TabItem("Summary"):
summary_output = gr.Textbox(label="Summary | TextRank, graph-based")
summarize_button = gr.Button("Summarize")
def summarize(transcription):
if not transcription:
return "Warning: a transcription must be available."
summary = summarize_text(transcription)
return summary
summarize_button.click(
fn=summarize,
inputs=[transcription_output], # Use the transcription from the first tab
outputs=summary_output
)
# Third Tab: PDF Download Options
with gr.TabItem("Download PDF"):
pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
pdf_summary_only = gr.Button("Download PDF with Summary Only")
pdf_both = gr.Button("Download PDF with Both")
pdf_output_transcription_only = gr.File(label="Download PDF")
pdf_output_summary_only = gr.File(label="Download PDF")
pdf_output_both = gr.File(label="Download PDF")
def generate_pdf_transcription_only(transcription):
return save_to_pdf(transcription, "")
def generate_pdf_summary_only(summary):
return save_to_pdf("", summary)
def generate_pdf_both(transcription, summary):
return save_to_pdf(transcription, summary)
pdf_transcription_only.click(
fn=generate_pdf_transcription_only,
inputs=[transcription_output],
outputs=[pdf_output_transcription_only]
)
pdf_summary_only.click(
fn=generate_pdf_summary_only,
inputs=[summary_output],
outputs=[pdf_output_summary_only]
)
pdf_both.click(
fn=generate_pdf_both,
inputs=[transcription_output, summary_output],
outputs=[pdf_output_both]
)
# run
iface.launch(share=True, debug=True)
|