nb / app.py
camparchimedes's picture
Update app.py
d2774a4 verified
raw
history blame
7.49 kB
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import gradio as gr
import warnings
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq # pipeline
from pydub import AudioSegment
import soundfile as sf
import numpy as np
from fpdf import FPDF
from PIL import Image
import time
import os
import spaces
warnings.filterwarnings("ignore")
# HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
model.to(device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#torch_dtype = torch.float32
model.to(device)
def convert_to_wav(audio_file):
audio = AudioSegment.from_file(audio_file, format="m4a")
wav_file = "temp.wav"
audio.export(wav_file, format="wav")
return wav_file
@spaces.GPU(queue=True)
def transcribe_audio(audio_file, batch_size=4):
start_time = time.time()
if audio_file.endswith(".m4a"):
audio_file = convert_to_wav(audio_file)
audio_input, sample_rate = sf.read(audio_file)
chunk_size = 16000 * 30
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = model.generate(
inputs.input_features,
max_length=2048,
num_beams=8,
attention_mask=attention_mask,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
end_time = time.time()
transcription_time = end_time - start_time
word_count = len(transcription.split())
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
return transcription.strip(), result
# summarization model
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
# t5-base to device
summarization_model.to(device)
# Graph-based summarization|TextRank
def summarize_text(text):
sentences = sent_tokenize(text)
if len(sentences) == 0:
return ""
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(tfidf_matrix)
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
# Select top N sentences (e.g., 3 sentences for the summary)
top_n = 3
summary = " ".join([s for _, s in ranked_sentences[:top_n]])
return summary
# HTML syntax for imagery
image_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
</div>
"""
def save_to_pdf(transcription, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
# include transcription
pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
# paragraph space
pdf.ln(10)
# include summary
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription_summary.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
# Gradio UI
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Audio Transcription App",
description="Upload an audio file to get the transcription",
theme="default",
live=False
)
# Gradio UI
iface = gr.Blocks()
with iface:
gr.HTML(image_html)
gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
with gr.Tabs():
# First Tab: Transcription
with gr.TabItem("Transcription"):
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=7, maximum=16, step=1, label="Batch Size")
transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
result_output = gr.Textbox(label="Time taken and Number of words")
transcribe_button = gr.Button("Transcribe")
def transcribe(audio_file, batch_size):
transcription, result = transcribe_audio(audio_file, batch_size)
return transcription, result
transcribe_button.click(
fn=transcribe,
inputs=[audio_input, batch_size_input],
outputs=[transcription_output, result_output]
)
# Second Tab: Summary
with gr.TabItem("Summary"):
summary_output = gr.Textbox(label="Summary | TextRank, graph-based")
summarize_button = gr.Button("Summarize")
def summarize(transcription):
if not transcription:
return "Warning: a transcription must be available."
summary = summarize_text(transcription)
return summary
summarize_button.click(
fn=summarize,
inputs=[transcription_output], # Use the transcription from the first tab
outputs=summary_output
)
# Third Tab: PDF Download Options
with gr.TabItem("Download PDF"):
pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
pdf_summary_only = gr.Button("Download PDF with Summary Only")
pdf_both = gr.Button("Download PDF with Both")
pdf_output_transcription_only = gr.File(label="Download PDF")
pdf_output_summary_only = gr.File(label="Download PDF")
pdf_output_both = gr.File(label="Download PDF")
def generate_pdf_transcription_only(transcription):
return save_to_pdf(transcription, "")
def generate_pdf_summary_only(summary):
return save_to_pdf("", summary)
def generate_pdf_both(transcription, summary):
return save_to_pdf(transcription, summary)
pdf_transcription_only.click(
fn=generate_pdf_transcription_only,
inputs=[transcription_output],
outputs=[pdf_output_transcription_only]
)
pdf_summary_only.click(
fn=generate_pdf_summary_only,
inputs=[summary_output],
outputs=[pdf_output_summary_only]
)
pdf_both.click(
fn=generate_pdf_both,
inputs=[transcription_output, summary_output],
outputs=[pdf_output_both]
)
# run
iface.launch(share=True, debug=True)