nb / app.py
camparchimedes's picture
Update app.py
dda0718 verified
raw
history blame
6.37 kB
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import gradio as gr
import warnings
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from pydub import AudioSegment
from fpdf import FPDF
from PIL import Image
import time
import os
# import spaces
warnings.filterwarnings("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch_dtype = torch.float32
# ASR pipeline
pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch.float32)
# Switch m4a to wav
def convert_to_wav(audio_file):
audio = AudioSegment.from_file(audio_file, format="m4a")
wav_file = "temp.wav"
audio.export(wav_file, format="wav")
return wav_file
# @spaces.GPU(queue=True)
# Transcription funct.@ASR pipeline
def transcribe_audio(audio_file):
if audio_file.endswith(".m4a"):
audio_file = convert_to_wav(audio_file)
start_time = time.time()
with torch.no_grad():
output = pipe(audio_file, chunk_length_s=30, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"})
transcription = output["text"]
end_time = time.time()
output_time = end_time - start_time
word_count = len(transcription.split())
result = f"Transcription: {transcription.strip()}\n\nTime taken: {output_time:.2f} seconds\nNumber of words: {word_count}"
return transcription.strip(), result
# t5-base model@summary funct.
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
# t5-base to device
summarization_model.to(device)
# Graph-based summarization (TextRank, method)
def summarize_text(text):
sentences = sent_tokenize(text)
if len(sentences) == 0:
return ""
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(tfidf_matrix)
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
# Select top N sentences (e.g., 3 sentences for the summary)
top_n = 3
summary = " ".join([s for _, s in ranked_sentences[:top_n]])
return summary
# HTML syntax for imagery
image_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
</div>
"""
# Transcription and summary@PDF option(s)
def save_to_pdf(transcription, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
if transcription:
pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
# paragraph space
pdf.ln(10)
if summary:
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription_summary.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
# Gradio
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Audio Transcription App",
description="Upload an audio file to get the transcription",
theme="default",
live=False
)
iface = gr.Blocks()
with iface:
gr.HTML(image_html)
gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
with gr.Tabs():
# Transcription Tab
with gr.TabItem("Transcription"):
audio_input = gr.Audio(type="filepath")
transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
result_output = gr.Textbox(label="Time taken and Number of words")
transcribe_button = gr.Button("Transcribe")
def transcribe(audio_file):
transcription, result = transcribe_audio(audio_file)
return transcription, result
transcribe_button.click(
fn=transcribe,
inputs=[audio_input],
outputs=[transcription_output, result_output]
)
# Summary Tab
with gr.TabItem("Summary"):
summary_output = gr.Textbox(label="Summary | TextRank, graph-based")
summarize_button = gr.Button("Summarize")
def summarize(transcription):
if not transcription:
return "Warning: a transcription must be available."
summary = summarize_text(transcription)
return summary
summarize_button.click(
fn=summarize,
inputs=[transcription_output],
outputs=summary_output
)
# PDF Download Tab
with gr.TabItem("Download PDF"):
pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
pdf_summary_only = gr.Button("Download PDF with Summary Only")
pdf_both = gr.Button("Download PDF with Both")
pdf_output_transcription_only = gr.File(label="Download PDF")
pdf_output_summary_only = gr.File(label="Download PDF")
pdf_output_both = gr.File(label="Download PDF")
def generate_pdf_transcription_only(transcription):
return save_to_pdf(transcription, "")
def generate_pdf_summary_only(summary):
return save_to_pdf("", summary)
def generate_pdf_both(transcription, summary):
return save_to_pdf(transcription, summary)
pdf_transcription_only.click(
fn=generate_pdf_transcription_only,
inputs=[transcription_output],
outputs=[pdf_output_transcription_only]
)
pdf_summary_only.click(
fn=generate_pdf_summary_only,
inputs=[summary_output],
outputs=[pdf_output_summary_only]
)
pdf_both.click(
fn=generate_pdf_both,
inputs=[transcription_output, summary_output],
outputs=[pdf_output_both]
)
# run
iface.launch(share=True, debug=True)