nb / app.py
camparchimedes's picture
Update app.py
0a9d8a4 verified
raw
history blame
8.65 kB
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import gradio as gr
import warnings
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq
from pydub import AudioSegment
import soundfile as sf
import numpy as np
from fpdf import FPDF
from PIL import Image
import time
import os
import spaces
warnings.filterwarnings("ignore")
HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#torch_dtype = torch.float32
model.to(device)
#asr = pipeline("automatic-speech-recognition", model=model, processor=processor.tokenizer, device=device, torch_dtype=torch.float32)
pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch.float32)
def transcribe_audio(audio_file):
with torch.no_grad():
output = pipe(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"})
return output["text"]
# Gradio interface
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Audio Transcription App",
description="Upload an audio file to get the transcription",
theme="default",
live=False
)
# summarization model
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
# Move summarization model to device
summarization_model.to(device)
def convert_to_wav(audio_file):
audio = AudioSegment.from_file(audio_file, format="m4a")
wav_file = "temp.wav"
audio.export(wav_file, format="wav")
return wav_file
# @spaces.GPU(duration=120, queue=False)
# Configure tokenizer to have distinct pad_token_id and eos_token_id
if processor.tokenizer.pad_token_id is None:
# Setting pad_token_id explicitly to ensure distinction from eos_token_id
processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + 1
# Sanity check to confirm distinct pad and eos tokens
assert processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id, \
"pad_token_id and eos_token_id must be distinct!"
# Proceed with the transcription function
def transcribe_audio(audio_file, batch_size=4):
start_time = time.time()
if audio_file.endswith(".m4a"):
audio_file = convert_to_wav(audio_file)
audio_input, sample_rate = sf.read(audio_file)
chunk_size = 16000 * 30
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = model.generate(
inputs.input_features,
max_length=2048,
num_beams=8,
task="transcribe",
attention_mask=attention_mask,
language="no",
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
end_time = time.time()
transcription_time = end_time - start_time
word_count = len(transcription.split())
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
return transcription.strip(), result
# Graph-based summarization|TextRank
def summarize_text(text):
sentences = sent_tokenize(text)
if len(sentences) == 0:
return ""
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(tfidf_matrix)
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
# Select top N sentences (e.g., 3 sentences for the summary)
top_n = 3
summary = " ".join([s for _, s in ranked_sentences[:top_n]])
return summary
# HTML syntax for imagery
image_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.png" alt="Banner" width="85%" height="auto">
</div>
"""
def save_to_pdf(transcription, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
# include transcription
pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
# paragraph space
pdf.ln(10)
# include summary
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription_summary.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
# Gradio UI
iface = gr.Blocks()
with iface:
gr.HTML(image_html)
gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.")
with gr.Tabs():
# First Tab: Transcription
with gr.TabItem("Transcription"):
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=7, maximum=16, step=1, label="Batch Size")
transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic")
result_output = gr.Textbox(label="Time taken and Number of words")
transcribe_button = gr.Button("Transcribe")
def transcribe(audio_file, batch_size):
transcription, result = transcribe_audio(audio_file, batch_size)
return transcription, result
transcribe_button.click(
fn=transcribe,
inputs=[audio_input, batch_size_input],
outputs=[transcription_output, result_output]
)
# Second Tab: Summary
with gr.TabItem("Summary"):
summary_output = gr.Textbox(label="Summary | TextRank, graph-based")
summarize_button = gr.Button("Summarize")
def summarize(transcription):
if not transcription:
return "Warning: a transcription must be available."
summary = summarize_text(transcription)
return summary
summarize_button.click(
fn=summarize,
inputs=[transcription_output], # Use the transcription from the first tab
outputs=summary_output
)
# Third Tab: PDF Download Options
with gr.TabItem("Download PDF"):
pdf_transcription_only = gr.Button("Download PDF with Transcription Only")
pdf_summary_only = gr.Button("Download PDF with Summary Only")
pdf_both = gr.Button("Download PDF with Both")
pdf_output_transcription_only = gr.File(label="Download PDF")
pdf_output_summary_only = gr.File(label="Download PDF")
pdf_output_both = gr.File(label="Download PDF")
def generate_pdf_transcription_only(transcription):
return save_to_pdf(transcription, "")
def generate_pdf_summary_only(summary):
return save_to_pdf("", summary)
def generate_pdf_both(transcription, summary):
return save_to_pdf(transcription, summary)
pdf_transcription_only.click(
fn=generate_pdf_transcription_only,
inputs=[transcription_output],
outputs=[pdf_output_transcription_only]
)
pdf_summary_only.click(
fn=generate_pdf_summary_only,
inputs=[summary_output],
outputs=[pdf_output_summary_only]
)
pdf_both.click(
fn=generate_pdf_both,
inputs=[transcription_output, summary_output],
outputs=[pdf_output_both]
)
# run
iface.launch(share=True, debug=True)