nb / app.py
camparchimedes's picture
Update app.py
c835efb verified
raw
history blame
6.01 kB
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import gradio as gr
import warnings
import torch
from transformers import pipeline, WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor, AutoTokenizer, AutoModelForSeq2SeqLM
from pydub import AudioSegment
import soundfile as sf
import numpy as np
from fpdf import FPDF
from PIL import Image
import time
import os
import spaces
warnings.filterwarnings("ignore")
HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN')
tokenizer = WhisperTokenizer.from_pretrained("NbAiLabBeta/nb-whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLabBeta/nb-whisper-medium")
processor = WhisperProcessor.from_pretrained("NbAiLabBeta/nb-whisper-medium")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, device=device, torch_dtype=torch.float32)
def transcribe_audio(audio_file):
with torch.no_grad():
output = asr(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 5, "task": "transcribe", "language": "no"})
return output["text"]
# Gradio interface
iface = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Audio Transcription App",
description="Upload an audio file to get the transcription",
theme="default",
layout="vertical",
live=False
)
# Load summarization models with authentication token
summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base")
summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
# Move summarization model to device
summarization_model.to(device)
def convert_to_wav(audio_file):
audio = AudioSegment.from_file(audio_file, format="m4a")
wav_file = "temp.wav"
audio.export(wav_file, format="wav")
return wav_file
@spaces.GPU(duration=120, queue=False)
def transcribe_audio(audio_file, batch_size=4):
start_time = time.time()
# Convert .m4a to .wav
if audio_file.endswith(".m4a"):
audio_file = convert_to_wav(audio_file)
audio_input, sample_rate = sf.read(audio_file)
chunk_size = 16000 * 30
chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)]
transcription = ""
for i in range(0, len(chunks), batch_size):
batch_chunks = chunks[i:i + batch_size]
inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True)
inputs = inputs.to(device)
attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None
with torch.no_grad():
output = transcription_model.generate(
inputs.input_features,
max_length=2048, # Increase max_length for longer outputs
num_beams=7,
task="transcribe",
attention_mask=attention_mask,
language="no"
)
transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " "
end_time = time.time()
transcription_time = end_time - start_time
word_count = len(transcription.split())
result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}"
return transcription.strip(), result
# Graph-based summarization|TextRank
def summarize_text(text):
sentences = sent_tokenize(text)
if len(sentences) == 0:
return ""
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(tfidf_matrix)
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
# Select top N sentences (e.g., 3 sentences for the summary)
top_n = 3
summary = " ".join([s for _, s in ranked_sentences[:top_n]])
return summary
# HTML syntax for imagery
image_html = """
<div style="text-align: center;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/Olas%20AudioSwitch%20Shop.png" alt="Banner" width="87%" height="auto">
</div>
<div style="text-align: center; margin-top: 20px;">
<img src="https://huggingface.co/spaces/camparchimedes/ola_s-audioshop/raw/main/picture.jpg" alt="Additional Image" width="50%" height="auto">
</div>
"""
# Gradio UI
iface = gr.Blocks()
with iface:
gr.HTML(image_html)
gr.Markdown("# Switch Work Audio Transcription App\nUpload an audio file to get the transcription")
audio_input = gr.Audio(type="filepath")
batch_size_input = gr.Slider(minimum=1, maximum=16, step=1, default=4, label="Batch Size")
transcription_output = gr.Textbox()
summary_output = gr.Textbox()
transcribe_button = gr.Button("Transcribe and Summarize")
def transcribe_and_summarize(audio_file, batch_size):
transcription, result = transcribe_audio(audio_file, batch_size)
summary = summarize_text(transcription)
return result, summary
transcribe_button.click(fn=transcribe_and_summarize, inputs=[audio_input, batch_size_input], outputs=[transcription_output, summary_output])
def save_to_pdf(transcription, summary):
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
# include transcription
pdf.multi_cell(0, 10, "Transcription:\n" + transcription)
# paragraph space
pdf.ln(10)
# include summary
pdf.multi_cell(0, 10, "Summary:\n" + summary)
pdf_output_path = "transcription_summary.pdf"
pdf.output(pdf_output_path)
return pdf_output_path
# run
iface.launch(share=True, debug=True)