import networkx as nx from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize import gradio as gr import warnings import torch from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq from pydub import AudioSegment import soundfile as sf import numpy as np from fpdf import FPDF from PIL import Image import time import os import spaces warnings.filterwarnings("ignore") HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN') model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') torch_dtype = torch.float32 model.to(device) #asr = pipeline("automatic-speech-recognition", model=model, processor=processor.tokenizer, device=device, torch_dtype=torch.float32) pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic") def transcribe_audio(audio_file): with torch.no_grad(): output = pipe(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"}) return output["text"] # Gradio interface iface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath"), outputs="text", title="Audio Transcription App", description="Upload an audio file to get the transcription", theme="default", live=False ) # summarization model summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base") summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") # Move summarization model to device summarization_model.to(device) def convert_to_wav(audio_file): audio = AudioSegment.from_file(audio_file, format="m4a") wav_file = "temp.wav" audio.export(wav_file, format="wav") return wav_file @spaces.GPU(duration=21600, queue=False) def transcribe_audio(audio_file, batch_size=4): start_time = time.time() # Convert .m4a to .wav if audio_file.endswith(".m4a"): audio_file = convert_to_wav(audio_file) audio_input, sample_rate = sf.read(audio_file) chunk_size = 16000 * 30 chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] transcription = "" for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i + batch_size] inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) inputs = inputs.to(device) attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None with torch.no_grad(): output = model.generate( inputs.input_features, max_length=2048, num_beams=8, task="transcribe", attention_mask=attention_mask, language="no" ) transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " end_time = time.time() transcription_time = end_time - start_time word_count = len(transcription.split()) result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}" return transcription.strip(), result # Graph-based summarization|TextRank def summarize_text(text): sentences = sent_tokenize(text) if len(sentences) == 0: return "" tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(tfidf_matrix) nx_graph = nx.from_numpy_array(similarity_matrix) scores = nx.pagerank(nx_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # Select top N sentences (e.g., 3 sentences for the summary) top_n = 3 summary = " ".join([s for _, s in ranked_sentences[:top_n]]) return summary # HTML syntax for imagery image_html = """