import networkx as nx from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize import gradio as gr import warnings import torch from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq from pydub import AudioSegment import soundfile as sf import numpy as np from fpdf import FPDF from PIL import Image import time import os import spaces warnings.filterwarnings("ignore") HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN') model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #torch_dtype = torch.float32 model.to(device) #asr = pipeline("automatic-speech-recognition", model=model, processor=processor.tokenizer, device=device, torch_dtype=torch.float32) pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch.float32) def transcribe_audio(audio_file): with torch.no_grad(): output = pipe(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"}) return output["text"] # Gradio UI iface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath"), outputs="text", title="Audio Transcription App", description="Upload an audio file to get the transcription", theme="default", live=False ) # summarization model summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base") summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") # t5-base to device summarization_model.to(device) def convert_to_wav(audio_file): audio = AudioSegment.from_file(audio_file, format="m4a") wav_file = "temp.wav" audio.export(wav_file, format="wav") return wav_file # Configure_is__not good enough #if processor.tokenizer.pad_token_id is None: #processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + 1 # Sanity check #assert processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id, \ #"pad_token_id and eos_token_id must be distinct..and they is not" @spaces.GPU() # transcription def transcribe_audio(audio_file, batch_size=4): start_time = time.time() if audio_file.endswith(".m4a"): audio_file = convert_to_wav(audio_file) audio_input, sample_rate = sf.read(audio_file) chunk_size = 16000 * 30 chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] transcription = "" for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i + batch_size] inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) inputs = inputs.to(device) attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None with torch.no_grad(): output = model.generate( inputs.input_features, max_length=2048, num_beams=8, task="transcribe", attention_mask=attention_mask, language="no", **encoded_input, pad_token_id=tokenizer.eos_token_id, # pad_token_id=processor.tokenizer.pad_token_id, # eos_token_id=processor.tokenizer.eos_token_id ) transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " end_time = time.time() transcription_time = end_time - start_time word_count = len(transcription.split()) result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}" return transcription.strip(), result # Graph-based summarization|TextRank def summarize_text(text): sentences = sent_tokenize(text) if len(sentences) == 0: return "" tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(tfidf_matrix) nx_graph = nx.from_numpy_array(similarity_matrix) scores = nx.pagerank(nx_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # Select top N sentences (e.g., 3 sentences for the summary) top_n = 3 summary = " ".join([s for _, s in ranked_sentences[:top_n]]) return summary # HTML syntax for imagery image_html = """
Banner
""" def save_to_pdf(transcription, summary): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) # include transcription pdf.multi_cell(0, 10, "Transcription:\n" + transcription) # paragraph space pdf.ln(10) # include summary pdf.multi_cell(0, 10, "Summary:\n" + summary) pdf_output_path = "transcription_summary.pdf" pdf.output(pdf_output_path) return pdf_output_path # Gradio UI iface = gr.Blocks() with iface: gr.HTML(image_html) gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.") with gr.Tabs(): # First Tab: Transcription with gr.TabItem("Transcription"): audio_input = gr.Audio(type="filepath") batch_size_input = gr.Slider(minimum=7, maximum=16, step=1, label="Batch Size") transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic") result_output = gr.Textbox(label="Time taken and Number of words") transcribe_button = gr.Button("Transcribe") def transcribe(audio_file, batch_size): transcription, result = transcribe_audio(audio_file, batch_size) return transcription, result transcribe_button.click( fn=transcribe, inputs=[audio_input, batch_size_input], outputs=[transcription_output, result_output] ) # Second Tab: Summary with gr.TabItem("Summary"): summary_output = gr.Textbox(label="Summary | TextRank, graph-based") summarize_button = gr.Button("Summarize") def summarize(transcription): if not transcription: return "Warning: a transcription must be available." summary = summarize_text(transcription) return summary summarize_button.click( fn=summarize, inputs=[transcription_output], # Use the transcription from the first tab outputs=summary_output ) # Third Tab: PDF Download Options with gr.TabItem("Download PDF"): pdf_transcription_only = gr.Button("Download PDF with Transcription Only") pdf_summary_only = gr.Button("Download PDF with Summary Only") pdf_both = gr.Button("Download PDF with Both") pdf_output_transcription_only = gr.File(label="Download PDF") pdf_output_summary_only = gr.File(label="Download PDF") pdf_output_both = gr.File(label="Download PDF") def generate_pdf_transcription_only(transcription): return save_to_pdf(transcription, "") def generate_pdf_summary_only(summary): return save_to_pdf("", summary) def generate_pdf_both(transcription, summary): return save_to_pdf(transcription, summary) pdf_transcription_only.click( fn=generate_pdf_transcription_only, inputs=[transcription_output], outputs=[pdf_output_transcription_only] ) pdf_summary_only.click( fn=generate_pdf_summary_only, inputs=[summary_output], outputs=[pdf_output_summary_only] ) pdf_both.click( fn=generate_pdf_both, inputs=[transcription_output, summary_output], outputs=[pdf_output_both] ) # run iface.launch(share=True, debug=True)