import networkx as nx from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize import gradio as gr import warnings import torch from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, AutoProcessor, AutoModelForSpeechSeq2Seq from pydub import AudioSegment import soundfile as sf import numpy as np from fpdf import FPDF from PIL import Image import time import os import spaces warnings.filterwarnings("ignore") HF_AUTH_TOKEN = os.getenv('HF_AUTH_TOKEN') model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") processor = AutoProcessor.from_pretrained("NbAiLabBeta/nb-whisper-large-semantic") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') #torch_dtype = torch.float32 model.to(device) #asr = pipeline("automatic-speech-recognition", model=model, processor=processor.tokenizer, device=device, torch_dtype=torch.float32) pipe = pipeline("automatic-speech-recognition", model="NbAiLabBeta/nb-whisper-large-semantic", device=device, torch_dtype=torch.float32) def transcribe_audio(audio_file): with torch.no_grad(): output = pipe(audio_file, chunk_length_s=28, generate_kwargs={"num_beams": 8, "task": "transcribe", "language": "no"}) return output["text"] # Gradio interface iface = gr.Interface( fn=transcribe_audio, inputs=gr.Audio(type="filepath"), outputs="text", title="Audio Transcription App", description="Upload an audio file to get the transcription", theme="default", live=False ) # summarization model summarization_tokenizer = AutoTokenizer.from_pretrained("t5-base") summarization_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") # Move summarization model to device summarization_model.to(device) def convert_to_wav(audio_file): audio = AudioSegment.from_file(audio_file, format="m4a") wav_file = "temp.wav" audio.export(wav_file, format="wav") return wav_file # @spaces.GPU(duration=120, queue=False) # Configure tokenizer to have distinct pad_token_id and eos_token_id if processor.tokenizer.pad_token_id is None: # Setting pad_token_id explicitly to ensure distinction from eos_token_id processor.tokenizer.pad_token_id = processor.tokenizer.eos_token_id + 1 # Sanity check to confirm distinct pad and eos tokens assert processor.tokenizer.pad_token_id != processor.tokenizer.eos_token_id, \ "pad_token_id and eos_token_id must be distinct!" # Proceed with the transcription function def transcribe_audio(audio_file, batch_size=4): start_time = time.time() if audio_file.endswith(".m4a"): audio_file = convert_to_wav(audio_file) audio_input, sample_rate = sf.read(audio_file) chunk_size = 16000 * 30 chunks = [audio_input[i:i + chunk_size] for i in range(0, len(audio_input), chunk_size)] transcription = "" for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i + batch_size] inputs = processor(batch_chunks, sampling_rate=16000, return_tensors="pt", padding=True) inputs = inputs.to(device) attention_mask = inputs.attention_mask.to(device) if 'attention_mask' in inputs else None with torch.no_grad(): output = model.generate( inputs.input_features, max_length=2048, num_beams=8, task="transcribe", attention_mask=attention_mask, language="no", pad_token_id=processor.tokenizer.pad_token_id, eos_token_id=processor.tokenizer.eos_token_id ) transcription += " ".join(processor.batch_decode(output, skip_special_tokens=True)) + " " end_time = time.time() transcription_time = end_time - start_time word_count = len(transcription.split()) result = f"Transcription: {transcription.strip()}\n\nTime taken: {transcription_time:.2f} seconds\nNumber of words: {word_count}" return transcription.strip(), result # Graph-based summarization|TextRank def summarize_text(text): sentences = sent_tokenize(text) if len(sentences) == 0: return "" tfidf_vectorizer = TfidfVectorizer() tfidf_matrix = tfidf_vectorizer.fit_transform(sentences) similarity_matrix = cosine_similarity(tfidf_matrix) nx_graph = nx.from_numpy_array(similarity_matrix) scores = nx.pagerank(nx_graph) ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True) # Select top N sentences (e.g., 3 sentences for the summary) top_n = 3 summary = " ".join([s for _, s in ranked_sentences[:top_n]]) return summary # HTML syntax for imagery image_html = """
Banner
""" def save_to_pdf(transcription, summary): pdf = FPDF() pdf.add_page() pdf.set_font("Arial", size=12) # include transcription pdf.multi_cell(0, 10, "Transcription:\n" + transcription) # paragraph space pdf.ln(10) # include summary pdf.multi_cell(0, 10, "Summary:\n" + summary) pdf_output_path = "transcription_summary.pdf" pdf.output(pdf_output_path) return pdf_output_path # Gradio UI iface = gr.Blocks() with iface: gr.HTML(image_html) gr.Markdown("# Vi har nå muligheten til å oversette lydfiler til norsk skrift.") with gr.Tabs(): # First Tab: Transcription with gr.TabItem("Transcription"): audio_input = gr.Audio(type="filepath") batch_size_input = gr.Slider(minimum=7, maximum=16, step=1, label="Batch Size") transcription_output = gr.Textbox(label="Transcription | nb-whisper-large-semantic") result_output = gr.Textbox(label="Time taken and Number of words") transcribe_button = gr.Button("Transcribe") def transcribe(audio_file, batch_size): transcription, result = transcribe_audio(audio_file, batch_size) return transcription, result transcribe_button.click( fn=transcribe, inputs=[audio_input, batch_size_input], outputs=[transcription_output, result_output] ) # Second Tab: Summary with gr.TabItem("Summary"): summary_output = gr.Textbox(label="Summary | TextRank, graph-based") summarize_button = gr.Button("Summarize") def summarize(transcription): if not transcription: return "Warning: a transcription must be available." summary = summarize_text(transcription) return summary summarize_button.click( fn=summarize, inputs=[transcription_output], # Use the transcription from the first tab outputs=summary_output ) # Third Tab: PDF Download Options with gr.TabItem("Download PDF"): pdf_transcription_only = gr.Button("Download PDF with Transcription Only") pdf_summary_only = gr.Button("Download PDF with Summary Only") pdf_both = gr.Button("Download PDF with Both") pdf_output_transcription_only = gr.File(label="Download PDF") pdf_output_summary_only = gr.File(label="Download PDF") pdf_output_both = gr.File(label="Download PDF") def generate_pdf_transcription_only(transcription): return save_to_pdf(transcription, "") def generate_pdf_summary_only(summary): return save_to_pdf("", summary) def generate_pdf_both(transcription, summary): return save_to_pdf(transcription, summary) pdf_transcription_only.click( fn=generate_pdf_transcription_only, inputs=[transcription_output], outputs=[pdf_output_transcription_only] ) pdf_summary_only.click( fn=generate_pdf_summary_only, inputs=[summary_output], outputs=[pdf_output_summary_only] ) pdf_both.click( fn=generate_pdf_both, inputs=[transcription_output, summary_output], outputs=[pdf_output_both] ) # run iface.launch(share=True, debug=True)