"""
Version: 5th_pruned_optimized_transcription_app.py (alias HF_modded_nb-whisper_T4)

Description: webapp, transkribering (norsk), NbAiLab/nb-whisper-large, oppsummering, pdf-download.
"""

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import time
import os
import warnings
from pydub import AudioSegment
import torch
import torchaudio
import torchaudio.transforms as transforms
from transformers import WhisperTokenizer, WhisperForConditionalGeneration, WhisperProcessor
from huggingface_hub import model_info
import spacy
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import re
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import gradio as gr
from fpdf import FPDF
from PIL import Image

# Suppress warnings
warnings.filterwarnings("ignore")

# Convert m4a audio to wav format
def convert_to_wav(audio_file):
    audio = AudioSegment.from_file(audio_file, format="m4a")
    wav_file = "temp.wav"
    audio.export(wav_file, format="wav")
    return wav_file

# Initialize device for torch
device = 0 if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float32

# Load tokenizer and model
tokenizer = WhisperTokenizer.from_pretrained("NbAiLab/nb-whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("NbAiLab/nb-whisper-large")
processor = WhisperProcessor.from_pretrained("NbAiLab/nb-whisper-large")


# Model script does not support JIT compilation
#model = model.to(device)
#model = torch.jit.script(model)

# Generation kwargs
generate_kwargs = {
    "num_beams": 5,
    "task": "transcribe",
    "language": "no",
    "forced_decoder_ids": None
}

# Transcribe
def transcribe_audio(audio_file, chunk_length_s=30):
    if audio_file.endswith(".m4a"):
        audio_file = convert_to_wav(audio_file)

    start_time = time.time()

    # Load the audio waveform using torchaudio
    waveform, sample_rate = torchaudio.load(audio_file)

    # Convert to mono if the audio has more than one channel
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Resample the audio to 16000 Hz if it’s not already
    if sample_rate != 16000:
        resampler = transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    # Calculate the number of chunks
    chunk_size = chunk_length_s * sample_rate
    num_chunks = waveform.shape[1] // chunk_size + int(waveform.shape[1] % chunk_size != 0)

    # Initialize an empty list to store the transcribed text from each chunk
    full_text = []

    for i in range(num_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, waveform.shape[1])
        chunk_waveform = waveform[:, start:end]

        # Ensure the chunk waveform is properly shaped
        if chunk_waveform.shape[0] > 1:
            chunk_waveform = torch.mean(chunk_waveform, dim=0, keepdim=True)

        # Process the chunk
        audio_input = processor(chunk_waveform.squeeze(0).numpy(), sampling_rate=sample_rate, return_tensors="pt")

        # Generate attention mask
        input_features = audio_input.input_features
        attention_mask = torch.ones(input_features.shape, dtype=torch.long)

        # ASR model inference on the chunk
        with torch.no_grad():
            generated_ids = model.generate(
                input_features=input_features.to(device),
                attention_mask=attention_mask.to(device),
                **generate_kwargs
            )
            chunk_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            full_text.append(chunk_text)

    # Combine the transcribed text from all chunks
    text = " ".join(full_text)

    output_time = time.time() - start_time

    # Audio duration (in seconds)
    audio_duration = waveform.shape[1] / sample_rate

    # Real-time Factor (RTF)
    rtf = output_time / audio_duration

    # Format of the result
    result = (
        f"Time taken: {output_time:.2f} seconds\n"
        f"Audio duration: {audio_duration / 60:.2f} minutes ({audio_duration:.2f} seconds)\n"
        f"Real-time Factor (RTF): {rtf:.2f}\n"
        f"Number of words: {len(text.split())}\n\n"
        "Real-time Factor (RTF) is a measure used to evaluate the speed of speech recognition systems. "
        "It is the ratio of transcription time to the duration of the audio.\n\n"
        "An RTF of less than 1 means the transcription process is faster than real-time (expected)."
    )

    return text, result


# Clean and preprocess/@summarization
def clean_text(text):
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

nlp = spacy.blank("nb")  # 'nb' ==> codename = Norwegian Bokmål
spacy_stop_words = spacy.lang.nb.stop_words.STOP_WORDS

def preprocess_text(text):
    # Process the text with SpaCy
    doc = nlp(text)
    # SpaCy's stop top wrds direct
    stop_words = spacy_stop_words
    # Filter out stop words
    words = [token.text for token in doc if token.text.lower() not in stop_words]
    return ' '.join(words)

# Summarize w/T5 model
def summarize_text(text):
    preprocessed_text = preprocess_text(text)
    inputs = summarization_tokenizer(preprocessed_text, max_length=1024, return_tensors="pt", truncation=True)
    inputs = inputs.to(device)
    summary_ids = summarization_model.generate(inputs.input_ids, num_beams=5, max_length=150, early_stopping=True)
    return summarization_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Builds similarity matrix
def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = nx.Graph()
    for i, tokens_a in enumerate(sentences):
        for j, tokens_b in enumerate(sentences):
            if i != j:
                common_words = set(tokens_a) & set(tokens_b)
                similarity_matrix.add_edge(i, j, weight=len(common_words))
    return similarity_matrix

# "Graph-based summarization" =====> 
def graph_based_summary(text, num_paragraphs=3):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    if len(sentences) < num_paragraphs:
        return sentences

    sentence_tokens = [nlp(sent) for sent in sentences]
    stop_words = spacy_stop_words
    filtered_tokens = [[token.text for token in tokens if token.text.lower() not in stop_words] for tokens in sentence_tokens]
    similarity_matrix = build_similarity_matrix(filtered_tokens, stop_words)

    scores = nx.pagerank(similarity_matrix)
    ranked_sentences = sorted(((scores[i], sent) for i, sent in enumerate(sentences)), reverse=True)
    return ' '.join([sent for _, sent in ranked_sentences[:num_paragraphs]])

# LexRank
def lex_rank_summary(text, num_paragraphs=3, threshold=0.1):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    if len(sentences) < num_paragraphs:
        return sentences

    stop_words = spacy_stop_words
    vectorizer = TfidfVectorizer(stop_words=list(stop_words))
    X = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(X, X)

    # Apply threshold@similarity matrix
    similarity_matrix[similarity_matrix < threshold] = 0
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])

# TextRank
def text_rank_summary(text, num_paragraphs=3):
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    if len(sentences) < num_paragraphs:
        return sentences

    stop_words = spacy_stop_words
    vectorizer = TfidfVectorizer(stop_words=list(stop_words))
    X = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(X, X)

    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    return ' '.join([ranked_sentences[i][1] for i in range(num_paragraphs)])


# Save text+summary/PDF
def save_to_pdf(text, summary):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_font("Arial", size=12)

    if text:
        pdf.multi_cell(0, 10, "Text:\n" + text)

    pdf.ln(10)  # Paragraph space

    if summary:
        pdf.multi_cell(0, 10, "Summary:\n" + summary)

    pdf_output_path = "transcription.pdf"
    pdf.output(pdf_output_path)
    return pdf_output_path

iface = gr.Blocks()

with iface:
    gr.HTML('<img src="https://huggingface.co/spaces/camparchimedes/transcription_app/resolve/main/banner_trans.png" alt="Banner Image" />')
    gr.Markdown("*Switch Work's JoJo-versjon som webapp for transkribering av lydfiler til norsk skrift. Språkmodell: NbAiLab/nb-whisper-large, Ekstra: oppsummering, pdf-download*")

    with gr.Tabs():
        with gr.TabItem("Transcription"):
            audio_input = gr.Audio(type="filepath")
            text_output = gr.Textbox(label="Text")
            result_output = gr.Textbox(label="Transcription Details")
            transcribe_button = gr.Button("Transcribe")

            transcribe_button.click(fn=transcribe_audio, inputs=[audio_input], outputs=[text_output, result_output])

        with gr.TabItem("Summary | Graph-based"):
            summary_output = gr.Textbox(label="Summary | Graph-based")
            summarize_button = gr.Button("Summarize")

            summarize_button.click(fn=lambda text: graph_based_summary(text), inputs=[text_output], outputs=[summary_output])

        with gr.TabItem("Summary | LexRank"):
            summary_output = gr.Textbox(label="Summary | LexRank")
            summarize_button = gr.Button("Summarize")

            summarize_button.click(fn=lambda text: lex_rank_summary(text), inputs=[text_output], outputs=[summary_output])

        with gr.TabItem("Summary | TextRank"):
            summary_output = gr.Textbox(label="Summary | TextRank")
            summarize_button = gr.Button("Summarize")

            summarize_button.click(fn=lambda text: text_rank_summary(text), inputs=[text_output], outputs=[summary_output])

        with gr.TabItem("Download PDF"):
            pdf_text_only = gr.Button("Download PDF with Text Only")
            pdf_summary_only = gr.Button("Download PDF with Summary Only")
            pdf_both = gr.Button("Download PDF with Both")

            pdf_output = gr.File(label="Download PDF")

            pdf_text_only.click(fn=lambda text: save_to_pdf(text, ""), inputs=[text_output], outputs=[pdf_output])
            pdf_summary_only.click(fn=lambda summary: save_to_pdf("", summary), inputs=[summary_output], outputs=[pdf_output])
            pdf_both.click(fn=lambda text, summary: save_to_pdf(text, summary), inputs=[text_output, summary_output], outputs=[pdf_output])

iface.launch(share=True, debug=True)