Spaces:

hideosnes
/

T2T-CPU-summarization

Running

App Files Files Community

T2T-CPU-summarization / app.py

hideosnes

Update app.py

cf271ff verified 2 months ago

raw

history blame

6.26 kB

	import gradio as gr
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	import torch
	import fitz
	from langdetect import detect
	import matplotlib.pyplot as plt
	from collections import Counter
	import re
	import os


	MODEL_PATH = "hideosnes/Bart-T2T-Distill_GildaBot"

	def load_model():
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
	model = AutoModelForSeq2SeqLM.from_pretrained(
	MODEL_PATH,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
	)
	return tokenizer, model

	tokenizer, model = load_model()

	# ### st.title("PDF Summarizer")
	# ### st.markdown("CPU-optimized model for text-to-text transformation (T2T), facilitating efficient and accurate language processing. Multi-lingual but target language is English. Please be gentle, it runs on CPU!")

	def summarize(file, text, style, length):
	text_input = ""
	if file is not None:
	if file.name.endswith(".pdf") and hasattr(file, 'read'):
	with fitz.open(stream=file.read(), filetype="pdf") as doc:
	text_input = " ".join([page.get_text() for page in doc])
	else:
	# if file doesn't end with .pdf AND hasn't attribute 'read',
	# then handle all other cases (TXT files, PDFs without .read(), etc.)
	text_input = str(file)
	elif text:
	text_input = text
	# If the input text is empty or contains only whitespace,
	# return early with a user message and placeholder values.
	if not text_input.strip():
	# Gradio expects the summarize() function to always return the same number of outputs,
	# so we return a message for the first output (the summary box) and None for the rest.
	# This ensures the UI remains consistent and doesn't break if the input is empty.
	return "Maybe try uploading a file or typing some text?", None, None, None, None, None

	# Language detection
	try:
	lang_code = detect(text_input)
	except:
	lang_code = "en"

	# Length
	max_token, min_token = (
	(100, 85) if length == "Short" else
	(200, 185) if length == "Medium" else
	(300, 285)
	)

	# System prompt based on language and style
	prompt_map = {
	"en": {
	"Precise": "In English, distill the following text into a concise summary, utilizing formal and academic language to convey the essential information:",
	"Sloppy": "In English, provide a brief and informal summary of the following text, using straightforward language to facilitate easy comprehension:",
	"Keywords": "In English, condense the following text into a list of keywords, highlighting key points and main ideas in a clear and objective manner:",
	}#, <-- don't forget the comma!!!!!
	#"foo": { "precise": "another language or prompt map could go here"}
	}
	prompt = prompt_map.get(lang_code, prompt_map["en"])[style] + " " + text_input

	# Summarization
	# Custom tokenizer: create a class with encode/decode methods following the HuggingFace
	# tokenizer interface, or use the PreTrainedTokenizerFast class with your own
	# vocab and pre-tokenization rules.

	# Note: 1024 tokens typically correspond to about 750–800 English words,
	# depending on the tokenizer and language. ---------------------------------------------- (!)
	# Make sure to display this token/word information to the user in the app UI for clarity.

	# Note: "pyTorchTensor" is not a valid TensorType, use one of ['pt', 'tf', 'np', 'jax', 'mlx']
	inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model.to(device)
	inputs = inputs.to(device)
	# the generated summary is not text yet but a tensor-array of token IDs
	summary_ids = model.generate(
	inputs,
	max_length=max_token,
	min_length=min_token,
	length_penalty=2.0,
	num_beams=4,
	early_stopping=True,
	no_repeat_ngram_size=3
	)
	summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

	# These lines calculate and store the word count of the original text,
	# the word count of the summary, and the percentage reduction in length after summarization.
	# Note: len() is a built-in Python function that returns the number of items in an object.
	original_len = len(text_input.split())
	summary_len = len(summary.split())
	reduction = 100 - (summary_len / original_len * 100)

	# Extracting the 5 most frequent words (longer than 3 characters)
	# from the summary, treating them as keywords.
	words = re.findall(r'\w+', summary.lower())
	keyword_counts = Counter(words).most_common(5)
	keywords = [kw for kw, _ in keyword_counts if len(kw) > 3]

	# Plot
	fig, ax = plt.subplots()
	ax.bar(
	["Original", "Summary"],
	[original_len, summary_len],
	color=["coral", "purple"]
	)
	ax.set_ylabel("Word Count")

	return summary, ", ".join(keywords), original_len, summary_len, f"{reduction:.2f}%", fig

	with gr.Blocks() as demo:
	gr.Markdown("Summarizer (T2T)")

	with gr.Row(): #left column
	with gr.Column():
	file = gr.File(label="Upload a PDF or a TXT file")
	text = gr.Textbox(label="Paste text from clipboard.", lines=10)

	with gr.Row(): # for inline horizontal layout
	style = gr.Dropdown(["Precise", "Sloppy", "Keywords"], label="Style")
	length = gr.Radio(["Short", "Middle", "Long"], label="Length")
	btn = gr.Button("Transform")

	with gr.Column(): #right column
	summary = gr.Textbox(label="Summary")
	keywords = gr.Textbox(label="Important Keywords")
	original_len = gr.Number(label="Original Text Length")
	summary_len = gr.Number(label="Summary Length")
	reduction = gr.Textbox(label="Summary Efficiency")
	plot = gr.Plot(label="Summary Statistics")

	btn.click(
	summarize,
	inputs=[file, text, style, length],
	outputs=[summary, keywords, original_len, summary_len, reduction, plot]
	)

	demo.launch()