hideosnes commited on
Commit
3a361ef
·
verified ·
1 Parent(s): 7e44b3c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -0
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
3
+ import torch
4
+ import fitz
5
+ from langdetect import detect
6
+ import matplotlib.pyplot as plt
7
+ from collections import Counter
8
+ import re
9
+ import os
10
+
11
+
12
+ MODEL_PATH = "hideosnes/Bart-T2T-Distill_GildaBot"
13
+
14
+ def load_model():
15
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
16
+ model = AutoModelForSeq2SeqLM.from_pretrained(
17
+ MODEL_PATH,
18
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
19
+ )
20
+ return tokenizer, model
21
+
22
+ tokenizer, model = load_model()
23
+
24
+ # ### st.title("PDF Summarizer")
25
+ # ### st.markdown("CPU-optimized model for text-to-text transformation (T2T), facilitating efficient and accurate language processing. Multi-lingual but target language is English. Please be gentle, it runs on CPU!")
26
+
27
+ def summarize(file, text, style, length):
28
+ text_input = ""
29
+ if file is not None:
30
+ if file.name.endswith(".pdf"):
31
+ with fitz.open(stream=file.read(), filetype="pdf") as doc:
32
+ text_input = " ".join([page.get_text() for page in doc])
33
+ elif file.name.endswith(".txt"):
34
+ text_input = file.read().decode("utf-8")
35
+ elif text:
36
+ text_input = text
37
+ # If the input text is empty or contains only whitespace,
38
+ # return early with a user message and placeholder values.
39
+ if not text_input.strip():
40
+ # Gradio expects the summarize() function to always return the same number of outputs,
41
+ # so we return a message for the first output (the summary box) and None for the rest.
42
+ # This ensures the UI remains consistent and doesn't break if the input is empty.
43
+ return "Maybe try uploading a file or typing some text?", None, None, None, None, None
44
+
45
+ # Language detection
46
+ try:
47
+ lang_code = detect(text_input)
48
+ except:
49
+ lang_code = "en"
50
+
51
+ # Length
52
+ max_token, min_token = (
53
+ (100, 85) if length == "Short" else
54
+ (200, 185) if length == "Medium" else
55
+ (300, 285)
56
+ )
57
+
58
+ # System prompt based on language and style
59
+ prompt_map = {
60
+ "en": {
61
+ "Precise": "In English, distill the following text into a concise summary, utilizing formal and academic language to convey the essential information:",
62
+ "Sloppy": "In English, provide a brief and informal summary of the following text, using straightforward language to facilitate easy comprehension:",
63
+ "Keywords": "In English, condense the following text into a list of keywords, highlighting key points and main ideas in a clear and objective manner:",
64
+ }#, <-- don't forget the comma!!!!!
65
+ #"foo": { "precise": "another language or prompt map could go here"}
66
+ }
67
+ prompt = prompt_map.get(lang_code, prompt_map["en"])+[style] + " " + text_input
68
+
69
+ # Summarization
70
+ # Custom tokenizer: create a class with encode/decode methods following the HuggingFace
71
+ # tokenizer interface, or use the PreTrainedTokenizerFast class with your own
72
+ # vocab and pre-tokenization rules.
73
+
74
+ # Note: 1024 tokens typically correspond to about 750–800 English words,
75
+ # depending on the tokenizer and language. ---------------------------------------------- (!)
76
+ # Make sure to display this token/word information to the user in the app UI for clarity.
77
+ inputs = tokenizer.encode(prompt, return_tensors="pyTorchTensor", truncation=True, max_length=1024)
78
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
79
+ model.to(device)
80
+ inputs = inputs.to(device)
81
+ # the generated summary is not text yet but a tensor-array of token IDs
82
+ summary_ids = model.generate(
83
+ inputs,
84
+ max_length=max_token,
85
+ min_length=min_token,
86
+ length_penalty=2.0,
87
+ num_beams=4,
88
+ early_stopping=True,
89
+ no_repeat_ngram_size=3
90
+ )
91
+ summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
92
+
93
+ # These lines calculate and store the word count of the original text,
94
+ # the word count of the summary, and the percentage reduction in length after summarization.
95
+ # Note: len() is a built-in Python function that returns the number of items in an object.
96
+ original_len = len(text_input.split())
97
+ summary_len = len(summary.split())
98
+ reduction = 100 - (summary_len / original_len * 100)
99
+
100
+ # Extracting the 5 most frequent words (longer than 3 characters)
101
+ # from the summary, treating them as keywords.
102
+ words = re.findall(r'\w+', summary.lower())
103
+ keyword_counts = Counter(words).most_common(5)
104
+ keywords = [kw for kw, _ in keyword_counts if len(kw) > 3]
105
+
106
+ # Plot
107
+ fig, ax = plt.subplots()
108
+ ax.bar(
109
+ ["Original", "Summary"],
110
+ [original_len, summary_len],
111
+ color=["coral", "purple"]
112
+ )
113
+ ax.set_ylabel("Word Count")
114
+
115
+ return summary, ", ".join(keywords), original_len, summary_len, f"{reduction:.2f}%", fig
116
+
117
+ with gr.Blocks() as app:
118
+ gr.Markdown("Summarizer (T2T)")
119
+ file = gr.File(label="Upload a PDF or a TXT file")
120
+ text = gr.Textbox(label="Paste text from clipboard.", lines=10)
121
+ style = gr.Dropdown(["Precise", "Sloppy", "Keywords"], label="Style")
122
+ length = gr.Radio(["Short", "Middle", "Long"], label="Length")
123
+ btn = gr.Button("Transform")
124
+ summary = gr.Textbox(label="Summary")
125
+ keywords = gr.Textbox(label="Important Keywords")
126
+ original_len = gr.Number(label="Original Text Length")
127
+ summary_len = gr.Number(label="Summary Length")
128
+ reduction = gr.Textbox(label="Summary Efficiency")
129
+ plot = gr.Plot(label="Summary Statistics")
130
+
131
+ btn.click(
132
+ summarize,
133
+ inputs=[file, text, style, length],
134
+ outputs=[summary, keywords, original_len, summary_len, reduction, plot]
135
+ )
136
+
137
+ app.launch()