Spaces:
Running
Running
Update app.py
Browse filesFixed Bug, where prompt and file name of pdf was analysed.
Added info for max tokens
app.py
CHANGED
@@ -27,12 +27,15 @@ tokenizer, model = load_model()
|
|
27 |
def summarize(file, text, style, length):
|
28 |
text_input = ""
|
29 |
if file is not None:
|
30 |
-
if file.name.endswith(".pdf") and hasattr(file, 'read'):
|
31 |
-
|
|
|
32 |
text_input = " ".join([page.get_text() for page in doc])
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
36 |
text_input = str(file)
|
37 |
elif text:
|
38 |
text_input = text
|
@@ -43,7 +46,7 @@ def summarize(file, text, style, length):
|
|
43 |
# so we return a message for the first output (the summary box) and None for the rest.
|
44 |
# This ensures the UI remains consistent and doesn't break if the input is empty.
|
45 |
return "Maybe try uploading a file or typing some text?", None, None, None, None, None
|
46 |
-
|
47 |
# Language detection
|
48 |
try:
|
49 |
lang_code = detect(text_input)
|
@@ -76,7 +79,7 @@ def summarize(file, text, style, length):
|
|
76 |
# Note: 1024 tokens typically correspond to about 750–800 English words,
|
77 |
# depending on the tokenizer and language. ---------------------------------------------- (!)
|
78 |
# Make sure to display this token/word information to the user in the app UI for clarity.
|
79 |
-
|
80 |
# Note: "pyTorchTensor" is not a valid TensorType, use one of ['pt', 'tf', 'np', 'jax', 'mlx']
|
81 |
inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)
|
82 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
@@ -129,6 +132,7 @@ with gr.Blocks() as demo:
|
|
129 |
with gr.Row(): # for inline horizontal layout
|
130 |
style = gr.Dropdown(["Precise", "Sloppy", "Keywords"], label="Style")
|
131 |
length = gr.Radio(["Short", "Middle", "Long"], label="Length")
|
|
|
132 |
btn = gr.Button("Transform")
|
133 |
|
134 |
with gr.Column(): #right column
|
|
|
27 |
def summarize(file, text, style, length):
|
28 |
text_input = ""
|
29 |
if file is not None:
|
30 |
+
if file.name.endswith(".pdf"): # and hasattr(file, 'read'):
|
31 |
+
try:
|
32 |
+
with fitz.open(file.name) as doc: #with fitz.open(stream=file.read(), filetype="pdf") as doc:
|
33 |
text_input = " ".join([page.get_text() for page in doc])
|
34 |
+
except:
|
35 |
+
# Fallback if fitz can't read the file
|
36 |
+
text_input = "Error: Could not read PDF file"
|
37 |
+
elif file.name.endswith(".txt"):
|
38 |
+
# For TXT files, use str(file) to get contents
|
39 |
text_input = str(file)
|
40 |
elif text:
|
41 |
text_input = text
|
|
|
46 |
# so we return a message for the first output (the summary box) and None for the rest.
|
47 |
# This ensures the UI remains consistent and doesn't break if the input is empty.
|
48 |
return "Maybe try uploading a file or typing some text?", None, None, None, None, None
|
49 |
+
|
50 |
# Language detection
|
51 |
try:
|
52 |
lang_code = detect(text_input)
|
|
|
79 |
# Note: 1024 tokens typically correspond to about 750–800 English words,
|
80 |
# depending on the tokenizer and language. ---------------------------------------------- (!)
|
81 |
# Make sure to display this token/word information to the user in the app UI for clarity.
|
82 |
+
|
83 |
# Note: "pyTorchTensor" is not a valid TensorType, use one of ['pt', 'tf', 'np', 'jax', 'mlx']
|
84 |
inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)
|
85 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
132 |
with gr.Row(): # for inline horizontal layout
|
133 |
style = gr.Dropdown(["Precise", "Sloppy", "Keywords"], label="Style")
|
134 |
length = gr.Radio(["Short", "Middle", "Long"], label="Length")
|
135 |
+
token_info = gr.Text(label="Max. Tokens:", value="1024 tokens ~ 750–800 words", interactive=False)
|
136 |
btn = gr.Button("Transform")
|
137 |
|
138 |
with gr.Column(): #right column
|