Spaces:

hideosnes
/

T2T-CPU-summarization

Running

App Files Files Community

hideosnes commited on Jun 8

Commit

2224978

verified ·

1 Parent(s): cf271ff

Update app.py

Browse files

Fixed Bug, where prompt and file name of pdf was analysed.
Added info for max tokens

Files changed (1) hide show

app.py +11 -7

app.py CHANGED Viewed

@@ -27,12 +27,15 @@ tokenizer, model = load_model()
 def summarize(file, text, style, length):
     text_input = ""
     if file is not None:
-        if file.name.endswith(".pdf") and hasattr(file, 'read'):
-                with fitz.open(stream=file.read(), filetype="pdf") as doc:
                     text_input = " ".join([page.get_text() for page in doc])
-        else:
-            # if file doesn't end with .pdf AND hasn't attribute 'read',
-            # then handle all other cases (TXT files, PDFs without .read(), etc.)
             text_input = str(file)
     elif text:
         text_input = text
@@ -43,7 +46,7 @@ def summarize(file, text, style, length):
     # so we return a message for the first output (the summary box) and None for the rest.
     # This ensures the UI remains consistent and doesn't break if the input is empty.
         return "Maybe try uploading a file or typing some text?", None, None, None, None, None
     # Language detection
     try:
         lang_code = detect(text_input)
@@ -76,7 +79,7 @@ def summarize(file, text, style, length):
     # Note: 1024 tokens typically correspond to about 750–800 English words,
     # depending on the tokenizer and language. ---------------------------------------------- (!)
     # Make sure to display this token/word information to the user in the app UI for clarity.
     # Note: "pyTorchTensor" is not a valid TensorType, use one of ['pt', 'tf', 'np', 'jax', 'mlx']
     inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -129,6 +132,7 @@ with gr.Blocks() as demo:
             with gr.Row(): # for inline horizontal layout
                 style = gr.Dropdown(["Precise", "Sloppy", "Keywords"], label="Style")
                 length = gr.Radio(["Short", "Middle", "Long"], label="Length")
                 btn = gr.Button("Transform")
         with gr.Column(): #right column

 def summarize(file, text, style, length):
     text_input = ""
     if file is not None:
+        if file.name.endswith(".pdf"): # and hasattr(file, 'read'):
+            try:
+                with fitz.open(file.name) as doc: #with fitz.open(stream=file.read(), filetype="pdf") as doc:
                     text_input = " ".join([page.get_text() for page in doc])
+            except:
+                # Fallback if fitz can't read the file
+                text_input = "Error: Could not read PDF file"
+        elif file.name.endswith(".txt"):
+            # For TXT files, use str(file) to get contents
             text_input = str(file)
     elif text:
         text_input = text
     # so we return a message for the first output (the summary box) and None for the rest.
     # This ensures the UI remains consistent and doesn't break if the input is empty.
         return "Maybe try uploading a file or typing some text?", None, None, None, None, None
     # Language detection
     try:
         lang_code = detect(text_input)
     # Note: 1024 tokens typically correspond to about 750–800 English words,
     # depending on the tokenizer and language. ---------------------------------------------- (!)
     # Make sure to display this token/word information to the user in the app UI for clarity.
     # Note: "pyTorchTensor" is not a valid TensorType, use one of ['pt', 'tf', 'np', 'jax', 'mlx']
     inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
             with gr.Row(): # for inline horizontal layout
                 style = gr.Dropdown(["Precise", "Sloppy", "Keywords"], label="Style")
                 length = gr.Radio(["Short", "Middle", "Long"], label="Length")
+                token_info = gr.Text(label="Max. Tokens:", value="1024 tokens ~ 750–800 words", interactive=False)
                 btn = gr.Button("Transform")
         with gr.Column(): #right column