hideosnes commited on
Commit
2224978
·
verified ·
1 Parent(s): cf271ff

Update app.py

Browse files

Fixed Bug, where prompt and file name of pdf was analysed.
Added info for max tokens

Files changed (1) hide show
  1. app.py +11 -7
app.py CHANGED
@@ -27,12 +27,15 @@ tokenizer, model = load_model()
27
  def summarize(file, text, style, length):
28
  text_input = ""
29
  if file is not None:
30
- if file.name.endswith(".pdf") and hasattr(file, 'read'):
31
- with fitz.open(stream=file.read(), filetype="pdf") as doc:
 
32
  text_input = " ".join([page.get_text() for page in doc])
33
- else:
34
- # if file doesn't end with .pdf AND hasn't attribute 'read',
35
- # then handle all other cases (TXT files, PDFs without .read(), etc.)
 
 
36
  text_input = str(file)
37
  elif text:
38
  text_input = text
@@ -43,7 +46,7 @@ def summarize(file, text, style, length):
43
  # so we return a message for the first output (the summary box) and None for the rest.
44
  # This ensures the UI remains consistent and doesn't break if the input is empty.
45
  return "Maybe try uploading a file or typing some text?", None, None, None, None, None
46
-
47
  # Language detection
48
  try:
49
  lang_code = detect(text_input)
@@ -76,7 +79,7 @@ def summarize(file, text, style, length):
76
  # Note: 1024 tokens typically correspond to about 750–800 English words,
77
  # depending on the tokenizer and language. ---------------------------------------------- (!)
78
  # Make sure to display this token/word information to the user in the app UI for clarity.
79
-
80
  # Note: "pyTorchTensor" is not a valid TensorType, use one of ['pt', 'tf', 'np', 'jax', 'mlx']
81
  inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)
82
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -129,6 +132,7 @@ with gr.Blocks() as demo:
129
  with gr.Row(): # for inline horizontal layout
130
  style = gr.Dropdown(["Precise", "Sloppy", "Keywords"], label="Style")
131
  length = gr.Radio(["Short", "Middle", "Long"], label="Length")
 
132
  btn = gr.Button("Transform")
133
 
134
  with gr.Column(): #right column
 
27
  def summarize(file, text, style, length):
28
  text_input = ""
29
  if file is not None:
30
+ if file.name.endswith(".pdf"): # and hasattr(file, 'read'):
31
+ try:
32
+ with fitz.open(file.name) as doc: #with fitz.open(stream=file.read(), filetype="pdf") as doc:
33
  text_input = " ".join([page.get_text() for page in doc])
34
+ except:
35
+ # Fallback if fitz can't read the file
36
+ text_input = "Error: Could not read PDF file"
37
+ elif file.name.endswith(".txt"):
38
+ # For TXT files, use str(file) to get contents
39
  text_input = str(file)
40
  elif text:
41
  text_input = text
 
46
  # so we return a message for the first output (the summary box) and None for the rest.
47
  # This ensures the UI remains consistent and doesn't break if the input is empty.
48
  return "Maybe try uploading a file or typing some text?", None, None, None, None, None
49
+
50
  # Language detection
51
  try:
52
  lang_code = detect(text_input)
 
79
  # Note: 1024 tokens typically correspond to about 750–800 English words,
80
  # depending on the tokenizer and language. ---------------------------------------------- (!)
81
  # Make sure to display this token/word information to the user in the app UI for clarity.
82
+
83
  # Note: "pyTorchTensor" is not a valid TensorType, use one of ['pt', 'tf', 'np', 'jax', 'mlx']
84
  inputs = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)
85
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
132
  with gr.Row(): # for inline horizontal layout
133
  style = gr.Dropdown(["Precise", "Sloppy", "Keywords"], label="Style")
134
  length = gr.Radio(["Short", "Middle", "Long"], label="Length")
135
+ token_info = gr.Text(label="Max. Tokens:", value="1024 tokens ~ 750–800 words", interactive=False)
136
  btn = gr.Button("Transform")
137
 
138
  with gr.Column(): #right column