Daemontatox commited on
Commit
5b73cc5
·
verified ·
1 Parent(s): 488a981

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -44
app.py CHANGED
@@ -7,55 +7,75 @@ import gradio as gr
7
  from gradio import FileData
8
  import time
9
  import spaces
10
- ckpt ="Daemontatox/DocumentCogito"
 
 
 
 
 
11
  model = MllamaForConditionalGeneration.from_pretrained(ckpt,
12
  torch_dtype=torch.bfloat16).to("cuda")
13
  processor = AutoProcessor.from_pretrained(ckpt)
14
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  @spaces.GPU()
17
  def bot_streaming(message, history, max_new_tokens=2048):
18
-
19
  txt = message["text"]
20
  ext_buffer = f"{txt}"
21
 
22
- messages= []
23
  images = []
24
 
25
-
26
- for i, msg in enumerate(history):
27
  if isinstance(msg[0], tuple):
28
- messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
29
  messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
30
  images.append(Image.open(msg[0][0]).convert("RGB"))
31
  elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
32
- # messages are already handled
33
  pass
34
- elif isinstance(history[i-1][0], str) and isinstance(msg[0], str): # text only turn
35
  messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
36
  messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
37
 
38
- # add current message
39
  if len(message["files"]) == 1:
 
40
 
41
- if isinstance(message["files"][0], str): # examples
42
- image = Image.open(message["files"][0]).convert("RGB")
43
- else: # regular input
44
- image = Image.open(message["files"][0]["path"]).convert("RGB")
45
- images.append(image)
 
 
 
 
 
46
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
47
  else:
48
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
49
 
50
-
51
  texts = processor.apply_chat_template(messages, add_generation_prompt=True)
52
 
53
- if images == []:
54
  inputs = processor(text=texts, return_tensors="pt").to("cuda")
55
  else:
56
  inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
 
57
  streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
58
-
59
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
60
  generated_text = ""
61
 
@@ -69,32 +89,34 @@ def bot_streaming(message, history, max_new_tokens=2048):
69
  time.sleep(0.01)
70
  yield buffer
71
 
72
-
73
- demo = gr.ChatInterface(fn=bot_streaming, title="Document Analyzer", examples=[
74
- [{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]},
75
- 200],
76
- [{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]},
77
- 250],
78
- [{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]},
79
- 250],
80
- [{"text": "How long does it take from invoice date to due date? Be short and concise.", "files":["./examples/invoice.png"]},
81
- 250],
82
- [{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]},
83
- 250],
84
  ],
85
- textbox=gr.MultimodalTextbox(),
86
- additional_inputs = [gr.Slider(
87
- minimum=10,
88
- maximum=500,
89
- value=2048,
90
- step=10,
91
- label="Maximum number of new tokens to generate",
92
- )
93
- ],
94
- cache_examples=False,
95
- description="MllM ",
96
- stop_btn="Stop Generation",
97
- fill_height=True,
98
- multimodal=True)
99
-
 
 
 
 
 
100
  demo.launch(debug=True)
 
7
  from gradio import FileData
8
  import time
9
  import spaces
10
+ from pdf2image import convert_from_path
11
+ import os
12
+ from PyPDF2 import PdfReader
13
+ import tempfile
14
+
15
+ ckpt = "Daemontatox/DocumentCogito"
16
  model = MllamaForConditionalGeneration.from_pretrained(ckpt,
17
  torch_dtype=torch.bfloat16).to("cuda")
18
  processor = AutoProcessor.from_pretrained(ckpt)
19
 
20
+ def process_pdf(pdf_path):
21
+ """Convert PDF pages to images and extract text."""
22
+ images = convert_from_path(pdf_path)
23
+ pdf_reader = PdfReader(pdf_path)
24
+ text = ""
25
+ for page in pdf_reader.pages:
26
+ text += page.extract_text() + "\n"
27
+ return images, text
28
+
29
+ def is_pdf(file_path):
30
+ """Check if the file is a PDF."""
31
+ return file_path.lower().endswith('.pdf')
32
 
33
  @spaces.GPU()
34
  def bot_streaming(message, history, max_new_tokens=2048):
 
35
  txt = message["text"]
36
  ext_buffer = f"{txt}"
37
 
38
+ messages = []
39
  images = []
40
 
41
+ # Process history
42
+ for i, msg in enumerate(history):
43
  if isinstance(msg[0], tuple):
44
+ messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "text", "text": history[i+1][1]}]})
45
  messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
46
  images.append(Image.open(msg[0][0]).convert("RGB"))
47
  elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
 
48
  pass
49
+ elif isinstance(history[i-1][0], str) and isinstance(msg[0], str):
50
  messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
51
  messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
52
 
53
+ # Process current message
54
  if len(message["files"]) == 1:
55
+ file_path = message["files"][0]["path"] if isinstance(message["files"][0], dict) else message["files"][0]
56
 
57
+ if is_pdf(file_path):
58
+ # Handle PDF
59
+ pdf_images, pdf_text = process_pdf(file_path)
60
+ images.extend(pdf_images)
61
+ txt = f"{txt}\nExtracted text from PDF:\n{pdf_text}"
62
+ else:
63
+ # Handle regular image
64
+ image = Image.open(file_path).convert("RGB")
65
+ images.append(image)
66
+
67
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
68
  else:
69
  messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
70
 
 
71
  texts = processor.apply_chat_template(messages, add_generation_prompt=True)
72
 
73
+ if not images:
74
  inputs = processor(text=texts, return_tensors="pt").to("cuda")
75
  else:
76
  inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
77
+
78
  streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
 
79
  generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
80
  generated_text = ""
81
 
 
89
  time.sleep(0.01)
90
  yield buffer
91
 
92
+ demo = gr.ChatInterface(
93
+ fn=bot_streaming,
94
+ title="Document Analyzer",
95
+ examples=[
96
+ [{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]}, 200],
97
+ [{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]}, 250],
98
+ [{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]}, 250],
99
+ [{"text": "How long does it take from invoice date to due date? Be short and concise.", "files":["./examples/invoice.png"]}, 250],
100
+ [{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]}, 250],
 
 
 
101
  ],
102
+ textbox=gr.MultimodalTextbox(),
103
+ additional_inputs=[
104
+ gr.Slider(
105
+ minimum=10,
106
+ maximum=500,
107
+ value=2048,
108
+ step=10,
109
+ label="Maximum number of new tokens to generate",
110
+ )
111
+ ],
112
+ cache_examples=False,
113
+ description="MllM Document and PDF Analyzer",
114
+ stop_btn="Stop Generation",
115
+ fill_height=True,
116
+ multimodal=True
117
+ )
118
+
119
+ # Update file types to include PDFs
120
+ demo.textbox.file_types = ["image", "pdf"]
121
+
122
  demo.launch(debug=True)