Daemontatox commited on
Commit
cd3a11d
·
verified ·
1 Parent(s): 2653b40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -87
app.py CHANGED
@@ -10,13 +10,17 @@ import spaces
10
  import fitz # PyMuPDF
11
  import io
12
  import numpy as np
 
 
 
 
 
13
 
14
  # Load model and processor
15
  ckpt = "Daemontatox/DocumentCogito"
16
  model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
17
  processor = AutoProcessor.from_pretrained(ckpt)
18
 
19
- # Document state to track uploaded files
20
  class DocumentState:
21
  def __init__(self):
22
  self.current_doc_images = []
@@ -30,100 +34,172 @@ class DocumentState:
30
 
31
  doc_state = DocumentState()
32
 
33
- # Function to convert PDF to images and extract text
34
  def process_pdf_file(file_path):
35
- """Convert PDF to images and extract text using PyMuPDF."""
36
- doc = fitz.open(file_path)
37
- images = []
38
- text = ""
39
-
40
- # Process each page
41
- for page_num in range(doc.page_count):
42
- page = doc[page_num]
43
- text += f"Page {page_num + 1} content:\n{page.get_text()}\n"
44
- pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
45
- img_data = pix.tobytes("png")
46
- img = Image.open(io.BytesIO(img_data))
47
- images.append(img.convert("RGB"))
48
-
49
- doc.close()
50
- return images, text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- # Function to process uploaded files (PDF or image)
53
  def process_file(file):
54
- """Process either PDF or image file and update document state."""
55
- doc_state.clear()
56
-
57
- if isinstance(file, dict):
58
- file_path = file["path"]
59
- else:
60
- file_path = file
61
 
62
- if file_path.lower().endswith('pdf'):
63
- doc_state.doc_type = 'pdf'
64
- doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
65
- return f"PDF processed. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
66
- else:
67
- doc_state.doc_type = 'image'
68
- doc_state.current_doc_images = [Image.open(file_path).convert("RGB")]
69
- return "Image loaded successfully. You can now ask questions about the content."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- # Function to handle streaming responses from the model
72
  @spaces.GPU()
73
  def bot_streaming(message, history, max_new_tokens=8192):
74
- txt = message["text"]
75
- messages = []
76
-
77
- # Process new file if provided
78
- if message.get("files") and len(message["files"]) > 0:
79
- process_file(message["files"][0])
80
-
81
- # Process history
82
- for i, msg in enumerate(history):
83
- if isinstance(msg[0], dict): # Multimodal message (text + files)
84
- user_content = [{"type": "text", "text": msg[0]["text"]}]
85
- if "files" in msg[0] and len(msg[0]["files"]) > 0:
86
- user_content.append({"type": "image"})
87
- messages.append({"role": "user", "content": user_content})
88
- messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
89
- elif isinstance(msg[0], str): # Text-only message
90
- messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
91
- messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
 
 
 
 
 
 
 
 
92
 
93
- # Include document context in the current message
94
- if doc_state.current_doc_images:
95
- context = f"\nDocument context:\n{doc_state.current_doc_text}" if doc_state.current_doc_text else ""
96
- current_msg = f"{txt}{context}"
97
- messages.append({"role": "user", "content": [{"type": "text", "text": current_msg}, {"type": "image"}]})
98
- else:
99
- messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
100
 
101
- # Apply chat template to messages
102
- texts = processor.apply_chat_template(messages, add_generation_prompt=True)
103
-
104
- # Process inputs based on whether we have images
105
- if doc_state.current_doc_images:
106
- inputs = processor(
107
- text=texts,
108
- images=doc_state.current_doc_images[0:1], # Only use first image
109
- return_tensors="pt"
110
- ).to("cuda")
111
- else:
112
- inputs = processor(text=texts, return_tensors="pt").to("cuda")
113
-
114
- streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
115
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
116
-
117
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
118
- thread.start()
119
- buffer = ""
120
-
121
- for new_text in streamer:
122
- buffer += new_text
123
- time.sleep(0.01)
124
- yield buffer
 
 
 
 
 
 
 
 
125
 
126
- # Function to clear document context
127
  def clear_context():
128
  """Clear the current document context."""
129
  doc_state.clear()
@@ -163,8 +239,7 @@ with gr.Blocks() as demo:
163
  clear_btn = gr.Button("Clear Document Context")
164
  clear_btn.click(fn=clear_context)
165
 
166
- # Update accepted file types
167
- chatbot.textbox.file_types = ["image", "pdf","text"]
168
 
169
  # Launch the interface
170
  demo.launch(debug=True)
 
10
  import fitz # PyMuPDF
11
  import io
12
  import numpy as np
13
+ import logging
14
+
15
+ # Set up logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
 
19
  # Load model and processor
20
  ckpt = "Daemontatox/DocumentCogito"
21
  model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
22
  processor = AutoProcessor.from_pretrained(ckpt)
23
 
 
24
  class DocumentState:
25
  def __init__(self):
26
  self.current_doc_images = []
 
34
 
35
  doc_state = DocumentState()
36
 
 
37
  def process_pdf_file(file_path):
38
+ """
39
+ Convert PDF to images and extract text using PyMuPDF with improved error handling
40
+ and image quality settings.
41
+ """
42
+ try:
43
+ doc = fitz.open(file_path)
44
+ images = []
45
+ text = ""
46
+
47
+ for page_num in range(doc.page_count):
48
+ try:
49
+ page = doc[page_num]
50
+
51
+ # Extract text with better formatting
52
+ page_text = page.get_text("text")
53
+ if page_text.strip(): # Only add non-empty pages
54
+ text += f"Page {page_num + 1}:\n{page_text}\n\n"
55
+
56
+ # Improved image extraction with error handling
57
+ try:
58
+ # Use higher DPI for better quality
59
+ zoom = 2 # Increase zoom factor for better resolution
60
+ mat = fitz.Matrix(zoom, zoom)
61
+ pix = page.get_pixmap(matrix=mat, alpha=False)
62
+
63
+ # Convert to PIL Image with proper color handling
64
+ img_data = pix.tobytes("png")
65
+ img = Image.open(io.BytesIO(img_data))
66
+
67
+ # Ensure RGB mode and reasonable size
68
+ img = img.convert("RGB")
69
+
70
+ # Resize if image is too large (keeping aspect ratio)
71
+ max_size = 1600
72
+ if max(img.size) > max_size:
73
+ ratio = max_size / max(img.size)
74
+ new_size = tuple(int(dim * ratio) for dim in img.size)
75
+ img = img.resize(new_size, Image.Resampling.LANCZOS)
76
+
77
+ images.append(img)
78
+
79
+ except Exception as e:
80
+ logger.error(f"Error processing page {page_num} image: {str(e)}")
81
+ continue
82
+
83
+ except Exception as e:
84
+ logger.error(f"Error processing page {page_num}: {str(e)}")
85
+ continue
86
+
87
+ doc.close()
88
+
89
+ if not images:
90
+ raise ValueError("No valid images could be extracted from the PDF")
91
+
92
+ return images, text
93
+
94
+ except Exception as e:
95
+ logger.error(f"Error processing PDF file: {str(e)}")
96
+ raise
97
 
 
98
  def process_file(file):
99
+ """Process either PDF or image file with improved error handling."""
100
+ try:
101
+ doc_state.clear()
 
 
 
 
102
 
103
+ if isinstance(file, dict):
104
+ file_path = file["path"]
105
+ else:
106
+ file_path = file
107
+
108
+ if file_path.lower().endswith('pdf'):
109
+ doc_state.doc_type = 'pdf'
110
+ try:
111
+ doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
112
+ return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
113
+ except Exception as e:
114
+ return f"Error processing PDF: {str(e)}. Please try a different PDF file or check if the file is corrupted."
115
+ else:
116
+ doc_state.doc_type = 'image'
117
+ try:
118
+ img = Image.open(file_path).convert("RGB")
119
+ # Resize if necessary
120
+ max_size = 1600
121
+ if max(img.size) > max_size:
122
+ ratio = max_size / max(img.size)
123
+ new_size = tuple(int(dim * ratio) for dim in img.size)
124
+ img = img.resize(new_size, Image.Resampling.LANCZOS)
125
+ doc_state.current_doc_images = [img]
126
+ return "Image loaded successfully. You can now ask questions about the content."
127
+ except Exception as e:
128
+ return f"Error processing image: {str(e)}. Please try a different image file."
129
+ except Exception as e:
130
+ logger.error(f"Error in process_file: {str(e)}")
131
+ return "An error occurred while processing the file. Please try again."
132
 
 
133
  @spaces.GPU()
134
  def bot_streaming(message, history, max_new_tokens=8192):
135
+ try:
136
+ txt = message["text"]
137
+ messages = []
138
+
139
+ # Process new file if provided
140
+ if message.get("files") and len(message["files"]) > 0:
141
+ result = process_file(message["files"][0])
142
+ if "Error" in result:
143
+ yield result
144
+ return
145
+
146
+ # Process history with better error handling
147
+ for i, msg in enumerate(history):
148
+ try:
149
+ if isinstance(msg[0], dict):
150
+ user_content = [{"type": "text", "text": msg[0]["text"]}]
151
+ if "files" in msg[0] and len(msg[0]["files"]) > 0:
152
+ user_content.append({"type": "image"})
153
+ messages.append({"role": "user", "content": user_content})
154
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
155
+ elif isinstance(msg[0], str):
156
+ messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
157
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
158
+ except Exception as e:
159
+ logger.error(f"Error processing history message {i}: {str(e)}")
160
+ continue
161
 
162
+ # Include document context
163
+ if doc_state.current_doc_images:
164
+ context = f"\nDocument context:\n{doc_state.current_doc_text}" if doc_state.current_doc_text else ""
165
+ current_msg = f"{txt}{context}"
166
+ messages.append({"role": "user", "content": [{"type": "text", "text": current_msg}, {"type": "image"}]})
167
+ else:
168
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
169
 
170
+ # Process inputs
171
+ texts = processor.apply_chat_template(messages, add_generation_prompt=True)
172
+
173
+ try:
174
+ if doc_state.current_doc_images:
175
+ inputs = processor(
176
+ text=texts,
177
+ images=doc_state.current_doc_images[0:1],
178
+ return_tensors="pt"
179
+ ).to("cuda")
180
+ else:
181
+ inputs = processor(text=texts, return_tensors="pt").to("cuda")
182
+
183
+ streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
184
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
185
+
186
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
187
+ thread.start()
188
+
189
+ buffer = ""
190
+ for new_text in streamer:
191
+ buffer += new_text
192
+ time.sleep(0.01)
193
+ yield buffer
194
+
195
+ except Exception as e:
196
+ logger.error(f"Error in model processing: {str(e)}")
197
+ yield "An error occurred while processing your request. Please try again."
198
+
199
+ except Exception as e:
200
+ logger.error(f"Error in bot_streaming: {str(e)}")
201
+ yield "An error occurred. Please try again."
202
 
 
203
  def clear_context():
204
  """Clear the current document context."""
205
  doc_state.clear()
 
239
  clear_btn = gr.Button("Clear Document Context")
240
  clear_btn.click(fn=clear_context)
241
 
242
+ chatbot.textbox.file_types = ["image", "pdf", "text"]
 
243
 
244
  # Launch the interface
245
  demo.launch(debug=True)