Spaces:

MawaredHR
/

Vision_tester

Running

App Files Files Community

Daemontatox commited on Jan 19

Commit

cd3a11d

verified ·

1 Parent(s): 2653b40

Update app.py

Browse files

Files changed (1) hide show

app.py +162 -87

app.py CHANGED Viewed

@@ -10,13 +10,17 @@ import spaces
 import fitz  # PyMuPDF
 import io
 import numpy as np
 # Load model and processor
 ckpt = "Daemontatox/DocumentCogito"
 model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
 processor = AutoProcessor.from_pretrained(ckpt)
-# Document state to track uploaded files
 class DocumentState:
     def __init__(self):
         self.current_doc_images = []
@@ -30,100 +34,172 @@ class DocumentState:
 doc_state = DocumentState()
-# Function to convert PDF to images and extract text
 def process_pdf_file(file_path):
-    """Convert PDF to images and extract text using PyMuPDF."""
-    doc = fitz.open(file_path)
-    images = []
-    text = ""
-    # Process each page
-    for page_num in range(doc.page_count):
-        page = doc[page_num]
-        text += f"Page {page_num + 1} content:\n{page.get_text()}\n"
-        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
-        img_data = pix.tobytes("png")
-        img = Image.open(io.BytesIO(img_data))
-        images.append(img.convert("RGB"))
-    doc.close()
-    return images, text
-# Function to process uploaded files (PDF or image)
 def process_file(file):
-    """Process either PDF or image file and update document state."""
-    doc_state.clear()
-    if isinstance(file, dict):
-        file_path = file["path"]
-    else:
-        file_path = file
-    if file_path.lower().endswith('pdf'):
-        doc_state.doc_type = 'pdf'
-        doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
-        return f"PDF processed. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
-    else:
-        doc_state.doc_type = 'image'
-        doc_state.current_doc_images = [Image.open(file_path).convert("RGB")]
-        return "Image loaded successfully. You can now ask questions about the content."
-# Function to handle streaming responses from the model
 @spaces.GPU()
 def bot_streaming(message, history, max_new_tokens=8192):
-    txt = message["text"]
-    messages = []
-    # Process new file if provided
-    if message.get("files") and len(message["files"]) > 0:
-        process_file(message["files"][0])
-    # Process history
-    for i, msg in enumerate(history):
-        if isinstance(msg[0], dict):  # Multimodal message (text + files)
-            user_content = [{"type": "text", "text": msg[0]["text"]}]
-            if "files" in msg[0] and len(msg[0]["files"]) > 0:
-                user_content.append({"type": "image"})
-            messages.append({"role": "user", "content": user_content})
-            messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
-        elif isinstance(msg[0], str):  # Text-only message
-            messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
-            messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
-    # Include document context in the current message
-    if doc_state.current_doc_images:
-        context = f"\nDocument context:\n{doc_state.current_doc_text}" if doc_state.current_doc_text else ""
-        current_msg = f"{txt}{context}"
-        messages.append({"role": "user", "content": [{"type": "text", "text": current_msg}, {"type": "image"}]})
-    else:
-        messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
-    # Apply chat template to messages
-    texts = processor.apply_chat_template(messages, add_generation_prompt=True)
-    # Process inputs based on whether we have images
-    if doc_state.current_doc_images:
-        inputs = processor(
-            text=texts,
-            images=doc_state.current_doc_images[0:1],  # Only use first image
-            return_tensors="pt"
-        ).to("cuda")
-    else:
-        inputs = processor(text=texts, return_tensors="pt").to("cuda")
-    streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
-    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        time.sleep(0.01)
-        yield buffer
-# Function to clear document context
 def clear_context():
     """Clear the current document context."""
     doc_state.clear()
@@ -163,8 +239,7 @@ with gr.Blocks() as demo:
     clear_btn = gr.Button("Clear Document Context")
     clear_btn.click(fn=clear_context)
-    # Update accepted file types
-    chatbot.textbox.file_types = ["image", "pdf","text"]
 # Launch the interface
 demo.launch(debug=True)

 import fitz  # PyMuPDF
 import io
 import numpy as np
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Load model and processor
 ckpt = "Daemontatox/DocumentCogito"
 model = MllamaForConditionalGeneration.from_pretrained(ckpt, torch_dtype=torch.bfloat16).to("cuda")
 processor = AutoProcessor.from_pretrained(ckpt)
 class DocumentState:
     def __init__(self):
         self.current_doc_images = []
 doc_state = DocumentState()
 def process_pdf_file(file_path):
+    """
+    Convert PDF to images and extract text using PyMuPDF with improved error handling
+    and image quality settings.
+    """
+    try:
+        doc = fitz.open(file_path)
+        images = []
+        text = ""
+        for page_num in range(doc.page_count):
+            try:
+                page = doc[page_num]
+                # Extract text with better formatting
+                page_text = page.get_text("text")
+                if page_text.strip():  # Only add non-empty pages
+                    text += f"Page {page_num + 1}:\n{page_text}\n\n"
+                # Improved image extraction with error handling
+                try:
+                    # Use higher DPI for better quality
+                    zoom = 2  # Increase zoom factor for better resolution
+                    mat = fitz.Matrix(zoom, zoom)
+                    pix = page.get_pixmap(matrix=mat, alpha=False)
+                    # Convert to PIL Image with proper color handling
+                    img_data = pix.tobytes("png")
+                    img = Image.open(io.BytesIO(img_data))
+                    # Ensure RGB mode and reasonable size
+                    img = img.convert("RGB")
+                    # Resize if image is too large (keeping aspect ratio)
+                    max_size = 1600
+                    if max(img.size) > max_size:
+                        ratio = max_size / max(img.size)
+                        new_size = tuple(int(dim * ratio) for dim in img.size)
+                        img = img.resize(new_size, Image.Resampling.LANCZOS)
+                    images.append(img)
+                except Exception as e:
+                    logger.error(f"Error processing page {page_num} image: {str(e)}")
+                    continue
+            except Exception as e:
+                logger.error(f"Error processing page {page_num}: {str(e)}")
+                continue
+        doc.close()
+        if not images:
+            raise ValueError("No valid images could be extracted from the PDF")
+        return images, text
+    except Exception as e:
+        logger.error(f"Error processing PDF file: {str(e)}")
+        raise
 def process_file(file):
+    """Process either PDF or image file with improved error handling."""
+    try:
+        doc_state.clear()
+        if isinstance(file, dict):
+            file_path = file["path"]
+        else:
+            file_path = file
+        if file_path.lower().endswith('pdf'):
+            doc_state.doc_type = 'pdf'
+            try:
+                doc_state.current_doc_images, doc_state.current_doc_text = process_pdf_file(file_path)
+                return f"PDF processed successfully. Total pages: {len(doc_state.current_doc_images)}. You can now ask questions about the content."
+            except Exception as e:
+                return f"Error processing PDF: {str(e)}. Please try a different PDF file or check if the file is corrupted."
+        else:
+            doc_state.doc_type = 'image'
+            try:
+                img = Image.open(file_path).convert("RGB")
+                # Resize if necessary
+                max_size = 1600
+                if max(img.size) > max_size:
+                    ratio = max_size / max(img.size)
+                    new_size = tuple(int(dim * ratio) for dim in img.size)
+                    img = img.resize(new_size, Image.Resampling.LANCZOS)
+                doc_state.current_doc_images = [img]
+                return "Image loaded successfully. You can now ask questions about the content."
+            except Exception as e:
+                return f"Error processing image: {str(e)}. Please try a different image file."
+    except Exception as e:
+        logger.error(f"Error in process_file: {str(e)}")
+        return "An error occurred while processing the file. Please try again."
 @spaces.GPU()
 def bot_streaming(message, history, max_new_tokens=8192):
+    try:
+        txt = message["text"]
+        messages = []
+        # Process new file if provided
+        if message.get("files") and len(message["files"]) > 0:
+            result = process_file(message["files"][0])
+            if "Error" in result:
+                yield result
+                return
+        # Process history with better error handling
+        for i, msg in enumerate(history):
+            try:
+                if isinstance(msg[0], dict):
+                    user_content = [{"type": "text", "text": msg[0]["text"]}]
+                    if "files" in msg[0] and len(msg[0]["files"]) > 0:
+                        user_content.append({"type": "image"})
+                    messages.append({"role": "user", "content": user_content})
+                    messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
+                elif isinstance(msg[0], str):
+                    messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
+                    messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
+            except Exception as e:
+                logger.error(f"Error processing history message {i}: {str(e)}")
+                continue
+        # Include document context
+        if doc_state.current_doc_images:
+            context = f"\nDocument context:\n{doc_state.current_doc_text}" if doc_state.current_doc_text else ""
+            current_msg = f"{txt}{context}"
+            messages.append({"role": "user", "content": [{"type": "text", "text": current_msg}, {"type": "image"}]})
+        else:
+            messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
+        # Process inputs
+        texts = processor.apply_chat_template(messages, add_generation_prompt=True)
+        try:
+            if doc_state.current_doc_images:
+                inputs = processor(
+                    text=texts,
+                    images=doc_state.current_doc_images[0:1],
+                    return_tensors="pt"
+                ).to("cuda")
+            else:
+                inputs = processor(text=texts, return_tensors="pt").to("cuda")
+            streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
+            generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
+            thread = Thread(target=model.generate, kwargs=generation_kwargs)
+            thread.start()
+            buffer = ""
+            for new_text in streamer:
+                buffer += new_text
+                time.sleep(0.01)
+                yield buffer
+        except Exception as e:
+            logger.error(f"Error in model processing: {str(e)}")
+            yield "An error occurred while processing your request. Please try again."
+    except Exception as e:
+        logger.error(f"Error in bot_streaming: {str(e)}")
+        yield "An error occurred. Please try again."
 def clear_context():
     """Clear the current document context."""
     doc_state.clear()
     clear_btn = gr.Button("Clear Document Context")
     clear_btn.click(fn=clear_context)
+    chatbot.textbox.file_types = ["image", "pdf", "text"]
 # Launch the interface
 demo.launch(debug=True)