import os import re import io import streamlit as st from PIL import Image, ImageDraw, ImageFont from google import genai from google.genai import types from pdf2image import convert_from_bytes DETECTION_PROMPT = """\ Analyze this document image and identify text regions following these rules: 1. GROUP RELATED CONTENT: - Full tables as SINGLE regions (including headers and all rows) - Paragraphs as SINGLE rectangular blocks (multiple lines as one box) - Keep text columns intact - Treat list items as single region if visually grouped 2. TEXT REGION REQUIREMENTS: - Boundaries must tightly wrap text content - Include 2% padding around text clusters - Exclude isolated decorative elements - Merge adjacent text fragments with ≤1% spacing 3. COORDINATE FORMAT: Python list of lists [[xmin, ymin, xmax, ymax]] - Normalized 0-1 with 3 decimal places - Ordered top-to-bottom, left-to-right - Table example: [[0.12, 0.35, 0.88, 0.65]] for full table 4. SPECIAL CASES: - Table cells should NOT have individual boxes - Page headers/footers as separate regions - Text wrapped around images as distinct regions Example response for table + 2 paragraphs: [[0.07, 0.12, 0.93, 0.28], # Header [0.12, 0.35, 0.88, 0.65], # Full table [0.10, 0.70, 0.90, 0.85], # First paragraph [0.10, 0.88, 0.90, 0.95]] # Second paragraph ONLY RETURN THE PYTHON LIST! No explanations. """ TEXT_EXTRACTION_PROMPT = "Extract the text in this image. Return only the exact text, nothing else." def parse_list_boxes(text): """Improved parsing with better error handling""" try: return eval(text) except: matches = re.findall(r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]', text) return [[float(x) for x in m] for m in matches] def draw_bounding_boxes(image, boxes): """Enhanced drawing with numbering""" if not boxes: return image draw = ImageDraw.Draw(image) width, height = image.size for i, box in enumerate(boxes): try: # Convert normalized coordinates to pixel values xmin = max(0.0, min(1.0, box[0])) * width ymin = max(0.0, min(1.0, box[1])) * height xmax = max(0.0, min(1.0, box[2])) * width ymax = max(0.0, min(1.0, box[3])) * height # Draw bounding box draw.rectangle([xmin, ymin, xmax, ymax], outline="#00FF00", width=3) # Draw number label label = str(i+1) draw.text((xmin + 5, ymin + 5), label, fill="red") except Exception as e: st.error(f"Error drawing box: {str(e)}") return image def extract_text_from_region(client, image, box): """Extract text from a specific region using Gemini""" try: width, height = image.size # Convert normalized coordinates to pixel values xmin = int(max(0.0, min(1.0, box[0])) * width) ymin = int(max(0.0, min(1.0, box[1])) * height) xmax = int(max(0.0, min(1.0, box[2])) * width) ymax = int(max(0.0, min(1.0, box[3])) * height) if xmin >= xmax or ymin >= ymax: return "" # Crop and convert to bytes cropped = image.crop((xmin, ymin, xmax, ymax)) img_byte_arr = io.BytesIO() cropped.save(img_byte_arr, format='PNG') # Call Gemini API response = client.models.generate_content( model="gemini-2.5-pro-exp-03-25", contents=[ TEXT_EXTRACTION_PROMPT, types.Part.from_bytes( data=img_byte_arr.getvalue(), mime_type="image/png" ) ] ) return response.text.strip() except Exception as e: st.error(f"Text extraction error: {str(e)}") return "" # Streamlit UI st.title("PDF Text Detection") uploaded_file = st.file_uploader("Upload PDF", type=["pdf"]) if uploaded_file and st.button("Analyze"): with st.spinner("Processing..."): try: images = convert_from_bytes(uploaded_file.read(), dpi=300) client = genai.Client(api_key=os.getenv("KEY")) tabs = st.tabs([f"Page {i+1}" for i in range(len(images))]) for idx, (tab, image) in enumerate(zip(tabs, images)): with tab: col1, col2 = st.columns(2) with col1: st.image(image, caption="Original", use_container_width=True) with col2: # Get bounding boxes img_byte_arr = io.BytesIO() image.save(img_byte_arr, format='PNG') response = client.models.generate_content( model="gemini-2.0-flash-exp", contents=[ DETECTION_PROMPT, types.Part.from_bytes( data=img_byte_arr.getvalue(), mime_type="image/png" ) ] ) boxes = parse_list_boxes(response.text) texts = [extract_text_from_region(client, image, box) for box in boxes] # Draw annotated image annotated = draw_bounding_boxes(image.copy(), boxes) st.image(annotated, caption=f"Detected {len(boxes)} text regions", use_container_width=True) # Display extracted texts if any(texts): st.subheader("Extracted Texts:") for i, text in enumerate(texts, 1): st.write(f"{i}. {text if text else 'No text detected'}") # Debug section debug_expander = st.expander("Debug Details") with debug_expander: st.write("**Raw API Response:**") st.code(response.text) st.write("**Parsed Boxes:**") st.write(boxes) except Exception as e: st.error(f"Error: {str(e)}")