Spaces:
Running
Running
File size: 6,518 Bytes
6459986 6c79114 d1dce8a 6c79114 d2aded5 6c79114 1d8d466 7e4f227 5f554b3 6018547 7c8a4dc 316d102 d2aded5 6c79114 62c24e3 6c64ea6 62c24e3 5f554b3 cdb1e78 d2aded5 62c24e3 cdb1e78 d2aded5 62c24e3 d2aded5 62c24e3 d2aded5 62c24e3 d2aded5 62c24e3 6c79114 d1dce8a d2aded5 b96fa16 d2aded5 6c79114 62c24e3 6c64ea6 70da8e5 62c24e3 6c64ea6 62c24e3 6c64ea6 c45c762 62c24e3 6c64ea6 d2aded5 6c64ea6 d2aded5 6c64ea6 d2aded5 6c64ea6 c45c762 6c64ea6 d2aded5 6c64ea6 62c24e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
import os
import re
import io
import streamlit as st
from PIL import Image, ImageDraw, ImageFont
from google import genai
from google.genai import types
from pdf2image import convert_from_bytes
DETECTION_PROMPT = """\
Analyze this document image and identify text regions following these rules:
1. GROUP RELATED CONTENT:
- Full tables as SINGLE regions (including headers and all rows)
- Paragraphs as SINGLE rectangular blocks (multiple lines as one box)
- Keep text columns intact
- Treat list items as single region if visually grouped
2. TEXT REGION REQUIREMENTS:
- Boundaries must tightly wrap text content
- Include 2% padding around text clusters
- Exclude isolated decorative elements
- Merge adjacent text fragments with ≤1% spacing
3. COORDINATE FORMAT:
Python list of lists [[xmin, ymin, xmax, ymax]]
- Normalized 0-1 with 3 decimal places
- Ordered top-to-bottom, left-to-right
- Table example: [[0.12, 0.35, 0.88, 0.65]] for full table
4. SPECIAL CASES:
- Table cells should NOT have individual boxes
- Page headers/footers as separate regions
- Text wrapped around images as distinct regions
Example response for table + 2 paragraphs:
[[0.07, 0.12, 0.93, 0.28], # Header
[0.12, 0.35, 0.88, 0.65], # Full table
[0.10, 0.70, 0.90, 0.85], # First paragraph
[0.10, 0.88, 0.90, 0.95]] # Second paragraph
ONLY RETURN THE PYTHON LIST! No explanations.
"""
TEXT_EXTRACTION_PROMPT = "Extract the text in this image. Return only the exact text, nothing else."
def parse_list_boxes(text):
"""Improved parsing with better error handling"""
try:
return eval(text)
except:
matches = re.findall(r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]', text)
return [[float(x) for x in m] for m in matches]
def draw_bounding_boxes(image, boxes):
"""Enhanced drawing with numbering"""
if not boxes:
return image
draw = ImageDraw.Draw(image)
width, height = image.size
for i, box in enumerate(boxes):
try:
# Convert normalized coordinates to pixel values
xmin = max(0.0, min(1.0, box[0])) * width
ymin = max(0.0, min(1.0, box[1])) * height
xmax = max(0.0, min(1.0, box[2])) * width
ymax = max(0.0, min(1.0, box[3])) * height
# Draw bounding box
draw.rectangle([xmin, ymin, xmax, ymax], outline="#00FF00", width=3)
# Draw number label
label = str(i+1)
draw.text((xmin + 5, ymin + 5), label, fill="red")
except Exception as e:
st.error(f"Error drawing box: {str(e)}")
return image
def extract_text_from_region(client, image, box):
"""Extract text from a specific region using Gemini"""
try:
width, height = image.size
# Convert normalized coordinates to pixel values
xmin = int(max(0.0, min(1.0, box[0])) * width)
ymin = int(max(0.0, min(1.0, box[1])) * height)
xmax = int(max(0.0, min(1.0, box[2])) * width)
ymax = int(max(0.0, min(1.0, box[3])) * height)
if xmin >= xmax or ymin >= ymax:
return ""
# Crop and convert to bytes
cropped = image.crop((xmin, ymin, xmax, ymax))
img_byte_arr = io.BytesIO()
cropped.save(img_byte_arr, format='PNG')
# Call Gemini API
response = client.models.generate_content(
model="gemini-2.5-pro-exp-03-25",
contents=[
TEXT_EXTRACTION_PROMPT,
types.Part.from_bytes(
data=img_byte_arr.getvalue(),
mime_type="image/png"
)
]
)
return response.text.strip()
except Exception as e:
st.error(f"Text extraction error: {str(e)}")
return ""
# Streamlit UI
st.title("PDF Text Detection")
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_file and st.button("Analyze"):
with st.spinner("Processing..."):
try:
images = convert_from_bytes(uploaded_file.read(), dpi=300)
client = genai.Client(api_key=os.getenv("KEY"))
tabs = st.tabs([f"Page {i+1}" for i in range(len(images))])
for idx, (tab, image) in enumerate(zip(tabs, images)):
with tab:
col1, col2 = st.columns(2)
with col1:
st.image(image, caption="Original", use_container_width=True)
with col2:
# Get bounding boxes
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format='PNG')
response = client.models.generate_content(
model="gemini-2.0-flash-exp",
contents=[
DETECTION_PROMPT,
types.Part.from_bytes(
data=img_byte_arr.getvalue(),
mime_type="image/png"
)
]
)
boxes = parse_list_boxes(response.text)
texts = [extract_text_from_region(client, image, box) for box in boxes]
# Draw annotated image
annotated = draw_bounding_boxes(image.copy(), boxes)
st.image(annotated,
caption=f"Detected {len(boxes)} text regions",
use_container_width=True)
# Display extracted texts
if any(texts):
st.subheader("Extracted Texts:")
for i, text in enumerate(texts, 1):
st.write(f"{i}. {text if text else 'No text detected'}")
# Debug section
debug_expander = st.expander("Debug Details")
with debug_expander:
st.write("**Raw API Response:**")
st.code(response.text)
st.write("**Parsed Boxes:**")
st.write(boxes)
except Exception as e:
st.error(f"Error: {str(e)}") |