pdf_gemini / app.py
Sebbe33's picture
Update app.py
b96fa16 verified
import os
import re
import io
import streamlit as st
from PIL import Image, ImageDraw, ImageFont
from google import genai
from google.genai import types
from pdf2image import convert_from_bytes
DETECTION_PROMPT = """\
Analyze this document image and identify text regions following these rules:
1. GROUP RELATED CONTENT:
- Full tables as SINGLE regions (including headers and all rows)
- Paragraphs as SINGLE rectangular blocks (multiple lines as one box)
- Keep text columns intact
- Treat list items as single region if visually grouped
2. TEXT REGION REQUIREMENTS:
- Boundaries must tightly wrap text content
- Include 2% padding around text clusters
- Exclude isolated decorative elements
- Merge adjacent text fragments with ≤1% spacing
3. COORDINATE FORMAT:
Python list of lists [[xmin, ymin, xmax, ymax]]
- Normalized 0-1 with 3 decimal places
- Ordered top-to-bottom, left-to-right
- Table example: [[0.12, 0.35, 0.88, 0.65]] for full table
4. SPECIAL CASES:
- Table cells should NOT have individual boxes
- Page headers/footers as separate regions
- Text wrapped around images as distinct regions
Example response for table + 2 paragraphs:
[[0.07, 0.12, 0.93, 0.28], # Header
[0.12, 0.35, 0.88, 0.65], # Full table
[0.10, 0.70, 0.90, 0.85], # First paragraph
[0.10, 0.88, 0.90, 0.95]] # Second paragraph
ONLY RETURN THE PYTHON LIST! No explanations.
"""
TEXT_EXTRACTION_PROMPT = "Extract the text in this image. Return only the exact text, nothing else."
def parse_list_boxes(text):
"""Improved parsing with better error handling"""
try:
return eval(text)
except:
matches = re.findall(r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]', text)
return [[float(x) for x in m] for m in matches]
def draw_bounding_boxes(image, boxes):
"""Enhanced drawing with numbering"""
if not boxes:
return image
draw = ImageDraw.Draw(image)
width, height = image.size
for i, box in enumerate(boxes):
try:
# Convert normalized coordinates to pixel values
xmin = max(0.0, min(1.0, box[0])) * width
ymin = max(0.0, min(1.0, box[1])) * height
xmax = max(0.0, min(1.0, box[2])) * width
ymax = max(0.0, min(1.0, box[3])) * height
# Draw bounding box
draw.rectangle([xmin, ymin, xmax, ymax], outline="#00FF00", width=3)
# Draw number label
label = str(i+1)
draw.text((xmin + 5, ymin + 5), label, fill="red")
except Exception as e:
st.error(f"Error drawing box: {str(e)}")
return image
def extract_text_from_region(client, image, box):
"""Extract text from a specific region using Gemini"""
try:
width, height = image.size
# Convert normalized coordinates to pixel values
xmin = int(max(0.0, min(1.0, box[0])) * width)
ymin = int(max(0.0, min(1.0, box[1])) * height)
xmax = int(max(0.0, min(1.0, box[2])) * width)
ymax = int(max(0.0, min(1.0, box[3])) * height)
if xmin >= xmax or ymin >= ymax:
return ""
# Crop and convert to bytes
cropped = image.crop((xmin, ymin, xmax, ymax))
img_byte_arr = io.BytesIO()
cropped.save(img_byte_arr, format='PNG')
# Call Gemini API
response = client.models.generate_content(
model="gemini-2.5-pro-exp-03-25",
contents=[
TEXT_EXTRACTION_PROMPT,
types.Part.from_bytes(
data=img_byte_arr.getvalue(),
mime_type="image/png"
)
]
)
return response.text.strip()
except Exception as e:
st.error(f"Text extraction error: {str(e)}")
return ""
# Streamlit UI
st.title("PDF Text Detection")
uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
if uploaded_file and st.button("Analyze"):
with st.spinner("Processing..."):
try:
images = convert_from_bytes(uploaded_file.read(), dpi=300)
client = genai.Client(api_key=os.getenv("KEY"))
tabs = st.tabs([f"Page {i+1}" for i in range(len(images))])
for idx, (tab, image) in enumerate(zip(tabs, images)):
with tab:
col1, col2 = st.columns(2)
with col1:
st.image(image, caption="Original", use_container_width=True)
with col2:
# Get bounding boxes
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format='PNG')
response = client.models.generate_content(
model="gemini-2.0-flash-exp",
contents=[
DETECTION_PROMPT,
types.Part.from_bytes(
data=img_byte_arr.getvalue(),
mime_type="image/png"
)
]
)
boxes = parse_list_boxes(response.text)
texts = [extract_text_from_region(client, image, box) for box in boxes]
# Draw annotated image
annotated = draw_bounding_boxes(image.copy(), boxes)
st.image(annotated,
caption=f"Detected {len(boxes)} text regions",
use_container_width=True)
# Display extracted texts
if any(texts):
st.subheader("Extracted Texts:")
for i, text in enumerate(texts, 1):
st.write(f"{i}. {text if text else 'No text detected'}")
# Debug section
debug_expander = st.expander("Debug Details")
with debug_expander:
st.write("**Raw API Response:**")
st.code(response.text)
st.write("**Parsed Boxes:**")
st.write(boxes)
except Exception as e:
st.error(f"Error: {str(e)}")