Sebbe33 commited on
Commit
62c24e3
·
verified ·
1 Parent(s): 6018547

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -93
app.py CHANGED
@@ -9,108 +9,75 @@ from pdf2image import convert_from_bytes
9
 
10
  # Constants
11
  DETECTION_PROMPT = """\
12
- Identify all text regions in this document. Provide bounding boxes in the format [xmin, ymin, xmax, ymax]
13
- as percentages of the image dimensions. Return only a Python-style list of lists without any additional text.
14
- Example: [[0.1, 0.2, 0.4, 0.5], [0.6, 0.7, 0.8, 0.9]]
 
15
  """
16
 
17
- # Helper functions
18
  def parse_list_boxes(text):
19
- """Extracts bounding boxes from response text"""
20
- pattern = r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]'
21
- matches = re.findall(pattern, text)
22
- return [[float(m) for m in match] for match in matches]
 
 
23
 
24
  def draw_bounding_boxes(image, boxes):
25
- """Draws bounding boxes on the image using [xmin, ymin, xmax, ymax] format"""
 
 
 
26
  draw = ImageDraw.Draw(image)
27
  width, height = image.size
28
 
29
  for box in boxes:
30
- xmin = max(0.0, min(1.0, box[0]))
31
- ymin = max(0.0, min(1.0, box[1]))
32
- xmax = max(0.0, min(1.0, box[2]))
33
- ymax = max(0.0, min(1.0, box[3]))
34
-
35
- draw.rectangle([
36
- xmin * width,
37
- ymin * height,
38
- xmax * width,
39
- ymax * height
40
- ], outline="#00FF00", width=2)
41
  return image
42
 
43
  # Streamlit UI
44
- st.title("PDF Text Region Detection with Gemini")
45
- col1, col2 = st.columns(2)
46
-
47
- with col1:
48
- uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
49
-
50
- if uploaded_file:
51
- if st.button("Analyze Document"):
52
- with st.spinner("Analyzing PDF..."):
53
- try:
54
- # Convert PDF to images
55
- pdf_bytes = uploaded_file.read()
56
- images = convert_from_bytes(pdf_bytes)
57
- results = []
58
-
59
- # Initialize client
60
- client = genai.Client(api_key=os.getenv("KEY"))
61
-
62
- for page_num, image in enumerate(images):
63
- # Prepare image
64
- img_byte_arr = io.BytesIO()
65
- image.save(img_byte_arr, format='PNG')
66
-
67
- image_part = types.Part.from_bytes(
68
- data=img_byte_arr.getvalue(),
69
- mime_type="image/png"
70
- )
71
-
72
- # Get all text boxes
73
- box_response = client.models.generate_content(
74
- model="gemini-2.0-flash-exp",
75
- contents=[DETECTION_PROMPT, image_part]
76
- )
77
-
78
- # Get description
79
- desc_response = client.models.generate_content(
80
- model="gemini-2.0-flash-exp",
81
- contents=["Describe this document section in detail.", image_part]
82
- )
83
-
84
- # Process boxes
85
- try:
86
- boxes = parse_list_boxes(box_response.text)
87
- except Exception as e:
88
- st.error(f"Error on page {page_num+1}: {str(e)}")
89
- boxes = []
90
-
91
- # Draw boxes
92
- annotated_image = image.copy()
93
- if boxes:
94
- annotated_image = draw_bounding_boxes(annotated_image, boxes)
95
-
96
- results.append({
97
- "page": page_num + 1,
98
- "image": annotated_image,
99
- "description": desc_response.text,
100
- "boxes": len(boxes)
101
- })
102
-
103
- # Display results
104
- with col2:
105
- st.write(f"## Results ({len(results)} pages)")
106
- tabs = st.tabs([f"Page {res['page']}" for res in results])
107
-
108
- for tab, res in zip(tabs, results):
109
- with tab:
110
- st.image(res["image"],
111
- caption=f"Page {res['page']} - {res['boxes']} text regions detected",
112
- use_container_width=True)
113
- st.write("**Description:**", res["description"])
114
-
115
- except Exception as e:
116
- st.error(f"Error: {str(e)}")
 
9
 
10
  # Constants
11
  DETECTION_PROMPT = """\
12
+ Identify ALL text regions in this document. Return bounding boxes as a Python list of lists
13
+ in format [[xmin, ymin, xmax, ymax]] where coordinates are normalized between 0-1.
14
+ Only return the list, nothing else. Example:
15
+ [[0.05, 0.12, 0.25, 0.18], [0.30, 0.40, 0.50, 0.55]]
16
  """
17
 
 
18
  def parse_list_boxes(text):
19
+ """Improved parsing with better error handling"""
20
+ try:
21
+ return eval(text) # Safer alternative: Use ast.literal_eval
22
+ except:
23
+ matches = re.findall(r'\[([\d\.]+),\s*([\d\.]+),\s*([\d\.]+),\s*([\d\.]+)\]', text)
24
+ return [[float(x) for x in m] for m in matches]
25
 
26
  def draw_bounding_boxes(image, boxes):
27
+ """Enhanced drawing with diagnostics"""
28
+ if not boxes:
29
+ return image
30
+
31
  draw = ImageDraw.Draw(image)
32
  width, height = image.size
33
 
34
  for box in boxes:
35
+ try:
36
+ xmin = max(0.0, min(1.0, box[0])) * width
37
+ ymin = max(0.0, min(1.0, box[1])) * height
38
+ xmax = max(0.0, min(1.0, box[2])) * width
39
+ ymax = max(0.0, min(1.0, box[3])) * height
40
+
41
+ draw.rectangle([xmin, ymin, xmax, ymax], outline="#00FF00", width=3)
42
+ except Exception as e:
43
+ st.error(f"Error drawing box: {str(e)}")
 
 
44
  return image
45
 
46
  # Streamlit UI
47
+ st.title("PDF Text Detection")
48
+ uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
49
+
50
+ if uploaded_file and st.button("Analyze"):
51
+ with st.spinner("Processing..."):
52
+ try:
53
+ images = convert_from_bytes(uploaded_file.read(), dpi=300) # Increased DPI
54
+ client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY")) # Verify env var name
55
+
56
+ for idx, image in enumerate(images):
57
+ with st.expander(f"Page {idx+1}", expanded=True):
58
+ img_byte_arr = io.BytesIO()
59
+ image.save(img_byte_arr, format='PNG')
60
+
61
+ # Get bounding boxes
62
+ response = client.models.generate_content(
63
+ model="gemini-1.5-pro-latest", # Try newer model
64
+ contents=[DETECTION_PROMPT, types.Part.from_bytes(img_byte_arr.getvalue(), "image/png")]
65
+ )
66
+
67
+ # Debug output
68
+ with st.expander("Raw API Response"):
69
+ st.code(response.text)
70
+
71
+ # Parse and draw
72
+ boxes = parse_list_boxes(response.text)
73
+ annotated = draw_bounding_boxes(image.copy(), boxes)
74
+
75
+ # Display
76
+ cols = st.columns(2)
77
+ cols[0].image(image, caption="Original", use_column_width=True)
78
+ cols[1].image(annotated,
79
+ caption=f"Detected {len(boxes)} text regions",
80
+ use_column_width=True)
81
+
82
+ except Exception as e:
83
+ st.error(f"Error: {str(e)}")