Biifruu commited on
Commit
3c39da9
·
verified ·
1 Parent(s): dd8d861

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -6
app.py CHANGED
@@ -21,13 +21,13 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
21
  image_counter = 1
22
  elements = []
23
 
24
- page = doc[0] # solo una página en cada llamada
25
 
26
  blocks = page.get_text("dict")["blocks"]
27
 
28
  for b in blocks:
29
  y = b["bbox"][1]
30
- if b["type"] == 0: # Texto
31
  for line in b["lines"]:
32
  line_y = line["bbox"][1]
33
  line_text = " ".join([span["text"] for span in line["spans"]]).strip()
@@ -35,7 +35,6 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
35
  if line_text:
36
  elements.append((line_y, line_text, max_font_size))
37
 
38
- # Extraer imágenes únicas (por xref)
39
  images_on_page = page.get_images(full=True)
40
  for img_index, img in enumerate(images_on_page):
41
  xref = img[0]
@@ -95,7 +94,6 @@ def convert(pdf_file):
95
 
96
  markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
97
 
98
- # OCR
99
  try:
100
  ocr_text = pytesseract.image_to_string(img)
101
  except pytesseract.TesseractError:
@@ -105,16 +103,20 @@ def convert(pdf_file):
105
  if ocr_text.strip():
106
  markdown_output += ocr_text + "\n"
107
 
108
- # Detección de imágenes dentro de la imagen completa (por contornos)
109
  try:
110
  img_cv = np.array(img)
111
  gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
112
  _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
113
  contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
114
 
 
 
 
115
  for i, cnt in enumerate(contours):
116
  x, y, w, h = cv2.boundingRect(cnt)
117
- if w > 50 and h > 50:
 
118
  region = img_cv[y:y+h, x:x+w]
119
  detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
120
  Image.fromarray(region).save(detected_path)
 
21
  image_counter = 1
22
  elements = []
23
 
24
+ page = doc[0]
25
 
26
  blocks = page.get_text("dict")["blocks"]
27
 
28
  for b in blocks:
29
  y = b["bbox"][1]
30
+ if b["type"] == 0:
31
  for line in b["lines"]:
32
  line_y = line["bbox"][1]
33
  line_text = " ".join([span["text"] for span in line["spans"]]).strip()
 
35
  if line_text:
36
  elements.append((line_y, line_text, max_font_size))
37
 
 
38
  images_on_page = page.get_images(full=True)
39
  for img_index, img in enumerate(images_on_page):
40
  xref = img[0]
 
94
 
95
  markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
96
 
 
97
  try:
98
  ocr_text = pytesseract.image_to_string(img)
99
  except pytesseract.TesseractError:
 
103
  if ocr_text.strip():
104
  markdown_output += ocr_text + "\n"
105
 
106
+ # Detección limitada de imágenes embebidas (hasta 5 contornos grandes)
107
  try:
108
  img_cv = np.array(img)
109
  gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
110
  _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
111
  contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
112
 
113
+ # Ordenar por área y limitar a 5 regiones grandes
114
+ contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
115
+
116
  for i, cnt in enumerate(contours):
117
  x, y, w, h = cv2.boundingRect(cnt)
118
+ area = w * h
119
+ if area > 5000:
120
  region = img_cv[y:y+h, x:x+w]
121
  detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
122
  Image.fromarray(region).save(detected_path)