Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -21,13 +21,13 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
|
|
21 |
image_counter = 1
|
22 |
elements = []
|
23 |
|
24 |
-
page = doc[0]
|
25 |
|
26 |
blocks = page.get_text("dict")["blocks"]
|
27 |
|
28 |
for b in blocks:
|
29 |
y = b["bbox"][1]
|
30 |
-
if b["type"] == 0:
|
31 |
for line in b["lines"]:
|
32 |
line_y = line["bbox"][1]
|
33 |
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
|
@@ -35,7 +35,6 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
|
|
35 |
if line_text:
|
36 |
elements.append((line_y, line_text, max_font_size))
|
37 |
|
38 |
-
# Extraer imágenes únicas (por xref)
|
39 |
images_on_page = page.get_images(full=True)
|
40 |
for img_index, img in enumerate(images_on_page):
|
41 |
xref = img[0]
|
@@ -95,7 +94,6 @@ def convert(pdf_file):
|
|
95 |
|
96 |
markdown_output += f"\n"
|
97 |
|
98 |
-
# OCR
|
99 |
try:
|
100 |
ocr_text = pytesseract.image_to_string(img)
|
101 |
except pytesseract.TesseractError:
|
@@ -105,16 +103,20 @@ def convert(pdf_file):
|
|
105 |
if ocr_text.strip():
|
106 |
markdown_output += ocr_text + "\n"
|
107 |
|
108 |
-
# Detección de imágenes
|
109 |
try:
|
110 |
img_cv = np.array(img)
|
111 |
gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
|
112 |
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
113 |
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
114 |
|
|
|
|
|
|
|
115 |
for i, cnt in enumerate(contours):
|
116 |
x, y, w, h = cv2.boundingRect(cnt)
|
117 |
-
|
|
|
118 |
region = img_cv[y:y+h, x:x+w]
|
119 |
detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
|
120 |
Image.fromarray(region).save(detected_path)
|
|
|
21 |
image_counter = 1
|
22 |
elements = []
|
23 |
|
24 |
+
page = doc[0]
|
25 |
|
26 |
blocks = page.get_text("dict")["blocks"]
|
27 |
|
28 |
for b in blocks:
|
29 |
y = b["bbox"][1]
|
30 |
+
if b["type"] == 0:
|
31 |
for line in b["lines"]:
|
32 |
line_y = line["bbox"][1]
|
33 |
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
|
|
|
35 |
if line_text:
|
36 |
elements.append((line_y, line_text, max_font_size))
|
37 |
|
|
|
38 |
images_on_page = page.get_images(full=True)
|
39 |
for img_index, img in enumerate(images_on_page):
|
40 |
xref = img[0]
|
|
|
94 |
|
95 |
markdown_output += f"\n"
|
96 |
|
|
|
97 |
try:
|
98 |
ocr_text = pytesseract.image_to_string(img)
|
99 |
except pytesseract.TesseractError:
|
|
|
103 |
if ocr_text.strip():
|
104 |
markdown_output += ocr_text + "\n"
|
105 |
|
106 |
+
# ✅ Detección limitada de imágenes embebidas (hasta 5 contornos grandes)
|
107 |
try:
|
108 |
img_cv = np.array(img)
|
109 |
gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
|
110 |
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
111 |
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
112 |
|
113 |
+
# Ordenar por área y limitar a 5 regiones grandes
|
114 |
+
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
|
115 |
+
|
116 |
for i, cnt in enumerate(contours):
|
117 |
x, y, w, h = cv2.boundingRect(cnt)
|
118 |
+
area = w * h
|
119 |
+
if area > 5000:
|
120 |
region = img_cv[y:y+h, x:x+w]
|
121 |
detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
|
122 |
Image.fromarray(region).save(detected_path)
|