Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,8 @@ import fitz # PyMuPDF
|
|
4 |
from PIL import Image
|
5 |
import pytesseract
|
6 |
import os
|
|
|
|
|
7 |
|
8 |
def clean_ocr_text(text):
|
9 |
lines = text.splitlines()
|
@@ -33,12 +35,12 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
|
|
33 |
if line_text:
|
34 |
elements.append((line_y, line_text, max_font_size))
|
35 |
|
36 |
-
# Extraer imágenes únicas (por xref
|
37 |
images_on_page = page.get_images(full=True)
|
38 |
for img_index, img in enumerate(images_on_page):
|
39 |
xref = img[0]
|
40 |
if xref in seen_xrefs:
|
41 |
-
continue
|
42 |
seen_xrefs.add(xref)
|
43 |
try:
|
44 |
base_image = page.parent.extract_image(xref)
|
@@ -53,7 +55,6 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
|
|
53 |
except Exception as e:
|
54 |
elements.append((float("inf"), f"[Error imagen: {e}]", 10))
|
55 |
|
56 |
-
# Ordenar por posición
|
57 |
elements.sort(key=lambda x: x[0])
|
58 |
previous_y = None
|
59 |
|
@@ -75,7 +76,7 @@ def convert(pdf_file):
|
|
75 |
doc = fitz.open(pdf_file)
|
76 |
markdown_output = ""
|
77 |
image_paths = []
|
78 |
-
seen_xrefs = set()
|
79 |
|
80 |
for page_num in range(len(doc)):
|
81 |
page = doc[page_num]
|
@@ -94,18 +95,36 @@ def convert(pdf_file):
|
|
94 |
|
95 |
markdown_output += f"\n"
|
96 |
|
|
|
97 |
try:
|
98 |
-
ocr_text = pytesseract.image_to_string(img, lang="spa")
|
99 |
-
except pytesseract.TesseractError:
|
100 |
ocr_text = pytesseract.image_to_string(img)
|
|
|
|
|
101 |
|
102 |
ocr_text = clean_ocr_text(ocr_text)
|
103 |
if ocr_text.strip():
|
104 |
markdown_output += ocr_text + "\n"
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
markdown_output += "\n---\n\n"
|
107 |
|
108 |
-
# Guardar como archivo .md
|
109 |
markdown_path = "/tmp/resultado.md"
|
110 |
with open(markdown_path, "w", encoding="utf-8") as f:
|
111 |
f.write(markdown_output)
|
|
|
4 |
from PIL import Image
|
5 |
import pytesseract
|
6 |
import os
|
7 |
+
import numpy as np
|
8 |
+
import cv2
|
9 |
|
10 |
def clean_ocr_text(text):
|
11 |
lines = text.splitlines()
|
|
|
35 |
if line_text:
|
36 |
elements.append((line_y, line_text, max_font_size))
|
37 |
|
38 |
+
# Extraer imágenes únicas (por xref)
|
39 |
images_on_page = page.get_images(full=True)
|
40 |
for img_index, img in enumerate(images_on_page):
|
41 |
xref = img[0]
|
42 |
if xref in seen_xrefs:
|
43 |
+
continue
|
44 |
seen_xrefs.add(xref)
|
45 |
try:
|
46 |
base_image = page.parent.extract_image(xref)
|
|
|
55 |
except Exception as e:
|
56 |
elements.append((float("inf"), f"[Error imagen: {e}]", 10))
|
57 |
|
|
|
58 |
elements.sort(key=lambda x: x[0])
|
59 |
previous_y = None
|
60 |
|
|
|
76 |
doc = fitz.open(pdf_file)
|
77 |
markdown_output = ""
|
78 |
image_paths = []
|
79 |
+
seen_xrefs = set()
|
80 |
|
81 |
for page_num in range(len(doc)):
|
82 |
page = doc[page_num]
|
|
|
95 |
|
96 |
markdown_output += f"\n"
|
97 |
|
98 |
+
# OCR
|
99 |
try:
|
|
|
|
|
100 |
ocr_text = pytesseract.image_to_string(img)
|
101 |
+
except pytesseract.TesseractError:
|
102 |
+
ocr_text = ""
|
103 |
|
104 |
ocr_text = clean_ocr_text(ocr_text)
|
105 |
if ocr_text.strip():
|
106 |
markdown_output += ocr_text + "\n"
|
107 |
|
108 |
+
# Detección de imágenes dentro de la imagen completa (por contornos)
|
109 |
+
try:
|
110 |
+
img_cv = np.array(img)
|
111 |
+
gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
|
112 |
+
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
113 |
+
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
114 |
+
|
115 |
+
for i, cnt in enumerate(contours):
|
116 |
+
x, y, w, h = cv2.boundingRect(cnt)
|
117 |
+
if w > 50 and h > 50:
|
118 |
+
region = img_cv[y:y+h, x:x+w]
|
119 |
+
detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
|
120 |
+
Image.fromarray(region).save(detected_path)
|
121 |
+
image_paths.append(detected_path)
|
122 |
+
markdown_output += f"\n\n\n"
|
123 |
+
except Exception as e:
|
124 |
+
markdown_output += f"\n\n[Error al detectar imágenes embebidas: {e}]\n"
|
125 |
+
|
126 |
markdown_output += "\n---\n\n"
|
127 |
|
|
|
128 |
markdown_path = "/tmp/resultado.md"
|
129 |
with open(markdown_path, "w", encoding="utf-8") as f:
|
130 |
f.write(markdown_output)
|