Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -15,7 +15,7 @@ def convert(pdf_file):
|
|
15 |
blocks = page.get_text("dict")["blocks"]
|
16 |
elements = []
|
17 |
|
18 |
-
# Extraemos
|
19 |
image_list = page.get_images(full=True)
|
20 |
xref_to_image_path = {}
|
21 |
|
@@ -24,7 +24,7 @@ def convert(pdf_file):
|
|
24 |
pix = fitz.Pixmap(doc, xref)
|
25 |
img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
|
26 |
|
27 |
-
if pix.n > 4: #
|
28 |
pix = fitz.Pixmap(fitz.csRGB, pix)
|
29 |
pix.save(img_path)
|
30 |
pix = None
|
@@ -32,7 +32,7 @@ def convert(pdf_file):
|
|
32 |
xref_to_image_path[xref] = img_path
|
33 |
image_counter += 1
|
34 |
|
35 |
-
#
|
36 |
for b in blocks:
|
37 |
if b["type"] == 0: # Texto
|
38 |
for line in b["lines"]:
|
@@ -42,13 +42,11 @@ def convert(pdf_file):
|
|
42 |
elements.append((y, text.strip()))
|
43 |
elif b["type"] == 1: # Imagen
|
44 |
y = b["bbox"][1]
|
45 |
-
|
46 |
-
xref = b.get("image", {}).get("xref", None)
|
47 |
if xref and xref in xref_to_image_path:
|
48 |
img_path = xref_to_image_path[xref]
|
49 |
elements.append((y, f""))
|
50 |
else:
|
51 |
-
# Si no encontramos la imagen, dejamos marcador vacío
|
52 |
elements.append((y, "[imagen]()"))
|
53 |
|
54 |
elements.sort(key=lambda x: x[0])
|
|
|
15 |
blocks = page.get_text("dict")["blocks"]
|
16 |
elements = []
|
17 |
|
18 |
+
# Extraemos lista de imágenes con sus xrefs
|
19 |
image_list = page.get_images(full=True)
|
20 |
xref_to_image_path = {}
|
21 |
|
|
|
24 |
pix = fitz.Pixmap(doc, xref)
|
25 |
img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
|
26 |
|
27 |
+
if pix.n > 4: # CMYK -> RGB
|
28 |
pix = fitz.Pixmap(fitz.csRGB, pix)
|
29 |
pix.save(img_path)
|
30 |
pix = None
|
|
|
32 |
xref_to_image_path[xref] = img_path
|
33 |
image_counter += 1
|
34 |
|
35 |
+
# Procesar bloques y ordenar por coordenada vertical
|
36 |
for b in blocks:
|
37 |
if b["type"] == 0: # Texto
|
38 |
for line in b["lines"]:
|
|
|
42 |
elements.append((y, text.strip()))
|
43 |
elif b["type"] == 1: # Imagen
|
44 |
y = b["bbox"][1]
|
45 |
+
xref = b.get("image", None)
|
|
|
46 |
if xref and xref in xref_to_image_path:
|
47 |
img_path = xref_to_image_path[xref]
|
48 |
elements.append((y, f""))
|
49 |
else:
|
|
|
50 |
elements.append((y, "[imagen]()"))
|
51 |
|
52 |
elements.sort(key=lambda x: x[0])
|