Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -15,6 +15,24 @@ def convert(pdf_file):
|
|
15 |
blocks = page.get_text("dict")["blocks"]
|
16 |
elements = []
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
for b in blocks:
|
19 |
if b["type"] == 0: # Texto
|
20 |
for line in b["lines"]:
|
@@ -24,21 +42,15 @@ def convert(pdf_file):
|
|
24 |
elements.append((y, text.strip()))
|
25 |
elif b["type"] == 1: # Imagen
|
26 |
y = b["bbox"][1]
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
|
32 |
-
|
33 |
-
if pix.n > 4: # CMYK
|
34 |
-
pix = fitz.Pixmap(fitz.csRGB, pix)
|
35 |
-
pix.save(img_path)
|
36 |
-
pix = None
|
37 |
-
|
38 |
elements.append((y, f""))
|
39 |
-
|
|
|
|
|
40 |
|
41 |
-
# Ordenar por posici贸n vertical (y)
|
42 |
elements.sort(key=lambda x: x[0])
|
43 |
|
44 |
for _, content in elements:
|
@@ -48,11 +60,6 @@ def convert(pdf_file):
|
|
48 |
|
49 |
gr.Interface(
|
50 |
convert,
|
51 |
-
inputs=[
|
52 |
-
|
53 |
-
],
|
54 |
-
outputs=[
|
55 |
-
gr.Text(label="Markdown"),
|
56 |
-
gr.JSON(label="Metadata"),
|
57 |
-
],
|
58 |
).launch()
|
|
|
15 |
blocks = page.get_text("dict")["blocks"]
|
16 |
elements = []
|
17 |
|
18 |
+
# Extraemos la lista de im谩genes en esta p谩gina, con sus xrefs
|
19 |
+
image_list = page.get_images(full=True)
|
20 |
+
xref_to_image_path = {}
|
21 |
+
|
22 |
+
for img in image_list:
|
23 |
+
xref = img[0]
|
24 |
+
pix = fitz.Pixmap(doc, xref)
|
25 |
+
img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
|
26 |
+
|
27 |
+
if pix.n > 4: # si es CMYK, convertir a RGB
|
28 |
+
pix = fitz.Pixmap(fitz.csRGB, pix)
|
29 |
+
pix.save(img_path)
|
30 |
+
pix = None
|
31 |
+
|
32 |
+
xref_to_image_path[xref] = img_path
|
33 |
+
image_counter += 1
|
34 |
+
|
35 |
+
# Recorremos bloques y reconstruimos texto+imagenes en orden vertical
|
36 |
for b in blocks:
|
37 |
if b["type"] == 0: # Texto
|
38 |
for line in b["lines"]:
|
|
|
42 |
elements.append((y, text.strip()))
|
43 |
elif b["type"] == 1: # Imagen
|
44 |
y = b["bbox"][1]
|
45 |
+
# El bloque de imagen tiene su xref
|
46 |
+
xref = b.get("image", {}).get("xref", None)
|
47 |
+
if xref and xref in xref_to_image_path:
|
48 |
+
img_path = xref_to_image_path[xref]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
elements.append((y, f""))
|
50 |
+
else:
|
51 |
+
# Si no encontramos la imagen, dejamos marcador vac铆o
|
52 |
+
elements.append((y, "[imagen]()"))
|
53 |
|
|
|
54 |
elements.sort(key=lambda x: x[0])
|
55 |
|
56 |
for _, content in elements:
|
|
|
60 |
|
61 |
gr.Interface(
|
62 |
convert,
|
63 |
+
inputs=[gr.File(label="Upload PDF", type="filepath")],
|
64 |
+
outputs=[gr.Text(label="Markdown"), gr.JSON(label="Metadata")],
|
|
|
|
|
|
|
|
|
|
|
65 |
).launch()
|