pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 17 days ago

Commit

dd21256

verified ·

1 Parent(s): a97d32a

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -20

app.py CHANGED Viewed

@@ -15,6 +15,24 @@ def convert(pdf_file):
         blocks = page.get_text("dict")["blocks"]
         elements = []
         for b in blocks:
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
@@ -24,21 +42,15 @@ def convert(pdf_file):
                         elements.append((y, text.strip()))
             elif b["type"] == 1:  # Imagen
                 y = b["bbox"][1]
-                img = page.get_image_list(full=True)
-                if img:
-                    xref = img[0][0]
-                    pix = fitz.Pixmap(doc, xref)
-                    img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
-                    if pix.n > 4:  # CMYK
-                        pix = fitz.Pixmap(fitz.csRGB, pix)
-                    pix.save(img_path)
-                    pix = None
                     elements.append((y, f"![imagen]({img_path})"))
-                    image_counter += 1
-        # Ordenar por posición vertical (y)
         elements.sort(key=lambda x: x[0])
         for _, content in elements:
@@ -48,11 +60,6 @@ def convert(pdf_file):
 gr.Interface(
     convert,
-    inputs=[
-        gr.File(label="Upload PDF", type="filepath"),
-    ],
-    outputs=[
-        gr.Text(label="Markdown"),
-        gr.JSON(label="Metadata"),
-    ],
 ).launch()

         blocks = page.get_text("dict")["blocks"]
         elements = []
+        # Extraemos la lista de imágenes en esta página, con sus xrefs
+        image_list = page.get_images(full=True)
+        xref_to_image_path = {}
+        for img in image_list:
+            xref = img[0]
+            pix = fitz.Pixmap(doc, xref)
+            img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
+            if pix.n > 4:  # si es CMYK, convertir a RGB
+                pix = fitz.Pixmap(fitz.csRGB, pix)
+            pix.save(img_path)
+            pix = None
+            xref_to_image_path[xref] = img_path
+            image_counter += 1
+        # Recorremos bloques y reconstruimos texto+imagenes en orden vertical
         for b in blocks:
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                         elements.append((y, text.strip()))
             elif b["type"] == 1:  # Imagen
                 y = b["bbox"][1]
+                # El bloque de imagen tiene su xref
+                xref = b.get("image", {}).get("xref", None)
+                if xref and xref in xref_to_image_path:
+                    img_path = xref_to_image_path[xref]
                     elements.append((y, f"![imagen]({img_path})"))
+                else:
+                    # Si no encontramos la imagen, dejamos marcador vacío
+                    elements.append((y, "[imagen]()"))
         elements.sort(key=lambda x: x[0])
         for _, content in elements:
 gr.Interface(
     convert,
+    inputs=[gr.File(label="Upload PDF", type="filepath")],
+    outputs=[gr.Text(label="Markdown"), gr.JSON(label="Metadata")],
 ).launch()