pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 17 days ago

Commit

f3b7c90

verified ·

1 Parent(s): dd21256

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -6

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ def convert(pdf_file):
         blocks = page.get_text("dict")["blocks"]
         elements = []
-        # Extraemos la lista de imágenes en esta página, con sus xrefs
         image_list = page.get_images(full=True)
         xref_to_image_path = {}
@@ -24,7 +24,7 @@ def convert(pdf_file):
             pix = fitz.Pixmap(doc, xref)
             img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
-            if pix.n > 4:  # si es CMYK, convertir a RGB
                 pix = fitz.Pixmap(fitz.csRGB, pix)
             pix.save(img_path)
             pix = None
@@ -32,7 +32,7 @@ def convert(pdf_file):
             xref_to_image_path[xref] = img_path
             image_counter += 1
-        # Recorremos bloques y reconstruimos texto+imagenes en orden vertical
         for b in blocks:
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
@@ -42,13 +42,11 @@ def convert(pdf_file):
                         elements.append((y, text.strip()))
             elif b["type"] == 1:  # Imagen
                 y = b["bbox"][1]
-                # El bloque de imagen tiene su xref
-                xref = b.get("image", {}).get("xref", None)
                 if xref and xref in xref_to_image_path:
                     img_path = xref_to_image_path[xref]
                     elements.append((y, f"![imagen]({img_path})"))
                 else:
-                    # Si no encontramos la imagen, dejamos marcador vacío
                     elements.append((y, "[imagen]()"))
         elements.sort(key=lambda x: x[0])

         blocks = page.get_text("dict")["blocks"]
         elements = []
+        # Extraemos lista de imágenes con sus xrefs
         image_list = page.get_images(full=True)
         xref_to_image_path = {}
             pix = fitz.Pixmap(doc, xref)
             img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
+            if pix.n > 4:  # CMYK -> RGB
                 pix = fitz.Pixmap(fitz.csRGB, pix)
             pix.save(img_path)
             pix = None
             xref_to_image_path[xref] = img_path
             image_counter += 1
+        # Procesar bloques y ordenar por coordenada vertical
         for b in blocks:
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                         elements.append((y, text.strip()))
             elif b["type"] == 1:  # Imagen
                 y = b["bbox"][1]
+                xref = b.get("image", None)
                 if xref and xref in xref_to_image_path:
                     img_path = xref_to_image_path[xref]
                     elements.append((y, f"![imagen]({img_path})"))
                 else:
                     elements.append((y, "[imagen]()"))
         elements.sort(key=lambda x: x[0])