pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 17 days ago

Commit

beb65ba

verified ·

1 Parent(s): d4b4544

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -8

app.py CHANGED Viewed

@@ -1,30 +1,48 @@
 import spaces
 import gradio as gr
 import fitz  # PyMuPDF
 @spaces.GPU
 def convert(pdf_file):
     doc = fitz.open(pdf_file)
     markdown_output = ""
-    for page in doc:
         blocks = page.get_text("dict")["blocks"]
         elements = []
         for b in blocks:
-            if b["type"] == 0:  # texto
                 for line in b["lines"]:
                     for span in line["spans"]:
-                        elements.append((span["bbox"][1], span["text"]))  # y, texto
-            elif b["type"] == 1:  # imagen
-                y_pos = b["bbox"][1]
-                elements.append((y_pos, "[imagen]()"))
-        # Ordenar por posición vertical
         elements.sort(key=lambda x: x[0])
         for _, content in elements:
-            markdown_output += content.strip() + "\n\n"
     return markdown_output.strip(), {}

 import spaces
 import gradio as gr
 import fitz  # PyMuPDF
+import os
 @spaces.GPU
 def convert(pdf_file):
     doc = fitz.open(pdf_file)
     markdown_output = ""
+    image_dir = "extracted_images"
+    os.makedirs(image_dir, exist_ok=True)
+    image_counter = 0
+    for page_number, page in enumerate(doc):
         blocks = page.get_text("dict")["blocks"]
         elements = []
         for b in blocks:
+            if b["type"] == 0:  # Texto
                 for line in b["lines"]:
                     for span in line["spans"]:
+                        y = span["bbox"][1]
+                        text = span["text"]
+                        elements.append((y, text.strip()))
+            elif b["type"] == 1:  # Imagen
+                y = b["bbox"][1]
+                img = page.get_image_list(full=True)
+                if img:
+                    xref = img[0][0]
+                    pix = fitz.Pixmap(doc, xref)
+                    img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
+                    if pix.n > 4:  # CMYK
+                        pix = fitz.Pixmap(fitz.csRGB, pix)
+                    pix.save(img_path)
+                    pix = None
+                    elements.append((y, f"![imagen]({img_path})"))
+                    image_counter += 1
+        # Ordenar por posición vertical (y)
         elements.sort(key=lambda x: x[0])
         for _, content in elements:
+            markdown_output += content + "\n\n"
     return markdown_output.strip(), {}