pdf-to-markdown

Running

App Files Files Community

Biifruu commited on 14 days ago

Commit

4337f3a

verified ·

1 Parent(s): f3b7c90

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -6

app.py CHANGED Viewed

@@ -2,10 +2,38 @@ import spaces
 import gradio as gr
 import fitz  # PyMuPDF
 import os
 @spaces.GPU
 def convert(pdf_file):
     doc = fitz.open(pdf_file)
     markdown_output = ""
     image_dir = "extracted_images"
     os.makedirs(image_dir, exist_ok=True)
@@ -15,7 +43,7 @@ def convert(pdf_file):
         blocks = page.get_text("dict")["blocks"]
         elements = []
-        # Extraemos lista de imágenes con sus xrefs
         image_list = page.get_images(full=True)
         xref_to_image_path = {}
@@ -24,7 +52,7 @@ def convert(pdf_file):
             pix = fitz.Pixmap(doc, xref)
             img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
-            if pix.n > 4:  # CMYK -> RGB
                 pix = fitz.Pixmap(fitz.csRGB, pix)
             pix.save(img_path)
             pix = None
@@ -32,7 +60,7 @@ def convert(pdf_file):
             xref_to_image_path[xref] = img_path
             image_counter += 1
-        # Procesar bloques y ordenar por coordenada vertical
         for b in blocks:
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
@@ -43,9 +71,10 @@ def convert(pdf_file):
             elif b["type"] == 1:  # Imagen
                 y = b["bbox"][1]
                 xref = b.get("image", None)
                 if xref and xref in xref_to_image_path:
-                    img_path = xref_to_image_path[xref]
-                    elements.append((y, f"![imagen]({img_path})"))
                 else:
                     elements.append((y, "[imagen]()"))
@@ -54,7 +83,10 @@ def convert(pdf_file):
         for _, content in elements:
             markdown_output += content + "\n\n"
-    return markdown_output.strip(), {}
 gr.Interface(
     convert,

 import gradio as gr
 import fitz  # PyMuPDF
 import os
+import tempfile
+import ocrmypdf
+def extract_text_from_pdf(doc):
+    full_text = ""
+    for page in doc:
+        text = page.get_text()
+        if text:
+            full_text += text + "\n\n"
+    return full_text.strip()
 @spaces.GPU
 def convert(pdf_file):
+    # Abrimos el PDF original
     doc = fitz.open(pdf_file)
+    # Extraemos texto
+    full_text = extract_text_from_pdf(doc)
+    # Si texto es muy corto, aplicamos OCR
+    if len(full_text) < 100:
+        # Creamos archivo temporal para PDF OCR
+        temp_ocr_pdf = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
+        temp_ocr_pdf.close()
+        # Aplicar OCR (forzamos OCR en todas las páginas)
+        ocrmypdf.ocr(pdf_file, temp_ocr_pdf.name, force_ocr=True)
+        # Abrimos PDF OCR
+        doc = fitz.open(temp_ocr_pdf.name)
+        full_text = extract_text_from_pdf(doc)
     markdown_output = ""
     image_dir = "extracted_images"
     os.makedirs(image_dir, exist_ok=True)
         blocks = page.get_text("dict")["blocks"]
         elements = []
+        # Extraemos todas las imágenes con sus xrefs
         image_list = page.get_images(full=True)
         xref_to_image_path = {}
             pix = fitz.Pixmap(doc, xref)
             img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
+            if pix.n > 4:
                 pix = fitz.Pixmap(fitz.csRGB, pix)
             pix.save(img_path)
             pix = None
             xref_to_image_path[xref] = img_path
             image_counter += 1
+        # Procesamos bloques en orden vertical (y)
         for b in blocks:
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
             elif b["type"] == 1:  # Imagen
                 y = b["bbox"][1]
                 xref = b.get("image", None)
+                # Insertamos link vacío en markdown para la imagen
                 if xref and xref in xref_to_image_path:
+                    # Aquí ponemos link vacío (sin destino) como pide
+                    elements.append((y, f"![imagen]()"))
                 else:
                     elements.append((y, "[imagen]()"))
         for _, content in elements:
             markdown_output += content + "\n\n"
+    # Metadata vacío o puedes agregar si quieres
+    metadata = {}
+    return markdown_output.strip(), metadata
 gr.Interface(
     convert,