pdf-to-markdown

Sleeping

App Files Files Community

Biifruu commited on May 30

Commit

e62d9f5

verified ·

1 Parent(s): 4337f3a

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -51

app.py CHANGED Viewed

@@ -5,47 +5,19 @@ import os
 import tempfile
 import ocrmypdf
-def extract_text_from_pdf(doc):
-    full_text = ""
-    for page in doc:
-        text = page.get_text()
-        if text:
-            full_text += text + "\n\n"
-    return full_text.strip()
-@spaces.GPU
-def convert(pdf_file):
-    # Abrimos el PDF original
-    doc = fitz.open(pdf_file)
-    # Extraemos texto
-    full_text = extract_text_from_pdf(doc)
-    # Si texto es muy corto, aplicamos OCR
-    if len(full_text) < 100:
-        # Creamos archivo temporal para PDF OCR
-        temp_ocr_pdf = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
-        temp_ocr_pdf.close()
-        # Aplicar OCR (forzamos OCR en todas las páginas)
-        ocrmypdf.ocr(pdf_file, temp_ocr_pdf.name, force_ocr=True)
-        # Abrimos PDF OCR
-        doc = fitz.open(temp_ocr_pdf.name)
-        full_text = extract_text_from_pdf(doc)
     markdown_output = ""
     image_dir = "extracted_images"
     os.makedirs(image_dir, exist_ok=True)
     image_counter = 0
-    for page_number, page in enumerate(doc):
         blocks = page.get_text("dict")["blocks"]
         elements = []
-        # Extraemos todas las imágenes con sus xrefs
         image_list = page.get_images(full=True)
-        xref_to_image_path = {}
         for img in image_list:
             xref = img[0]
@@ -57,39 +29,49 @@ def convert(pdf_file):
             pix.save(img_path)
             pix = None
-            xref_to_image_path[xref] = img_path
             image_counter += 1
-        # Procesamos bloques en orden vertical (y)
         for b in blocks:
             if b["type"] == 0:  # Texto
                 for line in b["lines"]:
-                    for span in line["spans"]:
-                        y = span["bbox"][1]
-                        text = span["text"]
-                        elements.append((y, text.strip()))
             elif b["type"] == 1:  # Imagen
-                y = b["bbox"][1]
-                xref = b.get("image", None)
-                # Insertamos link vacío en markdown para la imagen
-                if xref and xref in xref_to_image_path:
-                    # Aquí ponemos link vacío (sin destino) como pide
-                    elements.append((y, f"![imagen]()"))
-                else:
-                    elements.append((y, "[imagen]()"))
         elements.sort(key=lambda x: x[0])
         for _, content in elements:
             markdown_output += content + "\n\n"
-    # Metadata vacío o puedes agregar si quieres
-    metadata = {}
-    return markdown_output.strip(), metadata
 gr.Interface(
-    convert,
     inputs=[gr.File(label="Upload PDF", type="filepath")],
-    outputs=[gr.Text(label="Markdown"), gr.JSON(label="Metadata")],
 ).launch()

 import tempfile
 import ocrmypdf
+def extract_text_markdown(doc):
     markdown_output = ""
     image_dir = "extracted_images"
     os.makedirs(image_dir, exist_ok=True)
     image_counter = 0
+    for page in doc:
         blocks = page.get_text("dict")["blocks"]
         elements = []
+        # Extraer imágenes y guardar para asignar link
         image_list = page.get_images(full=True)
+        xref_to_placeholder = {}
         for img in image_list:
             xref = img[0]
             pix.save(img_path)
             pix = None
+            xref_to_placeholder[xref] = f"![imagen]()"
             image_counter += 1
         for b in blocks:
+            y = b["bbox"][1]
             if b["type"] == 0:  # Texto
+                paragraph = ""
                 for line in b["lines"]:
+                    line_text = " ".join([span["text"].strip() for span in line["spans"]])
+                    paragraph += line_text + " "
+                paragraph = paragraph.strip()
+                if paragraph:
+                    elements.append((y, paragraph))
             elif b["type"] == 1:  # Imagen
+                xref = b.get("image")
+                elements.append((y, "![imagen]()"))
         elements.sort(key=lambda x: x[0])
         for _, content in elements:
             markdown_output += content + "\n\n"
+    return markdown_output.strip()
+@spaces.GPU
+def convert(pdf_file):
+    original_doc = fitz.open(pdf_file)
+    plain_text = "\n".join([page.get_text() for page in original_doc])
+    # Si es imagen escaneada sin texto, aplicamos OCR
+    if len(plain_text.strip()) < 100:
+        ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
+        ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
+        doc = fitz.open(ocr_temp_path)
+    else:
+        doc = original_doc
+    markdown = extract_text_markdown(doc)
+    metadata = {}  # Puedes agregar metadatos si quieres
+    return markdown, metadata
 gr.Interface(
+    fn=convert,
     inputs=[gr.File(label="Upload PDF", type="filepath")],
+    outputs=[gr.Text(label="Markdown crudo"), gr.JSON(label="Metadata")],
 ).launch()