pdf-to-markdown / app.py
Biifruu's picture
Update app.py
e62d9f5 verified
raw
history blame
2.38 kB
import spaces
import gradio as gr
import fitz # PyMuPDF
import os
import tempfile
import ocrmypdf
def extract_text_markdown(doc):
markdown_output = ""
image_dir = "extracted_images"
os.makedirs(image_dir, exist_ok=True)
image_counter = 0
for page in doc:
blocks = page.get_text("dict")["blocks"]
elements = []
# Extraer imágenes y guardar para asignar link
image_list = page.get_images(full=True)
xref_to_placeholder = {}
for img in image_list:
xref = img[0]
pix = fitz.Pixmap(doc, xref)
img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
if pix.n > 4:
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.save(img_path)
pix = None
xref_to_placeholder[xref] = f"![imagen]()"
image_counter += 1
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
paragraph = ""
for line in b["lines"]:
line_text = " ".join([span["text"].strip() for span in line["spans"]])
paragraph += line_text + " "
paragraph = paragraph.strip()
if paragraph:
elements.append((y, paragraph))
elif b["type"] == 1: # Imagen
xref = b.get("image")
elements.append((y, "![imagen]()"))
elements.sort(key=lambda x: x[0])
for _, content in elements:
markdown_output += content + "\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
original_doc = fitz.open(pdf_file)
plain_text = "\n".join([page.get_text() for page in original_doc])
# Si es imagen escaneada sin texto, aplicamos OCR
if len(plain_text.strip()) < 100:
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
doc = fitz.open(ocr_temp_path)
else:
doc = original_doc
markdown = extract_text_markdown(doc)
metadata = {} # Puedes agregar metadatos si quieres
return markdown, metadata
gr.Interface(
fn=convert,
inputs=[gr.File(label="Upload PDF", type="filepath")],
outputs=[gr.Text(label="Markdown crudo"), gr.JSON(label="Metadata")],
).launch()