pdf-to-markdown / app.py
Biifruu's picture
Update app.py
7064c41 verified
raw
history blame
2.93 kB
import spaces
import gradio as gr
import fitz # PyMuPDF
import ocrmypdf
import tempfile
import os
def extract_text_markdown(doc):
markdown_output = ""
image_counter = 1
for page in doc:
blocks = page.get_text("dict")["blocks"]
lines_group = []
elements = []
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
spans = line["spans"]
cells = []
for span in spans:
cells.append((span["bbox"][0], span["text"])) # (x, texto)
if len(cells) > 1:
lines_group.append((line["bbox"][1], cells)) # (y, columnas)
else:
line_text = " ".join(span["text"] for span in spans).strip()
if line_text:
elements.append((line["bbox"][1], line_text))
elif b["type"] == 1: # Imagen
elements.append((y, f"[imagen_{image_counter}]()"))
image_counter += 1
# Detectar tablas rudimentarias
if len(lines_group) >= 2:
# Ordenar por coordenada vertical
lines_group.sort(key=lambda x: x[0])
header_cells = [cell[1].strip() for cell in lines_group[0][1]]
markdown_output += "| " + " | ".join(header_cells) + " |\n"
markdown_output += "| " + " | ".join(["---"] * len(header_cells)) + " |\n"
for y, row in lines_group[1:]:
row_cells = [cell[1].strip() for cell in row]
markdown_output += "| " + " | ".join(row_cells) + " |\n"
markdown_output += "\n"
# Agregar líneas de texto sueltas
elements.sort(key=lambda x: x[0])
previous_y = None
for y, content in elements:
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
markdown_output += content + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
original_doc = fitz.open(pdf_file)
plain_text = "\n".join([page.get_text() for page in original_doc])
# Aplicar OCR solo si el PDF no tiene texto
if len(plain_text.strip()) < 100:
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
doc = fitz.open(ocr_temp_path)
else:
doc = original_doc
markdown = extract_text_markdown(doc)
metadata = {} # Si necesitas metadatos, se pueden agregar aquí
return markdown, metadata
gr.Interface(
fn=convert,
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
).launch()