pdf-to-markdown / app.py
Biifruu's picture
Update app.py
14d9ac1 verified
raw
history blame
3.7 kB
import spaces
import gradio as gr
import fitz # PyMuPDF
import ocrmypdf
import tempfile
import os
def extract_text_markdown(doc):
markdown_output = ""
image_counter = 1
for page in doc:
blocks = page.get_text("dict")["blocks"]
raw_lines = []
elements = []
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
spans = line["spans"]
cells = [(span["bbox"][0], span["text"].strip()) for span in spans if span["text"].strip()]
if len(cells) > 1:
raw_lines.append((line["bbox"][1], cells)) # posibles líneas de tabla
else:
line_text = " ".join(span["text"] for span in spans).strip()
if line_text:
elements.append((line["bbox"][1], line_text))
elif b["type"] == 1: # Imagen
elements.append((y, f"[imagen_{image_counter}]()"))
image_counter += 1
# Procesar posibles tablas
table_block = []
previous_cell_count = None
for y, cells in raw_lines + [(None, None)]: # Agregar None para forzar el cierre al final
cell_count = len(cells) if cells else None
if cell_count == previous_cell_count:
table_block.append((y, cells))
else:
if len(table_block) >= 3: # Sólo si hay suficientes filas similares
markdown_output += convert_table_block_to_markdown(table_block)
markdown_output += "\n"
else:
# Agregar como texto normal si no cumple
for _, row_cells in table_block:
markdown_output += " ".join([c[1] for c in row_cells]) + "\n"
table_block = [(y, cells)] if cells else []
previous_cell_count = cell_count
# Procesar elementos normales
elements.sort(key=lambda x: x[0])
previous_y = None
for y, content in elements:
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
markdown_output += content + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
def convert_table_block_to_markdown(block):
"""Convierte un bloque de filas con estructura tabular a Markdown"""
lines = []
for _, cells in block:
row = [c[1] for c in cells]
lines.append("| " + " | ".join(row) + " |")
if len(lines) > 1:
# Insertar línea de encabezado
header = lines[0]
num_cols = header.count("|") - 1
separator = "| " + " | ".join(["---"] * num_cols) + " |"
lines.insert(1, separator)
return "\n".join(lines) + "\n"
@spaces.GPU
def convert(pdf_file):
original_doc = fitz.open(pdf_file)
plain_text = "\n".join([page.get_text() for page in original_doc])
# Aplicar OCR solo si el PDF no tiene texto
if len(plain_text.strip()) < 100:
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
doc = fitz.open(ocr_temp_path)
else:
doc = original_doc
markdown = extract_text_markdown(doc)
metadata = {} # Si necesitas metadatos, se pueden agregar aquí
return markdown, metadata
gr.Interface(
fn=convert,
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
).launch()