pdf-to-markdown / app.py
Biifruu's picture
Update app.py
0c78c99 verified
raw
history blame
2.92 kB
import spaces
import gradio as gr
import fitz # PyMuPDF
import ocrmypdf
import tempfile
def extract_text_markdown(doc):
markdown_output = ""
image_counter = 1
for page in doc:
blocks = page.get_text("dict")["blocks"]
elements = []
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
line_y = line["bbox"][1]
spans = line["spans"]
if len(spans) > 1:
line_text = " | ".join(span["text"].strip() for span in spans)
else:
line_text = " ".join(span["text"].strip() for span in spans)
if line_text:
elements.append((line_y, line_text))
elif b["type"] == 1: # Imagen
elements.append((y, f"[imagen_{image_counter}]()"))
image_counter += 1
elements.sort(key=lambda x: x[0])
previous_y = None
for y, content in elements:
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
markdown_output += content + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
original_doc = fitz.open(pdf_file)
plain_text = "\n".join([page.get_text() for page in original_doc])
if len(plain_text.strip()) < 100:
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
doc = fitz.open(ocr_temp_path)
else:
doc = original_doc
markdown = extract_text_markdown(doc)
metadata = {} # Puedes rellenar si quieres
return markdown, metadata
# Gradio Blocks UI
with gr.Blocks(title="Extractor PDF a Markdown") as demo:
gr.Markdown("### PDF → Markdown con imágenes como enlaces y botón de copiar")
pdf_input = gr.File(label="Sube tu PDF", type="filepath")
markdown_output = gr.Textbox(label="Markdown generado", lines=25)
metadata_output = gr.JSON(label="Metadata")
hidden_textarea = gr.Textbox(visible=False)
with gr.Row():
convert_btn = gr.Button("Convertir PDF")
copy_btn = gr.Button("📋 Copiar Markdown")
convert_btn.click(fn=convert, inputs=pdf_input, outputs=[markdown_output, metadata_output])
# Al hacer clic en copiar, movemos el contenido visible al invisible y ejecutamos JS
copy_btn.click(lambda text: text, inputs=markdown_output, outputs=hidden_textarea).then(
None,
_js="""
() => {
const text = document.querySelectorAll("textarea")[1].value;
navigator.clipboard.writeText(text);
alert("¡Markdown copiado al portapapeles!");
}
"""
)
demo.launch()