pdf-to-markdown / app.py
Biifruu's picture
Update app.py
f824c8d verified
raw
history blame
2.73 kB
import spaces
import gradio as gr
import fitz # PyMuPDF
import ocrmypdf
import tempfile
def extract_text_markdown(doc):
markdown_output = ""
image_counter = 1
for page in doc:
blocks = page.get_text("dict")["blocks"]
elements = []
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
line_y = line["bbox"][1]
spans = line["spans"]
line_text = " ".join(span["text"].strip() for span in spans)
if line_text:
elements.append((line_y, line_text))
elif b["type"] == 1: # Imagen
elements.append((y, f"[imagen_{image_counter}]()"))
image_counter += 1
elements.sort(key=lambda x: x[0])
previous_y = None
for y, content in elements:
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
markdown_output += content + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
original_doc = fitz.open(pdf_file)
plain_text = "\n".join([page.get_text() for page in original_doc])
if len(plain_text.strip()) < 100:
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
doc = fitz.open(ocr_temp_path)
else:
doc = original_doc
markdown = extract_text_markdown(doc)
metadata = {} # Puedes agregar metadata si quieres
return markdown, metadata
# Gradio Interface
with gr.Blocks(title="PDF → Markdown") as demo:
gr.Markdown("### PDF → Markdown con enlaces de imagen y botón copiar")
pdf_input = gr.File(label="Sube tu PDF", type="filepath")
markdown_output = gr.Textbox(label="Markdown generado", lines=25, elem_id="markdown-textbox")
metadata_output = gr.JSON(label="Metadata")
convert_btn = gr.Button("Convertir PDF")
# Botón copiar usando JS válido y accediendo al DOM real
gr.HTML("""
<button onclick="copyMarkdown()">📋 Copiar Markdown</button>
<script>
function copyMarkdown() {
const textarea = document.querySelector('#markdown-textbox textarea');
if (textarea) {
textarea.select();
document.execCommand('copy');
} else {
alert('No se pudo copiar el texto');
}
}
</script>
""")
convert_btn.click(fn=convert, inputs=pdf_input, outputs=[markdown_output, metadata_output])
demo.launch()