Spaces:
Running
Running
File size: 2,921 Bytes
145d936 d4b4544 4337f3a 7b1bb08 4337f3a e62d9f5 3920f3b 564947a c1d7645 e62d9f5 d4b4544 145d936 d4b4544 e62d9f5 beb65ba d4b4544 3e3d3c7 58c62dc 564947a 58c62dc 564947a 58c62dc 564947a 3e3d3c7 beb65ba 891d450 beb65ba d4b4544 3e3d3c7 7b1bb08 3e3d3c7 b3fecd4 d4b4544 e62d9f5 4337f3a e62d9f5 0c78c99 e62d9f5 145d936 0c78c99 58c62dc 564947a 58c62dc 0c78c99 58c62dc 564947a 0c78c99 58c62dc 0c78c99 58c62dc 0c78c99 58c62dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import spaces
import gradio as gr
import fitz # PyMuPDF
import ocrmypdf
import tempfile
def extract_text_markdown(doc):
markdown_output = ""
image_counter = 1
for page in doc:
blocks = page.get_text("dict")["blocks"]
elements = []
for b in blocks:
y = b["bbox"][1]
if b["type"] == 0: # Texto
for line in b["lines"]:
line_y = line["bbox"][1]
spans = line["spans"]
if len(spans) > 1:
line_text = " | ".join(span["text"].strip() for span in spans)
else:
line_text = " ".join(span["text"].strip() for span in spans)
if line_text:
elements.append((line_y, line_text))
elif b["type"] == 1: # Imagen
elements.append((y, f"[imagen_{image_counter}]()"))
image_counter += 1
elements.sort(key=lambda x: x[0])
previous_y = None
for y, content in elements:
if previous_y is not None and abs(y - previous_y) > 10:
markdown_output += "\n"
markdown_output += content + "\n"
previous_y = y
markdown_output += "\n---\n\n"
return markdown_output.strip()
@spaces.GPU
def convert(pdf_file):
original_doc = fitz.open(pdf_file)
plain_text = "\n".join([page.get_text() for page in original_doc])
if len(plain_text.strip()) < 100:
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
doc = fitz.open(ocr_temp_path)
else:
doc = original_doc
markdown = extract_text_markdown(doc)
metadata = {} # Puedes rellenar si quieres
return markdown, metadata
# Gradio Blocks UI
with gr.Blocks(title="Extractor PDF a Markdown") as demo:
gr.Markdown("### PDF → Markdown con imágenes como enlaces y botón de copiar")
pdf_input = gr.File(label="Sube tu PDF", type="filepath")
markdown_output = gr.Textbox(label="Markdown generado", lines=25)
metadata_output = gr.JSON(label="Metadata")
hidden_textarea = gr.Textbox(visible=False)
with gr.Row():
convert_btn = gr.Button("Convertir PDF")
copy_btn = gr.Button("📋 Copiar Markdown")
convert_btn.click(fn=convert, inputs=pdf_input, outputs=[markdown_output, metadata_output])
# Al hacer clic en copiar, movemos el contenido visible al invisible y ejecutamos JS
copy_btn.click(lambda text: text, inputs=markdown_output, outputs=hidden_textarea).then(
None,
_js="""
() => {
const text = document.querySelectorAll("textarea")[1].value;
navigator.clipboard.writeText(text);
alert("¡Markdown copiado al portapapeles!");
}
"""
)
demo.launch()
|