Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -17,13 +17,12 @@ def extract_text_markdown(doc):
|
|
17 |
if b["type"] == 0: # Texto
|
18 |
for line in b["lines"]:
|
19 |
line_y = line["bbox"][1]
|
20 |
-
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
line_text = " | ".join([span["text"].strip() for span in line_spans])
|
25 |
else:
|
26 |
-
line_text = " ".join(
|
27 |
|
28 |
if line_text:
|
29 |
elements.append((line_y, line_text))
|
@@ -31,10 +30,8 @@ def extract_text_markdown(doc):
|
|
31 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
32 |
image_counter += 1
|
33 |
|
34 |
-
# Ordenar por posición vertical
|
35 |
elements.sort(key=lambda x: x[0])
|
36 |
|
37 |
-
# Reconstrucción
|
38 |
previous_y = None
|
39 |
for y, content in elements:
|
40 |
if previous_y is not None and abs(y - previous_y) > 10:
|
@@ -59,20 +56,23 @@ def convert(pdf_file):
|
|
59 |
doc = original_doc
|
60 |
|
61 |
markdown = extract_text_markdown(doc)
|
62 |
-
metadata = {} #
|
63 |
return markdown, metadata
|
64 |
|
65 |
-
#
|
66 |
-
|
67 |
-
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
""
|
72 |
|
73 |
-
gr.
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
|
17 |
if b["type"] == 0: # Texto
|
18 |
for line in b["lines"]:
|
19 |
line_y = line["bbox"][1]
|
20 |
+
spans = line["spans"]
|
21 |
|
22 |
+
if len(spans) > 1:
|
23 |
+
line_text = " | ".join(span["text"].strip() for span in spans)
|
|
|
24 |
else:
|
25 |
+
line_text = " ".join(span["text"].strip() for span in spans)
|
26 |
|
27 |
if line_text:
|
28 |
elements.append((line_y, line_text))
|
|
|
30 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
31 |
image_counter += 1
|
32 |
|
|
|
33 |
elements.sort(key=lambda x: x[0])
|
34 |
|
|
|
35 |
previous_y = None
|
36 |
for y, content in elements:
|
37 |
if previous_y is not None and abs(y - previous_y) > 10:
|
|
|
56 |
doc = original_doc
|
57 |
|
58 |
markdown = extract_text_markdown(doc)
|
59 |
+
metadata = {} # Añade metadatos si quieres
|
60 |
return markdown, metadata
|
61 |
|
62 |
+
# Gradio Interface
|
63 |
+
with gr.Blocks(title="Extractor PDF a Markdown") as demo:
|
64 |
+
gr.Markdown("### PDF → Markdown con imágenes como enlaces y botón de copiar")
|
65 |
|
66 |
+
pdf_input = gr.File(label="Sube tu PDF", type="filepath")
|
67 |
+
markdown_output = gr.Textbox(label="Markdown generado", lines=25, elem_id="md_output")
|
68 |
+
metadata_output = gr.JSON(label="Metadata")
|
69 |
|
70 |
+
with gr.Row():
|
71 |
+
convert_btn = gr.Button("Convertir PDF")
|
72 |
+
copy_btn = gr.HTML("""
|
73 |
+
<button onclick="navigator.clipboard.writeText(document.getElementById('md_output').value)">📋 Copiar Markdown</button>
|
74 |
+
""")
|
75 |
+
|
76 |
+
convert_btn.click(fn=convert, inputs=pdf_input, outputs=[markdown_output, metadata_output])
|
77 |
+
|
78 |
+
demo.launch()
|