Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,10 +3,11 @@ import gradio as gr
|
|
3 |
import fitz # PyMuPDF
|
4 |
import ocrmypdf
|
5 |
import tempfile
|
|
|
6 |
|
7 |
def extract_text_markdown(doc):
|
8 |
markdown_output = ""
|
9 |
-
image_counter = 1
|
10 |
|
11 |
for page in doc:
|
12 |
blocks = page.get_text("dict")["blocks"]
|
@@ -17,16 +18,18 @@ def extract_text_markdown(doc):
|
|
17 |
if b["type"] == 0: # Texto
|
18 |
for line in b["lines"]:
|
19 |
line_y = line["bbox"][1]
|
20 |
-
|
21 |
-
line_text = " ".join(span["text"].strip() for span in spans)
|
22 |
if line_text:
|
23 |
elements.append((line_y, line_text))
|
24 |
elif b["type"] == 1: # Imagen
|
|
|
25 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
26 |
image_counter += 1
|
27 |
|
|
|
28 |
elements.sort(key=lambda x: x[0])
|
29 |
|
|
|
30 |
previous_y = None
|
31 |
for y, content in elements:
|
32 |
if previous_y is not None and abs(y - previous_y) > 10:
|
@@ -43,6 +46,7 @@ def convert(pdf_file):
|
|
43 |
original_doc = fitz.open(pdf_file)
|
44 |
plain_text = "\n".join([page.get_text() for page in original_doc])
|
45 |
|
|
|
46 |
if len(plain_text.strip()) < 100:
|
47 |
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
|
48 |
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
|
@@ -51,35 +55,11 @@ def convert(pdf_file):
|
|
51 |
doc = original_doc
|
52 |
|
53 |
markdown = extract_text_markdown(doc)
|
54 |
-
metadata = {} #
|
55 |
return markdown, metadata
|
56 |
|
57 |
-
|
58 |
-
|
59 |
-
gr.
|
60 |
-
|
61 |
-
|
62 |
-
markdown_output = gr.Textbox(label="Markdown generado", lines=25, elem_id="markdown-textbox")
|
63 |
-
metadata_output = gr.JSON(label="Metadata")
|
64 |
-
|
65 |
-
convert_btn = gr.Button("Convertir PDF")
|
66 |
-
|
67 |
-
# Botón copiar usando JS válido y accediendo al DOM real
|
68 |
-
gr.HTML("""
|
69 |
-
<button onclick="copyMarkdown()">📋 Copiar Markdown</button>
|
70 |
-
<script>
|
71 |
-
function copyMarkdown() {
|
72 |
-
const textarea = document.querySelector('#markdown-textbox textarea');
|
73 |
-
if (textarea) {
|
74 |
-
textarea.select();
|
75 |
-
document.execCommand('copy');
|
76 |
-
} else {
|
77 |
-
alert('No se pudo copiar el texto');
|
78 |
-
}
|
79 |
-
}
|
80 |
-
</script>
|
81 |
-
""")
|
82 |
-
|
83 |
-
convert_btn.click(fn=convert, inputs=pdf_input, outputs=[markdown_output, metadata_output])
|
84 |
-
|
85 |
-
demo.launch()
|
|
|
3 |
import fitz # PyMuPDF
|
4 |
import ocrmypdf
|
5 |
import tempfile
|
6 |
+
import os
|
7 |
|
8 |
def extract_text_markdown(doc):
|
9 |
markdown_output = ""
|
10 |
+
image_counter = 1 # Contador de imágenes
|
11 |
|
12 |
for page in doc:
|
13 |
blocks = page.get_text("dict")["blocks"]
|
|
|
18 |
if b["type"] == 0: # Texto
|
19 |
for line in b["lines"]:
|
20 |
line_y = line["bbox"][1]
|
21 |
+
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
|
|
|
22 |
if line_text:
|
23 |
elements.append((line_y, line_text))
|
24 |
elif b["type"] == 1: # Imagen
|
25 |
+
# Añade un enlace con nombre único
|
26 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
27 |
image_counter += 1
|
28 |
|
29 |
+
# Ordenar por posición vertical
|
30 |
elements.sort(key=lambda x: x[0])
|
31 |
|
32 |
+
# Reconstrucción con saltos lógicos
|
33 |
previous_y = None
|
34 |
for y, content in elements:
|
35 |
if previous_y is not None and abs(y - previous_y) > 10:
|
|
|
46 |
original_doc = fitz.open(pdf_file)
|
47 |
plain_text = "\n".join([page.get_text() for page in original_doc])
|
48 |
|
49 |
+
# Aplicar OCR solo si el PDF no tiene texto
|
50 |
if len(plain_text.strip()) < 100:
|
51 |
ocr_temp_path = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False).name
|
52 |
ocrmypdf.ocr(pdf_file, ocr_temp_path, force_ocr=True)
|
|
|
55 |
doc = original_doc
|
56 |
|
57 |
markdown = extract_text_markdown(doc)
|
58 |
+
metadata = {} # Si necesitas metadatos, se pueden agregar aquí
|
59 |
return markdown, metadata
|
60 |
|
61 |
+
gr.Interface(
|
62 |
+
fn=convert,
|
63 |
+
inputs=[gr.File(label="Sube tu PDF", type="filepath")],
|
64 |
+
outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
|
65 |
+
).launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|