Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -22,6 +22,7 @@ def extract_text_markdown(doc, image_paths):
|
|
22 |
blocks = page.get_text("dict")["blocks"]
|
23 |
elements = []
|
24 |
|
|
|
25 |
for b in blocks:
|
26 |
y = b["bbox"][1]
|
27 |
|
@@ -33,31 +34,38 @@ def extract_text_markdown(doc, image_paths):
|
|
33 |
if line_text:
|
34 |
elements.append((line_y, line_text, max_font_size))
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
elements.sort(key=lambda x: x[0])
|
48 |
previous_y = None
|
49 |
|
50 |
for y, text, font_size in elements:
|
51 |
is_header = font_size >= 14
|
52 |
-
|
53 |
if previous_y is not None and abs(y - previous_y) > 10:
|
54 |
markdown_output += "\n"
|
55 |
-
|
56 |
if is_header:
|
57 |
markdown_output += f"\n### {text.strip()}\n"
|
58 |
else:
|
59 |
markdown_output += text.strip() + "\n"
|
60 |
-
|
61 |
previous_y = y
|
62 |
|
63 |
markdown_output += "\n---\n\n"
|
@@ -75,14 +83,11 @@ def convert(pdf_file):
|
|
75 |
text = page.get_text("text").strip()
|
76 |
|
77 |
if len(text) > 30:
|
78 |
-
# Página con texto normal
|
79 |
markdown_output += extract_text_markdown([page], image_paths) + "\n"
|
80 |
else:
|
81 |
-
# Página vacía o con imagen: hacer OCR
|
82 |
pix = page.get_pixmap(dpi=300)
|
83 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
84 |
|
85 |
-
# Guardar imagen completa
|
86 |
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
|
87 |
img.save(image_path)
|
88 |
image_paths.append(image_path)
|
@@ -92,7 +97,7 @@ def convert(pdf_file):
|
|
92 |
try:
|
93 |
ocr_text = pytesseract.image_to_string(img, lang="spa")
|
94 |
except pytesseract.TesseractError:
|
95 |
-
ocr_text = pytesseract.image_to_string(img)
|
96 |
|
97 |
ocr_text = clean_ocr_text(ocr_text)
|
98 |
if ocr_text.strip():
|
@@ -100,7 +105,12 @@ def convert(pdf_file):
|
|
100 |
|
101 |
markdown_output += "\n---\n\n"
|
102 |
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
gr.Interface(
|
106 |
fn=convert,
|
@@ -108,6 +118,7 @@ gr.Interface(
|
|
108 |
outputs=[
|
109 |
gr.Markdown(label="Markdown estructurado"),
|
110 |
gr.JSON(label="Metadata"),
|
111 |
-
gr.Gallery(label="Imágenes extraídas", type="file")
|
|
|
112 |
],
|
113 |
).launch()
|
|
|
22 |
blocks = page.get_text("dict")["blocks"]
|
23 |
elements = []
|
24 |
|
25 |
+
# 🔁 Añadir texto normal (bloques)
|
26 |
for b in blocks:
|
27 |
y = b["bbox"][1]
|
28 |
|
|
|
34 |
if line_text:
|
35 |
elements.append((line_y, line_text, max_font_size))
|
36 |
|
37 |
+
# 🖼️ Extraer imágenes reales de la página (xref)
|
38 |
+
images_on_page = page.get_images(full=True)
|
39 |
+
for img_index, img in enumerate(images_on_page):
|
40 |
+
xref = img[0]
|
41 |
+
try:
|
42 |
+
base_image = doc.extract_image(xref)
|
43 |
+
image_bytes = base_image["image"]
|
44 |
+
ext = base_image["ext"]
|
45 |
+
image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
|
46 |
+
|
47 |
+
with open(image_path, "wb") as f:
|
48 |
+
f.write(image_bytes)
|
49 |
+
|
50 |
+
image_paths.append(image_path)
|
51 |
+
y_pos = 50 + img_index * 10 # Posición estimada para ordenar
|
52 |
+
elements.append((y_pos, f"", 10))
|
53 |
+
image_counter += 1
|
54 |
+
except Exception as e:
|
55 |
+
elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
|
56 |
+
|
57 |
+
# Ordenar y construir Markdown
|
58 |
elements.sort(key=lambda x: x[0])
|
59 |
previous_y = None
|
60 |
|
61 |
for y, text, font_size in elements:
|
62 |
is_header = font_size >= 14
|
|
|
63 |
if previous_y is not None and abs(y - previous_y) > 10:
|
64 |
markdown_output += "\n"
|
|
|
65 |
if is_header:
|
66 |
markdown_output += f"\n### {text.strip()}\n"
|
67 |
else:
|
68 |
markdown_output += text.strip() + "\n"
|
|
|
69 |
previous_y = y
|
70 |
|
71 |
markdown_output += "\n---\n\n"
|
|
|
83 |
text = page.get_text("text").strip()
|
84 |
|
85 |
if len(text) > 30:
|
|
|
86 |
markdown_output += extract_text_markdown([page], image_paths) + "\n"
|
87 |
else:
|
|
|
88 |
pix = page.get_pixmap(dpi=300)
|
89 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
90 |
|
|
|
91 |
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
|
92 |
img.save(image_path)
|
93 |
image_paths.append(image_path)
|
|
|
97 |
try:
|
98 |
ocr_text = pytesseract.image_to_string(img, lang="spa")
|
99 |
except pytesseract.TesseractError:
|
100 |
+
ocr_text = pytesseract.image_to_string(img)
|
101 |
|
102 |
ocr_text = clean_ocr_text(ocr_text)
|
103 |
if ocr_text.strip():
|
|
|
105 |
|
106 |
markdown_output += "\n---\n\n"
|
107 |
|
108 |
+
# Guardar como archivo .md
|
109 |
+
markdown_path = "/tmp/resultado.md"
|
110 |
+
with open(markdown_path, "w", encoding="utf-8") as f:
|
111 |
+
f.write(markdown_output)
|
112 |
+
|
113 |
+
return markdown_output.strip(), {}, image_paths, markdown_path
|
114 |
|
115 |
gr.Interface(
|
116 |
fn=convert,
|
|
|
118 |
outputs=[
|
119 |
gr.Markdown(label="Markdown estructurado"),
|
120 |
gr.JSON(label="Metadata"),
|
121 |
+
gr.Gallery(label="Imágenes extraídas", type="file"),
|
122 |
+
gr.File(label="Descargar .md")
|
123 |
],
|
124 |
).launch()
|