Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,29 +7,49 @@ import os
|
|
7 |
|
8 |
def extract_text_markdown(doc):
|
9 |
markdown_output = ""
|
10 |
-
image_counter = 1
|
11 |
|
12 |
for page in doc:
|
13 |
blocks = page.get_text("dict")["blocks"]
|
|
|
14 |
elements = []
|
15 |
|
16 |
for b in blocks:
|
17 |
y = b["bbox"][1]
|
|
|
18 |
if b["type"] == 0: # Texto
|
19 |
for line in b["lines"]:
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
elif b["type"] == 1: # Imagen
|
25 |
-
# A帽ade un enlace con nombre 煤nico
|
26 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
27 |
image_counter += 1
|
28 |
|
29 |
-
#
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
previous_y = None
|
34 |
for y, content in elements:
|
35 |
if previous_y is not None and abs(y - previous_y) > 10:
|
|
|
7 |
|
8 |
def extract_text_markdown(doc):
|
9 |
markdown_output = ""
|
10 |
+
image_counter = 1
|
11 |
|
12 |
for page in doc:
|
13 |
blocks = page.get_text("dict")["blocks"]
|
14 |
+
lines_group = []
|
15 |
elements = []
|
16 |
|
17 |
for b in blocks:
|
18 |
y = b["bbox"][1]
|
19 |
+
|
20 |
if b["type"] == 0: # Texto
|
21 |
for line in b["lines"]:
|
22 |
+
spans = line["spans"]
|
23 |
+
cells = []
|
24 |
+
for span in spans:
|
25 |
+
cells.append((span["bbox"][0], span["text"])) # (x, texto)
|
26 |
+
if len(cells) > 1:
|
27 |
+
lines_group.append((line["bbox"][1], cells)) # (y, columnas)
|
28 |
+
else:
|
29 |
+
line_text = " ".join(span["text"] for span in spans).strip()
|
30 |
+
if line_text:
|
31 |
+
elements.append((line["bbox"][1], line_text))
|
32 |
+
|
33 |
elif b["type"] == 1: # Imagen
|
|
|
34 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
35 |
image_counter += 1
|
36 |
|
37 |
+
# Detectar tablas rudimentarias
|
38 |
+
if len(lines_group) >= 2:
|
39 |
+
# Ordenar por coordenada vertical
|
40 |
+
lines_group.sort(key=lambda x: x[0])
|
41 |
+
header_cells = [cell[1].strip() for cell in lines_group[0][1]]
|
42 |
+
markdown_output += "| " + " | ".join(header_cells) + " |\n"
|
43 |
+
markdown_output += "| " + " | ".join(["---"] * len(header_cells)) + " |\n"
|
44 |
|
45 |
+
for y, row in lines_group[1:]:
|
46 |
+
row_cells = [cell[1].strip() for cell in row]
|
47 |
+
markdown_output += "| " + " | ".join(row_cells) + " |\n"
|
48 |
+
|
49 |
+
markdown_output += "\n"
|
50 |
+
|
51 |
+
# Agregar l铆neas de texto sueltas
|
52 |
+
elements.sort(key=lambda x: x[0])
|
53 |
previous_y = None
|
54 |
for y, content in elements:
|
55 |
if previous_y is not None and abs(y - previous_y) > 10:
|