Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -11,44 +11,47 @@ def extract_text_markdown(doc):
|
|
11 |
|
12 |
for page in doc:
|
13 |
blocks = page.get_text("dict")["blocks"]
|
14 |
-
|
15 |
elements = []
|
16 |
|
17 |
for b in blocks:
|
18 |
y = b["bbox"][1]
|
19 |
-
|
20 |
if b["type"] == 0: # Texto
|
21 |
for line in b["lines"]:
|
22 |
spans = line["spans"]
|
23 |
-
cells = []
|
24 |
-
for span in spans:
|
25 |
-
cells.append((span["bbox"][0], span["text"])) # (x, texto)
|
26 |
if len(cells) > 1:
|
27 |
-
|
28 |
else:
|
29 |
line_text = " ".join(span["text"] for span in spans).strip()
|
30 |
if line_text:
|
31 |
elements.append((line["bbox"][1], line_text))
|
32 |
-
|
33 |
elif b["type"] == 1: # Imagen
|
34 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
35 |
image_counter += 1
|
36 |
|
37 |
-
#
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
markdown_output += "| " + " | ".join(["---"] * len(header_cells)) + " |\n"
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
|
|
|
50 |
|
51 |
-
#
|
52 |
elements.sort(key=lambda x: x[0])
|
53 |
previous_y = None
|
54 |
for y, content in elements:
|
@@ -61,6 +64,22 @@ def extract_text_markdown(doc):
|
|
61 |
|
62 |
return markdown_output.strip()
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
@spaces.GPU
|
65 |
def convert(pdf_file):
|
66 |
original_doc = fitz.open(pdf_file)
|
|
|
11 |
|
12 |
for page in doc:
|
13 |
blocks = page.get_text("dict")["blocks"]
|
14 |
+
raw_lines = []
|
15 |
elements = []
|
16 |
|
17 |
for b in blocks:
|
18 |
y = b["bbox"][1]
|
|
|
19 |
if b["type"] == 0: # Texto
|
20 |
for line in b["lines"]:
|
21 |
spans = line["spans"]
|
22 |
+
cells = [(span["bbox"][0], span["text"].strip()) for span in spans if span["text"].strip()]
|
|
|
|
|
23 |
if len(cells) > 1:
|
24 |
+
raw_lines.append((line["bbox"][1], cells)) # posibles l铆neas de tabla
|
25 |
else:
|
26 |
line_text = " ".join(span["text"] for span in spans).strip()
|
27 |
if line_text:
|
28 |
elements.append((line["bbox"][1], line_text))
|
|
|
29 |
elif b["type"] == 1: # Imagen
|
30 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
31 |
image_counter += 1
|
32 |
|
33 |
+
# Procesar posibles tablas
|
34 |
+
table_block = []
|
35 |
+
previous_cell_count = None
|
36 |
+
|
37 |
+
for y, cells in raw_lines + [(None, None)]: # Agregar None para forzar el cierre al final
|
38 |
+
cell_count = len(cells) if cells else None
|
|
|
39 |
|
40 |
+
if cell_count == previous_cell_count:
|
41 |
+
table_block.append((y, cells))
|
42 |
+
else:
|
43 |
+
if len(table_block) >= 3: # S贸lo si hay suficientes filas similares
|
44 |
+
markdown_output += convert_table_block_to_markdown(table_block)
|
45 |
+
markdown_output += "\n"
|
46 |
+
else:
|
47 |
+
# Agregar como texto normal si no cumple
|
48 |
+
for _, row_cells in table_block:
|
49 |
+
markdown_output += " ".join([c[1] for c in row_cells]) + "\n"
|
50 |
|
51 |
+
table_block = [(y, cells)] if cells else []
|
52 |
+
previous_cell_count = cell_count
|
53 |
|
54 |
+
# Procesar elementos normales
|
55 |
elements.sort(key=lambda x: x[0])
|
56 |
previous_y = None
|
57 |
for y, content in elements:
|
|
|
64 |
|
65 |
return markdown_output.strip()
|
66 |
|
67 |
+
def convert_table_block_to_markdown(block):
|
68 |
+
"""Convierte un bloque de filas con estructura tabular a Markdown"""
|
69 |
+
lines = []
|
70 |
+
for _, cells in block:
|
71 |
+
row = [c[1] for c in cells]
|
72 |
+
lines.append("| " + " | ".join(row) + " |")
|
73 |
+
|
74 |
+
if len(lines) > 1:
|
75 |
+
# Insertar l铆nea de encabezado
|
76 |
+
header = lines[0]
|
77 |
+
num_cols = header.count("|") - 1
|
78 |
+
separator = "| " + " | ".join(["---"] * num_cols) + " |"
|
79 |
+
lines.insert(1, separator)
|
80 |
+
|
81 |
+
return "\n".join(lines) + "\n"
|
82 |
+
|
83 |
@spaces.GPU
|
84 |
def convert(pdf_file):
|
85 |
original_doc = fitz.open(pdf_file)
|