Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,52 +7,29 @@ import os
|
|
7 |
|
8 |
def extract_text_markdown(doc):
|
9 |
markdown_output = ""
|
10 |
-
image_counter = 1
|
11 |
|
12 |
for page in doc:
|
13 |
blocks = page.get_text("dict")["blocks"]
|
14 |
-
raw_lines = []
|
15 |
elements = []
|
16 |
|
17 |
for b in blocks:
|
18 |
y = b["bbox"][1]
|
19 |
if b["type"] == 0: # Texto
|
20 |
for line in b["lines"]:
|
21 |
-
|
22 |
-
|
23 |
-
if
|
24 |
-
|
25 |
-
else:
|
26 |
-
line_text = " ".join(span["text"] for span in spans).strip()
|
27 |
-
if line_text:
|
28 |
-
elements.append((line["bbox"][1], line_text))
|
29 |
elif b["type"] == 1: # Imagen
|
|
|
30 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
31 |
image_counter += 1
|
32 |
|
33 |
-
#
|
34 |
-
table_block = []
|
35 |
-
previous_cell_count = None
|
36 |
-
|
37 |
-
for y, cells in raw_lines + [(None, None)]: # Agregar None para forzar el cierre al final
|
38 |
-
cell_count = len(cells) if cells else None
|
39 |
-
|
40 |
-
if cell_count == previous_cell_count:
|
41 |
-
table_block.append((y, cells))
|
42 |
-
else:
|
43 |
-
if len(table_block) >= 3: # S贸lo si hay suficientes filas similares
|
44 |
-
markdown_output += convert_table_block_to_markdown(table_block)
|
45 |
-
markdown_output += "\n"
|
46 |
-
else:
|
47 |
-
# Agregar como texto normal si no cumple
|
48 |
-
for _, row_cells in table_block:
|
49 |
-
markdown_output += " ".join([c[1] for c in row_cells]) + "\n"
|
50 |
-
|
51 |
-
table_block = [(y, cells)] if cells else []
|
52 |
-
previous_cell_count = cell_count
|
53 |
-
|
54 |
-
# Procesar elementos normales
|
55 |
elements.sort(key=lambda x: x[0])
|
|
|
|
|
56 |
previous_y = None
|
57 |
for y, content in elements:
|
58 |
if previous_y is not None and abs(y - previous_y) > 10:
|
@@ -64,22 +41,6 @@ def extract_text_markdown(doc):
|
|
64 |
|
65 |
return markdown_output.strip()
|
66 |
|
67 |
-
def convert_table_block_to_markdown(block):
|
68 |
-
"""Convierte un bloque de filas con estructura tabular a Markdown"""
|
69 |
-
lines = []
|
70 |
-
for _, cells in block:
|
71 |
-
row = [c[1] for c in cells]
|
72 |
-
lines.append("| " + " | ".join(row) + " |")
|
73 |
-
|
74 |
-
if len(lines) > 1:
|
75 |
-
# Insertar l铆nea de encabezado
|
76 |
-
header = lines[0]
|
77 |
-
num_cols = header.count("|") - 1
|
78 |
-
separator = "| " + " | ".join(["---"] * num_cols) + " |"
|
79 |
-
lines.insert(1, separator)
|
80 |
-
|
81 |
-
return "\n".join(lines) + "\n"
|
82 |
-
|
83 |
@spaces.GPU
|
84 |
def convert(pdf_file):
|
85 |
original_doc = fitz.open(pdf_file)
|
|
|
7 |
|
8 |
def extract_text_markdown(doc):
|
9 |
markdown_output = ""
|
10 |
+
image_counter = 1 # Contador de im谩genes
|
11 |
|
12 |
for page in doc:
|
13 |
blocks = page.get_text("dict")["blocks"]
|
|
|
14 |
elements = []
|
15 |
|
16 |
for b in blocks:
|
17 |
y = b["bbox"][1]
|
18 |
if b["type"] == 0: # Texto
|
19 |
for line in b["lines"]:
|
20 |
+
line_y = line["bbox"][1]
|
21 |
+
line_text = " ".join([span["text"] for span in line["spans"]]).strip()
|
22 |
+
if line_text:
|
23 |
+
elements.append((line_y, line_text))
|
|
|
|
|
|
|
|
|
24 |
elif b["type"] == 1: # Imagen
|
25 |
+
# A帽ade un enlace con nombre 煤nico
|
26 |
elements.append((y, f"[imagen_{image_counter}]()"))
|
27 |
image_counter += 1
|
28 |
|
29 |
+
# Ordenar por posici贸n vertical
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
elements.sort(key=lambda x: x[0])
|
31 |
+
|
32 |
+
# Reconstrucci贸n con saltos l贸gicos
|
33 |
previous_y = None
|
34 |
for y, content in elements:
|
35 |
if previous_y is not None and abs(y - previous_y) > 10:
|
|
|
41 |
|
42 |
return markdown_output.strip()
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
@spaces.GPU
|
45 |
def convert(pdf_file):
|
46 |
original_doc = fitz.open(pdf_file)
|