Biifruu commited on
Commit
14d9ac1
verified
1 Parent(s): 7064c41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -19
app.py CHANGED
@@ -11,44 +11,47 @@ def extract_text_markdown(doc):
11
 
12
  for page in doc:
13
  blocks = page.get_text("dict")["blocks"]
14
- lines_group = []
15
  elements = []
16
 
17
  for b in blocks:
18
  y = b["bbox"][1]
19
-
20
  if b["type"] == 0: # Texto
21
  for line in b["lines"]:
22
  spans = line["spans"]
23
- cells = []
24
- for span in spans:
25
- cells.append((span["bbox"][0], span["text"])) # (x, texto)
26
  if len(cells) > 1:
27
- lines_group.append((line["bbox"][1], cells)) # (y, columnas)
28
  else:
29
  line_text = " ".join(span["text"] for span in spans).strip()
30
  if line_text:
31
  elements.append((line["bbox"][1], line_text))
32
-
33
  elif b["type"] == 1: # Imagen
34
  elements.append((y, f"[imagen_{image_counter}]()"))
35
  image_counter += 1
36
 
37
- # Detectar tablas rudimentarias
38
- if len(lines_group) >= 2:
39
- # Ordenar por coordenada vertical
40
- lines_group.sort(key=lambda x: x[0])
41
- header_cells = [cell[1].strip() for cell in lines_group[0][1]]
42
- markdown_output += "| " + " | ".join(header_cells) + " |\n"
43
- markdown_output += "| " + " | ".join(["---"] * len(header_cells)) + " |\n"
44
 
45
- for y, row in lines_group[1:]:
46
- row_cells = [cell[1].strip() for cell in row]
47
- markdown_output += "| " + " | ".join(row_cells) + " |\n"
 
 
 
 
 
 
 
48
 
49
- markdown_output += "\n"
 
50
 
51
- # Agregar l铆neas de texto sueltas
52
  elements.sort(key=lambda x: x[0])
53
  previous_y = None
54
  for y, content in elements:
@@ -61,6 +64,22 @@ def extract_text_markdown(doc):
61
 
62
  return markdown_output.strip()
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  @spaces.GPU
65
  def convert(pdf_file):
66
  original_doc = fitz.open(pdf_file)
 
11
 
12
  for page in doc:
13
  blocks = page.get_text("dict")["blocks"]
14
+ raw_lines = []
15
  elements = []
16
 
17
  for b in blocks:
18
  y = b["bbox"][1]
 
19
  if b["type"] == 0: # Texto
20
  for line in b["lines"]:
21
  spans = line["spans"]
22
+ cells = [(span["bbox"][0], span["text"].strip()) for span in spans if span["text"].strip()]
 
 
23
  if len(cells) > 1:
24
+ raw_lines.append((line["bbox"][1], cells)) # posibles l铆neas de tabla
25
  else:
26
  line_text = " ".join(span["text"] for span in spans).strip()
27
  if line_text:
28
  elements.append((line["bbox"][1], line_text))
 
29
  elif b["type"] == 1: # Imagen
30
  elements.append((y, f"[imagen_{image_counter}]()"))
31
  image_counter += 1
32
 
33
+ # Procesar posibles tablas
34
+ table_block = []
35
+ previous_cell_count = None
36
+
37
+ for y, cells in raw_lines + [(None, None)]: # Agregar None para forzar el cierre al final
38
+ cell_count = len(cells) if cells else None
 
39
 
40
+ if cell_count == previous_cell_count:
41
+ table_block.append((y, cells))
42
+ else:
43
+ if len(table_block) >= 3: # S贸lo si hay suficientes filas similares
44
+ markdown_output += convert_table_block_to_markdown(table_block)
45
+ markdown_output += "\n"
46
+ else:
47
+ # Agregar como texto normal si no cumple
48
+ for _, row_cells in table_block:
49
+ markdown_output += " ".join([c[1] for c in row_cells]) + "\n"
50
 
51
+ table_block = [(y, cells)] if cells else []
52
+ previous_cell_count = cell_count
53
 
54
+ # Procesar elementos normales
55
  elements.sort(key=lambda x: x[0])
56
  previous_y = None
57
  for y, content in elements:
 
64
 
65
  return markdown_output.strip()
66
 
67
+ def convert_table_block_to_markdown(block):
68
+ """Convierte un bloque de filas con estructura tabular a Markdown"""
69
+ lines = []
70
+ for _, cells in block:
71
+ row = [c[1] for c in cells]
72
+ lines.append("| " + " | ".join(row) + " |")
73
+
74
+ if len(lines) > 1:
75
+ # Insertar l铆nea de encabezado
76
+ header = lines[0]
77
+ num_cols = header.count("|") - 1
78
+ separator = "| " + " | ".join(["---"] * num_cols) + " |"
79
+ lines.insert(1, separator)
80
+
81
+ return "\n".join(lines) + "\n"
82
+
83
  @spaces.GPU
84
  def convert(pdf_file):
85
  original_doc = fitz.open(pdf_file)