Biifruu commited on
Commit
7064c41
verified
1 Parent(s): dd29269

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -9
app.py CHANGED
@@ -7,29 +7,49 @@ import os
7
 
8
  def extract_text_markdown(doc):
9
  markdown_output = ""
10
- image_counter = 1 # Contador de im谩genes
11
 
12
  for page in doc:
13
  blocks = page.get_text("dict")["blocks"]
 
14
  elements = []
15
 
16
  for b in blocks:
17
  y = b["bbox"][1]
 
18
  if b["type"] == 0: # Texto
19
  for line in b["lines"]:
20
- line_y = line["bbox"][1]
21
- line_text = " ".join([span["text"] for span in line["spans"]]).strip()
22
- if line_text:
23
- elements.append((line_y, line_text))
 
 
 
 
 
 
 
24
  elif b["type"] == 1: # Imagen
25
- # A帽ade un enlace con nombre 煤nico
26
  elements.append((y, f"[imagen_{image_counter}]()"))
27
  image_counter += 1
28
 
29
- # Ordenar por posici贸n vertical
30
- elements.sort(key=lambda x: x[0])
 
 
 
 
 
31
 
32
- # Reconstrucci贸n con saltos l贸gicos
 
 
 
 
 
 
 
33
  previous_y = None
34
  for y, content in elements:
35
  if previous_y is not None and abs(y - previous_y) > 10:
 
7
 
8
  def extract_text_markdown(doc):
9
  markdown_output = ""
10
+ image_counter = 1
11
 
12
  for page in doc:
13
  blocks = page.get_text("dict")["blocks"]
14
+ lines_group = []
15
  elements = []
16
 
17
  for b in blocks:
18
  y = b["bbox"][1]
19
+
20
  if b["type"] == 0: # Texto
21
  for line in b["lines"]:
22
+ spans = line["spans"]
23
+ cells = []
24
+ for span in spans:
25
+ cells.append((span["bbox"][0], span["text"])) # (x, texto)
26
+ if len(cells) > 1:
27
+ lines_group.append((line["bbox"][1], cells)) # (y, columnas)
28
+ else:
29
+ line_text = " ".join(span["text"] for span in spans).strip()
30
+ if line_text:
31
+ elements.append((line["bbox"][1], line_text))
32
+
33
  elif b["type"] == 1: # Imagen
 
34
  elements.append((y, f"[imagen_{image_counter}]()"))
35
  image_counter += 1
36
 
37
+ # Detectar tablas rudimentarias
38
+ if len(lines_group) >= 2:
39
+ # Ordenar por coordenada vertical
40
+ lines_group.sort(key=lambda x: x[0])
41
+ header_cells = [cell[1].strip() for cell in lines_group[0][1]]
42
+ markdown_output += "| " + " | ".join(header_cells) + " |\n"
43
+ markdown_output += "| " + " | ".join(["---"] * len(header_cells)) + " |\n"
44
 
45
+ for y, row in lines_group[1:]:
46
+ row_cells = [cell[1].strip() for cell in row]
47
+ markdown_output += "| " + " | ".join(row_cells) + " |\n"
48
+
49
+ markdown_output += "\n"
50
+
51
+ # Agregar l铆neas de texto sueltas
52
+ elements.sort(key=lambda x: x[0])
53
  previous_y = None
54
  for y, content in elements:
55
  if previous_y is not None and abs(y - previous_y) > 10: