Biifruu commited on
Commit
479e852
verified
1 Parent(s): 14d9ac1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -48
app.py CHANGED
@@ -7,52 +7,29 @@ import os
7
 
8
  def extract_text_markdown(doc):
9
  markdown_output = ""
10
- image_counter = 1
11
 
12
  for page in doc:
13
  blocks = page.get_text("dict")["blocks"]
14
- raw_lines = []
15
  elements = []
16
 
17
  for b in blocks:
18
  y = b["bbox"][1]
19
  if b["type"] == 0: # Texto
20
  for line in b["lines"]:
21
- spans = line["spans"]
22
- cells = [(span["bbox"][0], span["text"].strip()) for span in spans if span["text"].strip()]
23
- if len(cells) > 1:
24
- raw_lines.append((line["bbox"][1], cells)) # posibles l铆neas de tabla
25
- else:
26
- line_text = " ".join(span["text"] for span in spans).strip()
27
- if line_text:
28
- elements.append((line["bbox"][1], line_text))
29
  elif b["type"] == 1: # Imagen
 
30
  elements.append((y, f"[imagen_{image_counter}]()"))
31
  image_counter += 1
32
 
33
- # Procesar posibles tablas
34
- table_block = []
35
- previous_cell_count = None
36
-
37
- for y, cells in raw_lines + [(None, None)]: # Agregar None para forzar el cierre al final
38
- cell_count = len(cells) if cells else None
39
-
40
- if cell_count == previous_cell_count:
41
- table_block.append((y, cells))
42
- else:
43
- if len(table_block) >= 3: # S贸lo si hay suficientes filas similares
44
- markdown_output += convert_table_block_to_markdown(table_block)
45
- markdown_output += "\n"
46
- else:
47
- # Agregar como texto normal si no cumple
48
- for _, row_cells in table_block:
49
- markdown_output += " ".join([c[1] for c in row_cells]) + "\n"
50
-
51
- table_block = [(y, cells)] if cells else []
52
- previous_cell_count = cell_count
53
-
54
- # Procesar elementos normales
55
  elements.sort(key=lambda x: x[0])
 
 
56
  previous_y = None
57
  for y, content in elements:
58
  if previous_y is not None and abs(y - previous_y) > 10:
@@ -64,22 +41,6 @@ def extract_text_markdown(doc):
64
 
65
  return markdown_output.strip()
66
 
67
- def convert_table_block_to_markdown(block):
68
- """Convierte un bloque de filas con estructura tabular a Markdown"""
69
- lines = []
70
- for _, cells in block:
71
- row = [c[1] for c in cells]
72
- lines.append("| " + " | ".join(row) + " |")
73
-
74
- if len(lines) > 1:
75
- # Insertar l铆nea de encabezado
76
- header = lines[0]
77
- num_cols = header.count("|") - 1
78
- separator = "| " + " | ".join(["---"] * num_cols) + " |"
79
- lines.insert(1, separator)
80
-
81
- return "\n".join(lines) + "\n"
82
-
83
  @spaces.GPU
84
  def convert(pdf_file):
85
  original_doc = fitz.open(pdf_file)
 
7
 
8
  def extract_text_markdown(doc):
9
  markdown_output = ""
10
+ image_counter = 1 # Contador de im谩genes
11
 
12
  for page in doc:
13
  blocks = page.get_text("dict")["blocks"]
 
14
  elements = []
15
 
16
  for b in blocks:
17
  y = b["bbox"][1]
18
  if b["type"] == 0: # Texto
19
  for line in b["lines"]:
20
+ line_y = line["bbox"][1]
21
+ line_text = " ".join([span["text"] for span in line["spans"]]).strip()
22
+ if line_text:
23
+ elements.append((line_y, line_text))
 
 
 
 
24
  elif b["type"] == 1: # Imagen
25
+ # A帽ade un enlace con nombre 煤nico
26
  elements.append((y, f"[imagen_{image_counter}]()"))
27
  image_counter += 1
28
 
29
+ # Ordenar por posici贸n vertical
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  elements.sort(key=lambda x: x[0])
31
+
32
+ # Reconstrucci贸n con saltos l贸gicos
33
  previous_y = None
34
  for y, content in elements:
35
  if previous_y is not None and abs(y - previous_y) > 10:
 
41
 
42
  return markdown_output.strip()
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  @spaces.GPU
45
  def convert(pdf_file):
46
  original_doc = fitz.open(pdf_file)