Biifruu commited on
Commit
f3b7c90
·
verified ·
1 Parent(s): dd21256

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -6
app.py CHANGED
@@ -15,7 +15,7 @@ def convert(pdf_file):
15
  blocks = page.get_text("dict")["blocks"]
16
  elements = []
17
 
18
- # Extraemos la lista de imágenes en esta página, con sus xrefs
19
  image_list = page.get_images(full=True)
20
  xref_to_image_path = {}
21
 
@@ -24,7 +24,7 @@ def convert(pdf_file):
24
  pix = fitz.Pixmap(doc, xref)
25
  img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
26
 
27
- if pix.n > 4: # si es CMYK, convertir a RGB
28
  pix = fitz.Pixmap(fitz.csRGB, pix)
29
  pix.save(img_path)
30
  pix = None
@@ -32,7 +32,7 @@ def convert(pdf_file):
32
  xref_to_image_path[xref] = img_path
33
  image_counter += 1
34
 
35
- # Recorremos bloques y reconstruimos texto+imagenes en orden vertical
36
  for b in blocks:
37
  if b["type"] == 0: # Texto
38
  for line in b["lines"]:
@@ -42,13 +42,11 @@ def convert(pdf_file):
42
  elements.append((y, text.strip()))
43
  elif b["type"] == 1: # Imagen
44
  y = b["bbox"][1]
45
- # El bloque de imagen tiene su xref
46
- xref = b.get("image", {}).get("xref", None)
47
  if xref and xref in xref_to_image_path:
48
  img_path = xref_to_image_path[xref]
49
  elements.append((y, f"![imagen]({img_path})"))
50
  else:
51
- # Si no encontramos la imagen, dejamos marcador vacío
52
  elements.append((y, "[imagen]()"))
53
 
54
  elements.sort(key=lambda x: x[0])
 
15
  blocks = page.get_text("dict")["blocks"]
16
  elements = []
17
 
18
+ # Extraemos lista de imágenes con sus xrefs
19
  image_list = page.get_images(full=True)
20
  xref_to_image_path = {}
21
 
 
24
  pix = fitz.Pixmap(doc, xref)
25
  img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
26
 
27
+ if pix.n > 4: # CMYK -> RGB
28
  pix = fitz.Pixmap(fitz.csRGB, pix)
29
  pix.save(img_path)
30
  pix = None
 
32
  xref_to_image_path[xref] = img_path
33
  image_counter += 1
34
 
35
+ # Procesar bloques y ordenar por coordenada vertical
36
  for b in blocks:
37
  if b["type"] == 0: # Texto
38
  for line in b["lines"]:
 
42
  elements.append((y, text.strip()))
43
  elif b["type"] == 1: # Imagen
44
  y = b["bbox"][1]
45
+ xref = b.get("image", None)
 
46
  if xref and xref in xref_to_image_path:
47
  img_path = xref_to_image_path[xref]
48
  elements.append((y, f"![imagen]({img_path})"))
49
  else:
 
50
  elements.append((y, "[imagen]()"))
51
 
52
  elements.sort(key=lambda x: x[0])