Biifruu commited on
Commit
dd21256
verified
1 Parent(s): a97d32a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -20
app.py CHANGED
@@ -15,6 +15,24 @@ def convert(pdf_file):
15
  blocks = page.get_text("dict")["blocks"]
16
  elements = []
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  for b in blocks:
19
  if b["type"] == 0: # Texto
20
  for line in b["lines"]:
@@ -24,21 +42,15 @@ def convert(pdf_file):
24
  elements.append((y, text.strip()))
25
  elif b["type"] == 1: # Imagen
26
  y = b["bbox"][1]
27
- img = page.get_image_list(full=True)
28
- if img:
29
- xref = img[0][0]
30
- pix = fitz.Pixmap(doc, xref)
31
- img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
32
-
33
- if pix.n > 4: # CMYK
34
- pix = fitz.Pixmap(fitz.csRGB, pix)
35
- pix.save(img_path)
36
- pix = None
37
-
38
  elements.append((y, f"![imagen]({img_path})"))
39
- image_counter += 1
 
 
40
 
41
- # Ordenar por posici贸n vertical (y)
42
  elements.sort(key=lambda x: x[0])
43
 
44
  for _, content in elements:
@@ -48,11 +60,6 @@ def convert(pdf_file):
48
 
49
  gr.Interface(
50
  convert,
51
- inputs=[
52
- gr.File(label="Upload PDF", type="filepath"),
53
- ],
54
- outputs=[
55
- gr.Text(label="Markdown"),
56
- gr.JSON(label="Metadata"),
57
- ],
58
  ).launch()
 
15
  blocks = page.get_text("dict")["blocks"]
16
  elements = []
17
 
18
+ # Extraemos la lista de im谩genes en esta p谩gina, con sus xrefs
19
+ image_list = page.get_images(full=True)
20
+ xref_to_image_path = {}
21
+
22
+ for img in image_list:
23
+ xref = img[0]
24
+ pix = fitz.Pixmap(doc, xref)
25
+ img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
26
+
27
+ if pix.n > 4: # si es CMYK, convertir a RGB
28
+ pix = fitz.Pixmap(fitz.csRGB, pix)
29
+ pix.save(img_path)
30
+ pix = None
31
+
32
+ xref_to_image_path[xref] = img_path
33
+ image_counter += 1
34
+
35
+ # Recorremos bloques y reconstruimos texto+imagenes en orden vertical
36
  for b in blocks:
37
  if b["type"] == 0: # Texto
38
  for line in b["lines"]:
 
42
  elements.append((y, text.strip()))
43
  elif b["type"] == 1: # Imagen
44
  y = b["bbox"][1]
45
+ # El bloque de imagen tiene su xref
46
+ xref = b.get("image", {}).get("xref", None)
47
+ if xref and xref in xref_to_image_path:
48
+ img_path = xref_to_image_path[xref]
 
 
 
 
 
 
 
49
  elements.append((y, f"![imagen]({img_path})"))
50
+ else:
51
+ # Si no encontramos la imagen, dejamos marcador vac铆o
52
+ elements.append((y, "[imagen]()"))
53
 
 
54
  elements.sort(key=lambda x: x[0])
55
 
56
  for _, content in elements:
 
60
 
61
  gr.Interface(
62
  convert,
63
+ inputs=[gr.File(label="Upload PDF", type="filepath")],
64
+ outputs=[gr.Text(label="Markdown"), gr.JSON(label="Metadata")],
 
 
 
 
 
65
  ).launch()