Biifruu commited on
Commit
3e3d3c7
verified
1 Parent(s): 03cef85

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -26
app.py CHANGED
@@ -15,41 +15,29 @@ def extract_text_markdown(doc):
15
  blocks = page.get_text("dict")["blocks"]
16
  elements = []
17
 
18
- # Extraer im谩genes y guardar para asignar link
19
- image_list = page.get_images(full=True)
20
- xref_to_placeholder = {}
21
-
22
- for img in image_list:
23
- xref = img[0]
24
- pix = fitz.Pixmap(doc, xref)
25
- img_path = os.path.join(image_dir, f"imagen_{image_counter}.png")
26
-
27
- if pix.n > 4:
28
- pix = fitz.Pixmap(fitz.csRGB, pix)
29
- pix.save(img_path)
30
- pix = None
31
-
32
- xref_to_placeholder[xref] = f"![imagen]()"
33
- image_counter += 1
34
-
35
  for b in blocks:
36
  y = b["bbox"][1]
37
  if b["type"] == 0: # Texto
38
- paragraph = ""
39
  for line in b["lines"]:
40
- line_text = " ".join([span["text"].strip() for span in line["spans"]])
41
- paragraph += line_text + " "
42
- paragraph = paragraph.strip()
43
- if paragraph:
44
- elements.append((y, paragraph))
45
  elif b["type"] == 1: # Imagen
46
- xref = b.get("image")
47
  elements.append((y, "![imagen]()"))
48
 
 
49
  elements.sort(key=lambda x: x[0])
50
 
51
- for _, content in elements:
52
- markdown_output += content + "\n\n"
 
 
 
 
 
 
 
53
 
54
  return markdown_output.strip()
55
 
 
15
  blocks = page.get_text("dict")["blocks"]
16
  elements = []
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  for b in blocks:
19
  y = b["bbox"][1]
20
  if b["type"] == 0: # Texto
 
21
  for line in b["lines"]:
22
+ line_y = line["bbox"][1]
23
+ line_text = " ".join([span["text"] for span in line["spans"]]).strip()
24
+ if line_text:
25
+ elements.append((line_y, line_text))
 
26
  elif b["type"] == 1: # Imagen
 
27
  elements.append((y, "![imagen]()"))
28
 
29
+ # Ordenar elementos por posici贸n vertical
30
  elements.sort(key=lambda x: x[0])
31
 
32
+ # Reconstruir markdown respetando el espaciado visual
33
+ previous_y = None
34
+ for y, content in elements:
35
+ if previous_y is not None and abs(y - previous_y) > 10: # si hay espacio entre l铆neas, a帽adir salto
36
+ markdown_output += "\n"
37
+ markdown_output += content + "\n"
38
+ previous_y = y
39
+
40
+ markdown_output += "\n---\n\n" # separador entre p谩ginas
41
 
42
  return markdown_output.strip()
43