Biifruu commited on
Commit
650dd50
·
verified ·
1 Parent(s): 0ef1375

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -8
app.py CHANGED
@@ -17,15 +17,14 @@ def clean_ocr_text(text):
17
  def extract_text_markdown(doc, image_paths):
18
  markdown_output = ""
19
  image_counter = 1
 
20
 
21
  for page_num, page in enumerate(doc):
22
  blocks = page.get_text("dict")["blocks"]
23
  elements = []
24
 
25
- # 🔁 Añadir texto normal (bloques)
26
  for b in blocks:
27
  y = b["bbox"][1]
28
-
29
  if b["type"] == 0: # Texto
30
  for line in b["lines"]:
31
  line_y = line["bbox"][1]
@@ -34,30 +33,29 @@ def extract_text_markdown(doc, image_paths):
34
  if line_text:
35
  elements.append((line_y, line_text, max_font_size))
36
 
37
- # 🖼️ Extraer imágenes reales de la página (xref)
38
  images_on_page = page.get_images(full=True)
39
  for img_index, img in enumerate(images_on_page):
40
  xref = img[0]
 
 
 
41
  try:
42
  base_image = page.parent.extract_image(xref)
43
  image_bytes = base_image["image"]
44
  ext = base_image["ext"]
45
  image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
46
-
47
  with open(image_path, "wb") as f:
48
  f.write(image_bytes)
49
-
50
  image_paths.append(image_path)
51
- y_pos = 50 + img_index * 10 # Posición estimada para ordenar
52
  elements.append((y_pos, f"![imagen_{image_counter}]({image_path})", 10))
53
  image_counter += 1
54
  except Exception as e:
55
  elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
56
 
57
- # Ordenar y construir Markdown
58
  elements.sort(key=lambda x: x[0])
59
  previous_y = None
60
-
61
  for y, text, font_size in elements:
62
  is_header = font_size >= 14
63
  if previous_y is not None and abs(y - previous_y) > 10:
@@ -72,6 +70,7 @@ def extract_text_markdown(doc, image_paths):
72
 
73
  return markdown_output.strip()
74
 
 
75
  @spaces.GPU
76
  def convert(pdf_file):
77
  doc = fitz.open(pdf_file)
 
17
  def extract_text_markdown(doc, image_paths):
18
  markdown_output = ""
19
  image_counter = 1
20
+ seen_xrefs = set()
21
 
22
  for page_num, page in enumerate(doc):
23
  blocks = page.get_text("dict")["blocks"]
24
  elements = []
25
 
 
26
  for b in blocks:
27
  y = b["bbox"][1]
 
28
  if b["type"] == 0: # Texto
29
  for line in b["lines"]:
30
  line_y = line["bbox"][1]
 
33
  if line_text:
34
  elements.append((line_y, line_text, max_font_size))
35
 
36
+ # Extraer imágenes únicas por xref
37
  images_on_page = page.get_images(full=True)
38
  for img_index, img in enumerate(images_on_page):
39
  xref = img[0]
40
+ if xref in seen_xrefs:
41
+ continue # Saltar si ya se extrajo
42
+ seen_xrefs.add(xref)
43
  try:
44
  base_image = page.parent.extract_image(xref)
45
  image_bytes = base_image["image"]
46
  ext = base_image["ext"]
47
  image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
 
48
  with open(image_path, "wb") as f:
49
  f.write(image_bytes)
 
50
  image_paths.append(image_path)
51
+ y_pos = 50 + img_index * 10
52
  elements.append((y_pos, f"![imagen_{image_counter}]({image_path})", 10))
53
  image_counter += 1
54
  except Exception as e:
55
  elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
56
 
 
57
  elements.sort(key=lambda x: x[0])
58
  previous_y = None
 
59
  for y, text, font_size in elements:
60
  is_header = font_size >= 14
61
  if previous_y is not None and abs(y - previous_y) > 10:
 
70
 
71
  return markdown_output.strip()
72
 
73
+
74
  @spaces.GPU
75
  def convert(pdf_file):
76
  doc = fitz.open(pdf_file)