Biifruu commited on
Commit
8ff21c1
·
verified ·
1 Parent(s): 0a2dbbc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -7
app.py CHANGED
@@ -14,13 +14,12 @@ def clean_ocr_text(text):
14
  cleaned_lines.append(line)
15
  return "\n".join(cleaned_lines)
16
 
17
- def extract_text_markdown(doc, image_paths, page_index):
18
  markdown_output = f"\n## Página {page_index + 1}\n\n"
19
  image_counter = 1
20
- seen_xrefs = set()
21
  elements = []
22
 
23
- page = doc[0] # Solo se procesa una página cada vez
24
 
25
  blocks = page.get_text("dict")["blocks"]
26
 
@@ -34,12 +33,12 @@ def extract_text_markdown(doc, image_paths, page_index):
34
  if line_text:
35
  elements.append((line_y, line_text, max_font_size))
36
 
37
- # Extraer imágenes únicas
38
  images_on_page = page.get_images(full=True)
39
  for img_index, img in enumerate(images_on_page):
40
  xref = img[0]
41
  if xref in seen_xrefs:
42
- continue
43
  seen_xrefs.add(xref)
44
  try:
45
  base_image = page.parent.extract_image(xref)
@@ -49,7 +48,6 @@ def extract_text_markdown(doc, image_paths, page_index):
49
  with open(image_path, "wb") as f:
50
  f.write(image_bytes)
51
  image_paths.append(image_path)
52
- # Usar posición alta para insertar al final del Markdown
53
  elements.append((float("inf") - img_index, f"\n\n![imagen_{image_counter}]({image_path})\n", 10))
54
  image_counter += 1
55
  except Exception as e:
@@ -77,13 +75,14 @@ def convert(pdf_file):
77
  doc = fitz.open(pdf_file)
78
  markdown_output = ""
79
  image_paths = []
 
80
 
81
  for page_num in range(len(doc)):
82
  page = doc[page_num]
83
  text = page.get_text("text").strip()
84
 
85
  if len(text) > 30:
86
- markdown_output += extract_text_markdown([page], image_paths, page_num) + "\n"
87
  else:
88
  markdown_output += f"\n## Página {page_num + 1}\n\n"
89
  pix = page.get_pixmap(dpi=300)
 
14
  cleaned_lines.append(line)
15
  return "\n".join(cleaned_lines)
16
 
17
+ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
18
  markdown_output = f"\n## Página {page_index + 1}\n\n"
19
  image_counter = 1
 
20
  elements = []
21
 
22
+ page = doc[0] # solo una página en cada llamada
23
 
24
  blocks = page.get_text("dict")["blocks"]
25
 
 
33
  if line_text:
34
  elements.append((line_y, line_text, max_font_size))
35
 
36
+ # Extraer imágenes únicas (por xref, global)
37
  images_on_page = page.get_images(full=True)
38
  for img_index, img in enumerate(images_on_page):
39
  xref = img[0]
40
  if xref in seen_xrefs:
41
+ continue # ya extraída
42
  seen_xrefs.add(xref)
43
  try:
44
  base_image = page.parent.extract_image(xref)
 
48
  with open(image_path, "wb") as f:
49
  f.write(image_bytes)
50
  image_paths.append(image_path)
 
51
  elements.append((float("inf") - img_index, f"\n\n![imagen_{image_counter}]({image_path})\n", 10))
52
  image_counter += 1
53
  except Exception as e:
 
75
  doc = fitz.open(pdf_file)
76
  markdown_output = ""
77
  image_paths = []
78
+ seen_xrefs = set() # <<-- GLOBAL para todo el PDF
79
 
80
  for page_num in range(len(doc)):
81
  page = doc[page_num]
82
  text = page.get_text("text").strip()
83
 
84
  if len(text) > 30:
85
+ markdown_output += extract_text_markdown([page], image_paths, page_num, seen_xrefs) + "\n"
86
  else:
87
  markdown_output += f"\n## Página {page_num + 1}\n\n"
88
  pix = page.get_pixmap(dpi=300)