Biifruu commited on
Commit
0a2dbbc
·
verified ·
1 Parent(s): 650dd50

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -54
app.py CHANGED
@@ -14,63 +14,64 @@ def clean_ocr_text(text):
14
  cleaned_lines.append(line)
15
  return "\n".join(cleaned_lines)
16
 
17
- def extract_text_markdown(doc, image_paths):
18
- markdown_output = ""
19
  image_counter = 1
20
  seen_xrefs = set()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
- for page_num, page in enumerate(doc):
23
- blocks = page.get_text("dict")["blocks"]
24
- elements = []
25
-
26
- for b in blocks:
27
- y = b["bbox"][1]
28
- if b["type"] == 0: # Texto
29
- for line in b["lines"]:
30
- line_y = line["bbox"][1]
31
- line_text = " ".join([span["text"] for span in line["spans"]]).strip()
32
- max_font_size = max([span.get("size", 10) for span in line["spans"]])
33
- if line_text:
34
- elements.append((line_y, line_text, max_font_size))
35
-
36
- # Extraer imágenes únicas por xref
37
- images_on_page = page.get_images(full=True)
38
- for img_index, img in enumerate(images_on_page):
39
- xref = img[0]
40
- if xref in seen_xrefs:
41
- continue # Saltar si ya se extrajo
42
- seen_xrefs.add(xref)
43
- try:
44
- base_image = page.parent.extract_image(xref)
45
- image_bytes = base_image["image"]
46
- ext = base_image["ext"]
47
- image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
48
- with open(image_path, "wb") as f:
49
- f.write(image_bytes)
50
- image_paths.append(image_path)
51
- y_pos = 50 + img_index * 10
52
- elements.append((y_pos, f"![imagen_{image_counter}]({image_path})", 10))
53
- image_counter += 1
54
- except Exception as e:
55
- elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
56
-
57
- elements.sort(key=lambda x: x[0])
58
- previous_y = None
59
- for y, text, font_size in elements:
60
- is_header = font_size >= 14
61
- if previous_y is not None and abs(y - previous_y) > 10:
62
- markdown_output += "\n"
63
- if is_header:
64
- markdown_output += f"\n### {text.strip()}\n"
65
- else:
66
- markdown_output += text.strip() + "\n"
67
- previous_y = y
68
-
69
- markdown_output += "\n---\n\n"
70
-
71
  return markdown_output.strip()
72
 
73
-
74
  @spaces.GPU
75
  def convert(pdf_file):
76
  doc = fitz.open(pdf_file)
@@ -82,8 +83,9 @@ def convert(pdf_file):
82
  text = page.get_text("text").strip()
83
 
84
  if len(text) > 30:
85
- markdown_output += extract_text_markdown([page], image_paths) + "\n"
86
  else:
 
87
  pix = page.get_pixmap(dpi=300)
88
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
89
 
@@ -102,7 +104,7 @@ def convert(pdf_file):
102
  if ocr_text.strip():
103
  markdown_output += ocr_text + "\n"
104
 
105
- markdown_output += "\n---\n\n"
106
 
107
  # Guardar como archivo .md
108
  markdown_path = "/tmp/resultado.md"
 
14
  cleaned_lines.append(line)
15
  return "\n".join(cleaned_lines)
16
 
17
+ def extract_text_markdown(doc, image_paths, page_index):
18
+ markdown_output = f"\n## Página {page_index + 1}\n\n"
19
  image_counter = 1
20
  seen_xrefs = set()
21
+ elements = []
22
+
23
+ page = doc[0] # Solo se procesa una página cada vez
24
+
25
+ blocks = page.get_text("dict")["blocks"]
26
+
27
+ for b in blocks:
28
+ y = b["bbox"][1]
29
+ if b["type"] == 0: # Texto
30
+ for line in b["lines"]:
31
+ line_y = line["bbox"][1]
32
+ line_text = " ".join([span["text"] for span in line["spans"]]).strip()
33
+ max_font_size = max([span.get("size", 10) for span in line["spans"]])
34
+ if line_text:
35
+ elements.append((line_y, line_text, max_font_size))
36
+
37
+ # Extraer imágenes únicas
38
+ images_on_page = page.get_images(full=True)
39
+ for img_index, img in enumerate(images_on_page):
40
+ xref = img[0]
41
+ if xref in seen_xrefs:
42
+ continue
43
+ seen_xrefs.add(xref)
44
+ try:
45
+ base_image = page.parent.extract_image(xref)
46
+ image_bytes = base_image["image"]
47
+ ext = base_image["ext"]
48
+ image_path = f"/tmp/imagen_p{page_index + 1}_{img_index + 1}.{ext}"
49
+ with open(image_path, "wb") as f:
50
+ f.write(image_bytes)
51
+ image_paths.append(image_path)
52
+ # Usar posición alta para insertar al final del Markdown
53
+ elements.append((float("inf") - img_index, f"\n\n![imagen_{image_counter}]({image_path})\n", 10))
54
+ image_counter += 1
55
+ except Exception as e:
56
+ elements.append((float("inf"), f"[Error imagen: {e}]", 10))
57
+
58
+ # Ordenar por posición
59
+ elements.sort(key=lambda x: x[0])
60
+ previous_y = None
61
+
62
+ for y, text, font_size in elements:
63
+ is_header = font_size >= 14
64
+ if previous_y is not None and abs(y - previous_y) > 10:
65
+ markdown_output += "\n"
66
+ if is_header:
67
+ markdown_output += f"\n### {text.strip()}\n"
68
+ else:
69
+ markdown_output += text.strip() + "\n"
70
+ previous_y = y
71
 
72
+ markdown_output += "\n---\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  return markdown_output.strip()
74
 
 
75
  @spaces.GPU
76
  def convert(pdf_file):
77
  doc = fitz.open(pdf_file)
 
83
  text = page.get_text("text").strip()
84
 
85
  if len(text) > 30:
86
+ markdown_output += extract_text_markdown([page], image_paths, page_num) + "\n"
87
  else:
88
+ markdown_output += f"\n## Página {page_num + 1}\n\n"
89
  pix = page.get_pixmap(dpi=300)
90
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
91
 
 
104
  if ocr_text.strip():
105
  markdown_output += ocr_text + "\n"
106
 
107
+ markdown_output += "\n---\n\n"
108
 
109
  # Guardar como archivo .md
110
  markdown_path = "/tmp/resultado.md"