Biifruu commited on
Commit
6e5a37b
·
verified ·
1 Parent(s): 7d786f3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -9
app.py CHANGED
@@ -21,12 +21,13 @@ def extract_text_markdown(doc):
21
  markdown_output = ""
22
  image_counter = 1
23
 
24
- for page in doc:
25
  blocks = page.get_text("dict")["blocks"]
26
  elements = []
27
 
28
  for b in blocks:
29
  y = b["bbox"][1]
 
30
  if b["type"] == 0: # Texto
31
  for line in b["lines"]:
32
  line_y = line["bbox"][1]
@@ -34,14 +35,19 @@ def extract_text_markdown(doc):
34
  max_font_size = max([span.get("size", 10) for span in line["spans"]])
35
  if line_text:
36
  elements.append((line_y, line_text, max_font_size))
 
37
  elif b["type"] == 1: # Imagen
38
- elements.append((y, f"![imagen_{image_counter}](#)", 10))
39
- image_counter += 1
 
 
 
 
 
 
40
 
41
  elements.sort(key=lambda x: x[0])
42
-
43
  previous_y = None
44
- previous_font = None
45
 
46
  for y, text, font_size in elements:
47
  is_header = font_size >= 14
@@ -55,7 +61,6 @@ def extract_text_markdown(doc):
55
  markdown_output += text.strip() + "\n"
56
 
57
  previous_y = y
58
- previous_font = font_size
59
 
60
  markdown_output += "\n---\n\n"
61
 
@@ -75,11 +80,23 @@ def convert(pdf_file):
75
  # Página con texto normal
76
  markdown_output += extract_text_markdown([page]) + "\n"
77
  else:
78
- # Página sin texto: usar OCR
79
  pix = page.get_pixmap(dpi=300)
80
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
81
- ocr_text = pytesseract.image_to_string(img, lang="spa")
82
- markdown_output += clean_ocr_text(ocr_text) + "\n"
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  markdown_output += "\n---\n\n"
85
 
@@ -90,3 +107,4 @@ gr.Interface(
90
  inputs=[gr.File(label="Sube tu PDF", type="filepath")],
91
  outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
92
  ).launch()
 
 
21
  markdown_output = ""
22
  image_counter = 1
23
 
24
+ for page_num, page in enumerate(doc):
25
  blocks = page.get_text("dict")["blocks"]
26
  elements = []
27
 
28
  for b in blocks:
29
  y = b["bbox"][1]
30
+
31
  if b["type"] == 0: # Texto
32
  for line in b["lines"]:
33
  line_y = line["bbox"][1]
 
35
  max_font_size = max([span.get("size", 10) for span in line["spans"]])
36
  if line_text:
37
  elements.append((line_y, line_text, max_font_size))
38
+
39
  elif b["type"] == 1: # Imagen
40
+ try:
41
+ image = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=b["bbox"])
42
+ image_path = f"/tmp/imagen_embebida_{page_num + 1}_{image_counter}.jpg"
43
+ image.save(image_path)
44
+ elements.append((y, f"![imagen_{image_counter}]({image_path})", 10))
45
+ image_counter += 1
46
+ except Exception as e:
47
+ elements.append((y, f"[Error al procesar imagen: {e}]", 10))
48
 
49
  elements.sort(key=lambda x: x[0])
 
50
  previous_y = None
 
51
 
52
  for y, text, font_size in elements:
53
  is_header = font_size >= 14
 
61
  markdown_output += text.strip() + "\n"
62
 
63
  previous_y = y
 
64
 
65
  markdown_output += "\n---\n\n"
66
 
 
80
  # Página con texto normal
81
  markdown_output += extract_text_markdown([page]) + "\n"
82
  else:
83
+ # Página vacía o imagen: hacer OCR
84
  pix = page.get_pixmap(dpi=300)
85
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
86
+
87
+ # Guardar imagen escaneada completa
88
+ image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
89
+ img.save(image_path)
90
+ markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
91
+
92
+ try:
93
+ ocr_text = pytesseract.image_to_string(img, lang="spa")
94
+ except pytesseract.TesseractError:
95
+ ocr_text = pytesseract.image_to_string(img) # fallback sin lang
96
+
97
+ ocr_text = clean_ocr_text(ocr_text)
98
+ if ocr_text.strip():
99
+ markdown_output += ocr_text + "\n"
100
 
101
  markdown_output += "\n---\n\n"
102
 
 
107
  inputs=[gr.File(label="Sube tu PDF", type="filepath")],
108
  outputs=[gr.Text(label="Markdown estructurado"), gr.JSON(label="Metadata")],
109
  ).launch()
110
+