Biifruu commited on
Commit
4f878aa
·
verified ·
1 Parent(s): 584dc82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -20
app.py CHANGED
@@ -22,6 +22,7 @@ def extract_text_markdown(doc, image_paths):
22
  blocks = page.get_text("dict")["blocks"]
23
  elements = []
24
 
 
25
  for b in blocks:
26
  y = b["bbox"][1]
27
 
@@ -33,31 +34,38 @@ def extract_text_markdown(doc, image_paths):
33
  if line_text:
34
  elements.append((line_y, line_text, max_font_size))
35
 
36
- elif b["type"] == 1: # Imagen
37
- try:
38
- image = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=b["bbox"])
39
- image_path = f"/tmp/imagen_embebida_{page_num + 1}_{image_counter}.jpg"
40
- image.save(image_path)
41
- image_paths.append(image_path)
42
- elements.append((y, f"![imagen_{image_counter}]({image_path})", 10))
43
- image_counter += 1
44
- except Exception as e:
45
- elements.append((y, f"[Error al procesar imagen: {e}]", 10))
46
-
 
 
 
 
 
 
 
 
 
 
47
  elements.sort(key=lambda x: x[0])
48
  previous_y = None
49
 
50
  for y, text, font_size in elements:
51
  is_header = font_size >= 14
52
-
53
  if previous_y is not None and abs(y - previous_y) > 10:
54
  markdown_output += "\n"
55
-
56
  if is_header:
57
  markdown_output += f"\n### {text.strip()}\n"
58
  else:
59
  markdown_output += text.strip() + "\n"
60
-
61
  previous_y = y
62
 
63
  markdown_output += "\n---\n\n"
@@ -75,14 +83,11 @@ def convert(pdf_file):
75
  text = page.get_text("text").strip()
76
 
77
  if len(text) > 30:
78
- # Página con texto normal
79
  markdown_output += extract_text_markdown([page], image_paths) + "\n"
80
  else:
81
- # Página vacía o con imagen: hacer OCR
82
  pix = page.get_pixmap(dpi=300)
83
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
84
 
85
- # Guardar imagen completa
86
  image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
87
  img.save(image_path)
88
  image_paths.append(image_path)
@@ -92,7 +97,7 @@ def convert(pdf_file):
92
  try:
93
  ocr_text = pytesseract.image_to_string(img, lang="spa")
94
  except pytesseract.TesseractError:
95
- ocr_text = pytesseract.image_to_string(img) # fallback sin lang
96
 
97
  ocr_text = clean_ocr_text(ocr_text)
98
  if ocr_text.strip():
@@ -100,7 +105,12 @@ def convert(pdf_file):
100
 
101
  markdown_output += "\n---\n\n"
102
 
103
- return markdown_output.strip(), {}, image_paths
 
 
 
 
 
104
 
105
  gr.Interface(
106
  fn=convert,
@@ -108,6 +118,7 @@ gr.Interface(
108
  outputs=[
109
  gr.Markdown(label="Markdown estructurado"),
110
  gr.JSON(label="Metadata"),
111
- gr.Gallery(label="Imágenes extraídas", type="file")
 
112
  ],
113
  ).launch()
 
22
  blocks = page.get_text("dict")["blocks"]
23
  elements = []
24
 
25
+ # 🔁 Añadir texto normal (bloques)
26
  for b in blocks:
27
  y = b["bbox"][1]
28
 
 
34
  if line_text:
35
  elements.append((line_y, line_text, max_font_size))
36
 
37
+ # 🖼️ Extraer imágenes reales de la página (xref)
38
+ images_on_page = page.get_images(full=True)
39
+ for img_index, img in enumerate(images_on_page):
40
+ xref = img[0]
41
+ try:
42
+ base_image = doc.extract_image(xref)
43
+ image_bytes = base_image["image"]
44
+ ext = base_image["ext"]
45
+ image_path = f"/tmp/imagen_embebida_{page_num + 1}_{img_index + 1}.{ext}"
46
+
47
+ with open(image_path, "wb") as f:
48
+ f.write(image_bytes)
49
+
50
+ image_paths.append(image_path)
51
+ y_pos = 50 + img_index * 10 # Posición estimada para ordenar
52
+ elements.append((y_pos, f"![imagen_{image_counter}]({image_path})", 10))
53
+ image_counter += 1
54
+ except Exception as e:
55
+ elements.append((50 + img_index * 10, f"[Error imagen: {e}]", 10))
56
+
57
+ # Ordenar y construir Markdown
58
  elements.sort(key=lambda x: x[0])
59
  previous_y = None
60
 
61
  for y, text, font_size in elements:
62
  is_header = font_size >= 14
 
63
  if previous_y is not None and abs(y - previous_y) > 10:
64
  markdown_output += "\n"
 
65
  if is_header:
66
  markdown_output += f"\n### {text.strip()}\n"
67
  else:
68
  markdown_output += text.strip() + "\n"
 
69
  previous_y = y
70
 
71
  markdown_output += "\n---\n\n"
 
83
  text = page.get_text("text").strip()
84
 
85
  if len(text) > 30:
 
86
  markdown_output += extract_text_markdown([page], image_paths) + "\n"
87
  else:
 
88
  pix = page.get_pixmap(dpi=300)
89
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
90
 
 
91
  image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
92
  img.save(image_path)
93
  image_paths.append(image_path)
 
97
  try:
98
  ocr_text = pytesseract.image_to_string(img, lang="spa")
99
  except pytesseract.TesseractError:
100
+ ocr_text = pytesseract.image_to_string(img)
101
 
102
  ocr_text = clean_ocr_text(ocr_text)
103
  if ocr_text.strip():
 
105
 
106
  markdown_output += "\n---\n\n"
107
 
108
+ # Guardar como archivo .md
109
+ markdown_path = "/tmp/resultado.md"
110
+ with open(markdown_path, "w", encoding="utf-8") as f:
111
+ f.write(markdown_output)
112
+
113
+ return markdown_output.strip(), {}, image_paths, markdown_path
114
 
115
  gr.Interface(
116
  fn=convert,
 
118
  outputs=[
119
  gr.Markdown(label="Markdown estructurado"),
120
  gr.JSON(label="Metadata"),
121
+ gr.Gallery(label="Imágenes extraídas", type="file"),
122
+ gr.File(label="Descargar .md")
123
  ],
124
  ).launch()