Biifruu commited on
Commit
dd8d861
·
verified ·
1 Parent(s): 095781d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -7
app.py CHANGED
@@ -4,6 +4,8 @@ import fitz # PyMuPDF
4
  from PIL import Image
5
  import pytesseract
6
  import os
 
 
7
 
8
  def clean_ocr_text(text):
9
  lines = text.splitlines()
@@ -33,12 +35,12 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
33
  if line_text:
34
  elements.append((line_y, line_text, max_font_size))
35
 
36
- # Extraer imágenes únicas (por xref, global)
37
  images_on_page = page.get_images(full=True)
38
  for img_index, img in enumerate(images_on_page):
39
  xref = img[0]
40
  if xref in seen_xrefs:
41
- continue # ya extraída
42
  seen_xrefs.add(xref)
43
  try:
44
  base_image = page.parent.extract_image(xref)
@@ -53,7 +55,6 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
53
  except Exception as e:
54
  elements.append((float("inf"), f"[Error imagen: {e}]", 10))
55
 
56
- # Ordenar por posición
57
  elements.sort(key=lambda x: x[0])
58
  previous_y = None
59
 
@@ -75,7 +76,7 @@ def convert(pdf_file):
75
  doc = fitz.open(pdf_file)
76
  markdown_output = ""
77
  image_paths = []
78
- seen_xrefs = set() # <<-- GLOBAL para todo el PDF
79
 
80
  for page_num in range(len(doc)):
81
  page = doc[page_num]
@@ -94,18 +95,36 @@ def convert(pdf_file):
94
 
95
  markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
96
 
 
97
  try:
98
- ocr_text = pytesseract.image_to_string(img, lang="spa")
99
- except pytesseract.TesseractError:
100
  ocr_text = pytesseract.image_to_string(img)
 
 
101
 
102
  ocr_text = clean_ocr_text(ocr_text)
103
  if ocr_text.strip():
104
  markdown_output += ocr_text + "\n"
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  markdown_output += "\n---\n\n"
107
 
108
- # Guardar como archivo .md
109
  markdown_path = "/tmp/resultado.md"
110
  with open(markdown_path, "w", encoding="utf-8") as f:
111
  f.write(markdown_output)
 
4
  from PIL import Image
5
  import pytesseract
6
  import os
7
+ import numpy as np
8
+ import cv2
9
 
10
  def clean_ocr_text(text):
11
  lines = text.splitlines()
 
35
  if line_text:
36
  elements.append((line_y, line_text, max_font_size))
37
 
38
+ # Extraer imágenes únicas (por xref)
39
  images_on_page = page.get_images(full=True)
40
  for img_index, img in enumerate(images_on_page):
41
  xref = img[0]
42
  if xref in seen_xrefs:
43
+ continue
44
  seen_xrefs.add(xref)
45
  try:
46
  base_image = page.parent.extract_image(xref)
 
55
  except Exception as e:
56
  elements.append((float("inf"), f"[Error imagen: {e}]", 10))
57
 
 
58
  elements.sort(key=lambda x: x[0])
59
  previous_y = None
60
 
 
76
  doc = fitz.open(pdf_file)
77
  markdown_output = ""
78
  image_paths = []
79
+ seen_xrefs = set()
80
 
81
  for page_num in range(len(doc)):
82
  page = doc[page_num]
 
95
 
96
  markdown_output += f"![imagen_pagina_{page_num + 1}]({image_path})\n"
97
 
98
+ # OCR
99
  try:
 
 
100
  ocr_text = pytesseract.image_to_string(img)
101
+ except pytesseract.TesseractError:
102
+ ocr_text = ""
103
 
104
  ocr_text = clean_ocr_text(ocr_text)
105
  if ocr_text.strip():
106
  markdown_output += ocr_text + "\n"
107
 
108
+ # Detección de imágenes dentro de la imagen completa (por contornos)
109
+ try:
110
+ img_cv = np.array(img)
111
+ gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
112
+ _, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
113
+ contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
114
+
115
+ for i, cnt in enumerate(contours):
116
+ x, y, w, h = cv2.boundingRect(cnt)
117
+ if w > 50 and h > 50:
118
+ region = img_cv[y:y+h, x:x+w]
119
+ detected_path = f"/tmp/img_detectada_p{page_num + 1}_{i + 1}.jpg"
120
+ Image.fromarray(region).save(detected_path)
121
+ image_paths.append(detected_path)
122
+ markdown_output += f"\n\n![imagen_detectada]({detected_path})\n"
123
+ except Exception as e:
124
+ markdown_output += f"\n\n[Error al detectar imágenes embebidas: {e}]\n"
125
+
126
  markdown_output += "\n---\n\n"
127
 
 
128
  markdown_path = "/tmp/resultado.md"
129
  with open(markdown_path, "w", encoding="utf-8") as f:
130
  f.write(markdown_output)