Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ import pytesseract
|
|
6 |
import os
|
7 |
import numpy as np
|
8 |
import cv2
|
|
|
9 |
|
10 |
def clean_ocr_text(text):
|
11 |
lines = text.splitlines()
|
@@ -67,6 +68,16 @@ def extract_text_markdown(doc, image_paths, page_index, seen_xrefs):
|
|
67 |
|
68 |
@spaces.GPU
|
69 |
def convert(pdf_bytes):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
temp_pdf_path = "/tmp/uploaded_file.pdf"
|
71 |
with open(temp_pdf_path, "wb") as f:
|
72 |
f.write(pdf_bytes)
|
@@ -86,18 +97,15 @@ def convert(pdf_bytes):
|
|
86 |
markdown_output += f"\n## Página {page_num + 1}\n\n"
|
87 |
pix = page.get_pixmap(dpi=300)
|
88 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
89 |
-
|
90 |
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
|
91 |
img.save(image_path)
|
92 |
image_paths.append(image_path)
|
93 |
-
|
94 |
markdown_output += f"\n"
|
95 |
|
96 |
try:
|
97 |
ocr_text = pytesseract.image_to_string(img)
|
98 |
except pytesseract.TesseractError:
|
99 |
ocr_text = ""
|
100 |
-
|
101 |
ocr_text = clean_ocr_text(ocr_text)
|
102 |
if ocr_text.strip():
|
103 |
markdown_output += ocr_text + "\n"
|
@@ -107,9 +115,7 @@ def convert(pdf_bytes):
|
|
107 |
gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
|
108 |
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
109 |
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
110 |
-
|
111 |
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
|
112 |
-
|
113 |
for i, cnt in enumerate(contours):
|
114 |
x, y, w, h = cv2.boundingRect(cnt)
|
115 |
area = w * h
|
|
|
6 |
import os
|
7 |
import numpy as np
|
8 |
import cv2
|
9 |
+
import base64
|
10 |
|
11 |
def clean_ocr_text(text):
|
12 |
lines = text.splitlines()
|
|
|
68 |
|
69 |
@spaces.GPU
|
70 |
def convert(pdf_bytes):
|
71 |
+
# Si no se recibe ningún PDF, usamos uno por defecto.
|
72 |
+
if not pdf_bytes:
|
73 |
+
default_pdf_base64 = (
|
74 |
+
"JVBERi0xLjQKMSAwIG9iago8PC9UeXBlIC9DYXRhbG9nIC9QYWdlcyAyIDAgUgovT3V0cHV0cyA8PC9Qcm9jU2V0Wy9QREZdPj4+"
|
75 |
+
"CmVuZG9iagoKMiAwIG9iago8PC9UeXBlIC9QYWdlcyAvS2lkcyBbMyAwIFJdIC9Db3VudCAxPj4KZW5kb2JqCgozIDAgb2JqCjw8"
|
76 |
+
"L1R5cGUgL1BhZ2UgL1BhZ2VzIDIgMCBSIC9NZWRpYUJveCBbMCAwIDMwMCAxNDNdIC9Db250ZW50cyA0IDAgUiA+PgplbmRvYmoK"
|
77 |
+
"CgoKZW5kb2JqCnhyZWYKMC45NgolJUVPRgo="
|
78 |
+
)
|
79 |
+
pdf_bytes = base64.b64decode(default_pdf_base64)
|
80 |
+
|
81 |
temp_pdf_path = "/tmp/uploaded_file.pdf"
|
82 |
with open(temp_pdf_path, "wb") as f:
|
83 |
f.write(pdf_bytes)
|
|
|
97 |
markdown_output += f"\n## Página {page_num + 1}\n\n"
|
98 |
pix = page.get_pixmap(dpi=300)
|
99 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
|
100 |
image_path = f"/tmp/ocr_page_{page_num + 1}.jpg"
|
101 |
img.save(image_path)
|
102 |
image_paths.append(image_path)
|
|
|
103 |
markdown_output += f"\n"
|
104 |
|
105 |
try:
|
106 |
ocr_text = pytesseract.image_to_string(img)
|
107 |
except pytesseract.TesseractError:
|
108 |
ocr_text = ""
|
|
|
109 |
ocr_text = clean_ocr_text(ocr_text)
|
110 |
if ocr_text.strip():
|
111 |
markdown_output += ocr_text + "\n"
|
|
|
115 |
gray = cv2.cvtColor(img_cv, cv2.COLOR_RGB2GRAY)
|
116 |
_, thresh = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)
|
117 |
contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
|
|
118 |
contours = sorted(contours, key=cv2.contourArea, reverse=True)[:5]
|
|
|
119 |
for i, cnt in enumerate(contours):
|
120 |
x, y, w, h = cv2.boundingRect(cnt)
|
121 |
area = w * h
|