Spaces:
Running
Running
push from home
Browse files
app.py
CHANGED
@@ -15,6 +15,7 @@ import requests
|
|
15 |
import shutil
|
16 |
from concurrent.futures import ThreadPoolExecutor
|
17 |
import time
|
|
|
18 |
|
19 |
# Constants
|
20 |
DEFAULT_LANGUAGE = "English"
|
@@ -68,46 +69,64 @@ class OCRProcessor:
|
|
68 |
|
69 |
@staticmethod
|
70 |
def _encode_image(image_path: str) -> Optional[str]:
|
71 |
-
|
72 |
-
|
|
|
|
|
|
|
|
|
73 |
|
74 |
@staticmethod
|
75 |
def _save_uploaded_file(file_input: Union[str, bytes], filename: str) -> str:
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
@staticmethod
|
91 |
def _pdf_to_images(pdf_path: str) -> List[str]:
|
92 |
-
|
93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
pdf_document.close()
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
lambda i: OCRProcessor._convert_page(pdf_path, i),
|
100 |
-
range(pdf_document.page_count)
|
101 |
-
))
|
102 |
-
pdf_document.close()
|
103 |
-
return [path for path in image_paths if path]
|
104 |
|
105 |
@staticmethod
|
106 |
def _convert_page(pdf_path: str, page_num: int) -> Optional[str]:
|
107 |
try:
|
108 |
pdf_document = fitz.open(pdf_path)
|
109 |
page = pdf_document[page_num]
|
110 |
-
pix = page.get_pixmap(dpi=150)
|
111 |
image_path = os.path.join(UPLOAD_FOLDER, f"page_{page_num + 1}_{int(time.time())}.png")
|
112 |
pix.save(image_path)
|
113 |
pdf_document.close()
|
@@ -134,12 +153,18 @@ class OCRProcessor:
|
|
134 |
try:
|
135 |
self._check_file_size(pdf_file)
|
136 |
pdf_path = self._save_uploaded_file(pdf_file, file_name)
|
|
|
|
|
|
|
|
|
|
|
137 |
image_paths = self._pdf_to_images(pdf_path)
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
|
|
143 |
signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=TEMP_FILE_EXPIRY)
|
144 |
response = self._call_ocr_api(DocumentURLChunk(document_url=signed_url.url))
|
145 |
return self._get_combined_markdown(response), image_paths
|
@@ -153,6 +178,8 @@ class OCRProcessor:
|
|
153 |
self._check_file_size(image_file)
|
154 |
image_path = self._save_uploaded_file(image_file, file_name)
|
155 |
encoded_image = self._encode_image(image_path)
|
|
|
|
|
156 |
base64_url = f"data:image/jpeg;base64,{encoded_image}"
|
157 |
response = self._call_ocr_api(ImageURLChunk(image_url=base64_url))
|
158 |
return self._get_combined_markdown(response), image_path
|
@@ -180,6 +207,8 @@ class OCRProcessor:
|
|
180 |
self._check_file_size(image_file)
|
181 |
image_path = self._save_uploaded_file(image_file, file_name)
|
182 |
encoded_image = self._encode_image(image_path)
|
|
|
|
|
183 |
base64_url = f"data:image/jpeg;base64,{encoded_image}"
|
184 |
|
185 |
ocr_response = self._call_ocr_api(ImageURLChunk(image_url=base64_url))
|
|
|
15 |
import shutil
|
16 |
from concurrent.futures import ThreadPoolExecutor
|
17 |
import time
|
18 |
+
import fitz # PyMuPDF
|
19 |
|
20 |
# Constants
|
21 |
DEFAULT_LANGUAGE = "English"
|
|
|
69 |
|
70 |
@staticmethod
|
71 |
def _encode_image(image_path: str) -> Optional[str]:
|
72 |
+
try:
|
73 |
+
with open(image_path, "rb") as image_file:
|
74 |
+
return base64.b64encode(image_file.read()).decode('utf-8')
|
75 |
+
except Exception as e:
|
76 |
+
logger.error(f"Error encoding image {image_path}: {str(e)}")
|
77 |
+
return None
|
78 |
|
79 |
@staticmethod
|
80 |
def _save_uploaded_file(file_input: Union[str, bytes], filename: str) -> str:
|
81 |
+
clean_filename = os.path.basename(filename).replace(os.sep, "_")
|
82 |
+
file_path = os.path.join(UPLOAD_FOLDER, f"{int(time.time())}_{clean_filename}")
|
83 |
+
|
84 |
+
try:
|
85 |
+
if isinstance(file_input, str) and file_input.startswith("http"):
|
86 |
+
response = requests.get(file_input, timeout=10)
|
87 |
+
response.raise_for_status()
|
88 |
+
with open(file_path, 'wb') as f:
|
89 |
+
f.write(response.content)
|
90 |
+
elif isinstance(file_input, str) and os.path.exists(file_input):
|
91 |
+
shutil.copy2(file_input, file_path)
|
92 |
+
else:
|
93 |
+
with open(file_path, 'wb') as f:
|
94 |
+
if hasattr(file_input, 'read'):
|
95 |
+
shutil.copyfileobj(file_input, f)
|
96 |
+
else:
|
97 |
+
f.write(file_input)
|
98 |
+
if not os.path.exists(file_path):
|
99 |
+
raise FileNotFoundError(f"Failed to save file at {file_path}")
|
100 |
+
return file_path
|
101 |
+
except Exception as e:
|
102 |
+
logger.error(f"Error saving file {filename}: {str(e)}")
|
103 |
+
raise
|
104 |
|
105 |
@staticmethod
|
106 |
def _pdf_to_images(pdf_path: str) -> List[str]:
|
107 |
+
try:
|
108 |
+
pdf_document = fitz.open(pdf_path)
|
109 |
+
if pdf_document.page_count > MAX_PDF_PAGES:
|
110 |
+
pdf_document.close()
|
111 |
+
raise ValueError(f"PDF exceeds maximum page limit of {MAX_PDF_PAGES}")
|
112 |
+
|
113 |
+
with ThreadPoolExecutor() as executor:
|
114 |
+
image_paths = list(executor.map(
|
115 |
+
lambda i: OCRProcessor._convert_page(pdf_path, i),
|
116 |
+
range(pdf_document.page_count)
|
117 |
+
))
|
118 |
pdf_document.close()
|
119 |
+
return [path for path in image_paths if path]
|
120 |
+
except Exception as e:
|
121 |
+
logger.error(f"Error converting PDF to images: {str(e)}")
|
122 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
@staticmethod
|
125 |
def _convert_page(pdf_path: str, page_num: int) -> Optional[str]:
|
126 |
try:
|
127 |
pdf_document = fitz.open(pdf_path)
|
128 |
page = pdf_document[page_num]
|
129 |
+
pix = page.get_pixmap(dpi=150)
|
130 |
image_path = os.path.join(UPLOAD_FOLDER, f"page_{page_num + 1}_{int(time.time())}.png")
|
131 |
pix.save(image_path)
|
132 |
pdf_document.close()
|
|
|
153 |
try:
|
154 |
self._check_file_size(pdf_file)
|
155 |
pdf_path = self._save_uploaded_file(pdf_file, file_name)
|
156 |
+
logger.info(f"Saved PDF to: {pdf_path}")
|
157 |
+
|
158 |
+
if not os.path.exists(pdf_path):
|
159 |
+
raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
|
160 |
+
|
161 |
image_paths = self._pdf_to_images(pdf_path)
|
162 |
|
163 |
+
with open(pdf_path, "rb") as f:
|
164 |
+
uploaded_file = self.client.files.upload(
|
165 |
+
file={"file_name": file_name, "content": f},
|
166 |
+
purpose="ocr"
|
167 |
+
)
|
168 |
signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=TEMP_FILE_EXPIRY)
|
169 |
response = self._call_ocr_api(DocumentURLChunk(document_url=signed_url.url))
|
170 |
return self._get_combined_markdown(response), image_paths
|
|
|
178 |
self._check_file_size(image_file)
|
179 |
image_path = self._save_uploaded_file(image_file, file_name)
|
180 |
encoded_image = self._encode_image(image_path)
|
181 |
+
if not encoded_image:
|
182 |
+
raise ValueError("Failed to encode image")
|
183 |
base64_url = f"data:image/jpeg;base64,{encoded_image}"
|
184 |
response = self._call_ocr_api(ImageURLChunk(image_url=base64_url))
|
185 |
return self._get_combined_markdown(response), image_path
|
|
|
207 |
self._check_file_size(image_file)
|
208 |
image_path = self._save_uploaded_file(image_file, file_name)
|
209 |
encoded_image = self._encode_image(image_path)
|
210 |
+
if not encoded_image:
|
211 |
+
raise ValueError("Failed to encode image")
|
212 |
base64_url = f"data:image/jpeg;base64,{encoded_image}"
|
213 |
|
214 |
ocr_response = self._call_ocr_api(ImageURLChunk(image_url=base64_url))
|