Svngoku commited on
Commit
274798e
·
verified ·
1 Parent(s): 5361f7d

push from home

Browse files
Files changed (1) hide show
  1. app.py +60 -31
app.py CHANGED
@@ -15,6 +15,7 @@ import requests
15
  import shutil
16
  from concurrent.futures import ThreadPoolExecutor
17
  import time
 
18
 
19
  # Constants
20
  DEFAULT_LANGUAGE = "English"
@@ -68,46 +69,64 @@ class OCRProcessor:
68
 
69
  @staticmethod
70
  def _encode_image(image_path: str) -> Optional[str]:
71
- with open(image_path, "rb") as image_file:
72
- return base64.b64encode(image_file.read()).decode('utf-8')
 
 
 
 
73
 
74
  @staticmethod
75
  def _save_uploaded_file(file_input: Union[str, bytes], filename: str) -> str:
76
- file_path = os.path.join(UPLOAD_FOLDER, f"{int(time.time())}_{filename}")
77
- if isinstance(file_input, str) and file_input.startswith("http"):
78
- response = requests.get(file_input, timeout=10)
79
- response.raise_for_status()
80
- with open(file_path, 'wb') as f:
81
- f.write(response.content)
82
- else:
83
- with open(file_path, 'wb') as f:
84
- if hasattr(file_input, 'read'):
85
- shutil.copyfileobj(file_input, f)
86
- else:
87
- f.write(file_input)
88
- return file_path
 
 
 
 
 
 
 
 
 
 
89
 
90
  @staticmethod
91
  def _pdf_to_images(pdf_path: str) -> List[str]:
92
- pdf_document = fitz.open(pdf_path)
93
- if pdf_document.page_count > MAX_PDF_PAGES:
 
 
 
 
 
 
 
 
 
94
  pdf_document.close()
95
- raise ValueError(f"PDF exceeds maximum page limit of {MAX_PDF_PAGES}")
96
-
97
- with ThreadPoolExecutor() as executor:
98
- image_paths = list(executor.map(
99
- lambda i: OCRProcessor._convert_page(pdf_path, i),
100
- range(pdf_document.page_count)
101
- ))
102
- pdf_document.close()
103
- return [path for path in image_paths if path]
104
 
105
  @staticmethod
106
  def _convert_page(pdf_path: str, page_num: int) -> Optional[str]:
107
  try:
108
  pdf_document = fitz.open(pdf_path)
109
  page = pdf_document[page_num]
110
- pix = page.get_pixmap(dpi=150) # Improved resolution
111
  image_path = os.path.join(UPLOAD_FOLDER, f"page_{page_num + 1}_{int(time.time())}.png")
112
  pix.save(image_path)
113
  pdf_document.close()
@@ -134,12 +153,18 @@ class OCRProcessor:
134
  try:
135
  self._check_file_size(pdf_file)
136
  pdf_path = self._save_uploaded_file(pdf_file, file_name)
 
 
 
 
 
137
  image_paths = self._pdf_to_images(pdf_path)
138
 
139
- uploaded_file = self.client.files.upload(
140
- file={"file_name": pdf_path, "content": open(pdf_path, "rb")},
141
- purpose="ocr"
142
- )
 
143
  signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=TEMP_FILE_EXPIRY)
144
  response = self._call_ocr_api(DocumentURLChunk(document_url=signed_url.url))
145
  return self._get_combined_markdown(response), image_paths
@@ -153,6 +178,8 @@ class OCRProcessor:
153
  self._check_file_size(image_file)
154
  image_path = self._save_uploaded_file(image_file, file_name)
155
  encoded_image = self._encode_image(image_path)
 
 
156
  base64_url = f"data:image/jpeg;base64,{encoded_image}"
157
  response = self._call_ocr_api(ImageURLChunk(image_url=base64_url))
158
  return self._get_combined_markdown(response), image_path
@@ -180,6 +207,8 @@ class OCRProcessor:
180
  self._check_file_size(image_file)
181
  image_path = self._save_uploaded_file(image_file, file_name)
182
  encoded_image = self._encode_image(image_path)
 
 
183
  base64_url = f"data:image/jpeg;base64,{encoded_image}"
184
 
185
  ocr_response = self._call_ocr_api(ImageURLChunk(image_url=base64_url))
 
15
  import shutil
16
  from concurrent.futures import ThreadPoolExecutor
17
  import time
18
+ import fitz # PyMuPDF
19
 
20
  # Constants
21
  DEFAULT_LANGUAGE = "English"
 
69
 
70
  @staticmethod
71
  def _encode_image(image_path: str) -> Optional[str]:
72
+ try:
73
+ with open(image_path, "rb") as image_file:
74
+ return base64.b64encode(image_file.read()).decode('utf-8')
75
+ except Exception as e:
76
+ logger.error(f"Error encoding image {image_path}: {str(e)}")
77
+ return None
78
 
79
  @staticmethod
80
  def _save_uploaded_file(file_input: Union[str, bytes], filename: str) -> str:
81
+ clean_filename = os.path.basename(filename).replace(os.sep, "_")
82
+ file_path = os.path.join(UPLOAD_FOLDER, f"{int(time.time())}_{clean_filename}")
83
+
84
+ try:
85
+ if isinstance(file_input, str) and file_input.startswith("http"):
86
+ response = requests.get(file_input, timeout=10)
87
+ response.raise_for_status()
88
+ with open(file_path, 'wb') as f:
89
+ f.write(response.content)
90
+ elif isinstance(file_input, str) and os.path.exists(file_input):
91
+ shutil.copy2(file_input, file_path)
92
+ else:
93
+ with open(file_path, 'wb') as f:
94
+ if hasattr(file_input, 'read'):
95
+ shutil.copyfileobj(file_input, f)
96
+ else:
97
+ f.write(file_input)
98
+ if not os.path.exists(file_path):
99
+ raise FileNotFoundError(f"Failed to save file at {file_path}")
100
+ return file_path
101
+ except Exception as e:
102
+ logger.error(f"Error saving file {filename}: {str(e)}")
103
+ raise
104
 
105
  @staticmethod
106
  def _pdf_to_images(pdf_path: str) -> List[str]:
107
+ try:
108
+ pdf_document = fitz.open(pdf_path)
109
+ if pdf_document.page_count > MAX_PDF_PAGES:
110
+ pdf_document.close()
111
+ raise ValueError(f"PDF exceeds maximum page limit of {MAX_PDF_PAGES}")
112
+
113
+ with ThreadPoolExecutor() as executor:
114
+ image_paths = list(executor.map(
115
+ lambda i: OCRProcessor._convert_page(pdf_path, i),
116
+ range(pdf_document.page_count)
117
+ ))
118
  pdf_document.close()
119
+ return [path for path in image_paths if path]
120
+ except Exception as e:
121
+ logger.error(f"Error converting PDF to images: {str(e)}")
122
+ return []
 
 
 
 
 
123
 
124
  @staticmethod
125
  def _convert_page(pdf_path: str, page_num: int) -> Optional[str]:
126
  try:
127
  pdf_document = fitz.open(pdf_path)
128
  page = pdf_document[page_num]
129
+ pix = page.get_pixmap(dpi=150)
130
  image_path = os.path.join(UPLOAD_FOLDER, f"page_{page_num + 1}_{int(time.time())}.png")
131
  pix.save(image_path)
132
  pdf_document.close()
 
153
  try:
154
  self._check_file_size(pdf_file)
155
  pdf_path = self._save_uploaded_file(pdf_file, file_name)
156
+ logger.info(f"Saved PDF to: {pdf_path}")
157
+
158
+ if not os.path.exists(pdf_path):
159
+ raise FileNotFoundError(f"Saved PDF not found at: {pdf_path}")
160
+
161
  image_paths = self._pdf_to_images(pdf_path)
162
 
163
+ with open(pdf_path, "rb") as f:
164
+ uploaded_file = self.client.files.upload(
165
+ file={"file_name": file_name, "content": f},
166
+ purpose="ocr"
167
+ )
168
  signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=TEMP_FILE_EXPIRY)
169
  response = self._call_ocr_api(DocumentURLChunk(document_url=signed_url.url))
170
  return self._get_combined_markdown(response), image_paths
 
178
  self._check_file_size(image_file)
179
  image_path = self._save_uploaded_file(image_file, file_name)
180
  encoded_image = self._encode_image(image_path)
181
+ if not encoded_image:
182
+ raise ValueError("Failed to encode image")
183
  base64_url = f"data:image/jpeg;base64,{encoded_image}"
184
  response = self._call_ocr_api(ImageURLChunk(image_url=base64_url))
185
  return self._get_combined_markdown(response), image_path
 
207
  self._check_file_size(image_file)
208
  image_path = self._save_uploaded_file(image_file, file_name)
209
  encoded_image = self._encode_image(image_path)
210
+ if not encoded_image:
211
+ raise ValueError("Failed to encode image")
212
  base64_url = f"data:image/jpeg;base64,{encoded_image}"
213
 
214
  ocr_response = self._call_ocr_api(ImageURLChunk(image_url=base64_url))