Nihal2000 commited on
Commit
16ca714
·
verified ·
1 Parent(s): 74d5794

Update services/ocr_service.py

Browse files
Files changed (1) hide show
  1. services/ocr_service.py +250 -286
services/ocr_service.py CHANGED
@@ -1,324 +1,288 @@
 
1
  import logging
2
- from typing import Optional, List, Dict, Any
3
  import asyncio
4
  from pathlib import Path
5
- import tempfile
6
  import os
 
 
 
7
 
8
- from PIL import Image
9
- import pytesseract
10
- import config
 
11
 
12
  logger = logging.getLogger(__name__)
13
 
14
  class OCRService:
15
  def __init__(self):
16
- self.config = config.config
17
-
18
- # Configure Tesseract path if specified
19
- if self.config.TESSERACT_PATH:
20
- pytesseract.pytesseract.tesseract_cmd = self.config.TESSERACT_PATH
21
 
22
- self.language = self.config.OCR_LANGUAGE
23
-
24
- # Test OCR availability
25
- self._test_ocr_availability()
26
-
27
- def _test_ocr_availability(self):
28
- """Test if OCR is available and working"""
29
- try:
30
- # Create a simple test image
31
- test_image = Image.new('RGB', (100, 30), color='white')
32
- pytesseract.image_to_string(test_image)
33
- logger.info("OCR service initialized successfully")
34
- except Exception as e:
35
- logger.warning(f"OCR may not be available: {str(e)}")
36
-
37
- async def extract_text_from_image(self, image_path: str, language: Optional[str] = None) -> str:
38
- """Extract text from an image file"""
39
  try:
40
- # Use specified language or default
41
- lang = language or self.language
42
-
43
- # Load image
44
- image = Image.open(image_path)
45
-
46
- # Perform OCR in thread pool to avoid blocking
47
- loop = asyncio.get_event_loop()
48
- text = await loop.run_in_executor(
49
- None,
50
- self._extract_text_sync,
51
- image,
52
- lang
53
- )
54
-
55
- return text.strip()
56
-
57
  except Exception as e:
58
- logger.error(f"Error extracting text from image {image_path}: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
59
  return ""
60
-
61
- def _extract_text_sync(self, image: Image.Image, language: str) -> str:
62
- """Synchronous text extraction"""
 
 
 
 
 
 
63
  try:
64
- # Optimize image for OCR
65
- processed_image = self._preprocess_image(image)
66
-
67
- # Configure OCR
68
- config_string = '--psm 6' # Assume a single uniform block of text
69
 
70
- # Extract text
71
- text = pytesseract.image_to_string(
72
- processed_image,
73
- lang=language,
74
- config=config_string
 
 
75
  )
76
 
77
- return text
78
- except Exception as e:
79
- logger.error(f"Error in synchronous OCR: {str(e)}")
80
- return ""
81
-
82
- def _preprocess_image(self, image: Image.Image) -> Image.Image:
83
- """Preprocess image to improve OCR accuracy"""
84
- try:
85
- # Convert to grayscale if not already
86
- if image.mode != 'L':
87
- image = image.convert('L')
88
-
89
- # Resize image if too small (OCR works better on larger images)
90
- width, height = image.size
91
- if width < 300 or height < 300:
92
- scale_factor = max(300 / width, 300 / height)
93
- new_width = int(width * scale_factor)
94
- new_height = int(height * scale_factor)
95
- image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
96
-
97
- return image
98
- except Exception as e:
99
- logger.error(f"Error preprocessing image: {str(e)}")
100
- return image
101
-
102
- async def extract_text_from_pdf_images(self, pdf_path: str) -> List[str]:
103
- """Extract text from PDF by converting pages to images and running OCR"""
104
- try:
105
- import fitz # PyMuPDF
106
-
107
- texts = []
108
-
109
- # Open PDF
110
- pdf_document = fitz.open(pdf_path)
111
-
112
- for page_num in range(len(pdf_document)):
113
- try:
114
- # Get page
115
- page = pdf_document[page_num]
116
-
117
- # Convert page to image
118
- mat = fitz.Matrix(2.0, 2.0) # Scale factor for better quality
119
- pix = page.get_pixmap(matrix=mat)
120
- img_data = pix.tobytes("ppm")
121
 
122
- # Create PIL image from bytes
123
- with tempfile.NamedTemporaryFile(suffix='.ppm', delete=False) as tmp_file:
124
- tmp_file.write(img_data)
125
- tmp_file.flush()
126
-
127
- # Extract text from image
128
- page_text = await self.extract_text_from_image(tmp_file.name)
129
- texts.append(page_text)
130
-
131
- # Clean up temporary file
132
- os.unlink(tmp_file.name)
133
 
134
- except Exception as e:
135
- logger.warning(f"Error processing PDF page {page_num}: {str(e)}")
136
- texts.append("")
137
-
138
- pdf_document.close()
139
- return texts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- except ImportError:
142
- logger.error("PyMuPDF not available for PDF OCR")
143
- return []
 
 
 
144
  except Exception as e:
145
- logger.error(f"Error extracting text from PDF images: {str(e)}")
146
- return []
147
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  async def extract_text_with_confidence(self, image_path: str, min_confidence: float = 0.5) -> Dict[str, Any]:
149
- """Extract text with confidence scores"""
150
- try:
151
- image = Image.open(image_path)
152
-
153
- # Get detailed OCR data with confidence scores
154
- loop = asyncio.get_event_loop()
155
- ocr_data = await loop.run_in_executor(
156
- None,
157
- self._extract_detailed_data,
158
- image
159
- )
160
-
161
- # Filter by confidence
162
- filtered_text = []
163
- word_confidences = []
164
-
165
- for i, confidence in enumerate(ocr_data.get('conf', [])):
166
- if confidence > min_confidence * 100: # Tesseract uses 0-100 scale
167
- text = ocr_data.get('text', [])[i]
168
- if text.strip():
169
- filtered_text.append(text)
170
- word_confidences.append(confidence / 100.0) # Convert to 0-1 scale
171
-
172
- return {
173
- "text": " ".join(filtered_text),
174
- "confidence": sum(word_confidences) / len(word_confidences) if word_confidences else 0.0,
175
- "word_count": len(filtered_text),
176
- "raw_data": ocr_data
177
- }
178
-
179
- except Exception as e:
180
- logger.error(f"Error extracting text with confidence: {str(e)}")
181
- return {
182
- "text": "",
183
- "confidence": 0.0,
184
- "word_count": 0,
185
- "error": str(e)
186
- }
187
-
188
- def _extract_detailed_data(self, image: Image.Image) -> Dict[str, Any]:
189
- """Extract detailed OCR data with positions and confidence"""
190
- try:
191
- processed_image = self._preprocess_image(image)
192
-
193
- # Get detailed data
194
- data = pytesseract.image_to_data(
195
- processed_image,
196
- lang=self.language,
197
- config='--psm 6',
198
- output_type=pytesseract.Output.DICT
199
- )
200
-
201
- return data
202
- except Exception as e:
203
- logger.error(f"Error extracting detailed OCR data: {str(e)}")
204
- return {}
205
-
206
  async def detect_language(self, image_path: str) -> str:
207
- """Detect the language of text in an image"""
208
- try:
209
- image = Image.open(image_path)
210
-
211
- # Run language detection
212
- loop = asyncio.get_event_loop()
213
- languages = await loop.run_in_executor(
214
- None,
215
- pytesseract.image_to_osd,
216
- image
217
- )
218
-
219
- # Parse the output to get the language
220
- for line in languages.split('\n'):
221
- if 'Script:' in line:
222
- script = line.split(':')[1].strip()
223
- # Map script to language code
224
- script_to_lang = {
225
- 'Latin': 'eng',
226
- 'Arabic': 'ara',
227
- 'Chinese': 'chi_sim',
228
- 'Japanese': 'jpn',
229
- 'Korean': 'kor'
230
- }
231
- return script_to_lang.get(script, 'eng')
232
-
233
- return 'eng' # Default to English
234
-
235
- except Exception as e:
236
- logger.error(f"Error detecting language: {str(e)}")
237
- return 'eng'
238
-
239
  async def extract_tables_from_image(self, image_path: str) -> List[List[str]]:
240
- """Extract table data from an image"""
241
- try:
242
- # This is a basic implementation
243
- # For better table extraction, consider using specialized libraries like table-transformer
244
-
245
- image = Image.open(image_path)
246
-
247
- # Use specific PSM for tables
248
- loop = asyncio.get_event_loop()
249
- text = await loop.run_in_executor(
250
- None,
251
- lambda: pytesseract.image_to_string(
252
- image,
253
- lang=self.language,
254
- config='--psm 6 -c preserve_interword_spaces=1'
255
- )
256
- )
257
-
258
- # Simple table parsing (assumes space/tab separated)
259
- lines = text.split('\n')
260
  table_data = []
261
-
 
262
  for line in lines:
263
- if line.strip():
264
- # Split by multiple spaces or tabs
265
- cells = [cell.strip() for cell in line.split() if cell.strip()]
266
- if cells:
267
  table_data.append(cells)
268
 
 
 
 
 
269
  return table_data
270
-
271
- except Exception as e:
272
- logger.error(f"Error extracting tables from image: {str(e)}")
273
- return []
274
-
275
  async def get_supported_languages(self) -> List[str]:
276
- """Get list of supported OCR languages"""
277
- try:
278
- languages = pytesseract.get_languages()
279
- return sorted(languages)
280
- except Exception as e:
281
- logger.error(f"Error getting supported languages: {str(e)}")
282
- return ['eng'] # Default to English only
283
-
284
  async def validate_ocr_setup(self) -> Dict[str, Any]:
285
- """Validate OCR setup and return status"""
286
  try:
287
- # Test basic functionality
288
- test_image = Image.new('RGB', (200, 50), color='white')
289
-
290
- from PIL import ImageDraw, ImageFont
291
- draw = ImageDraw.Draw(test_image)
292
-
293
- try:
294
- # Try to use a default font
295
- draw.text((10, 10), "Test OCR", fill='black')
296
- except:
297
- # Fall back to basic text without font
298
- draw.text((10, 10), "Test", fill='black')
299
-
300
- # Test OCR
301
- result = pytesseract.image_to_string(test_image)
302
-
303
- # Get available languages
304
- languages = await self.get_supported_languages()
305
-
306
  return {
307
  "status": "operational",
308
- "tesseract_version": pytesseract.get_tesseract_version(),
309
- "available_languages": languages,
310
- "current_language": self.language,
311
- "test_result": result.strip(),
312
- "tesseract_path": pytesseract.pytesseract.tesseract_cmd
313
  }
314
-
 
 
315
  except Exception as e:
316
- return {
317
- "status": "error",
318
- "error": str(e),
319
- "tesseract_path": pytesseract.pytesseract.tesseract_cmd
320
- }
321
 
322
- def extract_text(self, file_path):
323
- # Dummy implementation for OCR
324
- return "OCR functionality not implemented yet."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
  import logging
 
3
  import asyncio
4
  from pathlib import Path
 
5
  import os
6
+ import base64 # For encoding files
7
+ from typing import Optional, List, Dict, Any
8
+ import json
9
 
10
+ from mistralai import Mistral
11
+ from mistralai.models import SDKError
12
+ # PIL (Pillow) for dummy image creation in main_example
13
+ from PIL import Image, ImageDraw, ImageFont
14
 
15
  logger = logging.getLogger(__name__)
16
 
17
  class OCRService:
18
  def __init__(self):
19
+ self.api_key = os.environ.get("MISTRAL_API_KEY")
20
+ if not self.api_key:
21
+ logger.error("MISTRAL_API_KEY environment variable not set.")
22
+ raise ValueError("MISTRAL_API_KEY not found in environment variables.")
 
23
 
24
+ self.client = Mistral(api_key=self.api_key)
25
+ self.ocr_model_name = "mistral-ocr-latest"
26
+ self.language = 'eng'
27
+ logger.info(f"OCRService (using Mistral AI model {self.ocr_model_name}) initialized.")
28
+
29
+ def _encode_file_to_base64(self, file_path: str) -> Optional[str]:
 
 
 
 
 
 
 
 
 
 
 
30
  try:
31
+ with open(file_path, "rb") as file_to_encode:
32
+ return base64.b64encode(file_to_encode.read()).decode('utf-8')
33
+ except FileNotFoundError:
34
+ logger.error(f"Error: The file {file_path} was not found for Base64 encoding.")
35
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
36
  except Exception as e:
37
+ logger.error(f"Error during Base64 encoding for {file_path}: {e}")
38
+ return None
39
+
40
+ # In OCRService class:
41
+
42
+ async def _process_file_with_mistral(self, file_path: str, mime_type: str) -> str:
43
+ file_name = Path(file_path).name
44
+ logger.info(f"Preparing to process file: {file_name} (MIME: {mime_type}) with Mistral OCR.")
45
+
46
+ base64_encoded_file = self._encode_file_to_base64(file_path)
47
+ if not base64_encoded_file:
48
+ logger.warning(f"Base64 encoding failed for {file_name}, cannot process.")
49
  return ""
50
+
51
+ document_type = "image_url" if mime_type.startswith("image/") else "document_url"
52
+ uri_key = "image_url" if document_type == "image_url" else "document_url"
53
+ data_uri = f"data:{mime_type};base64,{base64_encoded_file}"
54
+
55
+ document_payload = {
56
+ "type": document_type,
57
+ uri_key: data_uri
58
+ }
59
  try:
60
+ logger.info(f"Calling Mistral client.ocr.process for {file_name} with model {self.ocr_model_name}.")
61
+ loop = asyncio.get_event_loop()
 
 
 
62
 
63
+ ocr_response = await loop.run_in_executor(
64
+ None,
65
+ lambda: self.client.ocr.process(
66
+ model=self.ocr_model_name,
67
+ document=document_payload,
68
+ include_image_base64=False
69
+ )
70
  )
71
 
72
+ logger.info(f"Received OCR response for {file_name}. Type: {type(ocr_response)}")
73
+
74
+ extracted_markdown = ""
75
+ if hasattr(ocr_response, 'pages') and ocr_response.pages and isinstance(ocr_response.pages, list):
76
+ all_pages_markdown = []
77
+ for i, page in enumerate(ocr_response.pages):
78
+ page_content = None
79
+ if hasattr(page, 'markdown') and page.markdown: # Check for 'markdown' attribute
80
+ page_content = page.markdown
81
+ logger.debug(f"Extracted content from page {i} using 'page.markdown'.")
82
+ elif hasattr(page, 'markdown_content') and page.markdown_content:
83
+ page_content = page.markdown_content
84
+ logger.debug(f"Extracted content from page {i} using 'page.markdown_content'.")
85
+ elif hasattr(page, 'text') and page.text:
86
+ page_content = page.text
87
+ logger.debug(f"Extracted content from page {i} using 'page.text'.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
+ if page_content:
90
+ all_pages_markdown.append(page_content)
91
+ else:
92
+ page_details_for_log = str(page)[:200] # Default to string snippet
93
+ if hasattr(page, '__dict__'):
94
+ page_details_for_log = str(vars(page))[:200] # Log part of vars if it's an object
95
+ logger.warning(f"Page {i} in OCR response for {file_name} has no 'markdown', 'markdown_content', or 'text'. Page details: {page_details_for_log}")
 
 
 
 
96
 
97
+ if all_pages_markdown:
98
+ extracted_markdown = "\n\n---\nPage Break (simulated)\n---\n\n".join(all_pages_markdown) # Simulate page breaks
99
+ else:
100
+ logger.warning(f"'pages' attribute found but no content extracted from any pages for {file_name}.")
101
+
102
+ # Fallbacks if ocr_response doesn't have 'pages' but might have direct text/markdown
103
+ elif hasattr(ocr_response, 'text') and ocr_response.text:
104
+ extracted_markdown = ocr_response.text
105
+ logger.info(f"Extracted content from 'ocr_response.text' (no pages structure) for {file_name}.")
106
+ elif hasattr(ocr_response, 'markdown') and ocr_response.markdown:
107
+ extracted_markdown = ocr_response.markdown
108
+ logger.info(f"Extracted content from 'ocr_response.markdown' (no pages structure) for {file_name}.")
109
+ elif isinstance(ocr_response, str) and ocr_response:
110
+ extracted_markdown = ocr_response
111
+ logger.info(f"OCR response is a direct non-empty string for {file_name}.")
112
+ else:
113
+ logger.warning(f"Could not extract markdown from OCR response for {file_name} using known attributes (pages, text, markdown).")
114
+
115
+ if not extracted_markdown.strip():
116
+ logger.warning(f"Extracted markdown is empty for {file_name} after all parsing attempts.")
117
 
118
+ return extracted_markdown.strip()
119
+
120
+ except SDKError as e:
121
+ logger.error(f"Mistral API Exception during client.ocr.process for {file_name}: {e.message}")
122
+ logger.exception("SDKError details:")
123
+ return ""
124
  except Exception as e:
125
+ logger.error(f"Generic Exception during Mistral client.ocr.process call for {file_name}: {e}")
126
+ logger.exception("Exception details:")
127
+ return ""
128
+
129
+ async def extract_text_from_image(self, image_path: str, language: Optional[str] = None) -> str:
130
+ if language:
131
+ logger.info(f"Language parameter '{language}' provided, but Mistral OCR is broadly multilingual.")
132
+
133
+ ext = Path(image_path).suffix.lower()
134
+ mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png',
135
+ '.gif': 'image/gif', '.bmp': 'image/bmp', '.tiff': 'image/tiff', '.webp': 'image/webp',
136
+ '.avif': 'image/avif'}
137
+ mime_type = mime_map.get(ext)
138
+ if not mime_type:
139
+ logger.warning(f"Unsupported image extension '{ext}' for path '{image_path}'. Attempting with 'application/octet-stream'.")
140
+ mime_type = 'application/octet-stream'
141
+
142
+ return await self._process_file_with_mistral(image_path, mime_type)
143
+
144
+ async def extract_text_from_pdf(self, pdf_path: str) -> str:
145
+ return await self._process_file_with_mistral(pdf_path, "application/pdf")
146
+
147
+ async def extract_text_from_pdf_images(self, pdf_path: str) -> List[str]:
148
+ logger.info("Mistral processes PDFs directly. This method will return the full Markdown content as a single list item.")
149
+ full_markdown = await self._process_file_with_mistral(pdf_path, "application/pdf")
150
+ if full_markdown:
151
+ return [full_markdown]
152
+ return [""]
153
+
154
  async def extract_text_with_confidence(self, image_path: str, min_confidence: float = 0.5) -> Dict[str, Any]:
155
+ logger.warning("Mistral Document AI API (ocr.process) typically returns structured text (Markdown). Word-level confidence scores are not standard. 'confidence' field is a placeholder.")
156
+
157
+ ext = Path(image_path).suffix.lower()
158
+ mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.avif': 'image/avif'}
159
+ mime_type = mime_map.get(ext)
160
+ if not mime_type:
161
+ logger.warning(f"Unsupported image extension '{ext}' in extract_text_with_confidence. Defaulting mime type.")
162
+ mime_type = 'application/octet-stream'
163
+
164
+ text_markdown = await self._process_file_with_mistral(image_path, mime_type)
165
+
166
+ return {
167
+ "text": text_markdown,
168
+ "confidence": 0.0,
169
+ "word_count": len(text_markdown.split()) if text_markdown else 0,
170
+ "raw_data": "Mistral ocr.process response contains structured data. See logs from _process_file_with_mistral for details."
171
+ }
172
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  async def detect_language(self, image_path: str) -> str:
174
+ logger.warning("Mistral OCR is multilingual; explicit language detection is not part of client.ocr.process.")
175
+ return 'eng'
176
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  async def extract_tables_from_image(self, image_path: str) -> List[List[str]]:
178
+ logger.info("Extracting text (Markdown) from image using Mistral. Mistral OCR preserves table structures in Markdown.")
179
+
180
+ ext = Path(image_path).suffix.lower()
181
+ mime_map = {'.jpeg': 'image/jpeg', '.jpg': 'image/jpeg', '.png': 'image/png', '.avif': 'image/avif'}
182
+ mime_type = mime_map.get(ext)
183
+ if not mime_type:
184
+ logger.warning(f"Unsupported image extension '{ext}' in extract_tables_from_image. Defaulting mime type.")
185
+ mime_type = 'application/octet-stream'
186
+
187
+ markdown_content = await self._process_file_with_mistral(image_path, mime_type)
188
+
189
+ if markdown_content:
190
+ logger.info("Attempting basic parsing of Markdown tables. For complex tables, a dedicated parser is recommended.")
 
 
 
 
 
 
 
191
  table_data = []
192
+ # Simplified parsing logic for example purposes - can be improved significantly.
193
+ lines = markdown_content.split('\n')
194
  for line in lines:
195
+ stripped_line = line.strip()
196
+ if stripped_line.startswith('|') and stripped_line.endswith('|') and "---" not in stripped_line:
197
+ cells = [cell.strip() for cell in stripped_line.strip('|').split('|')]
198
+ if any(cells):
199
  table_data.append(cells)
200
 
201
+ if table_data:
202
+ logger.info(f"Extracted {len(table_data)} lines potentially forming tables using basic parsing.")
203
+ else:
204
+ logger.info("No distinct table structures found with basic parsing from extracted markdown.")
205
  return table_data
206
+ return []
207
+
 
 
 
208
  async def get_supported_languages(self) -> List[str]:
209
+ logger.info("Mistral OCR is multilingual. Refer to official Mistral AI documentation for details.")
210
+ return ['eng', 'multilingual (refer to Mistral documentation)']
211
+
 
 
 
 
 
212
  async def validate_ocr_setup(self) -> Dict[str, Any]:
 
213
  try:
214
+ models_response = await asyncio.to_thread(self.client.models.list)
215
+ model_ids = [model.id for model in models_response.data]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  return {
217
  "status": "operational",
218
+ "message": "Mistral client initialized. API key present. Model listing successful.",
219
+ "mistral_available_models_sample": model_ids[:5],
220
+ "configured_ocr_model": self.ocr_model_name,
 
 
221
  }
222
+ except SDKError as e:
223
+ logger.error(f"Mistral API Exception during setup validation: {e.message}")
224
+ return { "status": "error", "error": f"Mistral API Error: {e.message}"}
225
  except Exception as e:
226
+ logger.error(f"Generic error during Mistral OCR setup validation: {str(e)}")
227
+ return { "status": "error", "error": str(e) }
 
 
 
228
 
229
+ def extract_text(self, file_path: str) -> str:
230
+ logger.warning("`extract_text` is a synchronous method. Running async Mistral OCR in a blocking way.")
231
+ try:
232
+ ext = Path(file_path).suffix.lower()
233
+ if ext in ['.jpeg', '.jpg', '.png', '.gif', '.bmp', '.tiff', '.webp', '.avif']:
234
+ result = asyncio.run(self.extract_text_from_image(file_path))
235
+ elif ext == '.pdf':
236
+ result = asyncio.run(self.extract_text_from_pdf(file_path))
237
+ else:
238
+ logger.error(f"Unsupported file type for sync extract_text: {file_path}")
239
+ return "Unsupported file type."
240
+ return result
241
+ except Exception as e:
242
+ logger.error(f"Error in synchronous extract_text for {file_path}: {str(e)}")
243
+ return "Error during sync extraction."
244
+
245
+ # Example of how to use the OCRService (main execution part)
246
+ async def main_example():
247
+ logging.basicConfig(level=logging.DEBUG,
248
+ format='%(asctime)s - %(levelname)s - %(name)s - %(funcName)s - %(message)s')
249
+
250
+ if not os.environ.get("MISTRAL_API_KEY"):
251
+ logger.error("MISTRAL_API_KEY environment variable is not set. Please set it: export MISTRAL_API_KEY='yourkey'")
252
+ return
253
+
254
+ ocr_service = OCRService()
255
+
256
+ logger.info("--- Validating OCR Service Setup ---")
257
+ validation_status = await ocr_service.validate_ocr_setup()
258
+ logger.info(f"OCR Service Validation: {validation_status}")
259
+ if validation_status.get("status") == "error":
260
+ logger.error("Halting due to validation error.")
261
+ return
262
+
263
+ # --- Test with a specific PDF file ---
264
+ pdf_path_to_test = r"C:\path\to\your\certificate.pdf"
265
+
266
+ if os.path.exists(pdf_path_to_test):
267
+ logger.info(f"\n--- Extracting text from specific PDF: {pdf_path_to_test} ---")
268
+ # Using the method that aligns with original `extract_text_from_pdf_images` signature
269
+ pdf_markdown_list = await ocr_service.extract_text_from_pdf_images(pdf_path_to_test)
270
+ if pdf_markdown_list and pdf_markdown_list[0]:
271
+ logger.info(f"Extracted Markdown from PDF ({pdf_path_to_test}):\n" + pdf_markdown_list[0])
272
+ else:
273
+ logger.warning(f"No text extracted from PDF {pdf_path_to_test} or an error occurred.")
274
+ else:
275
+ logger.warning(f"PDF file for specific test '{pdf_path_to_test}' not found. Skipping this test.")
276
+ logger.warning("Please update `pdf_path_to_test` in `main_example` to a valid PDF path.")
277
+
278
+ image_path = "dummy_test_image_ocr.png"
279
+ if os.path.exists(image_path):
280
+ logger.info(f"\n---Extracting text from image: {image_path} ---")
281
+ # ... image processing logic ...
282
+ pass
283
+ else:
284
+ logger.info(f"Dummy image {image_path} not created or found, skipping optional image test.")
285
+
286
+
287
+ if __name__ == '__main__':
288
+ asyncio.run(main_example())