Spaces:
Sleeping
Sleeping
File size: 4,766 Bytes
1af10cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
"""
OCR (Optical Character Recognition) tools for TutorX.
"""
import base64
import io
import tempfile
from typing import Dict, Any, Optional, Tuple
# import fitz # PyMuPDFuv run
import pytesseract
from PIL import Image, ImageEnhance
import numpy as np
from mcp_server.mcp_instance import mcp
def preprocess_image(image: Image.Image) -> Image.Image:
"""
Preprocess image to improve OCR accuracy.
Args:
image: Input PIL Image
Returns:
Preprocessed PIL Image
"""
# Convert to grayscale
image = image.convert('L')
# Enhance contrast
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2.0)
# Enhance sharpness
enhancer = ImageEnhance.Sharpness(image)
image = enhancer.enhance(2.0)
return image
def extract_text_from_image(image: Image.Image) -> str:
"""
Extract text from an image using Tesseract OCR.
Args:
image: PIL Image to process
Returns:
Extracted text
"""
try:
# Preprocess the image
processed_image = preprocess_image(image)
# Use Tesseract to do OCR on the image
text = pytesseract.image_to_string(processed_image, lang='eng')
return text.strip()
except Exception as e:
raise RuntimeError(f"Error during OCR processing: {str(e)}")
def extract_text_from_pdf(pdf_data: bytes) -> Tuple[str, int]:
"""
Extract text from a PDF file.
Args:
pdf_data: PDF file content as bytes
Returns:
Tuple of (extracted_text, page_count)
"""
try:
# Open the PDF file
with fitz.open(stream=pdf_data, filetype="pdf") as doc:
page_count = len(doc)
extracted_text = []
# Extract text from each page
for page_num in range(page_count):
page = doc.load_page(page_num)
text = page.get_text()
# If no text is found, try OCR
if not text.strip():
pix = page.get_pixmap()
img_data = pix.tobytes("png")
img = Image.open(io.BytesIO(img_data))
text = extract_text_from_image(img)
extracted_text.append(text)
return "\n\n".join(extracted_text), page_count
except Exception as e:
raise RuntimeError(f"Error processing PDF: {str(e)}")
@mcp.tool()
async def pdf_ocr(request: Dict[str, Any]) -> Dict[str, Any]:
"""
Extract text from a PDF file using OCR.
Expected request format:
{
"pdf_data": "base64_encoded_pdf_data",
"filename": "document.pdf" # Optional
}
Returns:
Dictionary containing extracted text and metadata
"""
try:
# Get and validate input
pdf_data_b64 = request.get("pdf_data")
if not pdf_data_b64:
return {"error": "Missing required field: pdf_data"}
# Decode base64 data
try:
pdf_data = base64.b64decode(pdf_data_b64)
except Exception as e:
return {"error": f"Invalid base64 data: {str(e)}"}
# Extract text from PDF
extracted_text, page_count = extract_text_from_pdf(pdf_data)
# Prepare response
result = {
"success": True,
"filename": request.get("filename", "document.pdf"),
"page_count": page_count,
"extracted_text": extracted_text,
"character_count": len(extracted_text),
"word_count": len(extracted_text.split()),
"processing_time_ms": 0 # Could be calculated if needed
}
return result
except Exception as e:
return {"error": f"Error processing PDF: {str(e)}"}
@mcp.tool()
async def image_to_text(image_data: str) -> Dict[str, Any]:
"""
Extract text from an image using OCR.
Args:
image_data: Base64 encoded image data
Returns:
Dictionary containing extracted text and metadata
"""
try:
# Decode base64 image data
image_bytes = base64.b64decode(image_data)
# Open image
image = Image.open(io.BytesIO(image_bytes))
# Extract text
text = extract_text_from_image(image)
return {
"success": True,
"extracted_text": text,
"character_count": len(text),
"word_count": len(text.split()),
"image_size": image.size,
"image_mode": image.mode
}
except Exception as e:
return {"error": f"Error processing image: {str(e)}"}
|