Spaces:
Running
Running
File size: 2,970 Bytes
f3dd5d6 01dc73d f3dd5d6 01dc73d f3dd5d6 01dc73d f3dd5d6 01dc73d f3dd5d6 01dc73d f3dd5d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
from mistralai import Mistral
from mistralai.models import OCRResponse
OCR_MODEL = "mistral-ocr-latest"
CHAT_MODEL = "mistral-large-latest"
def ocr_from_file(file_path, api_key: str, mode="image"):
if not api_key:
raise ValueError("Mistral API Key is required.")
try:
client = Mistral(api_key=api_key)
except Exception as e:
raise ValueError("API invalid.")
uploaded_image = client.files.upload(
file={
"file_name": file_path,
"content": open(file_path, "rb"),
},
purpose="ocr"
)
signed_url = client.files.get_signed_url(file_id=uploaded_image.id)
if mode == "image":
ocr_response = client.ocr.process(
model=OCR_MODEL,
document={
"type": "image_url",
"image_url": signed_url.url,
},
include_image_base64=True
)
elif mode == "pdf":
ocr_response = client.ocr.process(
model=OCR_MODEL,
document={
"type": "document_url",
"document_url": signed_url.url,
},
include_image_base64=True
)
return ocr_response
def get_combined_markdown(ocr_response: OCRResponse) -> str:
markdowns: list[str] = []
for page in ocr_response.pages:
markdowns.append(page.markdown)
return "\n\n".join(markdowns)
def correct_text_with_ai(text: str, api_key: str) -> str:
if not api_key:
raise ValueError("Mistral API Key is required.")
try:
client = Mistral(api_key=api_key)
except Exception as e:
return f"ERROR: {str(e)}"
response = client.chat.complete(
model=CHAT_MODEL,
messages=[
{
"role": "system",
"content":
"""You are an expert proofreader specializing in Markdown formatting and OCR error correction. Your task is to meticulously review provided Markdown text that has been generated via OCR.
Your primary goal is to identify and correct **typographical errors, spelling mistakes, and redundant symbols** that are clearly a result of the OCR process.
Additionally, you must correct any illogical or jumbled line breaks to ensure proper Markdown paragraph formatting.
**Crucially, you must NOT alter the original meaning or content of the text.** Your corrections should be limited to:
* Obvious OCR-induced spelling errors
* Erroneous or redundant symbols
* Markdown formatting errors
* Jumbled or incorrect line breaks for proper paragraphing
After your thorough review, output the carefully corrected Markdown text. JUST the text."""
},
{
"role": "user",
"content": text
},
],
temperature=0.1,
)
return(response.choices[0].message.content)
def perform_raw_ocr(input_file, api_key: str):
if input_file != None:
file_ext = input_file.name.split('.')[-1].lower()
else:
return "File/Text not found"
if file_ext == "txt":
with open(input_file, "r", encoding="utf-8") as f:
return f.read()
elif file_ext == "pdf":
file_type = "pdf"
else:
file_type = "image"
response = ocr_from_file(input_file, api_key, file_type)
res_text = get_combined_markdown(response)
return res_text
|