File size: 2,970 Bytes
f3dd5d6
 
 
 
 
 
 
 
01dc73d
f3dd5d6
 
 
01dc73d
 
 
 
 
 
f3dd5d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01dc73d
f3dd5d6
 
 
01dc73d
 
 
 
 
f3dd5d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
01dc73d
f3dd5d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from mistralai import Mistral
from mistralai.models import OCRResponse


OCR_MODEL = "mistral-ocr-latest"
CHAT_MODEL = "mistral-large-latest"


def ocr_from_file(file_path, api_key: str, mode="image"):

	if not api_key:
		raise ValueError("Mistral API Key is required.")

	try:
		client = Mistral(api_key=api_key)
	except Exception as e:
		raise ValueError("API invalid.")

	uploaded_image = client.files.upload(
		file={
			"file_name": file_path,
			"content": open(file_path, "rb"),
		},
		purpose="ocr"
	)
	signed_url = client.files.get_signed_url(file_id=uploaded_image.id)

	if mode == "image":
		ocr_response = client.ocr.process(
			model=OCR_MODEL,
			document={
				"type": "image_url",
				"image_url": signed_url.url,
			},
			include_image_base64=True
		)
	elif mode == "pdf":
		ocr_response = client.ocr.process(
			model=OCR_MODEL,
			document={
				"type": "document_url",
				"document_url": signed_url.url,
			},
			include_image_base64=True
		)

	return ocr_response


def get_combined_markdown(ocr_response: OCRResponse) -> str:

	markdowns: list[str] = []
	for page in ocr_response.pages:
		markdowns.append(page.markdown)

	return "\n\n".join(markdowns)


def correct_text_with_ai(text: str, api_key: str) -> str:

	if not api_key:
		raise ValueError("Mistral API Key is required.")

	try:
		client = Mistral(api_key=api_key)
	except Exception as e:
		return f"ERROR: {str(e)}"

	response = client.chat.complete(
		model=CHAT_MODEL,
		messages=[
			{
				"role": "system",
				"content":
					"""You are an expert proofreader specializing in Markdown formatting and OCR error correction. Your task is to meticulously review provided Markdown text that has been generated via OCR.
					Your primary goal is to identify and correct **typographical errors, spelling mistakes, and redundant symbols** that are clearly a result of the OCR process.
					Additionally, you must correct any illogical or jumbled line breaks to ensure proper Markdown paragraph formatting.

					**Crucially, you must NOT alter the original meaning or content of the text.** Your corrections should be limited to:
					* Obvious OCR-induced spelling errors
					* Erroneous or redundant symbols
					* Markdown formatting errors
					* Jumbled or incorrect line breaks for proper paragraphing

					After your thorough review, output the carefully corrected Markdown text. JUST the text."""
				},
			{
				"role": "user",
				"content": text
				},
		],
		temperature=0.1,
	)
	return(response.choices[0].message.content)


def perform_raw_ocr(input_file, api_key: str):
	if input_file != None:
		file_ext = input_file.name.split('.')[-1].lower()
	else:
		return "File/Text not found"

	if file_ext == "txt":
		with open(input_file, "r", encoding="utf-8") as f:
			return f.read()
	elif file_ext == "pdf":
		file_type = "pdf"
	else:
		file_type = "image"
	response = ocr_from_file(input_file, api_key, file_type)
	res_text = get_combined_markdown(response)
	return res_text