SMART_KYC_OCR

Sleeping

App Files Files Community

SMART_KYC_OCR / utils.py

gopichandra

Update utils.py

b07dfbb verified about 1 month ago

raw

history blame

4.92 kB

	from paddleocr import PaddleOCR
	import re

	# Initialize OCR once (English). Download happens first time it's used.
	# If you want to support other langs, set lang='en' -> 'en'\|'hi'\|'mr'... etc, or 'en'+'multilang models'.
	ocr = PaddleOCR(use_angle_cls=True, lang='en')

	def _extract_dob(lines):
	"""
	Try multiple formats:
	- dd/mm/yyyy \| dd-mm-yyyy \| dd.mm.yyyy
	- yyyy-mm-dd
	- Year of Birth lines (YOB / YEAR / BIRTH)
	"""
	# dd{sep}mm{sep}yyyy
	for line in lines:
	m = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
	if m:
	return m.group(0)

	# yyyy-mm-dd
	for line in lines:
	m = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
	if m:
	return m.group(0)

	# Year only if labeled as YOB/Year/Birth
	for line in lines:
	m = re.search(r'\b(19\|20)\d{2}\b', line)
	if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH"]):
	return m.group(0)

	return "Not found"

	def extract_kyc_fields(file_path: str) -> dict:
	try:
	# OCR text extraction
	result = ocr.ocr(file_path, cls=True)

	# Flatten to text lines
	lines = []
	for block in result:
	for line in block:
	text = line[1][0].strip()
	if text:
	lines.append(text)

	full_text = "\n".join(lines)

	# Detect card type by patterns
	pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
	aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)

	if pan_match:
	card_type = "PAN"
	elif aadhaar_match:
	card_type = "AADHAAR"
	else:
	return {
	"card_type": "UNKNOWN",
	"error": "Could not identify document as PAN or Aadhaar."
	}

	response = {"card_type": card_type}

	# ===================== PAN CARD =====================
	if card_type == "PAN":
	response["pan_number"] = pan_match.group(0)

	# DOB
	response["dob"] = _extract_dob(lines)

	# Name (heuristic: next lines after "INCOME TAX DEPARTMENT")
	name = "Not found"
	for i in range(len(lines)):
	if "INCOME TAX DEPARTMENT" in lines[i].upper():
	for j in range(i + 1, len(lines)):
	possible = lines[j].strip()
	if (
	re.match(r'^[A-Z\s.]+$', possible)
	and not any(x in possible.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"])
	and not re.search(r'\d', possible)
	and len(possible) >= 3
	):
	name = possible
	break
	break
	response["name"] = name

	# ===================== AADHAAR CARD =====================
	else:
	response["aadhaar_number"] = aadhaar_match.group(0)

	# DOB / YOB
	response["dob"] = _extract_dob(lines)

	# Gender
	gender = "Not found"
	for line in lines:
	up = line.upper()
	if "TRANSGENDER" in up:
	gender = "TRANSGENDER"
	break
	if "FEMALE" in up:
	gender = "FEMALE"
	break
	if "MALE" in up:
	gender = "MALE"
	break
	response["gender"] = gender

	# Name: usually line before DOB or first reasonable line without digits
	name = "Not found"
	# try line before a date line
	for i, line in enumerate(lines):
	if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
	candidate = lines[i - 1].strip()
	if (
	not re.search(r'\d', candidate)
	and len(candidate.split()) >= 2
	and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
	):
	name = candidate
	break
	# fallback
	if name == "Not found":
	for line in lines:
	candidate = line.strip()
	if (
	not re.search(r'\d', candidate)
	and len(candidate.split()) >= 2
	and not any(x in candidate.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
	):
	name = candidate
	break
	response["name"] = name

	return response

	except Exception as e:
	return {"error": f"OCR processing failed: {str(e)}"}