File size: 3,509 Bytes
ae2e698
c70099c
 
73f6d86
ae2e698
a8683a1
 
ae2e698
 
8324e53
 
 
 
 
 
ae2e698
8324e53
ae2e698
2c3e33d
a726fb2
 
 
 
 
 
 
 
 
 
 
 
2c3e33d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a726fb2
 
2c3e33d
a726fb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c3e33d
a726fb2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae2e698
a726fb2
a8683a1
 
8324e53
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from paddleocr import PaddleOCR
import re

ocr = PaddleOCR(use_angle_cls=True, lang='en')

def extract_kyc_fields(file_path):
    try:
        result = ocr.ocr(file_path, cls=True)

        lines = []
        for block in result:
            for line in block:
                text = line[1][0].strip()
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        # PAN Number Detection
        pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
        aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)

        if pan_match:
            card_type = "PAN"
        elif aadhaar_match:
            card_type = "AADHAAR"
        else:
            card_type = "UNKNOWN"

        response = {"card_type": card_type}

        if card_type == "PAN":
            response["pan_number"] = pan_match.group(0)

            # Extract DOB as any line with DD/MM/YYYY
            dob = "Not found"
            for line in lines:
                match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                if match:
                    dob = match.group(0)
                    break
            response["dob"] = dob

            # Improved name extraction: find first uppercase name-like line after "INCOME TAX DEPARTMENT"
            name = "Not found"
            for i in range(len(lines)):
                if "INCOME TAX DEPARTMENT" in lines[i].upper():
                    for j in range(i+1, len(lines)):
                        possible = lines[j].strip()
                        if (
                            re.match(r'^[A-Z\s.]+$', possible) and
                            not any(x in possible for x in ["INDIA", "DEPARTMENT", "GOVT"]) and
                            not re.search(r'\d', possible)
                        ):
                            name = possible.strip()
                            break
                    break
            response["name"] = name

        elif card_type == "AADHAAR":
            response["aadhaar_number"] = aadhaar_match.group(0)

            # Extract DOB
            dob = "Not found"
            for line in lines:
                match = re.search(r'\d{2}[/-]\d{2}[/-]\d{4}', line)
                if match:
                    dob = match.group(0)
                    break
            response["dob"] = dob

            # Gender
            gender = "Not found"
            for line in lines:
                if "MALE" in line.upper():
                    gender = "MALE"
                    break
                elif "FEMALE" in line.upper():
                    gender = "FEMALE"
                    break
            response["gender"] = gender

            # Name logic for Aadhaar (same as before)
            name = "Not found"
            for i, line in enumerate(lines):
                if "DOB" in line.upper():
                    if i > 0:
                        possible_name = lines[i - 1]
                        if (
                            not any(x in possible_name.upper() for x in ["GOVERNMENT", "MALE", "FEMALE"])
                            and not re.search(r'\d', possible_name)
                        ):
                            name = possible_name.strip()
                            break
            response["name"] = name

        else:
            response["error"] = "Unable to determine document type."

        return response

    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}