File size: 7,111 Bytes
7147400
c70099c
7147400
 
 
c70099c
7147400
 
 
 
5ebcb93
ae2e698
7147400
 
 
 
 
 
 
 
 
 
 
 
 
 
254fdf9
7147400
 
 
 
 
 
 
a8683a1
ae2e698
8324e53
7147400
8324e53
 
7147400
8324e53
 
ae2e698
8324e53
ae2e698
254fdf9
 
b07dfbb
254fdf9
7147400
254fdf9
7147400
254fdf9
a726fb2
 
 
2c3e33d
7147400
 
 
2c3e33d
254fdf9
7147400
 
 
a726fb2
254fdf9
 
7147400
 
 
254fdf9
a726fb2
a8683a1
8324e53
254fdf9
7147400
 
 
254fdf9
7147400
 
254fdf9
7147400
 
 
 
 
65bef46
7147400
 
 
65bef46
 
7147400
 
 
65bef46
7147400
 
 
 
65bef46
 
7147400
 
65bef46
7147400
 
 
 
 
 
 
 
 
 
 
 
65bef46
7147400
65bef46
 
7147400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import os
import re
from datetime import datetime
from simple_salesforce import Salesforce
from paddleocr import PaddleOCR

# -----------------------------------
# OCR SETUP
# -----------------------------------
os.environ.setdefault("OMP_NUM_THREADS", "1")  # limit threads for stability
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Regex patterns
PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
DOB_REGEXES = [
    r'\b\d{2}[./-]\d{2}[./-]\d{4}\b',
    r'\b\d{4}-\d{2}-\d{2}\b',
    r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',
    r'\b(19|20)\d{2}\b'
]
GENDERS = ["MALE", "FEMALE", "TRANSGENDER"]  # kept for completeness (not stored)

# -----------------------------------
# OCR HELPERS
# -----------------------------------
def extract_kyc_fields(file_path, force_type=None):
    """
    Returns a dict with:
      card_type: PAN | AADHAAR | UNKNOWN
      pan_number / aadhaar_number
      name (best-guess)
      dob  (best-guess for the detected card)
    """
    try:
        result = ocr.ocr(file_path, cls=True)
        lines = []

        for block in result:
            for line in block:
                text = re.sub(r'\s+', ' ', line[1][0].strip())
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        if force_type:
            card_type = force_type.upper()
        else:
            card_type = "UNKNOWN"
            if re.search(PAN_REGEX, full_text):
                card_type = "PAN"
            elif re.search(AADHAAR_REGEX, full_text):
                card_type = "AADHAAR"

        response = {"card_type": card_type}

        if card_type == "PAN":
            response["pan_number"] = _first_match(PAN_REGEX, full_text) or "Not found"
            response["dob"] = _extract_dob(lines)
            response["name"] = _extract_pan_name(lines)

        elif card_type == "AADHAAR":
            response["aadhaar_number"] = _first_match(AADHAAR_REGEX, full_text) or "Not found"
            response["dob"] = _extract_dob(lines)
            response["name"] = _extract_aadhaar_name(lines)

        else:
            response["error"] = "Could not identify document as PAN or Aadhaar."
            # best-effort generic fields
            response["dob"] = _extract_dob(lines)
            response["name"] = _extract_generic_name(lines)

        return response
    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}

def _first_match(pattern, text, flags=0):
    m = re.search(pattern, text, flags)
    return m.group(0) if m else None

def _extract_dob(lines):
    # Try common formats
    for line in lines:
        for pattern in DOB_REGEXES[:-1]:
            m = re.search(pattern, line, re.IGNORECASE)
            if m:
                return m.group(0)
    # Year-only with labels
    for line in lines:
        m = re.search(DOB_REGEXES[-1], line)
        if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]):
            return m.group(0)
    return "Not found"

def _extract_pan_name(lines):
    for i, line in enumerate(lines):
        if "INCOME TAX DEPARTMENT" in line.upper():
            for j in range(i + 1, len(lines)):
                candidate = lines[j].strip()
                if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate):
                    if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]):
                        return candidate
    return "Not found"

def _extract_aadhaar_name(lines):
    # Heuristic: Name usually above DOB
    for i, line in enumerate(lines):
        if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES):
            if i > 0:
                candidate = lines[i - 1].strip()
                if _looks_like_name(candidate):
                    return candidate
    # Fallback
    for line in lines:
        if _looks_like_name(line.strip()):
            return line.strip()
    return "Not found"

def _extract_generic_name(lines):
    for line in lines:
        if _looks_like_name(line.strip()):
            return line.strip()
    return "Not found"

def _looks_like_name(text):
    if re.search(r'\d', text):
        return False
    if len(text.split()) < 2:
        return False
    banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]
    return not any(b in text.upper() for b in banned)

# -----------------------------------
# SALESFORCE HELPERS
# -----------------------------------
SF_USERNAME = os.getenv("SF_USERNAME", "")
SF_PASSWORD = os.getenv("SF_PASSWORD", "")
SF_TOKEN    = os.getenv("SF_TOKEN", "")
SF_DOMAIN   = os.getenv("SF_DOMAIN", "login")  # "login"=prod, "test"=sandbox

def connect_salesforce():
    try:
        sf = Salesforce(
            username=SF_USERNAME,
            password=SF_PASSWORD,
            security_token=SF_TOKEN,
            domain=SF_DOMAIN
        )
        print(f"βœ… Connected to Salesforce ({SF_DOMAIN})")
        return sf
    except Exception as e:
        print("❌ Salesforce login failed:", e)
        return None

def create_kyc_record(sf, kyc_data, file_name=None, agent_id=None):
    """
    Creates a record in KYC_Record__c with the fields:
      Aadhaar_Name__c, Aadhaar_DOB__c, Aadhaar_Number__c
      Pan_Name__c,    Pan_DOB__c,    PAN_Number__c
    Optionally includes Agent__c if you pass agent_id and that field exists.
    """
    try:
        if not sf:
            return {"status": "error", "message": "Salesforce not connected"}

        # Normalize values
        def val_or_blank(key): return (kyc_data.get(key) or "").replace("Not found", "")

        record = {
            "Aadhaar_Name__c":  "",
            "Aadhaar_DOB__c":   "",
            "Aadhaar_Number__c":"",
            "Pan_Name__c":      "",
            "Pan_DOB__c":       "",
            "PAN_Number__c":    "",
        }

        ct = (kyc_data.get("card_type") or "").upper()
        if ct == "AADHAAR":
            record["Aadhaar_Name__c"]   = val_or_blank("name")
            record["Aadhaar_DOB__c"]    = val_or_blank("dob")
            record["Aadhaar_Number__c"] = val_or_blank("aadhaar_number")
        elif ct == "PAN":
            record["Pan_Name__c"]    = val_or_blank("name")
            record["Pan_DOB__c"]     = val_or_blank("dob")
            record["PAN_Number__c"]  = val_or_blank("pan_number")
        else:
            # Unknown: best effort β€” fill name/dob into Aadhaar side to avoid losing data
            record["Aadhaar_Name__c"] = val_or_blank("name")
            record["Aadhaar_DOB__c"]  = val_or_blank("dob")

        # Optionally include Agent__c if provided (and exists in your org)
        if agent_id:
            record["Agent__c"] = agent_id

        # Optionally store file name in a text field if you have one (not required by you):
        # record["KYC_File_Name__c"] = file_name or ""

        resp = sf.KYC_Record__c.create(record)
        return {"status": "success", "record_id": resp.get("id")}
    except Exception as e:
        return {"status": "error", "message": str(e)}