gopichandra commited on
Commit
7147400
·
verified ·
1 Parent(s): a4d0a87

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +156 -68
utils.py CHANGED
@@ -1,17 +1,44 @@
1
- from paddleocr import PaddleOCR
2
  import re
 
 
 
3
 
4
- # Initialize OCR
 
 
 
5
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def extract_kyc_fields(file_path, force_type=None):
 
 
 
 
 
 
 
8
  try:
9
  result = ocr.ocr(file_path, cls=True)
10
-
11
  lines = []
 
12
  for block in result:
13
  for line in block:
14
- text = line[1][0].strip()
15
  if text:
16
  lines.append(text)
17
 
@@ -20,96 +47,157 @@ def extract_kyc_fields(file_path, force_type=None):
20
  if force_type:
21
  card_type = force_type.upper()
22
  else:
23
- pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
24
- aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
25
  card_type = "UNKNOWN"
26
- if pan_match:
27
  card_type = "PAN"
28
- elif aadhaar_match:
29
  card_type = "AADHAAR"
30
 
31
  response = {"card_type": card_type}
32
 
33
  if card_type == "PAN":
34
- pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
35
- if pan_match:
36
- response["pan_number"] = pan_match.group(0)
37
- response["dob"] = extract_dob(lines)
38
- response["name"] = extract_pan_name(lines)
39
 
40
  elif card_type == "AADHAAR":
41
- aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
42
- if aadhaar_match:
43
- response["aadhaar_number"] = aadhaar_match.group(0)
44
- response["dob"] = extract_dob(lines)
45
- response["gender"] = extract_gender(lines)
46
- response["name"] = extract_aadhaar_name(lines)
47
 
48
  else:
49
  response["error"] = "Could not identify document as PAN or Aadhaar."
 
 
 
50
 
51
  return response
52
  except Exception as e:
53
  return {"error": f"OCR processing failed: {str(e)}"}
54
 
 
 
 
55
 
56
- def extract_dob(lines):
57
- dob = "Not found"
58
- for line in lines:
59
- match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
60
- if match:
61
- return match.group(0)
62
- for line in lines:
63
- match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
64
- if match:
65
- return match.group(0)
66
  for line in lines:
67
- match = re.search(r'\b(19|20)\d{2}\b', line)
68
- if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
69
- return match.group(0)
70
- return dob
71
-
72
-
73
- def extract_gender(lines):
74
  for line in lines:
75
- if "MALE" in line.upper():
76
- return "MALE"
77
- elif "FEMALE" in line.upper():
78
- return "FEMALE"
79
- elif "TRANSGENDER" in line.upper():
80
- return "TRANSGENDER"
81
  return "Not found"
82
 
83
-
84
- def extract_pan_name(lines):
85
- for i in range(len(lines)):
86
- if "INCOME TAX DEPARTMENT" in lines[i].upper():
87
  for j in range(i + 1, len(lines)):
88
- possible = lines[j].strip()
89
- if (
90
- re.match(r'^[A-Z\s.]+$', possible)
91
- and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
92
- and not re.search(r'\d', possible)
93
- ):
94
- return possible.strip()
95
  return "Not found"
96
 
97
-
98
- def extract_aadhaar_name(lines):
99
  for i, line in enumerate(lines):
100
- if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
101
- possible_name = lines[i - 1].strip()
102
- if (
103
- not re.search(r'\d', possible_name)
104
- and len(possible_name.split()) >= 2
105
- and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
106
- ):
107
- return possible_name
 
 
 
 
108
  for line in lines:
109
- if (
110
- not re.search(r'\d', line)
111
- and len(line.split()) >= 2
112
- and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
113
- ):
114
  return line.strip()
115
  return "Not found"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
  import re
3
+ from datetime import datetime
4
+ from simple_salesforce import Salesforce
5
+ from paddleocr import PaddleOCR
6
 
7
+ # -----------------------------------
8
+ # OCR SETUP
9
+ # -----------------------------------
10
+ os.environ.setdefault("OMP_NUM_THREADS", "1") # limit threads for stability
11
  ocr = PaddleOCR(use_angle_cls=True, lang='en')
12
 
13
+ # Regex patterns
14
+ PAN_REGEX = r'\b[A-Z]{5}[0-9]{4}[A-Z]\b'
15
+ AADHAAR_REGEX = r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}\b'
16
+ DOB_REGEXES = [
17
+ r'\b\d{2}[./-]\d{2}[./-]\d{4}\b',
18
+ r'\b\d{4}-\d{2}-\d{2}\b',
19
+ r'\b\d{2}[./-](JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)[./-]\d{4}\b',
20
+ r'\b(19|20)\d{2}\b'
21
+ ]
22
+ GENDERS = ["MALE", "FEMALE", "TRANSGENDER"] # kept for completeness (not stored)
23
+
24
+ # -----------------------------------
25
+ # OCR HELPERS
26
+ # -----------------------------------
27
  def extract_kyc_fields(file_path, force_type=None):
28
+ """
29
+ Returns a dict with:
30
+ card_type: PAN | AADHAAR | UNKNOWN
31
+ pan_number / aadhaar_number
32
+ name (best-guess)
33
+ dob (best-guess for the detected card)
34
+ """
35
  try:
36
  result = ocr.ocr(file_path, cls=True)
 
37
  lines = []
38
+
39
  for block in result:
40
  for line in block:
41
+ text = re.sub(r'\s+', ' ', line[1][0].strip())
42
  if text:
43
  lines.append(text)
44
 
 
47
  if force_type:
48
  card_type = force_type.upper()
49
  else:
 
 
50
  card_type = "UNKNOWN"
51
+ if re.search(PAN_REGEX, full_text):
52
  card_type = "PAN"
53
+ elif re.search(AADHAAR_REGEX, full_text):
54
  card_type = "AADHAAR"
55
 
56
  response = {"card_type": card_type}
57
 
58
  if card_type == "PAN":
59
+ response["pan_number"] = _first_match(PAN_REGEX, full_text) or "Not found"
60
+ response["dob"] = _extract_dob(lines)
61
+ response["name"] = _extract_pan_name(lines)
 
 
62
 
63
  elif card_type == "AADHAAR":
64
+ response["aadhaar_number"] = _first_match(AADHAAR_REGEX, full_text) or "Not found"
65
+ response["dob"] = _extract_dob(lines)
66
+ response["name"] = _extract_aadhaar_name(lines)
 
 
 
67
 
68
  else:
69
  response["error"] = "Could not identify document as PAN or Aadhaar."
70
+ # best-effort generic fields
71
+ response["dob"] = _extract_dob(lines)
72
+ response["name"] = _extract_generic_name(lines)
73
 
74
  return response
75
  except Exception as e:
76
  return {"error": f"OCR processing failed: {str(e)}"}
77
 
78
+ def _first_match(pattern, text, flags=0):
79
+ m = re.search(pattern, text, flags)
80
+ return m.group(0) if m else None
81
 
82
+ def _extract_dob(lines):
83
+ # Try common formats
 
 
 
 
 
 
 
 
84
  for line in lines:
85
+ for pattern in DOB_REGEXES[:-1]:
86
+ m = re.search(pattern, line, re.IGNORECASE)
87
+ if m:
88
+ return m.group(0)
89
+ # Year-only with labels
 
 
90
  for line in lines:
91
+ m = re.search(DOB_REGEXES[-1], line)
92
+ if m and any(lbl in line.upper() for lbl in ["YOB", "YEAR", "BIRTH", "DOB"]):
93
+ return m.group(0)
 
 
 
94
  return "Not found"
95
 
96
+ def _extract_pan_name(lines):
97
+ for i, line in enumerate(lines):
98
+ if "INCOME TAX DEPARTMENT" in line.upper():
 
99
  for j in range(i + 1, len(lines)):
100
+ candidate = lines[j].strip()
101
+ if re.match(r'^[A-Z\s.]+$', candidate) and not re.search(r'\d', candidate):
102
+ if not any(x in candidate.upper() for x in ["INDIA", "GOVT", "DEPARTMENT"]):
103
+ return candidate
 
 
 
104
  return "Not found"
105
 
106
+ def _extract_aadhaar_name(lines):
107
+ # Heuristic: Name usually above DOB
108
  for i, line in enumerate(lines):
109
+ if any(re.search(p, line, re.IGNORECASE) for p in DOB_REGEXES):
110
+ if i > 0:
111
+ candidate = lines[i - 1].strip()
112
+ if _looks_like_name(candidate):
113
+ return candidate
114
+ # Fallback
115
+ for line in lines:
116
+ if _looks_like_name(line.strip()):
117
+ return line.strip()
118
+ return "Not found"
119
+
120
+ def _extract_generic_name(lines):
121
  for line in lines:
122
+ if _looks_like_name(line.strip()):
 
 
 
 
123
  return line.strip()
124
  return "Not found"
125
+
126
+ def _looks_like_name(text):
127
+ if re.search(r'\d', text):
128
+ return False
129
+ if len(text.split()) < 2:
130
+ return False
131
+ banned = ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]
132
+ return not any(b in text.upper() for b in banned)
133
+
134
+ # -----------------------------------
135
+ # SALESFORCE HELPERS
136
+ # -----------------------------------
137
+ SF_USERNAME = os.getenv("SF_USERNAME", "")
138
+ SF_PASSWORD = os.getenv("SF_PASSWORD", "")
139
+ SF_TOKEN = os.getenv("SF_TOKEN", "")
140
+ SF_DOMAIN = os.getenv("SF_DOMAIN", "login") # "login"=prod, "test"=sandbox
141
+
142
+ def connect_salesforce():
143
+ try:
144
+ sf = Salesforce(
145
+ username=SF_USERNAME,
146
+ password=SF_PASSWORD,
147
+ security_token=SF_TOKEN,
148
+ domain=SF_DOMAIN
149
+ )
150
+ print(f"✅ Connected to Salesforce ({SF_DOMAIN})")
151
+ return sf
152
+ except Exception as e:
153
+ print("❌ Salesforce login failed:", e)
154
+ return None
155
+
156
+ def create_kyc_record(sf, kyc_data, file_name=None, agent_id=None):
157
+ """
158
+ Creates a record in KYC_Record__c with the fields:
159
+ Aadhaar_Name__c, Aadhaar_DOB__c, Aadhaar_Number__c
160
+ Pan_Name__c, Pan_DOB__c, PAN_Number__c
161
+ Optionally includes Agent__c if you pass agent_id and that field exists.
162
+ """
163
+ try:
164
+ if not sf:
165
+ return {"status": "error", "message": "Salesforce not connected"}
166
+
167
+ # Normalize values
168
+ def val_or_blank(key): return (kyc_data.get(key) or "").replace("Not found", "")
169
+
170
+ record = {
171
+ "Aadhaar_Name__c": "",
172
+ "Aadhaar_DOB__c": "",
173
+ "Aadhaar_Number__c":"",
174
+ "Pan_Name__c": "",
175
+ "Pan_DOB__c": "",
176
+ "PAN_Number__c": "",
177
+ }
178
+
179
+ ct = (kyc_data.get("card_type") or "").upper()
180
+ if ct == "AADHAAR":
181
+ record["Aadhaar_Name__c"] = val_or_blank("name")
182
+ record["Aadhaar_DOB__c"] = val_or_blank("dob")
183
+ record["Aadhaar_Number__c"] = val_or_blank("aadhaar_number")
184
+ elif ct == "PAN":
185
+ record["Pan_Name__c"] = val_or_blank("name")
186
+ record["Pan_DOB__c"] = val_or_blank("dob")
187
+ record["PAN_Number__c"] = val_or_blank("pan_number")
188
+ else:
189
+ # Unknown: best effort — fill name/dob into Aadhaar side to avoid losing data
190
+ record["Aadhaar_Name__c"] = val_or_blank("name")
191
+ record["Aadhaar_DOB__c"] = val_or_blank("dob")
192
+
193
+ # Optionally include Agent__c if provided (and exists in your org)
194
+ if agent_id:
195
+ record["Agent__c"] = agent_id
196
+
197
+ # Optionally store file name in a text field if you have one (not required by you):
198
+ # record["KYC_File_Name__c"] = file_name or ""
199
+
200
+ resp = sf.KYC_Record__c.create(record)
201
+ return {"status": "success", "record_id": resp.get("id")}
202
+ except Exception as e:
203
+ return {"status": "error", "message": str(e)}