WebashalarForML commited on
Commit
d97b3ca
·
verified ·
1 Parent(s): 13fd947

Update utils/spacy.py

Browse files
Files changed (1) hide show
  1. utils/spacy.py +461 -246
utils/spacy.py CHANGED
@@ -1,246 +1,461 @@
1
- import spacy
2
- import logging
3
- import json
4
- from utils.fileTotext import extract_text_based_on_format
5
- import re
6
-
7
- def is_valid_email(email):
8
- email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
9
- return re.match(email_regex, email) is not None
10
-
11
- def is_valid_contact(contact):
12
- patterns = [
13
- r'^\+91[\s\.\-\/]?\(?0?\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with optional 0 and separators
14
- r'^\+91[\s\.\-\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with 10 digits separated
15
- r'^\d{5}[\s\-\.\/]?\d{5}$', # Local format without country code
16
- r'^\+91[\s\.\-\/]?\d{10}$', # +91 with 10 digits together
17
- r'^\d{10}$', # 10 digits together
18
- r'^\+91[\s\.\-\/]?\(?\d{5}\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$' # +91 with varying separators
19
- r'\+1\s\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada Intl +1 (XXX) XXX-XXXX
20
- r'\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada STD (XXX) XXX-XXXX
21
- r'\(\d{3}\)\s\d{3}\s\d{4} ', # USA/Canada (XXX) XXX XXXX
22
- r'\(\d{3}\)\s\d{3}\s\d{3} ', # USA/Canada (XXX) XXX XXX
23
- r'\+1\d{10} ', # +1 XXXXXXXXXX
24
- r'\d{10} ', # XXXXXXXXXX
25
- r'\+44\s\d{4}\s\d{6} ', # UK Intl +44 XXXX XXXXXX
26
- r'\+44\s\d{3}\s\d{3}\s\d{4} ', # UK Intl +44 XXX XXX XXXX
27
- r'0\d{4}\s\d{6} ', # UK STD 0XXXX XXXXXX
28
- r'0\d{3}\s\d{3}\s\d{4} ', # UK STD 0XXX XXX XXXX
29
- r'\+44\d{10} ', # +44 XXXXXXXXXX
30
- r'0\d{10} ', # 0XXXXXXXXXX
31
- r'\+61\s\d\s\d{4}\s\d{4} ', # Australia Intl +61 X XXXX XXXX
32
- r'0\d\s\d{4}\s\d{4} ', # Australia STD 0X XXXX XXXX
33
- r'\+61\d{9} ', # +61 XXXXXXXXX
34
- r'0\d{9} ', # 0XXXXXXXXX
35
- r'\+91\s\d{5}-\d{5} ', # India Intl +91 XXXXX-XXXXX
36
- r'\+91\s\d{4}-\d{6} ', # India Intl +91 XXXX-XXXXXX
37
- r'\+91\s\d{10} ', # India Intl +91 XXXXXXXXXX
38
- r'0\d{2}-\d{7} ', # India STD 0XX-XXXXXXX
39
- r'\+91\d{10} ', # +91 XXXXXXXXXX
40
- r'\+49\s\d{4}\s\d{8} ', # Germany Intl +49 XXXX XXXXXXXX
41
- r'\+49\s\d{3}\s\d{7} ', # Germany Intl +49 XXX XXXXXXX
42
- r'0\d{3}\s\d{8} ', # Germany STD 0XXX XXXXXXXX
43
- r'\+49\d{12} ', # +49 XXXXXXXXXXXX
44
- r'\+49\d{10} ', # +49 XXXXXXXXXX
45
- r'0\d{11} ', # 0XXXXXXXXXXX
46
- r'\+86\s\d{3}\s\d{4}\s\d{4} ', # China Intl +86 XXX XXXX XXXX
47
- r'0\d{3}\s\d{4}\s\d{4} ', # China STD 0XXX XXXX XXXX
48
- r'\+86\d{11} ', # +86 XXXXXXXXXXX
49
- r'\+81\s\d\s\d{4}\s\d{4} ', # Japan Intl +81 X XXXX XXXX
50
- r'\+81\s\d{2}\s\d{4}\s\d{4} ', # Japan Intl +81 XX XXXX XXXX
51
- r'0\d\s\d{4}\s\d{4} ', # Japan STD 0X XXXX XXXX
52
- r'\+81\d{10} ', # +81 XXXXXXXXXX
53
- r'\+81\d{9} ', # +81 XXXXXXXXX
54
- r'0\d{9} ', # 0XXXXXXXXX
55
- r'\+55\s\d{2}\s\d{5}-\d{4} ', # Brazil Intl +55 XX XXXXX-XXXX
56
- r'\+55\s\d{2}\s\d{4}-\d{4} ', # Brazil Intl +55 XX XXXX-XXXX
57
- r'0\d{2}\s\d{4}\s\d{4} ', # Brazil STD 0XX XXXX XXXX
58
- r'\+55\d{11} ', # +55 XXXXXXXXXXX
59
- r'\+55\d{10} ', # +55 XXXXXXXXXX
60
- r'0\d{10} ', # 0XXXXXXXXXX
61
- r'\+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France Intl +33 X XX XX XX XX
62
- r'0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France STD 0X XX XX XX XX
63
- r'\+33\d{9} ', # +33 XXXXXXXXX
64
- r'0\d{9} ', # 0XXXXXXXXX
65
- r'\+7\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia Intl +7 XXX XXX-XX-XX
66
- r'8\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia STD 8 XXX XXX-XX-XX
67
- r'\+7\d{10} ', # +7 XXXXXXXXXX
68
- r'8\d{10} ', # 8 XXXXXXXXXX
69
- r'\+27\s\d{2}\s\d{3}\s\d{4} ', # South Africa Intl +27 XX XXX XXXX
70
- r'0\d{2}\s\d{3}\s\d{4} ', # South Africa STD 0XX XXX XXXX
71
- r'\+27\d{9} ', # +27 XXXXXXXXX
72
- r'0\d{9} ', # 0XXXXXXXXX
73
- r'\+52\s\d{3}\s\d{3}\s\d{4} ', # Mexico Intl +52 XXX XXX XXXX
74
- r'\+52\s\d{2}\s\d{4}\s\d{4} ', # Mexico Intl +52 XX XXXX XXXX
75
- r'01\s\d{3}\s\d{4} ', # Mexico STD 01 XXX XXXX
76
- r'\+52\d{10} ', # +52 XXXXXXXXXX
77
- r'01\d{7} ', # 01 XXXXXXX
78
- r'\+234\s\d{3}\s\d{3}\s\d{4} ', # Nigeria Intl +234 XXX XXX XXXX
79
- r'0\d{3}\s\d{3}\s\d{4} ', # Nigeria STD 0XXX XXX XXXX
80
- r'\+234\d{10} ', # +234 XXXXXXXXXX
81
- r'0\d{10} ', # 0XXXXXXXXXX
82
- r'\+971\s\d\s\d{3}\s\d{4} ', # UAE Intl +971 X XXX XXXX
83
- r'0\d\s\d{3}\s\d{4} ', # UAE STD 0X XXX XXXX
84
- r'\+971\d{8} ', # +971 XXXXXXXX
85
- r'0\d{8} ', # 0XXXXXXXX
86
- r'\+54\s9\s\d{3}\s\d{3}\s\d{4} ', # Argentina Intl +54 9 XXX XXX XXXX
87
- r'\+54\s\d{1}\s\d{4}\s\d{4} ', # Argentina Intl +54 X XXXX XXXX
88
- r'0\d{3}\s\d{4} ', # Argentina STD 0XXX XXXX
89
- r'\+54\d{10} ', # +54 9 XXXXXXXXXX
90
- r'\+54\d{9} ', # +54 XXXXXXXXX
91
- r'0\d{7} ', # 0XXXXXXX
92
- r'\+966\s\d\s\d{3}\s\d{4} ', # Saudi Intl +966 X XXX XXXX
93
- r'0\d\s\d{3}\s\d{4} ', # Saudi STD 0X XXX XXXX
94
- r'\+966\d{8} ', # +966 XXXXXXXX
95
- r'0\d{8} ', # 0XXXXXXXX
96
- r'\+1\d{10} ', # +1 XXXXXXXXXX
97
- r'\+1\s\d{3}\s\d{3}\s\d{4} ', # +1 XXX XXX XXXX
98
- r'\d{5}\s\d{5} ', # XXXXX XXXXX
99
- r'\d{10} ', # XXXXXXXXXX
100
- r'\+44\d{10} ', # +44 XXXXXXXXXX
101
- r'0\d{10} ', # 0XXXXXXXXXX
102
- r'\+61\d{9} ', # +61 XXXXXXXXX
103
- r'0\d{9} ', # 0XXXXXXXXX
104
- r'\+91\d{10} ', # +91 XXXXXXXXXX
105
- r'\+49\d{12} ', # +49 XXXXXXXXXXXX
106
- r'\+49\d{10} ', # +49 XXXXXXXXXX
107
- r'0\d{11} ', # 0XXXXXXXXXXX
108
- r'\+86\d{11} ', # +86 XXXXXXXXXXX
109
- r'\+81\d{10} ', # +81 XXXXXXXXXX
110
- r'\+81\d{9} ', # +81 XXXXXXXXX
111
- r'0\d{9} ', # 0XXXXXXXXX
112
- r'\+55\d{11} ', # +55 XXXXXXXXXXX
113
- r'\+55\d{10} ', # +55 XXXXXXXXXX
114
- r'0\d{10} ', # 0XXXXXXXXXX
115
- r'\+33\d{9} ', # +33 XXXXXXXXX
116
- r'0\d{9} ', # 0XXXXXXXXX
117
- r'\+7\d{10} ', # +7 XXXXXXXXXX
118
- r'8\d{10} ', # 8 XXXXXXXXXX
119
- r'\+27\d{9} ', # +27 XXXXXXXXX
120
- r'0\d{9} ', # 0XXXXXXXXX (South Africa STD)
121
- r'\+52\d{10} ', # +52 XXXXXXXXXX
122
- r'01\d{7} ', # 01 XXXXXXX
123
- r'\+234\d{10} ', # +234 XXXXXXXXXX
124
- r'0\d{10} ', # 0XXXXXXXXXX
125
- r'\+971\d{8} ', # +971 XXXXXXXX
126
- r'0\d{8} ', # 0XXXXXXXX
127
- r'\+54\s9\s\d{10} ', # +54 9 XXXXXXXXXX
128
- r'\+54\d{9} ', # +54 XXXXXXXXX
129
- r'0\d{7} ', # 0XXXXXXX
130
- r'\+966\d{8} ', # +966 XXXXXXXX
131
- r'0\d{8}' # 0XXXXXXXX
132
- ]
133
-
134
- # Check if the contact matches any of the patterns
135
- return any(re.match(pattern, contact) for pattern in patterns) is not None
136
-
137
- # Function to parse resume with SpaCy
138
- # Function to parse resume with SpaCy
139
- def Parser_from_model(file_path):
140
- result = {
141
- "personal": {
142
- "name": '',
143
- "contact": '',
144
- "email": '',
145
- "location": '',
146
- "link": '',
147
- "invalid_email": '',
148
- "invalid_contact": ''
149
- },
150
- "professional": {
151
- "technical_skills": [],
152
- "non_technical_skills": [],
153
- "tools": [],
154
- "experience": [
155
- {
156
- "company": '',
157
- "projects": '',
158
- "role": '',
159
- "years": '',
160
- "project_experience": []
161
- }
162
- ],
163
- "education": [
164
- {
165
- "qualification": '',
166
- "university": '',
167
- "course": '',
168
- "certificate": ''
169
- }
170
- ]
171
- }
172
- }
173
-
174
- try:
175
- nlp = spacy.load("Spacy_Models/ner_model_05_3")
176
- logging.debug("Model loaded successfully.")
177
- except Exception as e:
178
- logging.error(f"Error loading model: {e}")
179
- return {"error": "Model loading failed"}
180
-
181
- try:
182
- cleaned_text, hyperlinks = extract_text_based_on_format(file_path)
183
- if not cleaned_text.strip():
184
- logging.error("No text extracted from the file.")
185
- return {"error": "Text extraction failed"}
186
- except Exception as e:
187
- logging.error(f"Error extracting text from file: {e}")
188
- return {"error": "Text extraction failed"}
189
-
190
- try:
191
- doc = nlp(cleaned_text)
192
- except Exception as e:
193
- logging.error(f"Error processing text with SpaCy: {e}")
194
- return {"error": "Text processing failed"}
195
-
196
- # Initialize entities as a dictionary with lists
197
- entities = {label: [] for label in ['PERSON', 'EMAIL', 'CONTACT', 'LOCATION', 'SKILL', 'SOFT_SKILL', 'COMPANY', 'PROJECTS', 'JOB_TITLE', 'YEARS_EXPERIENCE', 'EXPERIENCE', 'QUALIFICATION', 'UNIVERSITY', 'COURSE', 'CERTIFICATE']}
198
-
199
- # Process entities
200
- for ent in doc.ents:
201
- if ent.label_ in entities:
202
- if ent.text not in entities[ent.label_]: # Avoid duplicates
203
- entities[ent.label_].append(ent.text)
204
-
205
- # Map entities to the result JSON
206
- result['personal']['name'] = entities.get('PERSON', [''])[0] if entities.get('PERSON', []) else ''
207
-
208
- # Validate email
209
- extracted_email = entities.get('EMAIL', [''])[0] if entities.get('EMAIL', []) else ''
210
- if is_valid_email(extracted_email):
211
- result['personal']['email'] = extracted_email
212
- else:
213
- logging.warning(f"Invalid email detected: {extracted_email}")
214
- result['personal']['email'] = "Invalid email"
215
- result['personal']['invalid_email'] = extracted_email
216
-
217
- # Validate contact
218
- extracted_contact = entities.get('CONTACT', [''])[0] if entities.get('CONTACT', []) else ''
219
- if is_valid_contact(extracted_contact):
220
- result['personal']['contact'] = extracted_contact
221
- else:
222
- logging.warning(f"Invalid contact detected: {extracted_contact}")
223
- result['personal']['contact'] = "Invalid contact"
224
- result['personal']['invalid_contact'] = extracted_contact
225
-
226
- result['personal']['location'] = entities.get('LOCATION', [''])[0] if entities.get('LOCATION', []) else ''
227
- result['personal']['link'] = hyperlinks # Hyperlinks from extracted text
228
-
229
- result['professional']['technical_skills'] = entities.get('SKILL', [])
230
- result['professional']['non_technical_skills'] = entities.get('SOFT_SKILL', [])
231
- result['professional']['tools'] = [] # Add logic if tools extraction is needed
232
-
233
- result['professional']['experience'][0]['company'] = entities.get('COMPANY', [''])[0] if entities.get('COMPANY', []) else ''
234
- result['professional']['experience'][0]['projects'] = entities.get('PROJECTS', [''])[0] if entities.get('PROJECTS', []) else ''
235
- result['professional']['experience'][0]['role'] = entities.get('JOB_TITLE', [''])[0] if entities.get('JOB_TITLE', []) else ''
236
- result['professional']['experience'][0]['years'] = entities.get('YEARS_EXPERIENCE', [''])[0] if entities.get('YEARS_EXPERIENCE', []) else ''
237
- result['professional']['experience'][0]['project_experience'] = entities.get('EXPERIENCE', [])
238
-
239
- result['professional']['education'][0]['qualification'] = entities.get('QUALIFICATION', [''])[0] if entities.get('QUALIFICATION', []) else ''
240
- result['professional']['education'][0]['university'] = entities.get('UNIVERSITY', [''])[0] if entities.get('UNIVERSITY', []) else ''
241
- result['professional']['education'][0]['course'] = entities.get('COURSE', [''])[0] if entities.get('COURSE', []) else ''
242
- result['professional']['education'][0]['certificate'] = entities.get('CERTIFICATE', [''])[0] if entities.get('CERTIFICATE', []) else ''
243
-
244
- print(result)
245
- return result
246
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import logging
3
+ import json
4
+ from utils.fileTotext import extract_text_based_on_format
5
+ import re
6
+
7
+ def is_valid_email(email):
8
+ email_regex = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
9
+ return re.match(email_regex, email) is not None
10
+
11
+ def is_valid_contact(contact):
12
+ patterns = [
13
+ r'^\+91[\s\.\-\/]?\(?0?\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with optional 0 and separators
14
+ r'^\+91[\s\.\-\/]?\d{5}[\s\-\.\/]?\d{5}$', # +91 with 10 digits separated
15
+ r'^\d{5}[\s\-\.\/]?\d{5}$', # Local format without country code
16
+ r'^\+91[\s\.\-\/]?\d{10}$', # +91 with 10 digits together
17
+ r'^\d{10}$', # 10 digits together
18
+ r'^\+91[\s\.\-\/]?\(?\d{5}\)?[\s\-\.\/]?\d{5}[\s\-\.\/]?\d{5}$' # +91 with varying separators
19
+ r'\+1\s\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada Intl +1 (XXX) XXX-XXXX
20
+ r'\(\d{3}\)\s\d{3}-\d{4} ', # USA/Canada STD (XXX) XXX-XXXX
21
+ r'\(\d{3}\)\s\d{3}\s\d{4} ', # USA/Canada (XXX) XXX XXXX
22
+ r'\(\d{3}\)\s\d{3}\s\d{3} ', # USA/Canada (XXX) XXX XXX
23
+ r'\+1\d{10} ', # +1 XXXXXXXXXX
24
+ r'\d{10} ', # XXXXXXXXXX
25
+ r'\+44\s\d{4}\s\d{6} ', # UK Intl +44 XXXX XXXXXX
26
+ r'\+44\s\d{3}\s\d{3}\s\d{4} ', # UK Intl +44 XXX XXX XXXX
27
+ r'0\d{4}\s\d{6} ', # UK STD 0XXXX XXXXXX
28
+ r'0\d{3}\s\d{3}\s\d{4} ', # UK STD 0XXX XXX XXXX
29
+ r'\+44\d{10} ', # +44 XXXXXXXXXX
30
+ r'0\d{10} ', # 0XXXXXXXXXX
31
+ r'\+61\s\d\s\d{4}\s\d{4} ', # Australia Intl +61 X XXXX XXXX
32
+ r'0\d\s\d{4}\s\d{4} ', # Australia STD 0X XXXX XXXX
33
+ r'\+61\d{9} ', # +61 XXXXXXXXX
34
+ r'0\d{9} ', # 0XXXXXXXXX
35
+ r'\+91\s\d{5}-\d{5} ', # India Intl +91 XXXXX-XXXXX
36
+ r'\+91\s\d{4}-\d{6} ', # India Intl +91 XXXX-XXXXXX
37
+ r'\+91\s\d{10} ', # India Intl +91 XXXXXXXXXX
38
+ r'0\d{2}-\d{7} ', # India STD 0XX-XXXXXXX
39
+ r'\+91\d{10} ', # +91 XXXXXXXXXX
40
+ r'\+49\s\d{4}\s\d{8} ', # Germany Intl +49 XXXX XXXXXXXX
41
+ r'\+49\s\d{3}\s\d{7} ', # Germany Intl +49 XXX XXXXXXX
42
+ r'0\d{3}\s\d{8} ', # Germany STD 0XXX XXXXXXXX
43
+ r'\+49\d{12} ', # +49 XXXXXXXXXXXX
44
+ r'\+49\d{10} ', # +49 XXXXXXXXXX
45
+ r'0\d{11} ', # 0XXXXXXXXXXX
46
+ r'\+86\s\d{3}\s\d{4}\s\d{4} ', # China Intl +86 XXX XXXX XXXX
47
+ r'0\d{3}\s\d{4}\s\d{4} ', # China STD 0XXX XXXX XXXX
48
+ r'\+86\d{11} ', # +86 XXXXXXXXXXX
49
+ r'\+81\s\d\s\d{4}\s\d{4} ', # Japan Intl +81 X XXXX XXXX
50
+ r'\+81\s\d{2}\s\d{4}\s\d{4} ', # Japan Intl +81 XX XXXX XXXX
51
+ r'0\d\s\d{4}\s\d{4} ', # Japan STD 0X XXXX XXXX
52
+ r'\+81\d{10} ', # +81 XXXXXXXXXX
53
+ r'\+81\d{9} ', # +81 XXXXXXXXX
54
+ r'0\d{9} ', # 0XXXXXXXXX
55
+ r'\+55\s\d{2}\s\d{5}-\d{4} ', # Brazil Intl +55 XX XXXXX-XXXX
56
+ r'\+55\s\d{2}\s\d{4}-\d{4} ', # Brazil Intl +55 XX XXXX-XXXX
57
+ r'0\d{2}\s\d{4}\s\d{4} ', # Brazil STD 0XX XXXX XXXX
58
+ r'\+55\d{11} ', # +55 XXXXXXXXXXX
59
+ r'\+55\d{10} ', # +55 XXXXXXXXXX
60
+ r'0\d{10} ', # 0XXXXXXXXXX
61
+ r'\+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France Intl +33 X XX XX XX XX
62
+ r'0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} ', # France STD 0X XX XX XX XX
63
+ r'\+33\d{9} ', # +33 XXXXXXXXX
64
+ r'0\d{9} ', # 0XXXXXXXXX
65
+ r'\+7\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia Intl +7 XXX XXX-XX-XX
66
+ r'8\s\d{3}\s\d{3}-\d{2}-\d{2} ', # Russia STD 8 XXX XXX-XX-XX
67
+ r'\+7\d{10} ', # +7 XXXXXXXXXX
68
+ r'8\d{10} ', # 8 XXXXXXXXXX
69
+ r'\+27\s\d{2}\s\d{3}\s\d{4} ', # South Africa Intl +27 XX XXX XXXX
70
+ r'0\d{2}\s\d{3}\s\d{4} ', # South Africa STD 0XX XXX XXXX
71
+ r'\+27\d{9} ', # +27 XXXXXXXXX
72
+ r'0\d{9} ', # 0XXXXXXXXX
73
+ r'\+52\s\d{3}\s\d{3}\s\d{4} ', # Mexico Intl +52 XXX XXX XXXX
74
+ r'\+52\s\d{2}\s\d{4}\s\d{4} ', # Mexico Intl +52 XX XXXX XXXX
75
+ r'01\s\d{3}\s\d{4} ', # Mexico STD 01 XXX XXXX
76
+ r'\+52\d{10} ', # +52 XXXXXXXXXX
77
+ r'01\d{7} ', # 01 XXXXXXX
78
+ r'\+234\s\d{3}\s\d{3}\s\d{4} ', # Nigeria Intl +234 XXX XXX XXXX
79
+ r'0\d{3}\s\d{3}\s\d{4} ', # Nigeria STD 0XXX XXX XXXX
80
+ r'\+234\d{10} ', # +234 XXXXXXXXXX
81
+ r'0\d{10} ', # 0XXXXXXXXXX
82
+ r'\+971\s\d\s\d{3}\s\d{4} ', # UAE Intl +971 X XXX XXXX
83
+ r'0\d\s\d{3}\s\d{4} ', # UAE STD 0X XXX XXXX
84
+ r'\+971\d{8} ', # +971 XXXXXXXX
85
+ r'0\d{8} ', # 0XXXXXXXX
86
+ r'\+54\s9\s\d{3}\s\d{3}\s\d{4} ', # Argentina Intl +54 9 XXX XXX XXXX
87
+ r'\+54\s\d{1}\s\d{4}\s\d{4} ', # Argentina Intl +54 X XXXX XXXX
88
+ r'0\d{3}\s\d{4} ', # Argentina STD 0XXX XXXX
89
+ r'\+54\d{10} ', # +54 9 XXXXXXXXXX
90
+ r'\+54\d{9} ', # +54 XXXXXXXXX
91
+ r'0\d{7} ', # 0XXXXXXX
92
+ r'\+966\s\d\s\d{3}\s\d{4} ', # Saudi Intl +966 X XXX XXXX
93
+ r'0\d\s\d{3}\s\d{4} ', # Saudi STD 0X XXX XXXX
94
+ r'\+966\d{8} ', # +966 XXXXXXXX
95
+ r'0\d{8} ', # 0XXXXXXXX
96
+ r'\+1\d{10} ', # +1 XXXXXXXXXX
97
+ r'\+1\s\d{3}\s\d{3}\s\d{4} ', # +1 XXX XXX XXXX
98
+ r'\d{5}\s\d{5} ', # XXXXX XXXXX
99
+ r'\d{10} ', # XXXXXXXXXX
100
+ r'\+44\d{10} ', # +44 XXXXXXXXXX
101
+ r'0\d{10} ', # 0XXXXXXXXXX
102
+ r'\+61\d{9} ', # +61 XXXXXXXXX
103
+ r'0\d{9} ', # 0XXXXXXXXX
104
+ r'\+91\d{10} ', # +91 XXXXXXXXXX
105
+ r'\+49\d{12} ', # +49 XXXXXXXXXXXX
106
+ r'\+49\d{10} ', # +49 XXXXXXXXXX
107
+ r'0\d{11} ', # 0XXXXXXXXXXX
108
+ r'\+86\d{11} ', # +86 XXXXXXXXXXX
109
+ r'\+81\d{10} ', # +81 XXXXXXXXXX
110
+ r'\+81\d{9} ', # +81 XXXXXXXXX
111
+ r'0\d{9} ', # 0XXXXXXXXX
112
+ r'\+55\d{11} ', # +55 XXXXXXXXXXX
113
+ r'\+55\d{10} ', # +55 XXXXXXXXXX
114
+ r'0\d{10} ', # 0XXXXXXXXXX
115
+ r'\+33\d{9} ', # +33 XXXXXXXXX
116
+ r'0\d{9} ', # 0XXXXXXXXX
117
+ r'\+7\d{10} ', # +7 XXXXXXXXXX
118
+ r'8\d{10} ', # 8 XXXXXXXXXX
119
+ r'\+27\d{9} ', # +27 XXXXXXXXX
120
+ r'0\d{9} ', # 0XXXXXXXXX (South Africa STD)
121
+ r'\+52\d{10} ', # +52 XXXXXXXXXX
122
+ r'01\d{7} ', # 01 XXXXXXX
123
+ r'\+234\d{10} ', # +234 XXXXXXXXXX
124
+ r'0\d{10} ', # 0XXXXXXXXXX
125
+ r'\+971\d{8} ', # +971 XXXXXXXX
126
+ r'0\d{8} ', # 0XXXXXXXX
127
+ r'\+54\s9\s\d{10} ', # +54 9 XXXXXXXXXX
128
+ r'\+54\d{9} ', # +54 XXXXXXXXX
129
+ r'0\d{7} ', # 0XXXXXXX
130
+ r'\+966\d{8} ', # +966 XXXXXXXX
131
+ r'0\d{8}' # 0XXXXXXXX
132
+ ]
133
+
134
+ # Check if the contact matches any of the patterns
135
+ return any(re.match(pattern, contact) for pattern in patterns) is not None
136
+
137
+ # Extracting the Contact Details through Regular Expression
138
+ def extract_contact_details(text):
139
+ # Regex patterns
140
+ # Phone numbers with at least 5 digits in any segment
141
+ combined_phone_regex = re.compile(r'''
142
+ (?:
143
+ #(?:(?:\+91[-.\s]?)?\d{5}[-.\s]?\d{5})|(?:\+?\d{1,3})?[-.\s()]?\d{5,}[-.\s()]?\d{5,}[-.\s()]?\d{1,9} | /^[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{3})[\.-)( ]*([0-9]{4})$/ |
144
+ \+1\s\(\d{3}\)\s\d{3}-\d{4} | # USA/Canada Intl +1 (XXX) XXX-XXXX
145
+ \(\d{3}\)\s\d{3}-\d{4} | # USA/Canada STD (XXX) XXX-XXXX
146
+ \(\d{3}\)\s\d{3}\s\d{4} | # USA/Canada (XXX) XXX XXXX
147
+ \(\d{3}\)\s\d{3}\s\d{3} | # USA/Canada (XXX) XXX XXX
148
+ \+1\d{10} | # +1 XXXXXXXXXX
149
+ \d{10} | # XXXXXXXXXX
150
+ \+44\s\d{4}\s\d{6} | # UK Intl +44 XXXX XXXXXX
151
+ \+44\s\d{3}\s\d{3}\s\d{4} | # UK Intl +44 XXX XXX XXXX
152
+ 0\d{4}\s\d{6} | # UK STD 0XXXX XXXXXX
153
+ 0\d{3}\s\d{3}\s\d{4} | # UK STD 0XXX XXX XXXX
154
+ \+44\d{10} | # +44 XXXXXXXXXX
155
+ 0\d{10} | # 0XXXXXXXXXX
156
+ \+61\s\d\s\d{4}\s\d{4} | # Australia Intl +61 X XXXX XXXX
157
+ 0\d\s\d{4}\s\d{4} | # Australia STD 0X XXXX XXXX
158
+ \+61\d{9} | # +61 XXXXXXXXX
159
+ 0\d{9} | # 0XXXXXXXXX
160
+ \+91\s\d{5}-\d{5} | # India Intl +91 XXXXX-XXXXX
161
+ \+91\s\d{4}-\d{6} | # India Intl +91 XXXX-XXXXXX
162
+ \+91\s\d{10} | # India Intl +91 XXXXXXXXXX
163
+ \+91\s\d{3}\s\d{3}\s\d{4} | # India Intl +91 XXX XXX XXXX
164
+ \+91\s\d{3}-\d{3}-\d{4} | # India Intl +91 XXX-XXX-XXXX
165
+ \+91\s\d{2}\s\d{4}\s\d{4} | # India Intl +91 XX XXXX XXXX
166
+ \+91\s\d{2}-\d{4}-\d{4} | # India Intl +91 XX-XXXX-XXXX
167
+ \+91\s\d{5}\s\d{5} | # India Intl +91 XXXXX XXXXX
168
+ \d{5}\s\d{5} | # India XXXXX XXXXX
169
+ \d{5}-\d{5} | # India XXXXX-XXXXX
170
+ 0\d{2}-\d{7} | # India STD 0XX-XXXXXXX
171
+ \+91\d{10} | # +91 XXXXXXXXXX
172
+ \d{10} | # XXXXXXXXXX # Here is the regex to handle all possible combination of the contact
173
+ \d{6}-\d{4} | # XXXXXX-XXXX
174
+ \d{4}-\d{6} | # XXXX-XXXXXX
175
+ \d{3}\s\d{3}\s\d{4} | # XXX XXX XXXX
176
+ \d{3}-\d{3}-\d{4} | # XXX-XXX-XXXX
177
+ \d{4}\s\d{3}\s\d{3} | # XXXX XXX XXX
178
+ \d{4}-\d{3}-\d{3} | # XXXX-XXX-XXX #-----
179
+ \+49\s\d{4}\s\d{8} | # Germany Intl +49 XXXX XXXXXXXX
180
+ \+49\s\d{3}\s\d{7} | # Germany Intl +49 XXX XXXXXXX
181
+ 0\d{3}\s\d{8} | # Germany STD 0XXX XXXXXXXX
182
+ \+49\d{12} | # +49 XXXXXXXXXXXX
183
+ \+49\d{10} | # +49 XXXXXXXXXX
184
+ 0\d{11} | # 0XXXXXXXXXXX
185
+ \+86\s\d{3}\s\d{4}\s\d{4} | # China Intl +86 XXX XXXX XXXX
186
+ 0\d{3}\s\d{4}\s\d{4} | # China STD 0XXX XXXX XXXX
187
+ \+86\d{11} | # +86 XXXXXXXXXXX
188
+ \+81\s\d\s\d{4}\s\d{4} | # Japan Intl +81 X XXXX XXXX
189
+ \+81\s\d{2}\s\d{4}\s\d{4} | # Japan Intl +81 XX XXXX XXXX
190
+ 0\d\s\d{4}\s\d{4} | # Japan STD 0X XXXX XXXX
191
+ \+81\d{10} | # +81 XXXXXXXXXX
192
+ \+81\d{9} | # +81 XXXXXXXXX
193
+ 0\d{9} | # 0XXXXXXXXX
194
+ \+55\s\d{2}\s\d{5}-\d{4} | # Brazil Intl +55 XX XXXXX-XXXX
195
+ \+55\s\d{2}\s\d{4}-\d{4} | # Brazil Intl +55 XX XXXX-XXXX
196
+ 0\d{2}\s\d{4}\s\d{4} | # Brazil STD 0XX XXXX XXXX
197
+ \+55\d{11} | # +55 XXXXXXXXXXX
198
+ \+55\d{10} | # +55 XXXXXXXXXX
199
+ 0\d{10} | # 0XXXXXXXXXX
200
+ \+33\s\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} | # France Intl +33 X XX XX XX XX
201
+ 0\d\s\d{2}\s\d{2}\s\d{2}\s\d{2} | # France STD 0X XX XX XX XX
202
+ \+33\d{9} | # +33 XXXXXXXXX
203
+ 0\d{9} | # 0XXXXXXXXX
204
+ \+7\s\d{3}\s\d{3}-\d{2}-\d{2} | # Russia Intl +7 XXX XXX-XX-XX
205
+ 8\s\d{3}\s\d{3}-\d{2}-\d{2} | # Russia STD 8 XXX XXX-XX-XX
206
+ \+7\d{10} | # +7 XXXXXXXXXX
207
+ 8\d{10} | # 8 XXXXXXXXXX
208
+ \+27\s\d{2}\s\d{3}\s\d{4} | # South Africa Intl +27 XX XXX XXXX
209
+ 0\d{2}\s\d{3}\s\d{4} | # South Africa STD 0XX XXX XXXX
210
+ \+27\d{9} | # +27 XXXXXXXXX
211
+ 0\d{9} | # 0XXXXXXXXX
212
+ \+52\s\d{3}\s\d{3}\s\d{4} | # Mexico Intl +52 XXX XXX XXXX
213
+ \+52\s\d{2}\s\d{4}\s\d{4} | # Mexico Intl +52 XX XXXX XXXX
214
+ 01\s\d{3}\s\d{4} | # Mexico STD 01 XXX XXXX
215
+ \+52\d{10} | # +52 XXXXXXXXXX
216
+ 01\d{7} | # 01 XXXXXXX
217
+ \+234\s\d{3}\s\d{3}\s\d{4} | # Nigeria Intl +234 XXX XXX XXXX
218
+ 0\d{3}\s\d{3}\s\d{4} | # Nigeria STD 0XXX XXX XXXX
219
+ \+234\d{10} | # +234 XXXXXXXXXX
220
+ 0\d{10} | # 0XXXXXXXXXX
221
+ \+971\s\d\s\d{3}\s\d{4} | # UAE Intl +971 X XXX XXXX
222
+ 0\d\s\d{3}\s\d{4} | # UAE STD 0X XXX XXXX
223
+ \+971\d{8} | # +971 XXXXXXXX
224
+ 0\d{8} | # 0XXXXXXXX
225
+ \+54\s9\s\d{3}\s\d{3}\s\d{4} | # Argentina Intl +54 9 XXX XXX XXXX
226
+ \+54\s\d{1}\s\d{4}\s\d{4} | # Argentina Intl +54 X XXXX XXXX
227
+ 0\d{3}\s\d{4} | # Argentina STD 0XXX XXXX
228
+ \+54\d{10} | # +54 9 XXXXXXXXXX
229
+ \+54\d{9} | # +54 XXXXXXXXX
230
+ 0\d{7} | # 0XXXXXXX
231
+ \+966\s\d\s\d{3}\s\d{4} | # Saudi Intl +966 X XXX XXXX
232
+ 0\d\s\d{3}\s\d{4} | # Saudi STD 0X XXX XXXX
233
+ \+966\d{8} | # +966 XXXXXXXX
234
+ 0\d{8} | # 0XXXXXXXX
235
+ \+1\d{10} | # +1 XXXXXXXXXX
236
+ \+1\s\d{3}\s\d{3}\s\d{4} | # +1 XXX XXX XXXX
237
+ \d{5}\s\d{5} | # XXXXX XXXXX
238
+ \d{10} | # XXXXXXXXXX
239
+ \+44\d{10} | # +44 XXXXXXXXXX
240
+ 0\d{10} | # 0XXXXXXXXXX
241
+ \+61\d{9} | # +61 XXXXXXXXX
242
+ 0\d{9} | # 0XXXXXXXXX
243
+ \+91\d{10} | # +91 XXXXXXXXXX
244
+ \+49\d{12} | # +49 XXXXXXXXXXXX
245
+ \+49\d{10} | # +49 XXXXXXXXXX
246
+ 0\d{11} | # 0XXXXXXXXXXX
247
+ \+86\d{11} | # +86 XXXXXXXXXXX
248
+ \+81\d{10} | # +81 XXXXXXXXXX
249
+ \+81\d{9} | # +81 XXXXXXXXX
250
+ 0\d{9} | # 0XXXXXXXXX
251
+ \+55\d{11} | # +55 XXXXXXXXXXX
252
+ \+55\d{10} | # +55 XXXXXXXXXX
253
+ 0\d{10} | # 0XXXXXXXXXX
254
+ \+33\d{9} | # +33 XXXXXXXXX
255
+ 0\d{9} | # 0XXXXXXXXX
256
+ \+7\d{10} | # +7 XXXXXXXXXX
257
+ 8\d{10} | # 8 XXXXXXXXXX
258
+ \+27\d{9} | # +27 XXXXXXXXX
259
+ 0\d{9} | # 0XXXXXXXXX (South Africa STD)
260
+ \+52\d{10} | # +52 XXXXXXXXXX
261
+ 01\d{7} | # 01 XXXXXXX
262
+ \+234\d{10} | # +234 XXXXXXXXXX
263
+ 0\d{10} | # 0XXXXXXXXXX
264
+ \+971\d{8} | # +971 XXXXXXXX
265
+ 0\d{8} | # 0XXXXXXXX
266
+ \+54\s9\s\d{10} | # +54 9 XXXXXXXXXX
267
+ \+54\d{9} | # +54 XXXXXXXXX
268
+ 0\d{7} | # 0XXXXXXX
269
+ \+966\d{8} | # +966 XXXXXXXX
270
+ 0\d{8} # 0XXXXXXXX
271
+ \+\d{3}-\d{3}-\d{4}
272
+ )
273
+ ''',re.VERBOSE)
274
+
275
+ # Email regex
276
+ email_regex = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b')
277
+
278
+ # URL and links regex, updated to avoid conflicts with email domains
279
+ link_regex = re.compile(r'\b(?:https?:\/\/)?(?:www\.)[a-zA-Z0-9-]+\.(?:com|co\.in|co|io|org|net|edu|gov|mil|int|uk|us|in|de|au|app|tech|xyz|info|biz|fr|dev)\b')
280
+
281
+ # Find all matches in the text
282
+ phone_numbers = [num for num in combined_phone_regex.findall(text) if len(num) >= 5]
283
+
284
+ emails = email_regex.findall(text)
285
+
286
+ links_RE = [link for link in link_regex.findall(text) if len(link)>=11]
287
+
288
+ # Remove profile links that might conflict with emails
289
+ links_RE = [link for link in links_RE if not any(email in link for email in emails)]
290
+
291
+ return {
292
+ "phone_numbers": phone_numbers,
293
+ "emails": emails,
294
+ "links_RE": links_RE
295
+ }
296
+
297
+ # preprocessing the data
298
+ def process_extracted_text(extracted_text):
299
+ # Load JSON data
300
+ data = json.dumps(extracted_text, indent=4)
301
+ data = json.loads(data)
302
+
303
+ # Create a single dictionary to hold combined results
304
+ combined_results = {
305
+ "phone_numbers": [],
306
+ "emails": [],
307
+ "links_RE": []
308
+ }
309
+
310
+ # Process each text entry
311
+ for filename, text in data.items():
312
+ contact_details = extract_contact_details(text)
313
+ # Extend combined results with the details from this file
314
+ combined_results["phone_numbers"].extend(contact_details["phone_numbers"])
315
+ combined_results["emails"].extend(contact_details["emails"])
316
+ combined_results["links_RE"].extend(contact_details["links_RE"])
317
+
318
+ # Convert the combined results to JSON
319
+ #combined_results_json = json.dumps(combined_results, indent=4)
320
+ combined_results_json = combined_results
321
+
322
+ # Print the final JSON results
323
+ print("Combined contact details in JSON format:")
324
+ print(combined_results_json)
325
+
326
+ return combined_results_json
327
+
328
+ # Function to parse resume with SpaCy
329
+
330
+ def Parser_from_model(file_path):
331
+ # Initialize result with lists instead of strings for consistency
332
+ result = {
333
+ "personal": {
334
+ "name": [],
335
+ "contact": [],
336
+ "email": [],
337
+ "location": [],
338
+ "link": [],
339
+ "invalid_email": [],
340
+ "invalid_contact": []
341
+ },
342
+ "professional": {
343
+ "technical_skills": [],
344
+ "non_technical_skills": [],
345
+ "tools": [],
346
+ "experience": [
347
+ {
348
+ "company": [],
349
+ "projects": [],
350
+ "role": [],
351
+ "years": [],
352
+ "project_experience": []
353
+ }
354
+ ],
355
+ "education": [
356
+ {
357
+ "qualification": [],
358
+ "university": [],
359
+ "course": [],
360
+ "certificate": []
361
+ }
362
+ ]
363
+ }
364
+ }
365
+
366
+ try:
367
+ nlp = spacy.load("Spacy_Models/ner_model_05_3")
368
+ logging.debug("Model loaded successfully.")
369
+ except Exception as e:
370
+ logging.error(f"Error loading model: {e}")
371
+ return {"error": "Model loading failed"}
372
+
373
+ try:
374
+ cleaned_text, hyperlinks = extract_text_based_on_format(file_path)
375
+ if not cleaned_text.strip():
376
+ logging.error("No text extracted from the file.")
377
+ return {"error": "Text extraction failed"}
378
+ except Exception as e:
379
+ logging.error(f"Error extracting text from file: {e}")
380
+ return {"error": "Text extraction failed"}
381
+
382
+ try:
383
+ doc = nlp(cleaned_text)
384
+ except Exception as e:
385
+ logging.error(f"Error processing text with SpaCy: {e}")
386
+ return {"error": "Text processing failed"}
387
+
388
+ # Extracting Contact Details
389
+ cont_data = process_extracted_text(cleaned_text)
390
+
391
+ # Initialize entities as a dictionary with lists
392
+ entities = {label: [] for label in [
393
+ 'PERSON', 'EMAIL', 'CONTACT', 'LOCATION', 'SKILL', 'SOFT_SKILL',
394
+ 'COMPANY', 'PROJECTS', 'JOB_TITLE', 'YEARS_EXPERIENCE', 'EXPERIENCE',
395
+ 'QUALIFICATION', 'UNIVERSITY', 'COURSE', 'CERTIFICATE'
396
+ ]}
397
+
398
+ # Process entities and avoid duplicates
399
+ for ent in doc.ents:
400
+ if ent.label_ in entities and ent.text not in entities[ent.label_]:
401
+ entities[ent.label_].append(ent.text)
402
+
403
+ # Helper function to handle None or unexpected types
404
+ def normalize_to_list(value):
405
+ if value is []:
406
+ return None
407
+ elif isinstance(value, (str, int, float)):
408
+ return [str(value)]
409
+ elif isinstance(value, list):
410
+ return value
411
+ else:
412
+ return [str(value)]
413
+
414
+ # Map entities to result JSON
415
+ result['personal']['name'] = normalize_to_list(entities.get('PERSON'))
416
+ result['personal']['email'] += cont_data['emails']
417
+ result['personal']['invalid_email'] = []
418
+
419
+ # Validate email and handle invalid ones
420
+ for email in entities.get('EMAIL', []):
421
+ if is_valid_email(email):
422
+ result['personal']['email'].append(email)
423
+ else:
424
+ result['personal']['invalid_email'].append(email)
425
+
426
+ # Validate contact and handle invalid ones
427
+ result['personal']['contact'] += cont_data['phone_numbers']
428
+ result['personal']['invalid_contact'] = []
429
+ for contact in entities.get('CONTACT', []):
430
+ if is_valid_contact(contact):
431
+ result['personal']['contact'].append(contact)
432
+ else:
433
+ result['personal']['invalid_contact'].append(contact)
434
+
435
+ result['personal']['location'] = normalize_to_list(entities.get('LOCATION'))
436
+ result['personal']['link'] = normalize_to_list(hyperlinks)
437
+ result['personal']['link'] += cont_data['links_RE']
438
+
439
+ e
440
+
441
+ result['professional']['technical_skills'] = normalize_to_list(entities.get('SKILL'))
442
+ result['professional']['non_technical_skills'] = normalize_to_list(entities.get('SOFT_SKILL'))
443
+ result['professional']['tools'] = None # Logic for tools can be added if needed
444
+
445
+ experience = result['professional']['experience'][0]
446
+ experience['company'] = normalize_to_list(entities.get('COMPANY'))
447
+ experience['projects'] = normalize_to_list(entities.get('PROJECTS'))
448
+ experience['role'] = normalize_to_list(entities.get('JOB_TITLE'))
449
+ experience['years'] = normalize_to_list(entities.get('YEARS_EXPERIENCE'))
450
+ experience['project_experience'] = normalize_to_list(entities.get('EXPERIENCE'))
451
+
452
+ education = result['professional']['education'][0]
453
+ education['qualification'] = normalize_to_list(entities.get('QUALIFICATION'))
454
+ education['university'] = normalize_to_list(entities.get('UNIVERSITY'))
455
+ education['course'] = normalize_to_list(entities.get('COURSE'))
456
+ education['certificate'] = normalize_to_list(entities.get('CERTIFICATE'))
457
+
458
+ print(result)
459
+ return result
460
+
461
+