husseinelsaadi commited on
Commit
f2a1cfa
·
1 Parent(s): 6248af7
Files changed (1) hide show
  1. backend/services/resume_parser.py +144 -103
backend/services/resume_parser.py CHANGED
@@ -1,7 +1,8 @@
1
  import json
2
  import re
 
3
  from pathlib import Path
4
- from typing import Dict, List, Optional
5
  from pdfminer.high_level import extract_text as pdf_extract_text
6
  from docx import Document
7
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
@@ -30,7 +31,8 @@ class ResumeParser:
30
  "ner",
31
  model=model,
32
  tokenizer=tokenizer,
33
- aggregation_strategy="simple"
 
34
  )
35
  self.model_loaded = True
36
  logger.info("Model loaded successfully")
@@ -45,7 +47,8 @@ class ResumeParser:
45
  self.ner_pipeline = pipeline(
46
  "ner",
47
  model=MODEL_NAME,
48
- aggregation_strategy="simple"
 
49
  )
50
  self.model_loaded = True
51
  logger.info("Fallback model loaded successfully")
@@ -64,6 +67,8 @@ class ResumeParser:
64
 
65
  if path.suffix.lower() == ".pdf":
66
  text = pdf_extract_text(file_path)
 
 
67
  logger.info(f"Extracted {len(text)} characters from PDF")
68
  return text
69
 
@@ -81,37 +86,61 @@ class ResumeParser:
81
  raise
82
 
83
  def extract_with_regex(self, text: str) -> Dict[str, List[str]]:
84
- """Fallback extraction using regex patterns"""
85
  patterns = {
86
  'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
87
- 'phone': r'(\+\d{1,3}[-.\s]?)?$$?\d{3}$$?[-.\s]?\d{3}[-.\s]?\d{4}',
88
- 'skills': r'(?i)(?:skills?|technologies?|tools?)[:\-\s]*([^\n]+)',
89
- 'education': r'(?i)(?:education|degree|university|college|bachelor|master|phd)[:\-\s]*([^\n]+)',
90
- 'experience': r'(?i)(?:experience|work|employment|job)[:\-\s]*([^\n]+)'
 
91
  }
92
 
93
  results = {}
94
  for key, pattern in patterns.items():
95
- matches = re.findall(pattern, text, re.MULTILINE)
96
- results[key] = [match.strip() for match in matches if match.strip()]
 
 
 
 
 
 
 
97
 
98
  return results
99
 
100
  def extract_name_from_text(self, text: str) -> str:
101
- """Extract name using heuristics"""
 
 
 
 
 
 
 
 
 
 
 
102
  lines = text.split('\n')
103
- # Usually name is in the first few lines
104
- for line in lines[:5]:
105
  line = line.strip()
106
- if line and len(line.split()) <= 4 and len(line) > 2:
107
  # Check if it looks like a name (not email, phone, etc.)
108
- if not re.search(r'[@\d]', line) and not line.lower().startswith(('resume', 'cv', 'curriculum')):
109
- return line
 
110
  return "Not Found"
111
 
112
  def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]:
113
  """Process NER entities with improved logic"""
114
- name, skills, education, experience = [], [], [], []
 
 
 
 
 
115
 
116
  logger.info(f"Processing {len(entities)} entities")
117
 
@@ -120,27 +149,77 @@ class ResumeParser:
120
  value = ent.get("word", "").strip()
121
  confidence = ent.get("score", 0)
122
 
123
- logger.debug(f"Entity: {label} = {value} (confidence: {confidence:.2f})")
124
-
125
- # Only consider high-confidence entities
126
- if confidence < 0.5:
127
  continue
128
 
 
129
  if label in ["PERSON", "PER", "NAME"]:
130
- name.append(value)
131
  elif label in ["SKILL", "TECH", "TECHNOLOGY"]:
132
- skills.append(value)
133
- elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"]:
134
- education.append(value)
135
  elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]:
136
- experience.append(value)
137
 
138
- return {
139
- "name": name,
140
- "skills": skills,
141
- "education": education,
142
- "experience": experience
 
 
 
 
 
 
 
 
 
 
143
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]:
146
  """Parse resume with multiple extraction methods"""
@@ -154,59 +233,53 @@ class ResumeParser:
154
  logger.info(f"Text preview: {text[:200]}...")
155
 
156
  # Initialize results
157
- results = {
158
- "name": "Not Found",
159
- "skills": "Not Found",
160
- "education": "Not Found",
161
- "experience": "Not Found"
162
  }
163
 
164
  # Method 1: Try NER model if available
165
  if self.model_loaded and self.ner_pipeline:
166
  try:
167
  logger.info("Using NER model for extraction")
168
- entities = self.ner_pipeline(text)
169
  ner_results = self.process_ner_entities(entities)
170
-
171
- # Update results with NER findings
172
- for key in results.keys():
173
- if ner_results.get(key):
174
- unique_items = list(dict.fromkeys(ner_results[key]))
175
- results[key] = ", ".join(unique_items)
176
-
177
  except Exception as e:
178
  logger.warning(f"NER extraction failed: {e}")
179
 
180
- # Method 2: Regex fallback
181
  logger.info("Using regex patterns for extraction")
182
  regex_results = self.extract_with_regex(text)
 
183
 
184
- # Fill in missing information with regex results
185
- if results["name"] == "Not Found":
186
- results["name"] = self.extract_name_from_text(text)
187
-
188
- if results["skills"] == "Not Found" and regex_results.get("skills"):
189
- results["skills"] = ", ".join(regex_results["skills"][:3]) # Limit to first 3
190
 
191
- if results["education"] == "Not Found" and regex_results.get("education"):
192
- results["education"] = ", ".join(regex_results["education"][:2]) # Limit to first 2
193
 
194
- if results["experience"] == "Not Found" and regex_results.get("experience"):
195
- results["experience"] = ", ".join(regex_results["experience"][:3]) # Limit to first 3
196
-
197
- # Add email and phone if found
198
- if regex_results.get("email"):
199
- results["email"] = regex_results["email"][0]
200
- if regex_results.get("phone"):
201
- results["phone"] = regex_results["phone"][0]
202
 
203
  logger.info("Parsing completed successfully")
204
- return results
205
 
206
  except Exception as e:
207
  logger.error(f"Error parsing resume: {e}")
208
  return {
209
  "name": "Error",
 
 
210
  "skills": "Error",
211
  "education": "Error",
212
  "experience": "Error",
@@ -220,44 +293,12 @@ def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
220
  """Main function to parse resume"""
221
  return resume_parser.parse_resume(file_path, filename)
222
 
223
- # Test function
224
- def test_parser():
225
- """Test the parser with sample text"""
226
- sample_text = """
227
- John Doe
228
- Software Engineer
229
230
- (555) 123-4567
231
-
232
- Skills: Python, JavaScript, React, Node.js, SQL
233
-
234
- Education:
235
- Bachelor of Science in Computer Science
236
- University of Technology, 2020
237
-
238
- Experience:
239
- Senior Software Developer at Tech Corp (2021-2023)
240
- - Developed web applications using React and Node.js
241
- - Managed database systems and APIs
242
- """
243
-
244
- # Create a temporary file for testing
245
- import tempfile
246
- with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
247
- f.write(sample_text)
248
- temp_path = f.name
249
-
250
- try:
251
- # Test regex extraction
252
- regex_results = resume_parser.extract_with_regex(sample_text)
253
- print("Regex Results:", json.dumps(regex_results, indent=2))
254
-
255
- # Test name extraction
256
- name = resume_parser.extract_name_from_text(sample_text)
257
- print(f"Extracted Name: {name}")
258
-
259
- except Exception as e:
260
- print(f"Test error: {e}")
261
- finally:
262
- Path(temp_path).unlink(missing_ok=True)
263
-
 
1
  import json
2
  import re
3
+ import os
4
  from pathlib import Path
5
+ from typing import Dict, List, Optional, Union
6
  from pdfminer.high_level import extract_text as pdf_extract_text
7
  from docx import Document
8
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 
31
  "ner",
32
  model=model,
33
  tokenizer=tokenizer,
34
+ aggregation_strategy="simple",
35
+ device=0 if os.environ.get("L4_GPU", "false").lower() == "true" else -1
36
  )
37
  self.model_loaded = True
38
  logger.info("Model loaded successfully")
 
47
  self.ner_pipeline = pipeline(
48
  "ner",
49
  model=MODEL_NAME,
50
+ aggregation_strategy="simple",
51
+ device=0 if os.environ.get("L4_GPU", "false").lower() == "true" else -1
52
  )
53
  self.model_loaded = True
54
  logger.info("Fallback model loaded successfully")
 
67
 
68
  if path.suffix.lower() == ".pdf":
69
  text = pdf_extract_text(file_path)
70
+ # Clean up PDF text extraction artifacts
71
+ text = re.sub(r'\s+', ' ', text).strip()
72
  logger.info(f"Extracted {len(text)} characters from PDF")
73
  return text
74
 
 
86
  raise
87
 
88
  def extract_with_regex(self, text: str) -> Dict[str, List[str]]:
89
+ """Improved regex patterns for extraction"""
90
  patterns = {
91
  'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
92
+ 'phone': r'(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
93
+ 'skills': r'(?i)(?:skills?|technologies?|tools?|expertise)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)',
94
+ 'education': r'(?i)(?:education|degree|university|college|bachelor|master|phd)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)',
95
+ 'experience': r'(?i)(?:experience|work\shistory|employment|job\shistory)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)',
96
+ 'name': r'^(?!(resume|cv|curriculum vitae|\d))[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+'
97
  }
98
 
99
  results = {}
100
  for key, pattern in patterns.items():
101
+ matches = re.findall(pattern, text, re.MULTILINE | re.IGNORECASE)
102
+ if key == 'name' and matches:
103
+ # Take the first likely name match
104
+ results[key] = [matches[0].strip()]
105
+ else:
106
+ # Clean and filter matches
107
+ cleaned = [m.strip() for m in matches if m.strip()]
108
+ if cleaned:
109
+ results[key] = cleaned
110
 
111
  return results
112
 
113
  def extract_name_from_text(self, text: str) -> str:
114
+ """Improved name extraction heuristics"""
115
+ # First try to find name using regex
116
+ name_match = re.search(
117
+ r'^(?!(resume|cv|curriculum vitae|\d))[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+',
118
+ text,
119
+ re.MULTILINE | re.IGNORECASE
120
+ )
121
+
122
+ if name_match:
123
+ return name_match.group(0).strip()
124
+
125
+ # Fallback to line-based approach
126
  lines = text.split('\n')
127
+ for line in lines[:10]: # Check first 10 lines
 
128
  line = line.strip()
129
+ if line and 2 <= len(line.split()) <= 4:
130
  # Check if it looks like a name (not email, phone, etc.)
131
+ if not re.search(r'[@\d+\-\(\)]', line):
132
+ if line[0].isupper() and not line.lower().startswith(('resume', 'cv', 'curriculum')):
133
+ return line
134
  return "Not Found"
135
 
136
  def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]:
137
  """Process NER entities with improved logic"""
138
+ results = {
139
+ "name": [],
140
+ "skills": [],
141
+ "education": [],
142
+ "experience": []
143
+ }
144
 
145
  logger.info(f"Processing {len(entities)} entities")
146
 
 
149
  value = ent.get("word", "").strip()
150
  confidence = ent.get("score", 0)
151
 
152
+ # Skip low confidence entities and empty values
153
+ if confidence < 0.7 or not value:
 
 
154
  continue
155
 
156
+ # Normalize labels
157
  if label in ["PERSON", "PER", "NAME"]:
158
+ results["name"].append(value)
159
  elif label in ["SKILL", "TECH", "TECHNOLOGY"]:
160
+ results["skills"].append(value)
161
+ elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"] and "university" not in value.lower():
162
+ results["education"].append(value)
163
  elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]:
164
+ results["experience"].append(value)
165
 
166
+ # Deduplicate and clean results
167
+ for key in results:
168
+ results[key] = list(dict.fromkeys(results[key])) # Preserve order
169
+
170
+ return results
171
+
172
+ def merge_results(self, ner_results: Dict, regex_results: Dict) -> Dict[str, str]:
173
+ """Merge NER and regex results intelligently"""
174
+ merged = {
175
+ "name": "Not Found",
176
+ "email": "Not Found",
177
+ "phone": "Not Found",
178
+ "skills": "Not Found",
179
+ "education": "Not Found",
180
+ "experience": "Not Found"
181
  }
182
+
183
+ # Name - prioritize NER, then regex, then text extraction
184
+ if ner_results.get("name"):
185
+ merged["name"] = " ".join(ner_results["name"][:1]) # Take first name only
186
+ elif regex_results.get("name"):
187
+ merged["name"] = regex_results["name"][0]
188
+
189
+ # Email and phone - only from regex
190
+ if regex_results.get("email"):
191
+ merged["email"] = regex_results["email"][0]
192
+ if regex_results.get("phone"):
193
+ merged["phone"] = regex_results["phone"][0]
194
+
195
+ # Skills - combine both sources
196
+ all_skills = []
197
+ if ner_results.get("skills"):
198
+ all_skills.extend(ner_results["skills"])
199
+ if regex_results.get("skills"):
200
+ all_skills.extend(regex_results["skills"])
201
+ if all_skills:
202
+ merged["skills"] = ", ".join(list(dict.fromkeys(all_skills))[:10]) # Limit to 10 skills
203
+
204
+ # Education - combine both sources
205
+ all_edu = []
206
+ if ner_results.get("education"):
207
+ all_edu.extend(ner_results["education"])
208
+ if regex_results.get("education"):
209
+ all_edu.extend(regex_results["education"])
210
+ if all_edu:
211
+ merged["education"] = ", ".join(list(dict.fromkeys(all_edu))[:3] # Limit to 3 items
212
+
213
+ # Experience - combine both sources
214
+ all_exp = []
215
+ if ner_results.get("experience"):
216
+ all_exp.extend(ner_results["experience"])
217
+ if regex_results.get("experience"):
218
+ all_exp.extend(regex_results["experience"])
219
+ if all_exp:
220
+ merged["experience"] = ", ".join(list(dict.fromkeys(all_exp))[:3] # Limit to 3 items
221
+
222
+ return merged
223
 
224
  def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]:
225
  """Parse resume with multiple extraction methods"""
 
233
  logger.info(f"Text preview: {text[:200]}...")
234
 
235
  # Initialize results
236
+ ner_results = {
237
+ "name": [],
238
+ "skills": [],
239
+ "education": [],
240
+ "experience": []
241
  }
242
 
243
  # Method 1: Try NER model if available
244
  if self.model_loaded and self.ner_pipeline:
245
  try:
246
  logger.info("Using NER model for extraction")
247
+ entities = self.ner_pipeline(text[:5120]) # Limit input size for NER
248
  ner_results = self.process_ner_entities(entities)
249
+ logger.info(f"NER results: {json.dumps(ner_results, indent=2)}")
 
 
 
 
 
 
250
  except Exception as e:
251
  logger.warning(f"NER extraction failed: {e}")
252
 
253
+ # Method 2: Regex extraction
254
  logger.info("Using regex patterns for extraction")
255
  regex_results = self.extract_with_regex(text)
256
+ logger.info(f"Regex results: {json.dumps(regex_results, indent=2)}")
257
 
258
+ # Method 3: Name extraction fallback
259
+ if not ner_results.get("name") and not regex_results.get("name"):
260
+ name = self.extract_name_from_text(text)
261
+ if name != "Not Found":
262
+ regex_results["name"] = [name]
 
263
 
264
+ # Merge all results
265
+ final_results = self.merge_results(ner_results, regex_results)
266
 
267
+ # If name still not found, try filename
268
+ if final_results["name"] == "Not Found" and filename:
269
+ # Try to extract name from filename (common pattern: "Firstname Lastname - Resume.pdf")
270
+ name_from_file = re.sub(r'[-_].*', '', filename).strip()
271
+ if len(name_from_file.split()) >= 2:
272
+ final_results["name"] = name_from_file
 
 
273
 
274
  logger.info("Parsing completed successfully")
275
+ return final_results
276
 
277
  except Exception as e:
278
  logger.error(f"Error parsing resume: {e}")
279
  return {
280
  "name": "Error",
281
+ "email": "Error",
282
+ "phone": "Error",
283
  "skills": "Error",
284
  "education": "Error",
285
  "experience": "Error",
 
293
  """Main function to parse resume"""
294
  return resume_parser.parse_resume(file_path, filename)
295
 
296
+ if __name__ == "__main__":
297
+ # Test the parser
298
+ test_file = input("Enter path to resume file: ")
299
+ if os.path.exists(test_file):
300
+ results = parse_resume(test_file, os.path.basename(test_file))
301
+ print("\nParsing Results:")
302
+ print(json.dumps(results, indent=2))
303
+ else:
304
+ print("File not found")