husseinelsaadi commited on
Commit
c0dac84
·
1 Parent(s): f2a1cfa
Files changed (1) hide show
  1. backend/services/resume_parser.py +73 -265
backend/services/resume_parser.py CHANGED
@@ -1,304 +1,112 @@
1
- import json
2
  import re
3
- import os
4
  from pathlib import Path
5
- from typing import Dict, List, Optional, Union
6
  from pdfminer.high_level import extract_text as pdf_extract_text
7
  from docx import Document
8
- from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
9
- import logging
10
-
11
- # Set up logging
12
- logging.basicConfig(level=logging.INFO)
13
- logger = logging.getLogger(__name__)
14
 
15
  class ResumeParser:
16
  def __init__(self):
17
- self.ner_pipeline = None
18
- self.model_loaded = False
19
- self._load_model()
20
 
21
- def _load_model(self):
22
- """Load the NER model with error handling and fallbacks"""
23
- try:
24
- # Try the original model first
25
- MODEL_NAME = "manishiitg/resume-ner"
26
- logger.info(f"Attempting to load model: {MODEL_NAME}")
27
-
28
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
29
- model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
30
- self.ner_pipeline = pipeline(
31
- "ner",
32
- model=model,
33
- tokenizer=tokenizer,
34
- aggregation_strategy="simple",
35
- device=0 if os.environ.get("L4_GPU", "false").lower() == "true" else -1
36
- )
37
- self.model_loaded = True
38
- logger.info("Model loaded successfully")
39
-
40
- except Exception as e:
41
- logger.warning(f"Failed to load primary model: {e}")
42
- try:
43
- # Fallback to a more reliable model
44
- MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
45
- logger.info(f"Trying fallback model: {MODEL_NAME}")
46
-
47
- self.ner_pipeline = pipeline(
48
- "ner",
49
- model=MODEL_NAME,
50
- aggregation_strategy="simple",
51
- device=0 if os.environ.get("L4_GPU", "false").lower() == "true" else -1
52
- )
53
- self.model_loaded = True
54
- logger.info("Fallback model loaded successfully")
55
-
56
- except Exception as e2:
57
- logger.error(f"Failed to load fallback model: {e2}")
58
- self.model_loaded = False
59
-
60
  def extract_text(self, file_path: str) -> str:
61
- """Extract text from PDF or DOCX files with error handling"""
62
- try:
63
- path = Path(file_path)
64
-
65
- if not path.exists():
66
- raise FileNotFoundError(f"File not found: {file_path}")
67
-
68
- if path.suffix.lower() == ".pdf":
69
- text = pdf_extract_text(file_path)
70
- # Clean up PDF text extraction artifacts
71
- text = re.sub(r'\s+', ' ', text).strip()
72
- logger.info(f"Extracted {len(text)} characters from PDF")
73
- return text
74
-
75
- elif path.suffix.lower() == ".docx":
76
- doc = Document(file_path)
77
- text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
78
- logger.info(f"Extracted {len(text)} characters from DOCX")
79
- return text
80
-
81
- else:
82
- raise ValueError(f"Unsupported file format: {path.suffix}")
83
-
84
- except Exception as e:
85
- logger.error(f"Error extracting text: {e}")
86
- raise
87
-
88
- def extract_with_regex(self, text: str) -> Dict[str, List[str]]:
89
- """Improved regex patterns for extraction"""
90
- patterns = {
91
- 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
92
- 'phone': r'(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
93
- 'skills': r'(?i)(?:skills?|technologies?|tools?|expertise)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)',
94
- 'education': r'(?i)(?:education|degree|university|college|bachelor|master|phd)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)',
95
- 'experience': r'(?i)(?:experience|work\shistory|employment|job\shistory)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)',
96
- 'name': r'^(?!(resume|cv|curriculum vitae|\d))[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+'
97
- }
98
-
99
- results = {}
100
- for key, pattern in patterns.items():
101
- matches = re.findall(pattern, text, re.MULTILINE | re.IGNORECASE)
102
- if key == 'name' and matches:
103
- # Take the first likely name match
104
- results[key] = [matches[0].strip()]
105
- else:
106
- # Clean and filter matches
107
- cleaned = [m.strip() for m in matches if m.strip()]
108
- if cleaned:
109
- results[key] = cleaned
110
 
111
- return results
 
 
 
 
 
 
 
112
 
113
- def extract_name_from_text(self, text: str) -> str:
114
- """Improved name extraction heuristics"""
115
- # First try to find name using regex
116
- name_match = re.search(
117
- r'^(?!(resume|cv|curriculum vitae|\d))[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+',
118
- text,
119
- re.MULTILINE | re.IGNORECASE
120
- )
 
 
121
 
122
- if name_match:
123
- return name_match.group(0).strip()
 
 
124
 
125
- # Fallback to line-based approach
126
- lines = text.split('\n')
127
- for line in lines[:10]: # Check first 10 lines
128
- line = line.strip()
129
- if line and 2 <= len(line.split()) <= 4:
130
- # Check if it looks like a name (not email, phone, etc.)
131
- if not re.search(r'[@\d+\-\(\)]', line):
132
- if line[0].isupper() and not line.lower().startswith(('resume', 'cv', 'curriculum')):
133
- return line
134
  return "Not Found"
135
 
136
- def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]:
137
- """Process NER entities with improved logic"""
138
  results = {
139
- "name": [],
140
  "skills": [],
141
  "education": [],
142
  "experience": []
143
  }
144
 
145
- logger.info(f"Processing {len(entities)} entities")
146
-
147
- for ent in entities:
148
- label = ent.get("entity_group", "").upper()
149
- value = ent.get("word", "").strip()
150
- confidence = ent.get("score", 0)
151
-
152
- # Skip low confidence entities and empty values
153
- if confidence < 0.7 or not value:
154
- continue
155
-
156
- # Normalize labels
157
- if label in ["PERSON", "PER", "NAME"]:
158
- results["name"].append(value)
159
- elif label in ["SKILL", "TECH", "TECHNOLOGY"]:
160
- results["skills"].append(value)
161
- elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"] and "university" not in value.lower():
162
- results["education"].append(value)
163
- elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]:
164
- results["experience"].append(value)
165
-
166
- # Deduplicate and clean results
167
- for key in results:
168
- results[key] = list(dict.fromkeys(results[key])) # Preserve order
169
-
170
- return results
171
-
172
- def merge_results(self, ner_results: Dict, regex_results: Dict) -> Dict[str, str]:
173
- """Merge NER and regex results intelligently"""
174
- merged = {
175
- "name": "Not Found",
176
- "email": "Not Found",
177
- "phone": "Not Found",
178
- "skills": "Not Found",
179
- "education": "Not Found",
180
- "experience": "Not Found"
181
- }
182
 
183
- # Name - prioritize NER, then regex, then text extraction
184
- if ner_results.get("name"):
185
- merged["name"] = " ".join(ner_results["name"][:1]) # Take first name only
186
- elif regex_results.get("name"):
187
- merged["name"] = regex_results["name"][0]
 
 
188
 
189
- # Email and phone - only from regex
190
- if regex_results.get("email"):
191
- merged["email"] = regex_results["email"][0]
192
- if regex_results.get("phone"):
193
- merged["phone"] = regex_results["phone"][0]
 
 
194
 
195
- # Skills - combine both sources
196
- all_skills = []
197
- if ner_results.get("skills"):
198
- all_skills.extend(ner_results["skills"])
199
- if regex_results.get("skills"):
200
- all_skills.extend(regex_results["skills"])
201
- if all_skills:
202
- merged["skills"] = ", ".join(list(dict.fromkeys(all_skills))[:10]) # Limit to 10 skills
203
-
204
- # Education - combine both sources
205
- all_edu = []
206
- if ner_results.get("education"):
207
- all_edu.extend(ner_results["education"])
208
- if regex_results.get("education"):
209
- all_edu.extend(regex_results["education"])
210
- if all_edu:
211
- merged["education"] = ", ".join(list(dict.fromkeys(all_edu))[:3] # Limit to 3 items
212
-
213
- # Experience - combine both sources
214
- all_exp = []
215
- if ner_results.get("experience"):
216
- all_exp.extend(ner_results["experience"])
217
- if regex_results.get("experience"):
218
- all_exp.extend(regex_results["experience"])
219
- if all_exp:
220
- merged["experience"] = ", ".join(list(dict.fromkeys(all_exp))[:3] # Limit to 3 items
221
-
222
- return merged
223
 
224
- def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]:
225
- """Parse resume with multiple extraction methods"""
226
  try:
227
- # Extract text
228
  text = self.extract_text(file_path)
229
 
230
  if not text or len(text.strip()) < 10:
231
- raise ValueError("Extracted text is too short or empty")
 
 
 
 
 
232
 
233
- logger.info(f"Text preview: {text[:200]}...")
 
234
 
235
- # Initialize results
236
- ner_results = {
237
- "name": [],
238
- "skills": [],
239
- "education": [],
240
- "experience": []
241
  }
242
 
243
- # Method 1: Try NER model if available
244
- if self.model_loaded and self.ner_pipeline:
245
- try:
246
- logger.info("Using NER model for extraction")
247
- entities = self.ner_pipeline(text[:5120]) # Limit input size for NER
248
- ner_results = self.process_ner_entities(entities)
249
- logger.info(f"NER results: {json.dumps(ner_results, indent=2)}")
250
- except Exception as e:
251
- logger.warning(f"NER extraction failed: {e}")
252
-
253
- # Method 2: Regex extraction
254
- logger.info("Using regex patterns for extraction")
255
- regex_results = self.extract_with_regex(text)
256
- logger.info(f"Regex results: {json.dumps(regex_results, indent=2)}")
257
-
258
- # Method 3: Name extraction fallback
259
- if not ner_results.get("name") and not regex_results.get("name"):
260
- name = self.extract_name_from_text(text)
261
- if name != "Not Found":
262
- regex_results["name"] = [name]
263
-
264
- # Merge all results
265
- final_results = self.merge_results(ner_results, regex_results)
266
-
267
- # If name still not found, try filename
268
- if final_results["name"] == "Not Found" and filename:
269
- # Try to extract name from filename (common pattern: "Firstname Lastname - Resume.pdf")
270
- name_from_file = re.sub(r'[-_].*', '', filename).strip()
271
- if len(name_from_file.split()) >= 2:
272
- final_results["name"] = name_from_file
273
-
274
- logger.info("Parsing completed successfully")
275
- return final_results
276
-
277
  except Exception as e:
278
- logger.error(f"Error parsing resume: {e}")
279
  return {
280
- "name": "Error",
281
- "email": "Error",
282
- "phone": "Error",
283
- "skills": "Error",
284
- "education": "Error",
285
- "experience": "Error",
286
- "error": str(e)
287
  }
288
 
289
- # Create global instance
290
  resume_parser = ResumeParser()
291
 
292
- def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
293
- """Main function to parse resume"""
294
- return resume_parser.parse_resume(file_path, filename)
295
-
296
- if __name__ == "__main__":
297
- # Test the parser
298
- test_file = input("Enter path to resume file: ")
299
- if os.path.exists(test_file):
300
- results = parse_resume(test_file, os.path.basename(test_file))
301
- print("\nParsing Results:")
302
- print(json.dumps(results, indent=2))
303
- else:
304
- print("File not found")
 
 
1
  import re
 
2
  from pathlib import Path
 
3
  from pdfminer.high_level import extract_text as pdf_extract_text
4
  from docx import Document
 
 
 
 
 
 
5
 
6
  class ResumeParser:
7
  def __init__(self):
8
+ pass
 
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def extract_text(self, file_path: str) -> str:
11
+ """Extract text from PDF or DOCX files"""
12
+ path = Path(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ if path.suffix.lower() == ".pdf":
15
+ text = pdf_extract_text(file_path)
16
+ return re.sub(r'\s+', ' ', text).strip()
17
+ elif path.suffix.lower() == ".docx":
18
+ doc = Document(file_path)
19
+ return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
20
+ else:
21
+ raise ValueError("Unsupported file format")
22
 
23
+ def extract_name(self, text: str) -> str:
24
+ """Extract name from resume text"""
25
+ # Try to find name at the beginning of document
26
+ first_lines = [line.strip() for line in text.split('\n')[:10] if line.strip()]
27
+
28
+ for line in first_lines:
29
+ # Simple name pattern (2-4 words, all starting with capital)
30
+ if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3}$', line):
31
+ if not any(word.lower() in ['resume', 'cv', 'curriculum'] for word in line.split()):
32
+ return line
33
 
34
+ # Fallback: return first non-empty line that looks like a name
35
+ for line in first_lines:
36
+ if 2 <= len(line.split()) <= 4 and line[0].isupper():
37
+ return line
38
 
 
 
 
 
 
 
 
 
 
39
  return "Not Found"
40
 
41
+ def extract_sections(self, text: str) -> dict:
42
+ """Extract skills, education, and experience using regex"""
43
  results = {
 
44
  "skills": [],
45
  "education": [],
46
  "experience": []
47
  }
48
 
49
+ # Extract skills
50
+ skills_match = re.search(
51
+ r'(?:skills|technologies|expertise)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
52
+ text, re.IGNORECASE
53
+ )
54
+ if skills_match:
55
+ skills_text = skills_match.group(1)
56
+ results["skills"] = [s.strip() for s in re.split(r'[,;]', skills_text) if s.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ # Extract education
59
+ edu_match = re.search(
60
+ r'(?:education|degrees?)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
61
+ text, re.IGNORECASE
62
+ )
63
+ if edu_match:
64
+ results["education"] = [e.strip() for e in edu_match.group(1).split('\n') if e.strip()]
65
 
66
+ # Extract experience
67
+ exp_match = re.search(
68
+ r'(?:experience|work history|employment)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
69
+ text, re.IGNORECASE
70
+ )
71
+ if exp_match:
72
+ results["experience"] = [x.strip() for x in exp_match.group(1).split('\n') if x.strip()]
73
 
74
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ def parse_resume(self, file_path: str) -> dict:
77
+ """Main parsing function"""
78
  try:
 
79
  text = self.extract_text(file_path)
80
 
81
  if not text or len(text.strip()) < 10:
82
+ return {
83
+ "name": "Error: Empty file",
84
+ "skills": [],
85
+ "education": [],
86
+ "experience": []
87
+ }
88
 
89
+ name = self.extract_name(text)
90
+ sections = self.extract_sections(text)
91
 
92
+ return {
93
+ "name": name,
94
+ "skills": sections["skills"][:10], # Limit to 10 skills
95
+ "education": sections["education"][:3], # Limit to 3 items
96
+ "experience": sections["experience"][:3] # Limit to 3 items
 
97
  }
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  except Exception as e:
 
100
  return {
101
+ "name": f"Error: {str(e)}",
102
+ "skills": [],
103
+ "education": [],
104
+ "experience": []
 
 
 
105
  }
106
 
107
+ # Global instance
108
  resume_parser = ResumeParser()
109
 
110
+ def parse_resume(file_path: str) -> dict:
111
+ """Public interface for resume parsing"""
112
+ return resume_parser.parse_resume(file_path)