husseinelsaadi commited on
Commit
6248af7
·
1 Parent(s): 2489359
Files changed (1) hide show
  1. backend/services/resume_parser.py +256 -51
backend/services/resume_parser.py CHANGED
@@ -1,58 +1,263 @@
1
  import json
 
2
  from pathlib import Path
3
- from typing import Dict
4
-
5
  from pdfminer.high_level import extract_text as pdf_extract_text
6
  from docx import Document
7
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # --------------------
10
- # Load PyTorch Resume NER Model
11
- # --------------------
12
- MODEL_NAME = "manishiitg/resume-ner"
13
-
14
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
15
- model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
16
- ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
17
-
18
- # --------------------
19
- # Extract Text from PDF/DOCX
20
- # --------------------
21
- def extract_text(file_path: str) -> str:
22
- path = Path(file_path)
23
- if path.suffix.lower() == ".pdf":
24
- return pdf_extract_text(file_path)
25
- elif path.suffix.lower() == ".docx":
26
- doc = Document(file_path)
27
- return "\n".join([p.text for p in doc.paragraphs])
28
- else:
29
- raise ValueError("Unsupported file format")
30
-
31
- # --------------------
32
- # Parse Resume
33
- # --------------------
34
  def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
35
- text = extract_text(file_path)
36
- entities = ner_pipeline(text)
37
-
38
- name, skills, education, experience = [], [], [], []
39
-
40
- for ent in entities:
41
- label = ent["entity_group"].upper()
42
- value = ent["word"].strip()
43
-
44
- if label == "NAME":
45
- name.append(value)
46
- elif label == "SKILL":
47
- skills.append(value)
48
- elif label in ["EDUCATION", "DEGREE"]:
49
- education.append(value)
50
- elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
51
- experience.append(value)
52
-
53
- return {
54
- "name": " ".join(dict.fromkeys(name)) or "Not Found",
55
- "skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
56
- "education": ", ".join(dict.fromkeys(education)) or "Not Found",
57
- "experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
58
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
+ import re
3
  from pathlib import Path
4
+ from typing import Dict, List, Optional
 
5
  from pdfminer.high_level import extract_text as pdf_extract_text
6
  from docx import Document
7
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
8
+ import logging
9
+
10
+ # Set up logging
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class ResumeParser:
15
+ def __init__(self):
16
+ self.ner_pipeline = None
17
+ self.model_loaded = False
18
+ self._load_model()
19
+
20
+ def _load_model(self):
21
+ """Load the NER model with error handling and fallbacks"""
22
+ try:
23
+ # Try the original model first
24
+ MODEL_NAME = "manishiitg/resume-ner"
25
+ logger.info(f"Attempting to load model: {MODEL_NAME}")
26
+
27
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
28
+ model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
29
+ self.ner_pipeline = pipeline(
30
+ "ner",
31
+ model=model,
32
+ tokenizer=tokenizer,
33
+ aggregation_strategy="simple"
34
+ )
35
+ self.model_loaded = True
36
+ logger.info("Model loaded successfully")
37
+
38
+ except Exception as e:
39
+ logger.warning(f"Failed to load primary model: {e}")
40
+ try:
41
+ # Fallback to a more reliable model
42
+ MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
43
+ logger.info(f"Trying fallback model: {MODEL_NAME}")
44
+
45
+ self.ner_pipeline = pipeline(
46
+ "ner",
47
+ model=MODEL_NAME,
48
+ aggregation_strategy="simple"
49
+ )
50
+ self.model_loaded = True
51
+ logger.info("Fallback model loaded successfully")
52
+
53
+ except Exception as e2:
54
+ logger.error(f"Failed to load fallback model: {e2}")
55
+ self.model_loaded = False
56
+
57
+ def extract_text(self, file_path: str) -> str:
58
+ """Extract text from PDF or DOCX files with error handling"""
59
+ try:
60
+ path = Path(file_path)
61
+
62
+ if not path.exists():
63
+ raise FileNotFoundError(f"File not found: {file_path}")
64
+
65
+ if path.suffix.lower() == ".pdf":
66
+ text = pdf_extract_text(file_path)
67
+ logger.info(f"Extracted {len(text)} characters from PDF")
68
+ return text
69
+
70
+ elif path.suffix.lower() == ".docx":
71
+ doc = Document(file_path)
72
+ text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
73
+ logger.info(f"Extracted {len(text)} characters from DOCX")
74
+ return text
75
+
76
+ else:
77
+ raise ValueError(f"Unsupported file format: {path.suffix}")
78
+
79
+ except Exception as e:
80
+ logger.error(f"Error extracting text: {e}")
81
+ raise
82
+
83
+ def extract_with_regex(self, text: str) -> Dict[str, List[str]]:
84
+ """Fallback extraction using regex patterns"""
85
+ patterns = {
86
+ 'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
87
+ 'phone': r'(\+\d{1,3}[-.\s]?)?$$?\d{3}$$?[-.\s]?\d{3}[-.\s]?\d{4}',
88
+ 'skills': r'(?i)(?:skills?|technologies?|tools?)[:\-\s]*([^\n]+)',
89
+ 'education': r'(?i)(?:education|degree|university|college|bachelor|master|phd)[:\-\s]*([^\n]+)',
90
+ 'experience': r'(?i)(?:experience|work|employment|job)[:\-\s]*([^\n]+)'
91
+ }
92
+
93
+ results = {}
94
+ for key, pattern in patterns.items():
95
+ matches = re.findall(pattern, text, re.MULTILINE)
96
+ results[key] = [match.strip() for match in matches if match.strip()]
97
+
98
+ return results
99
+
100
+ def extract_name_from_text(self, text: str) -> str:
101
+ """Extract name using heuristics"""
102
+ lines = text.split('\n')
103
+ # Usually name is in the first few lines
104
+ for line in lines[:5]:
105
+ line = line.strip()
106
+ if line and len(line.split()) <= 4 and len(line) > 2:
107
+ # Check if it looks like a name (not email, phone, etc.)
108
+ if not re.search(r'[@\d]', line) and not line.lower().startswith(('resume', 'cv', 'curriculum')):
109
+ return line
110
+ return "Not Found"
111
+
112
+ def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]:
113
+ """Process NER entities with improved logic"""
114
+ name, skills, education, experience = [], [], [], []
115
+
116
+ logger.info(f"Processing {len(entities)} entities")
117
+
118
+ for ent in entities:
119
+ label = ent.get("entity_group", "").upper()
120
+ value = ent.get("word", "").strip()
121
+ confidence = ent.get("score", 0)
122
+
123
+ logger.debug(f"Entity: {label} = {value} (confidence: {confidence:.2f})")
124
+
125
+ # Only consider high-confidence entities
126
+ if confidence < 0.5:
127
+ continue
128
+
129
+ if label in ["PERSON", "PER", "NAME"]:
130
+ name.append(value)
131
+ elif label in ["SKILL", "TECH", "TECHNOLOGY"]:
132
+ skills.append(value)
133
+ elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"]:
134
+ education.append(value)
135
+ elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]:
136
+ experience.append(value)
137
+
138
+ return {
139
+ "name": name,
140
+ "skills": skills,
141
+ "education": education,
142
+ "experience": experience
143
+ }
144
+
145
+ def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]:
146
+ """Parse resume with multiple extraction methods"""
147
+ try:
148
+ # Extract text
149
+ text = self.extract_text(file_path)
150
+
151
+ if not text or len(text.strip()) < 10:
152
+ raise ValueError("Extracted text is too short or empty")
153
+
154
+ logger.info(f"Text preview: {text[:200]}...")
155
+
156
+ # Initialize results
157
+ results = {
158
+ "name": "Not Found",
159
+ "skills": "Not Found",
160
+ "education": "Not Found",
161
+ "experience": "Not Found"
162
+ }
163
+
164
+ # Method 1: Try NER model if available
165
+ if self.model_loaded and self.ner_pipeline:
166
+ try:
167
+ logger.info("Using NER model for extraction")
168
+ entities = self.ner_pipeline(text)
169
+ ner_results = self.process_ner_entities(entities)
170
+
171
+ # Update results with NER findings
172
+ for key in results.keys():
173
+ if ner_results.get(key):
174
+ unique_items = list(dict.fromkeys(ner_results[key]))
175
+ results[key] = ", ".join(unique_items)
176
+
177
+ except Exception as e:
178
+ logger.warning(f"NER extraction failed: {e}")
179
+
180
+ # Method 2: Regex fallback
181
+ logger.info("Using regex patterns for extraction")
182
+ regex_results = self.extract_with_regex(text)
183
+
184
+ # Fill in missing information with regex results
185
+ if results["name"] == "Not Found":
186
+ results["name"] = self.extract_name_from_text(text)
187
+
188
+ if results["skills"] == "Not Found" and regex_results.get("skills"):
189
+ results["skills"] = ", ".join(regex_results["skills"][:3]) # Limit to first 3
190
+
191
+ if results["education"] == "Not Found" and regex_results.get("education"):
192
+ results["education"] = ", ".join(regex_results["education"][:2]) # Limit to first 2
193
+
194
+ if results["experience"] == "Not Found" and regex_results.get("experience"):
195
+ results["experience"] = ", ".join(regex_results["experience"][:3]) # Limit to first 3
196
+
197
+ # Add email and phone if found
198
+ if regex_results.get("email"):
199
+ results["email"] = regex_results["email"][0]
200
+ if regex_results.get("phone"):
201
+ results["phone"] = regex_results["phone"][0]
202
+
203
+ logger.info("Parsing completed successfully")
204
+ return results
205
+
206
+ except Exception as e:
207
+ logger.error(f"Error parsing resume: {e}")
208
+ return {
209
+ "name": "Error",
210
+ "skills": "Error",
211
+ "education": "Error",
212
+ "experience": "Error",
213
+ "error": str(e)
214
+ }
215
+
216
+ # Create global instance
217
+ resume_parser = ResumeParser()
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
220
+ """Main function to parse resume"""
221
+ return resume_parser.parse_resume(file_path, filename)
222
+
223
+ # Test function
224
+ def test_parser():
225
+ """Test the parser with sample text"""
226
+ sample_text = """
227
+ John Doe
228
+ Software Engineer
229
230
+ (555) 123-4567
231
+
232
+ Skills: Python, JavaScript, React, Node.js, SQL
233
+
234
+ Education:
235
+ Bachelor of Science in Computer Science
236
+ University of Technology, 2020
237
+
238
+ Experience:
239
+ Senior Software Developer at Tech Corp (2021-2023)
240
+ - Developed web applications using React and Node.js
241
+ - Managed database systems and APIs
242
+ """
243
+
244
+ # Create a temporary file for testing
245
+ import tempfile
246
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
247
+ f.write(sample_text)
248
+ temp_path = f.name
249
+
250
+ try:
251
+ # Test regex extraction
252
+ regex_results = resume_parser.extract_with_regex(sample_text)
253
+ print("Regex Results:", json.dumps(regex_results, indent=2))
254
+
255
+ # Test name extraction
256
+ name = resume_parser.extract_name_from_text(sample_text)
257
+ print(f"Extracted Name: {name}")
258
+
259
+ except Exception as e:
260
+ print(f"Test error: {e}")
261
+ finally:
262
+ Path(temp_path).unlink(missing_ok=True)
263
+