husseinelsaadi commited on
Commit
9e72d2c
·
1 Parent(s): 22b00f2
backend/models/resume_parser/resume_to_features.py CHANGED
@@ -1,39 +1,251 @@
1
  import os
2
- from pyresparser import ResumeParser
 
 
 
 
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- def extract_resume_features(resume_path):
6
- """
7
- Extract features from a resume file.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- Args:
10
- resume_path (str): Path to the resume file
 
 
 
 
 
 
 
 
 
11
 
12
- Returns:
13
- dict: Dictionary containing extracted features from resume
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  """
15
  try:
16
- data = ResumeParser(resume_path).get_extracted_data()
17
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  except Exception as e:
19
- print(f"Error parsing resume: {e}")
20
  return {
21
  'name': '',
22
  'email': '',
23
  'mobile_number': '',
24
  'skills': [],
25
  'experience': [],
26
- 'no_of_pages': 0,
27
- 'total_experience': 0
28
  }
29
 
30
-
31
- # Example usage (will run if script is executed directly)
32
- if __name__ == "__main__":
33
- # Build absolute path to the resume file
34
- current_dir = os.path.dirname(os.path.abspath(__file__))
35
- resume_path = os.path.join(current_dir, '../../../data/resumes/Hussein El Saadi - CV.pdf')
36
-
37
- # Parse and print the extracted data
38
- data = extract_resume_features(resume_path)
39
- print(data)
 
1
  import os
2
+ import re
3
+ import json
4
+ from pathlib import Path
5
+ import PyPDF2
6
+ from docx import Document
7
+ import textract
8
 
9
+ class SimpleResumeParser:
10
+ def __init__(self):
11
+ # Common skills keywords
12
+ self.skills_keywords = [
13
+ 'python', 'javascript', 'java', 'c++', 'c#', 'php', 'ruby', 'go', 'rust',
14
+ 'html', 'css', 'react', 'angular', 'vue', 'node.js', 'express', 'django',
15
+ 'flask', 'spring', 'laravel', 'rails', 'asp.net', 'jquery', 'bootstrap',
16
+ 'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch',
17
+ 'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'github',
18
+ 'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'scikit-learn',
19
+ 'data analysis', 'pandas', 'numpy', 'matplotlib', 'tableau', 'power bi',
20
+ 'agile', 'scrum', 'devops', 'ci/cd', 'microservices', 'api', 'rest', 'graphql'
21
+ ]
22
+
23
+ # Education keywords
24
+ self.education_keywords = [
25
+ 'bachelor', 'master', 'phd', 'degree', 'university', 'college', 'institute',
26
+ 'computer science', 'engineering', 'mathematics', 'physics', 'chemistry',
27
+ 'business', 'mba', 'certification', 'diploma'
28
+ ]
29
+
30
+ # Experience keywords
31
+ self.experience_keywords = [
32
+ 'experience', 'worked', 'developed', 'managed', 'led', 'created', 'built',
33
+ 'designed', 'implemented', 'maintained', 'optimized', 'improved', 'years'
34
+ ]
35
 
36
+ def extract_text_from_pdf(self, file_path):
37
+ """Extract text from PDF file"""
38
+ try:
39
+ with open(file_path, 'rb') as file:
40
+ reader = PyPDF2.PdfReader(file)
41
+ text = ""
42
+ for page in reader.pages:
43
+ text += page.extract_text() + "\n"
44
+ return text
45
+ except Exception as e:
46
+ print(f"Error reading PDF: {e}")
47
+ return ""
48
+
49
+ def extract_text_from_docx(self, file_path):
50
+ """Extract text from DOCX file"""
51
+ try:
52
+ doc = Document(file_path)
53
+ text = ""
54
+ for paragraph in doc.paragraphs:
55
+ text += paragraph.text + "\n"
56
+ return text
57
+ except Exception as e:
58
+ print(f"Error reading DOCX: {e}")
59
+ return ""
60
+
61
+ def extract_text_from_doc(self, file_path):
62
+ """Extract text from DOC file using textract"""
63
+ try:
64
+ text = textract.process(file_path).decode('utf-8')
65
+ return text
66
+ except Exception as e:
67
+ print(f"Error reading DOC: {e}")
68
+ return ""
69
+
70
+ def extract_text(self, file_path):
71
+ """Extract text based on file extension"""
72
+ file_extension = Path(file_path).suffix.lower()
73
+
74
+ if file_extension == '.pdf':
75
+ return self.extract_text_from_pdf(file_path)
76
+ elif file_extension == '.docx':
77
+ return self.extract_text_from_docx(file_path)
78
+ elif file_extension == '.doc':
79
+ return self.extract_text_from_doc(file_path)
80
+ else:
81
+ return ""
82
+
83
+ def extract_email(self, text):
84
+ """Extract email addresses from text"""
85
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
86
+ emails = re.findall(email_pattern, text)
87
+ return emails[0] if emails else ""
88
+
89
+ def extract_phone(self, text):
90
+ """Extract phone numbers from text"""
91
+ phone_patterns = [
92
+ r'\+?1?[-.\s]?$$?([0-9]{3})$$?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
93
+ r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})',
94
+ r'(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})',
95
+ r'($$\d{3}$$\s?\d{3}[-.\s]?\d{4})'
96
+ ]
97
+
98
+ for pattern in phone_patterns:
99
+ matches = re.findall(pattern, text)
100
+ if matches:
101
+ if isinstance(matches[0], tuple):
102
+ return ''.join(matches[0])
103
+ return matches[0]
104
+ return ""
105
+
106
+ def extract_name(self, text):
107
+ """Extract name from text (simple heuristic)"""
108
+ lines = text.split('\n')
109
+ for line in lines[:5]: # Check first 5 lines
110
+ line = line.strip()
111
+ if len(line.split()) == 2 and line.replace(' ', '').isalpha():
112
+ # Simple check: two words, all alphabetic
113
+ if not any(keyword in line.lower() for keyword in ['resume', 'cv', 'curriculum']):
114
+ return line.title()
115
+ return ""
116
 
117
+ def extract_skills(self, text):
118
+ """Extract skills from text"""
119
+ text_lower = text.lower()
120
+ found_skills = []
121
+
122
+ for skill in self.skills_keywords:
123
+ if skill.lower() in text_lower:
124
+ found_skills.append(skill.title())
125
+
126
+ # Remove duplicates and return
127
+ return list(set(found_skills))
128
 
129
+ def extract_education(self, text):
130
+ """Extract education information"""
131
+ text_lower = text.lower()
132
+ education = []
133
+
134
+ # Look for education section
135
+ education_section = ""
136
+ lines = text.split('\n')
137
+ in_education_section = False
138
+
139
+ for line in lines:
140
+ line_lower = line.lower()
141
+ if any(keyword in line_lower for keyword in ['education', 'academic', 'qualification']):
142
+ in_education_section = True
143
+ continue
144
+ elif in_education_section and any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'project']):
145
+ break
146
+ elif in_education_section:
147
+ education_section += line + " "
148
+
149
+ # Extract degrees and institutions
150
+ for keyword in self.education_keywords:
151
+ if keyword in text_lower:
152
+ # Find context around the keyword
153
+ pattern = rf'.{{0,50}}{re.escape(keyword)}.{{0,50}}'
154
+ matches = re.findall(pattern, text, re.IGNORECASE)
155
+ education.extend(matches)
156
+
157
+ return education[:3] # Return top 3 education entries
158
+
159
+ def extract_experience(self, text):
160
+ """Extract work experience"""
161
+ experience = []
162
+ lines = text.split('\n')
163
+
164
+ # Look for experience section
165
+ in_experience_section = False
166
+ current_experience = ""
167
+
168
+ for line in lines:
169
+ line_lower = line.lower()
170
+ if any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'career']):
171
+ in_experience_section = True
172
+ continue
173
+ elif in_experience_section and any(keyword in line_lower for keyword in ['education', 'skill', 'project']):
174
+ if current_experience:
175
+ experience.append(current_experience.strip())
176
+ break
177
+ elif in_experience_section:
178
+ if line.strip():
179
+ current_experience += line + " "
180
+ elif current_experience:
181
+ experience.append(current_experience.strip())
182
+ current_experience = ""
183
+
184
+ if current_experience:
185
+ experience.append(current_experience.strip())
186
+
187
+ return experience[:3] # Return top 3 experience entries
188
+
189
+ def extract_summary(self, text):
190
+ """Extract summary/objective"""
191
+ lines = text.split('\n')
192
+ summary = ""
193
+
194
+ for i, line in enumerate(lines):
195
+ line_lower = line.lower()
196
+ if any(keyword in line_lower for keyword in ['summary', 'objective', 'profile', 'about']):
197
+ # Get next few lines as summary
198
+ summary_lines = lines[i+1:i+4]
199
+ summary = ' '.join([l.strip() for l in summary_lines if l.strip()])
200
+ break
201
+
202
+ return summary[:200] # Limit to 200 characters
203
+
204
+ def extract_resume_features(file_path):
205
+ """
206
+ Main function to extract features from resume
207
+ Returns a dictionary with extracted information
208
  """
209
  try:
210
+ parser = SimpleResumeParser()
211
+ text = parser.extract_text(file_path)
212
+
213
+ if not text:
214
+ return {
215
+ 'name': '',
216
+ 'email': '',
217
+ 'mobile_number': '',
218
+ 'skills': [],
219
+ 'experience': [],
220
+ 'education': [],
221
+ 'summary': ''
222
+ }
223
+
224
+ # Extract all features
225
+ features = {
226
+ 'name': parser.extract_name(text),
227
+ 'email': parser.extract_email(text),
228
+ 'mobile_number': parser.extract_phone(text),
229
+ 'skills': parser.extract_skills(text),
230
+ 'experience': parser.extract_experience(text),
231
+ 'education': parser.extract_education(text),
232
+ 'summary': parser.extract_summary(text)
233
+ }
234
+
235
+ return features
236
+
237
  except Exception as e:
238
+ print(f"Error extracting resume features: {e}")
239
  return {
240
  'name': '',
241
  'email': '',
242
  'mobile_number': '',
243
  'skills': [],
244
  'experience': [],
245
+ 'education': [],
246
+ 'summary': ''
247
  }
248
 
249
+ # For backward compatibility
250
+ def parse_resume(file_path):
251
+ return extract_resume_features(file_path)