Spaces:
Paused
Paused
File size: 9,320 Bytes
504df0f 9e72d2c 504df0f 9e72d2c 504df0f 9e72d2c 504df0f 9e72d2c 504df0f 9e72d2c 504df0f 9e72d2c 504df0f 9e72d2c 504df0f 9e72d2c 504df0f 9e72d2c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
import os
import re
import json
from pathlib import Path
import PyPDF2
from docx import Document
import textract
class SimpleResumeParser:
def __init__(self):
# Common skills keywords
self.skills_keywords = [
'python', 'javascript', 'java', 'c++', 'c#', 'php', 'ruby', 'go', 'rust',
'html', 'css', 'react', 'angular', 'vue', 'node.js', 'express', 'django',
'flask', 'spring', 'laravel', 'rails', 'asp.net', 'jquery', 'bootstrap',
'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch',
'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'github',
'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'scikit-learn',
'data analysis', 'pandas', 'numpy', 'matplotlib', 'tableau', 'power bi',
'agile', 'scrum', 'devops', 'ci/cd', 'microservices', 'api', 'rest', 'graphql'
]
# Education keywords
self.education_keywords = [
'bachelor', 'master', 'phd', 'degree', 'university', 'college', 'institute',
'computer science', 'engineering', 'mathematics', 'physics', 'chemistry',
'business', 'mba', 'certification', 'diploma'
]
# Experience keywords
self.experience_keywords = [
'experience', 'worked', 'developed', 'managed', 'led', 'created', 'built',
'designed', 'implemented', 'maintained', 'optimized', 'improved', 'years'
]
def extract_text_from_pdf(self, file_path):
"""Extract text from PDF file"""
try:
with open(file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text
except Exception as e:
print(f"Error reading PDF: {e}")
return ""
def extract_text_from_docx(self, file_path):
"""Extract text from DOCX file"""
try:
doc = Document(file_path)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text
except Exception as e:
print(f"Error reading DOCX: {e}")
return ""
def extract_text_from_doc(self, file_path):
"""Extract text from DOC file using textract"""
try:
text = textract.process(file_path).decode('utf-8')
return text
except Exception as e:
print(f"Error reading DOC: {e}")
return ""
def extract_text(self, file_path):
"""Extract text based on file extension"""
file_extension = Path(file_path).suffix.lower()
if file_extension == '.pdf':
return self.extract_text_from_pdf(file_path)
elif file_extension == '.docx':
return self.extract_text_from_docx(file_path)
elif file_extension == '.doc':
return self.extract_text_from_doc(file_path)
else:
return ""
def extract_email(self, text):
"""Extract email addresses from text"""
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
emails = re.findall(email_pattern, text)
return emails[0] if emails else ""
def extract_phone(self, text):
"""Extract phone numbers from text"""
phone_patterns = [
r'\+?1?[-.\s]?$$?([0-9]{3})$$?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})',
r'(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})',
r'($$\d{3}$$\s?\d{3}[-.\s]?\d{4})'
]
for pattern in phone_patterns:
matches = re.findall(pattern, text)
if matches:
if isinstance(matches[0], tuple):
return ''.join(matches[0])
return matches[0]
return ""
def extract_name(self, text):
"""Extract name from text (simple heuristic)"""
lines = text.split('\n')
for line in lines[:5]: # Check first 5 lines
line = line.strip()
if len(line.split()) == 2 and line.replace(' ', '').isalpha():
# Simple check: two words, all alphabetic
if not any(keyword in line.lower() for keyword in ['resume', 'cv', 'curriculum']):
return line.title()
return ""
def extract_skills(self, text):
"""Extract skills from text"""
text_lower = text.lower()
found_skills = []
for skill in self.skills_keywords:
if skill.lower() in text_lower:
found_skills.append(skill.title())
# Remove duplicates and return
return list(set(found_skills))
def extract_education(self, text):
"""Extract education information"""
text_lower = text.lower()
education = []
# Look for education section
education_section = ""
lines = text.split('\n')
in_education_section = False
for line in lines:
line_lower = line.lower()
if any(keyword in line_lower for keyword in ['education', 'academic', 'qualification']):
in_education_section = True
continue
elif in_education_section and any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'project']):
break
elif in_education_section:
education_section += line + " "
# Extract degrees and institutions
for keyword in self.education_keywords:
if keyword in text_lower:
# Find context around the keyword
pattern = rf'.{{0,50}}{re.escape(keyword)}.{{0,50}}'
matches = re.findall(pattern, text, re.IGNORECASE)
education.extend(matches)
return education[:3] # Return top 3 education entries
def extract_experience(self, text):
"""Extract work experience"""
experience = []
lines = text.split('\n')
# Look for experience section
in_experience_section = False
current_experience = ""
for line in lines:
line_lower = line.lower()
if any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'career']):
in_experience_section = True
continue
elif in_experience_section and any(keyword in line_lower for keyword in ['education', 'skill', 'project']):
if current_experience:
experience.append(current_experience.strip())
break
elif in_experience_section:
if line.strip():
current_experience += line + " "
elif current_experience:
experience.append(current_experience.strip())
current_experience = ""
if current_experience:
experience.append(current_experience.strip())
return experience[:3] # Return top 3 experience entries
def extract_summary(self, text):
"""Extract summary/objective"""
lines = text.split('\n')
summary = ""
for i, line in enumerate(lines):
line_lower = line.lower()
if any(keyword in line_lower for keyword in ['summary', 'objective', 'profile', 'about']):
# Get next few lines as summary
summary_lines = lines[i+1:i+4]
summary = ' '.join([l.strip() for l in summary_lines if l.strip()])
break
return summary[:200] # Limit to 200 characters
def extract_resume_features(file_path):
"""
Main function to extract features from resume
Returns a dictionary with extracted information
"""
try:
parser = SimpleResumeParser()
text = parser.extract_text(file_path)
if not text:
return {
'name': '',
'email': '',
'mobile_number': '',
'skills': [],
'experience': [],
'education': [],
'summary': ''
}
# Extract all features
features = {
'name': parser.extract_name(text),
'email': parser.extract_email(text),
'mobile_number': parser.extract_phone(text),
'skills': parser.extract_skills(text),
'experience': parser.extract_experience(text),
'education': parser.extract_education(text),
'summary': parser.extract_summary(text)
}
return features
except Exception as e:
print(f"Error extracting resume features: {e}")
return {
'name': '',
'email': '',
'mobile_number': '',
'skills': [],
'experience': [],
'education': [],
'summary': ''
}
# For backward compatibility
def parse_resume(file_path):
return extract_resume_features(file_path) |