Spaces:
Paused
Paused
import os | |
import re | |
import json | |
from pathlib import Path | |
import PyPDF2 | |
from docx import Document | |
import textract | |
class SimpleResumeParser: | |
def __init__(self): | |
# Common skills keywords | |
self.skills_keywords = [ | |
'python', 'javascript', 'java', 'c++', 'c#', 'php', 'ruby', 'go', 'rust', | |
'html', 'css', 'react', 'angular', 'vue', 'node.js', 'express', 'django', | |
'flask', 'spring', 'laravel', 'rails', 'asp.net', 'jquery', 'bootstrap', | |
'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', | |
'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'github', | |
'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'scikit-learn', | |
'data analysis', 'pandas', 'numpy', 'matplotlib', 'tableau', 'power bi', | |
'agile', 'scrum', 'devops', 'ci/cd', 'microservices', 'api', 'rest', 'graphql' | |
] | |
# Education keywords | |
self.education_keywords = [ | |
'bachelor', 'master', 'phd', 'degree', 'university', 'college', 'institute', | |
'computer science', 'engineering', 'mathematics', 'physics', 'chemistry', | |
'business', 'mba', 'certification', 'diploma' | |
] | |
# Experience keywords | |
self.experience_keywords = [ | |
'experience', 'worked', 'developed', 'managed', 'led', 'created', 'built', | |
'designed', 'implemented', 'maintained', 'optimized', 'improved', 'years' | |
] | |
def extract_text_from_pdf(self, file_path): | |
"""Extract text from PDF file""" | |
try: | |
with open(file_path, 'rb') as file: | |
reader = PyPDF2.PdfReader(file) | |
text = "" | |
for page in reader.pages: | |
text += page.extract_text() + "\n" | |
return text | |
except Exception as e: | |
print(f"Error reading PDF: {e}") | |
return "" | |
def extract_text_from_docx(self, file_path): | |
"""Extract text from DOCX file""" | |
try: | |
doc = Document(file_path) | |
text = "" | |
for paragraph in doc.paragraphs: | |
text += paragraph.text + "\n" | |
return text | |
except Exception as e: | |
print(f"Error reading DOCX: {e}") | |
return "" | |
def extract_text_from_doc(self, file_path): | |
"""Extract text from DOC file using textract""" | |
try: | |
text = textract.process(file_path).decode('utf-8') | |
return text | |
except Exception as e: | |
print(f"Error reading DOC: {e}") | |
return "" | |
def extract_text(self, file_path): | |
"""Extract text based on file extension""" | |
file_extension = Path(file_path).suffix.lower() | |
if file_extension == '.pdf': | |
return self.extract_text_from_pdf(file_path) | |
elif file_extension == '.docx': | |
return self.extract_text_from_docx(file_path) | |
elif file_extension == '.doc': | |
return self.extract_text_from_doc(file_path) | |
else: | |
return "" | |
def extract_email(self, text): | |
"""Extract email addresses from text""" | |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
emails = re.findall(email_pattern, text) | |
return emails[0] if emails else "" | |
def extract_phone(self, text): | |
"""Extract phone numbers from text""" | |
phone_patterns = [ | |
r'\+?1?[-.\s]?$$?([0-9]{3})$$?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})', | |
r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})', | |
r'(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})', | |
r'($$\d{3}$$\s?\d{3}[-.\s]?\d{4})' | |
] | |
for pattern in phone_patterns: | |
matches = re.findall(pattern, text) | |
if matches: | |
if isinstance(matches[0], tuple): | |
return ''.join(matches[0]) | |
return matches[0] | |
return "" | |
def extract_name(self, text): | |
"""Extract name from text (simple heuristic)""" | |
lines = text.split('\n') | |
for line in lines[:5]: # Check first 5 lines | |
line = line.strip() | |
if len(line.split()) == 2 and line.replace(' ', '').isalpha(): | |
# Simple check: two words, all alphabetic | |
if not any(keyword in line.lower() for keyword in ['resume', 'cv', 'curriculum']): | |
return line.title() | |
return "" | |
def extract_skills(self, text): | |
"""Extract skills from text""" | |
text_lower = text.lower() | |
found_skills = [] | |
for skill in self.skills_keywords: | |
if skill.lower() in text_lower: | |
found_skills.append(skill.title()) | |
# Remove duplicates and return | |
return list(set(found_skills)) | |
def extract_education(self, text): | |
"""Extract education information""" | |
text_lower = text.lower() | |
education = [] | |
# Look for education section | |
education_section = "" | |
lines = text.split('\n') | |
in_education_section = False | |
for line in lines: | |
line_lower = line.lower() | |
if any(keyword in line_lower for keyword in ['education', 'academic', 'qualification']): | |
in_education_section = True | |
continue | |
elif in_education_section and any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'project']): | |
break | |
elif in_education_section: | |
education_section += line + " " | |
# Extract degrees and institutions | |
for keyword in self.education_keywords: | |
if keyword in text_lower: | |
# Find context around the keyword | |
pattern = rf'.{{0,50}}{re.escape(keyword)}.{{0,50}}' | |
matches = re.findall(pattern, text, re.IGNORECASE) | |
education.extend(matches) | |
return education[:3] # Return top 3 education entries | |
def extract_experience(self, text): | |
"""Extract work experience""" | |
experience = [] | |
lines = text.split('\n') | |
# Look for experience section | |
in_experience_section = False | |
current_experience = "" | |
for line in lines: | |
line_lower = line.lower() | |
if any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'career']): | |
in_experience_section = True | |
continue | |
elif in_experience_section and any(keyword in line_lower for keyword in ['education', 'skill', 'project']): | |
if current_experience: | |
experience.append(current_experience.strip()) | |
break | |
elif in_experience_section: | |
if line.strip(): | |
current_experience += line + " " | |
elif current_experience: | |
experience.append(current_experience.strip()) | |
current_experience = "" | |
if current_experience: | |
experience.append(current_experience.strip()) | |
return experience[:3] # Return top 3 experience entries | |
def extract_summary(self, text): | |
"""Extract summary/objective""" | |
lines = text.split('\n') | |
summary = "" | |
for i, line in enumerate(lines): | |
line_lower = line.lower() | |
if any(keyword in line_lower for keyword in ['summary', 'objective', 'profile', 'about']): | |
# Get next few lines as summary | |
summary_lines = lines[i+1:i+4] | |
summary = ' '.join([l.strip() for l in summary_lines if l.strip()]) | |
break | |
return summary[:200] # Limit to 200 characters | |
def extract_resume_features(file_path): | |
""" | |
Main function to extract features from resume | |
Returns a dictionary with extracted information | |
""" | |
try: | |
parser = SimpleResumeParser() | |
text = parser.extract_text(file_path) | |
if not text: | |
return { | |
'name': '', | |
'email': '', | |
'mobile_number': '', | |
'skills': [], | |
'experience': [], | |
'education': [], | |
'summary': '' | |
} | |
# Extract all features | |
features = { | |
'name': parser.extract_name(text), | |
'email': parser.extract_email(text), | |
'mobile_number': parser.extract_phone(text), | |
'skills': parser.extract_skills(text), | |
'experience': parser.extract_experience(text), | |
'education': parser.extract_education(text), | |
'summary': parser.extract_summary(text) | |
} | |
return features | |
except Exception as e: | |
print(f"Error extracting resume features: {e}") | |
return { | |
'name': '', | |
'email': '', | |
'mobile_number': '', | |
'skills': [], | |
'experience': [], | |
'education': [], | |
'summary': '' | |
} | |
# For backward compatibility | |
def parse_resume(file_path): | |
return extract_resume_features(file_path) |