import os import re import json from pathlib import Path import PyPDF2 from docx import Document import textract class SimpleResumeParser: def __init__(self): # Common skills keywords self.skills_keywords = [ 'python', 'javascript', 'java', 'c++', 'c#', 'php', 'ruby', 'go', 'rust', 'html', 'css', 'react', 'angular', 'vue', 'node.js', 'express', 'django', 'flask', 'spring', 'laravel', 'rails', 'asp.net', 'jquery', 'bootstrap', 'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', 'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'github', 'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'scikit-learn', 'data analysis', 'pandas', 'numpy', 'matplotlib', 'tableau', 'power bi', 'agile', 'scrum', 'devops', 'ci/cd', 'microservices', 'api', 'rest', 'graphql' ] # Education keywords self.education_keywords = [ 'bachelor', 'master', 'phd', 'degree', 'university', 'college', 'institute', 'computer science', 'engineering', 'mathematics', 'physics', 'chemistry', 'business', 'mba', 'certification', 'diploma' ] # Experience keywords self.experience_keywords = [ 'experience', 'worked', 'developed', 'managed', 'led', 'created', 'built', 'designed', 'implemented', 'maintained', 'optimized', 'improved', 'years' ] def extract_text_from_pdf(self, file_path): """Extract text from PDF file""" try: with open(file_path, 'rb') as file: reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: text += page.extract_text() + "\n" return text except Exception as e: print(f"Error reading PDF: {e}") return "" def extract_text_from_docx(self, file_path): """Extract text from DOCX file""" try: doc = Document(file_path) text = "" for paragraph in doc.paragraphs: text += paragraph.text + "\n" return text except Exception as e: print(f"Error reading DOCX: {e}") return "" def extract_text_from_doc(self, file_path): """Extract text from DOC file using textract""" try: text = textract.process(file_path).decode('utf-8') return text except Exception as e: print(f"Error reading DOC: {e}") return "" def extract_text(self, file_path): """Extract text based on file extension""" file_extension = Path(file_path).suffix.lower() if file_extension == '.pdf': return self.extract_text_from_pdf(file_path) elif file_extension == '.docx': return self.extract_text_from_docx(file_path) elif file_extension == '.doc': return self.extract_text_from_doc(file_path) else: return "" def extract_email(self, text): """Extract email addresses from text""" email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' emails = re.findall(email_pattern, text) return emails[0] if emails else "" def extract_phone(self, text): """Extract phone numbers from text""" phone_patterns = [ r'\+?1?[-.\s]?$$?([0-9]{3})$$?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})', r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})', r'(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})', r'($$\d{3}$$\s?\d{3}[-.\s]?\d{4})' ] for pattern in phone_patterns: matches = re.findall(pattern, text) if matches: if isinstance(matches[0], tuple): return ''.join(matches[0]) return matches[0] return "" def extract_name(self, text): """Extract name from text (simple heuristic)""" lines = text.split('\n') for line in lines[:5]: # Check first 5 lines line = line.strip() if len(line.split()) == 2 and line.replace(' ', '').isalpha(): # Simple check: two words, all alphabetic if not any(keyword in line.lower() for keyword in ['resume', 'cv', 'curriculum']): return line.title() return "" def extract_skills(self, text): """Extract skills from text""" text_lower = text.lower() found_skills = [] for skill in self.skills_keywords: if skill.lower() in text_lower: found_skills.append(skill.title()) # Remove duplicates and return return list(set(found_skills)) def extract_education(self, text): """Extract education information""" text_lower = text.lower() education = [] # Look for education section education_section = "" lines = text.split('\n') in_education_section = False for line in lines: line_lower = line.lower() if any(keyword in line_lower for keyword in ['education', 'academic', 'qualification']): in_education_section = True continue elif in_education_section and any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'project']): break elif in_education_section: education_section += line + " " # Extract degrees and institutions for keyword in self.education_keywords: if keyword in text_lower: # Find context around the keyword pattern = rf'.{{0,50}}{re.escape(keyword)}.{{0,50}}' matches = re.findall(pattern, text, re.IGNORECASE) education.extend(matches) return education[:3] # Return top 3 education entries def extract_experience(self, text): """Extract work experience""" experience = [] lines = text.split('\n') # Look for experience section in_experience_section = False current_experience = "" for line in lines: line_lower = line.lower() if any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'career']): in_experience_section = True continue elif in_experience_section and any(keyword in line_lower for keyword in ['education', 'skill', 'project']): if current_experience: experience.append(current_experience.strip()) break elif in_experience_section: if line.strip(): current_experience += line + " " elif current_experience: experience.append(current_experience.strip()) current_experience = "" if current_experience: experience.append(current_experience.strip()) return experience[:3] # Return top 3 experience entries def extract_summary(self, text): """Extract summary/objective""" lines = text.split('\n') summary = "" for i, line in enumerate(lines): line_lower = line.lower() if any(keyword in line_lower for keyword in ['summary', 'objective', 'profile', 'about']): # Get next few lines as summary summary_lines = lines[i+1:i+4] summary = ' '.join([l.strip() for l in summary_lines if l.strip()]) break return summary[:200] # Limit to 200 characters def extract_resume_features(file_path): """ Main function to extract features from resume Returns a dictionary with extracted information """ try: parser = SimpleResumeParser() text = parser.extract_text(file_path) if not text: return { 'name': '', 'email': '', 'mobile_number': '', 'skills': [], 'experience': [], 'education': [], 'summary': '' } # Extract all features features = { 'name': parser.extract_name(text), 'email': parser.extract_email(text), 'mobile_number': parser.extract_phone(text), 'skills': parser.extract_skills(text), 'experience': parser.extract_experience(text), 'education': parser.extract_education(text), 'summary': parser.extract_summary(text) } return features except Exception as e: print(f"Error extracting resume features: {e}") return { 'name': '', 'email': '', 'mobile_number': '', 'skills': [], 'experience': [], 'education': [], 'summary': '' } # For backward compatibility def parse_resume(file_path): return extract_resume_features(file_path)