Spaces:
Paused
Paused
| import os | |
| import re | |
| import json | |
| from pathlib import Path | |
| import PyPDF2 | |
| from docx import Document | |
| import textract | |
| class SimpleResumeParser: | |
| def __init__(self): | |
| # Common skills keywords | |
| self.skills_keywords = [ | |
| 'python', 'javascript', 'java', 'c++', 'c#', 'php', 'ruby', 'go', 'rust', | |
| 'html', 'css', 'react', 'angular', 'vue', 'node.js', 'express', 'django', | |
| 'flask', 'spring', 'laravel', 'rails', 'asp.net', 'jquery', 'bootstrap', | |
| 'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', | |
| 'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'github', | |
| 'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'scikit-learn', | |
| 'data analysis', 'pandas', 'numpy', 'matplotlib', 'tableau', 'power bi', | |
| 'agile', 'scrum', 'devops', 'ci/cd', 'microservices', 'api', 'rest', 'graphql' | |
| ] | |
| # Education keywords | |
| self.education_keywords = [ | |
| 'bachelor', 'master', 'phd', 'degree', 'university', 'college', 'institute', | |
| 'computer science', 'engineering', 'mathematics', 'physics', 'chemistry', | |
| 'business', 'mba', 'certification', 'diploma' | |
| ] | |
| # Experience keywords | |
| self.experience_keywords = [ | |
| 'experience', 'worked', 'developed', 'managed', 'led', 'created', 'built', | |
| 'designed', 'implemented', 'maintained', 'optimized', 'improved', 'years' | |
| ] | |
| def extract_text_from_pdf(self, file_path): | |
| """Extract text from PDF file""" | |
| try: | |
| with open(file_path, 'rb') as file: | |
| reader = PyPDF2.PdfReader(file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text | |
| except Exception as e: | |
| print(f"Error reading PDF: {e}") | |
| return "" | |
| def extract_text_from_docx(self, file_path): | |
| """Extract text from DOCX file""" | |
| try: | |
| doc = Document(file_path) | |
| text = "" | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text | |
| except Exception as e: | |
| print(f"Error reading DOCX: {e}") | |
| return "" | |
| def extract_text_from_doc(self, file_path): | |
| """Extract text from DOC file using textract""" | |
| try: | |
| text = textract.process(file_path).decode('utf-8') | |
| return text | |
| except Exception as e: | |
| print(f"Error reading DOC: {e}") | |
| return "" | |
| def extract_text(self, file_path): | |
| """Extract text based on file extension""" | |
| file_extension = Path(file_path).suffix.lower() | |
| if file_extension == '.pdf': | |
| return self.extract_text_from_pdf(file_path) | |
| elif file_extension == '.docx': | |
| return self.extract_text_from_docx(file_path) | |
| elif file_extension == '.doc': | |
| return self.extract_text_from_doc(file_path) | |
| else: | |
| return "" | |
| def extract_email(self, text): | |
| """Extract email addresses from text""" | |
| email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
| emails = re.findall(email_pattern, text) | |
| return emails[0] if emails else "" | |
| def extract_phone(self, text): | |
| """Extract phone numbers from text""" | |
| phone_patterns = [ | |
| r'\+?1?[-.\s]?$$?([0-9]{3})$$?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})', | |
| r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})', | |
| r'(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})', | |
| r'($$\d{3}$$\s?\d{3}[-.\s]?\d{4})' | |
| ] | |
| for pattern in phone_patterns: | |
| matches = re.findall(pattern, text) | |
| if matches: | |
| if isinstance(matches[0], tuple): | |
| return ''.join(matches[0]) | |
| return matches[0] | |
| return "" | |
| def extract_name(self, text): | |
| """Extract name from text (simple heuristic)""" | |
| lines = text.split('\n') | |
| for line in lines[:5]: # Check first 5 lines | |
| line = line.strip() | |
| if len(line.split()) == 2 and line.replace(' ', '').isalpha(): | |
| # Simple check: two words, all alphabetic | |
| if not any(keyword in line.lower() for keyword in ['resume', 'cv', 'curriculum']): | |
| return line.title() | |
| return "" | |
| def extract_skills(self, text): | |
| """Extract skills from text""" | |
| text_lower = text.lower() | |
| found_skills = [] | |
| for skill in self.skills_keywords: | |
| if skill.lower() in text_lower: | |
| found_skills.append(skill.title()) | |
| # Remove duplicates and return | |
| return list(set(found_skills)) | |
| def extract_education(self, text): | |
| """Extract education information""" | |
| text_lower = text.lower() | |
| education = [] | |
| # Look for education section | |
| education_section = "" | |
| lines = text.split('\n') | |
| in_education_section = False | |
| for line in lines: | |
| line_lower = line.lower() | |
| if any(keyword in line_lower for keyword in ['education', 'academic', 'qualification']): | |
| in_education_section = True | |
| continue | |
| elif in_education_section and any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'project']): | |
| break | |
| elif in_education_section: | |
| education_section += line + " " | |
| # Extract degrees and institutions | |
| for keyword in self.education_keywords: | |
| if keyword in text_lower: | |
| # Find context around the keyword | |
| pattern = rf'.{{0,50}}{re.escape(keyword)}.{{0,50}}' | |
| matches = re.findall(pattern, text, re.IGNORECASE) | |
| education.extend(matches) | |
| return education[:3] # Return top 3 education entries | |
| def extract_experience(self, text): | |
| """Extract work experience""" | |
| experience = [] | |
| lines = text.split('\n') | |
| # Look for experience section | |
| in_experience_section = False | |
| current_experience = "" | |
| for line in lines: | |
| line_lower = line.lower() | |
| if any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'career']): | |
| in_experience_section = True | |
| continue | |
| elif in_experience_section and any(keyword in line_lower for keyword in ['education', 'skill', 'project']): | |
| if current_experience: | |
| experience.append(current_experience.strip()) | |
| break | |
| elif in_experience_section: | |
| if line.strip(): | |
| current_experience += line + " " | |
| elif current_experience: | |
| experience.append(current_experience.strip()) | |
| current_experience = "" | |
| if current_experience: | |
| experience.append(current_experience.strip()) | |
| return experience[:3] # Return top 3 experience entries | |
| def extract_summary(self, text): | |
| """Extract summary/objective""" | |
| lines = text.split('\n') | |
| summary = "" | |
| for i, line in enumerate(lines): | |
| line_lower = line.lower() | |
| if any(keyword in line_lower for keyword in ['summary', 'objective', 'profile', 'about']): | |
| # Get next few lines as summary | |
| summary_lines = lines[i+1:i+4] | |
| summary = ' '.join([l.strip() for l in summary_lines if l.strip()]) | |
| break | |
| return summary[:200] # Limit to 200 characters | |
| def extract_resume_features(file_path): | |
| """ | |
| Main function to extract features from resume | |
| Returns a dictionary with extracted information | |
| """ | |
| try: | |
| parser = SimpleResumeParser() | |
| text = parser.extract_text(file_path) | |
| if not text: | |
| return { | |
| 'name': '', | |
| 'email': '', | |
| 'mobile_number': '', | |
| 'skills': [], | |
| 'experience': [], | |
| 'education': [], | |
| 'summary': '' | |
| } | |
| # Extract all features | |
| features = { | |
| 'name': parser.extract_name(text), | |
| 'email': parser.extract_email(text), | |
| 'mobile_number': parser.extract_phone(text), | |
| 'skills': parser.extract_skills(text), | |
| 'experience': parser.extract_experience(text), | |
| 'education': parser.extract_education(text), | |
| 'summary': parser.extract_summary(text) | |
| } | |
| return features | |
| except Exception as e: | |
| print(f"Error extracting resume features: {e}") | |
| return { | |
| 'name': '', | |
| 'email': '', | |
| 'mobile_number': '', | |
| 'skills': [], | |
| 'experience': [], | |
| 'education': [], | |
| 'summary': '' | |
| } | |
| # For backward compatibility | |
| def parse_resume(file_path): | |
| return extract_resume_features(file_path) |