Spaces:

husseinelsaadi
/

Codingo

Paused

File size: 9,320 Bytes

import os
import re
import json
from pathlib import Path
import PyPDF2
from docx import Document
import textract

class SimpleResumeParser:
    def __init__(self):
        # Common skills keywords
        self.skills_keywords = [
            'python', 'javascript', 'java', 'c++', 'c#', 'php', 'ruby', 'go', 'rust',
            'html', 'css', 'react', 'angular', 'vue', 'node.js', 'express', 'django',
            'flask', 'spring', 'laravel', 'rails', 'asp.net', 'jquery', 'bootstrap',
            'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch',
            'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'github',
            'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'scikit-learn',
            'data analysis', 'pandas', 'numpy', 'matplotlib', 'tableau', 'power bi',
            'agile', 'scrum', 'devops', 'ci/cd', 'microservices', 'api', 'rest', 'graphql'
        ]
        
        # Education keywords
        self.education_keywords = [
            'bachelor', 'master', 'phd', 'degree', 'university', 'college', 'institute',
            'computer science', 'engineering', 'mathematics', 'physics', 'chemistry',
            'business', 'mba', 'certification', 'diploma'
        ]
        
        # Experience keywords
        self.experience_keywords = [
            'experience', 'worked', 'developed', 'managed', 'led', 'created', 'built',
            'designed', 'implemented', 'maintained', 'optimized', 'improved', 'years'
        ]

    def extract_text_from_pdf(self, file_path):
        """Extract text from PDF file"""
        try:
            with open(file_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text() + "\n"
                return text
        except Exception as e:
            print(f"Error reading PDF: {e}")
            return ""

    def extract_text_from_docx(self, file_path):
        """Extract text from DOCX file"""
        try:
            doc = Document(file_path)
            text = ""
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            return text
        except Exception as e:
            print(f"Error reading DOCX: {e}")
            return ""

    def extract_text_from_doc(self, file_path):
        """Extract text from DOC file using textract"""
        try:
            text = textract.process(file_path).decode('utf-8')
            return text
        except Exception as e:
            print(f"Error reading DOC: {e}")
            return ""

    def extract_text(self, file_path):
        """Extract text based on file extension"""
        file_extension = Path(file_path).suffix.lower()
        
        if file_extension == '.pdf':
            return self.extract_text_from_pdf(file_path)
        elif file_extension == '.docx':
            return self.extract_text_from_docx(file_path)
        elif file_extension == '.doc':
            return self.extract_text_from_doc(file_path)
        else:
            return ""

    def extract_email(self, text):
        """Extract email addresses from text"""
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        emails = re.findall(email_pattern, text)
        return emails[0] if emails else ""

    def extract_phone(self, text):
        """Extract phone numbers from text"""
        phone_patterns = [
            r'\+?1?[-.\s]?$$?([0-9]{3})$$?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
            r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})',
            r'(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})',
            r'($$\d{3}$$\s?\d{3}[-.\s]?\d{4})'
        ]
        
        for pattern in phone_patterns:
            matches = re.findall(pattern, text)
            if matches:
                if isinstance(matches[0], tuple):
                    return ''.join(matches[0])
                return matches[0]
        return ""

    def extract_name(self, text):
        """Extract name from text (simple heuristic)"""
        lines = text.split('\n')
        for line in lines[:5]:  # Check first 5 lines
            line = line.strip()
            if len(line.split()) == 2 and line.replace(' ', '').isalpha():
                # Simple check: two words, all alphabetic
                if not any(keyword in line.lower() for keyword in ['resume', 'cv', 'curriculum']):
                    return line.title()
        return ""

    def extract_skills(self, text):
        """Extract skills from text"""
        text_lower = text.lower()
        found_skills = []
        
        for skill in self.skills_keywords:
            if skill.lower() in text_lower:
                found_skills.append(skill.title())
        
        # Remove duplicates and return
        return list(set(found_skills))

    def extract_education(self, text):
        """Extract education information"""
        text_lower = text.lower()
        education = []
        
        # Look for education section
        education_section = ""
        lines = text.split('\n')
        in_education_section = False
        
        for line in lines:
            line_lower = line.lower()
            if any(keyword in line_lower for keyword in ['education', 'academic', 'qualification']):
                in_education_section = True
                continue
            elif in_education_section and any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'project']):
                break
            elif in_education_section:
                education_section += line + " "
        
        # Extract degrees and institutions
        for keyword in self.education_keywords:
            if keyword in text_lower:
                # Find context around the keyword
                pattern = rf'.{{0,50}}{re.escape(keyword)}.{{0,50}}'
                matches = re.findall(pattern, text, re.IGNORECASE)
                education.extend(matches)
        
        return education[:3]  # Return top 3 education entries

    def extract_experience(self, text):
        """Extract work experience"""
        experience = []
        lines = text.split('\n')
        
        # Look for experience section
        in_experience_section = False
        current_experience = ""
        
        for line in lines:
            line_lower = line.lower()
            if any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'career']):
                in_experience_section = True
                continue
            elif in_experience_section and any(keyword in line_lower for keyword in ['education', 'skill', 'project']):
                if current_experience:
                    experience.append(current_experience.strip())
                break
            elif in_experience_section:
                if line.strip():
                    current_experience += line + " "
                elif current_experience:
                    experience.append(current_experience.strip())
                    current_experience = ""
        
        if current_experience:
            experience.append(current_experience.strip())
        
        return experience[:3]  # Return top 3 experience entries

    def extract_summary(self, text):
        """Extract summary/objective"""
        lines = text.split('\n')
        summary = ""
        
        for i, line in enumerate(lines):
            line_lower = line.lower()
            if any(keyword in line_lower for keyword in ['summary', 'objective', 'profile', 'about']):
                # Get next few lines as summary
                summary_lines = lines[i+1:i+4]
                summary = ' '.join([l.strip() for l in summary_lines if l.strip()])
                break
        
        return summary[:200]  # Limit to 200 characters

def extract_resume_features(file_path):
    """
    Main function to extract features from resume
    Returns a dictionary with extracted information
    """
    try:
        parser = SimpleResumeParser()
        text = parser.extract_text(file_path)
        
        if not text:
            return {
                'name': '',
                'email': '',
                'mobile_number': '',
                'skills': [],
                'experience': [],
                'education': [],
                'summary': ''
            }
        
        # Extract all features
        features = {
            'name': parser.extract_name(text),
            'email': parser.extract_email(text),
            'mobile_number': parser.extract_phone(text),
            'skills': parser.extract_skills(text),
            'experience': parser.extract_experience(text),
            'education': parser.extract_education(text),
            'summary': parser.extract_summary(text)
        }
        
        return features
        
    except Exception as e:
        print(f"Error extracting resume features: {e}")
        return {
            'name': '',
            'email': '',
            'mobile_number': '',
            'skills': [],
            'experience': [],
            'education': [],
            'summary': ''
        }

# For backward compatibility
def parse_resume(file_path):
    return extract_resume_features(file_path)