File size: 3,977 Bytes
6248af7
33fa314
 
 
6248af7
 
 
c0dac84
6248af7
 
c0dac84
 
6248af7
c0dac84
 
 
 
 
 
 
 
6248af7
c0dac84
 
 
 
 
 
 
 
 
 
f2a1cfa
c0dac84
 
 
 
f2a1cfa
6248af7
 
c0dac84
 
f2a1cfa
 
 
 
 
6248af7
c0dac84
 
 
 
 
 
 
 
f2a1cfa
c0dac84
 
 
 
 
 
 
f2a1cfa
c0dac84
 
 
 
 
 
 
f2a1cfa
c0dac84
6248af7
c0dac84
 
6248af7
 
 
 
c0dac84
 
 
 
 
 
6248af7
c0dac84
 
6248af7
c0dac84
 
 
 
 
6248af7
 
 
 
c0dac84
 
 
 
6248af7
 
c0dac84
6248af7
b336194
c0dac84
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import re
from pathlib import Path
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document

class ResumeParser:
    def __init__(self):
        pass
    
    def extract_text(self, file_path: str) -> str:
        """Extract text from PDF or DOCX files"""
        path = Path(file_path)
        
        if path.suffix.lower() == ".pdf":
            text = pdf_extract_text(file_path)
            return re.sub(r'\s+', ' ', text).strip()
        elif path.suffix.lower() == ".docx":
            doc = Document(file_path)
            return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
        else:
            raise ValueError("Unsupported file format")

    def extract_name(self, text: str) -> str:
        """Extract name from resume text"""
        # Try to find name at the beginning of document
        first_lines = [line.strip() for line in text.split('\n')[:10] if line.strip()]
        
        for line in first_lines:
            # Simple name pattern (2-4 words, all starting with capital)
            if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3}$', line):
                if not any(word.lower() in ['resume', 'cv', 'curriculum'] for word in line.split()):
                    return line
        
        # Fallback: return first non-empty line that looks like a name
        for line in first_lines:
            if 2 <= len(line.split()) <= 4 and line[0].isupper():
                return line
        
        return "Not Found"

    def extract_sections(self, text: str) -> dict:
        """Extract skills, education, and experience using regex"""
        results = {
            "skills": [],
            "education": [],
            "experience": []
        }
        
        # Extract skills
        skills_match = re.search(
            r'(?:skills|technologies|expertise)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
            text, re.IGNORECASE
        )
        if skills_match:
            skills_text = skills_match.group(1)
            results["skills"] = [s.strip() for s in re.split(r'[,;]', skills_text) if s.strip()]
        
        # Extract education
        edu_match = re.search(
            r'(?:education|degrees?)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
            text, re.IGNORECASE
        )
        if edu_match:
            results["education"] = [e.strip() for e in edu_match.group(1).split('\n') if e.strip()]
        
        # Extract experience
        exp_match = re.search(
            r'(?:experience|work history|employment)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
            text, re.IGNORECASE
        )
        if exp_match:
            results["experience"] = [x.strip() for x in exp_match.group(1).split('\n') if x.strip()]
        
        return results

    def parse_resume(self, file_path: str) -> dict:
        """Main parsing function"""
        try:
            text = self.extract_text(file_path)
            
            if not text or len(text.strip()) < 10:
                return {
                    "name": "Error: Empty file",
                    "skills": [],
                    "education": [],
                    "experience": []
                }
            
            name = self.extract_name(text)
            sections = self.extract_sections(text)
            
            return {
                "name": name,
                "skills": sections["skills"][:10],  # Limit to 10 skills
                "education": sections["education"][:3],  # Limit to 3 items
                "experience": sections["experience"][:3]  # Limit to 3 items
            }
            
        except Exception as e:
            return {
                "name": f"Error: {str(e)}",
                "skills": [],
                "education": [],
                "experience": []
            }

# Global instance
resume_parser = ResumeParser()

def parse_resume(file_path: str) -> dict:
    """Public interface for resume parsing"""
    return resume_parser.parse_resume(file_path)