File size: 10,221 Bytes
682910e
6248af7
33fa314
6248af7
33fa314
 
efffc2e
6248af7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b336194
2489359
6248af7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import json
import re
from pathlib import Path
from typing import Dict, List, Optional
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class ResumeParser:
    def __init__(self):
        self.ner_pipeline = None
        self.model_loaded = False
        self._load_model()
    
    def _load_model(self):
        """Load the NER model with error handling and fallbacks"""
        try:
            # Try the original model first
            MODEL_NAME = "manishiitg/resume-ner"
            logger.info(f"Attempting to load model: {MODEL_NAME}")
            
            tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
            model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
            self.ner_pipeline = pipeline(
                "ner", 
                model=model, 
                tokenizer=tokenizer, 
                aggregation_strategy="simple"
            )
            self.model_loaded = True
            logger.info("Model loaded successfully")
            
        except Exception as e:
            logger.warning(f"Failed to load primary model: {e}")
            try:
                # Fallback to a more reliable model
                MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
                logger.info(f"Trying fallback model: {MODEL_NAME}")
                
                self.ner_pipeline = pipeline(
                    "ner", 
                    model=MODEL_NAME,
                    aggregation_strategy="simple"
                )
                self.model_loaded = True
                logger.info("Fallback model loaded successfully")
                
            except Exception as e2:
                logger.error(f"Failed to load fallback model: {e2}")
                self.model_loaded = False

    def extract_text(self, file_path: str) -> str:
        """Extract text from PDF or DOCX files with error handling"""
        try:
            path = Path(file_path)
            
            if not path.exists():
                raise FileNotFoundError(f"File not found: {file_path}")
            
            if path.suffix.lower() == ".pdf":
                text = pdf_extract_text(file_path)
                logger.info(f"Extracted {len(text)} characters from PDF")
                return text
                
            elif path.suffix.lower() == ".docx":
                doc = Document(file_path)
                text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
                logger.info(f"Extracted {len(text)} characters from DOCX")
                return text
                
            else:
                raise ValueError(f"Unsupported file format: {path.suffix}")
                
        except Exception as e:
            logger.error(f"Error extracting text: {e}")
            raise

    def extract_with_regex(self, text: str) -> Dict[str, List[str]]:
        """Fallback extraction using regex patterns"""
        patterns = {
            'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
            'phone': r'(\+\d{1,3}[-.\s]?)?$$?\d{3}$$?[-.\s]?\d{3}[-.\s]?\d{4}',
            'skills': r'(?i)(?:skills?|technologies?|tools?)[:\-\s]*([^\n]+)',
            'education': r'(?i)(?:education|degree|university|college|bachelor|master|phd)[:\-\s]*([^\n]+)',
            'experience': r'(?i)(?:experience|work|employment|job)[:\-\s]*([^\n]+)'
        }
        
        results = {}
        for key, pattern in patterns.items():
            matches = re.findall(pattern, text, re.MULTILINE)
            results[key] = [match.strip() for match in matches if match.strip()]
        
        return results

    def extract_name_from_text(self, text: str) -> str:
        """Extract name using heuristics"""
        lines = text.split('\n')
        # Usually name is in the first few lines
        for line in lines[:5]:
            line = line.strip()
            if line and len(line.split()) <= 4 and len(line) > 2:
                # Check if it looks like a name (not email, phone, etc.)
                if not re.search(r'[@\d]', line) and not line.lower().startswith(('resume', 'cv', 'curriculum')):
                    return line
        return "Not Found"

    def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]:
        """Process NER entities with improved logic"""
        name, skills, education, experience = [], [], [], []
        
        logger.info(f"Processing {len(entities)} entities")
        
        for ent in entities:
            label = ent.get("entity_group", "").upper()
            value = ent.get("word", "").strip()
            confidence = ent.get("score", 0)
            
            logger.debug(f"Entity: {label} = {value} (confidence: {confidence:.2f})")
            
            # Only consider high-confidence entities
            if confidence < 0.5:
                continue
                
            if label in ["PERSON", "PER", "NAME"]:
                name.append(value)
            elif label in ["SKILL", "TECH", "TECHNOLOGY"]:
                skills.append(value)
            elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"]:
                education.append(value)
            elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]:
                experience.append(value)
        
        return {
            "name": name,
            "skills": skills,
            "education": education,
            "experience": experience
        }

    def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]:
        """Parse resume with multiple extraction methods"""
        try:
            # Extract text
            text = self.extract_text(file_path)
            
            if not text or len(text.strip()) < 10:
                raise ValueError("Extracted text is too short or empty")
            
            logger.info(f"Text preview: {text[:200]}...")
            
            # Initialize results
            results = {
                "name": "Not Found",
                "skills": "Not Found",
                "education": "Not Found",
                "experience": "Not Found"
            }
            
            # Method 1: Try NER model if available
            if self.model_loaded and self.ner_pipeline:
                try:
                    logger.info("Using NER model for extraction")
                    entities = self.ner_pipeline(text)
                    ner_results = self.process_ner_entities(entities)
                    
                    # Update results with NER findings
                    for key in results.keys():
                        if ner_results.get(key):
                            unique_items = list(dict.fromkeys(ner_results[key]))
                            results[key] = ", ".join(unique_items)
                            
                except Exception as e:
                    logger.warning(f"NER extraction failed: {e}")
            
            # Method 2: Regex fallback
            logger.info("Using regex patterns for extraction")
            regex_results = self.extract_with_regex(text)
            
            # Fill in missing information with regex results
            if results["name"] == "Not Found":
                results["name"] = self.extract_name_from_text(text)
            
            if results["skills"] == "Not Found" and regex_results.get("skills"):
                results["skills"] = ", ".join(regex_results["skills"][:3])  # Limit to first 3
            
            if results["education"] == "Not Found" and regex_results.get("education"):
                results["education"] = ", ".join(regex_results["education"][:2])  # Limit to first 2
            
            if results["experience"] == "Not Found" and regex_results.get("experience"):
                results["experience"] = ", ".join(regex_results["experience"][:3])  # Limit to first 3
            
            # Add email and phone if found
            if regex_results.get("email"):
                results["email"] = regex_results["email"][0]
            if regex_results.get("phone"):
                results["phone"] = regex_results["phone"][0]
            
            logger.info("Parsing completed successfully")
            return results
            
        except Exception as e:
            logger.error(f"Error parsing resume: {e}")
            return {
                "name": "Error",
                "skills": "Error",
                "education": "Error",
                "experience": "Error",
                "error": str(e)
            }

# Create global instance
resume_parser = ResumeParser()

def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
    """Main function to parse resume"""
    return resume_parser.parse_resume(file_path, filename)

# Test function
def test_parser():
    """Test the parser with sample text"""
    sample_text = """
    John Doe
    Software Engineer
    [email protected]
    (555) 123-4567
    
    Skills: Python, JavaScript, React, Node.js, SQL
    
    Education:
    Bachelor of Science in Computer Science
    University of Technology, 2020
    
    Experience:
    Senior Software Developer at Tech Corp (2021-2023)
    - Developed web applications using React and Node.js
    - Managed database systems and APIs
    """
    
    # Create a temporary file for testing
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
        f.write(sample_text)
        temp_path = f.name
    
    try:
        # Test regex extraction
        regex_results = resume_parser.extract_with_regex(sample_text)
        print("Regex Results:", json.dumps(regex_results, indent=2))
        
        # Test name extraction
        name = resume_parser.extract_name_from_text(sample_text)
        print(f"Extracted Name: {name}")
        
    except Exception as e:
        print(f"Test error: {e}")
    finally:
        Path(temp_path).unlink(missing_ok=True)