Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on 17 days ago

Commit

f2a1cfa

1 Parent(s): 6248af7

updated

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +144 -103

backend/services/resume_parser.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import json
 import re
 from pathlib import Path
-from typing import Dict, List, Optional
 from pdfminer.high_level import extract_text as pdf_extract_text
 from docx import Document
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
@@ -30,7 +31,8 @@ class ResumeParser:
                 "ner",
                 model=model,
                 tokenizer=tokenizer,
-                aggregation_strategy="simple"
             )
             self.model_loaded = True
             logger.info("Model loaded successfully")
@@ -45,7 +47,8 @@ class ResumeParser:
                 self.ner_pipeline = pipeline(
                     "ner",
                     model=MODEL_NAME,
-                    aggregation_strategy="simple"
                 )
                 self.model_loaded = True
                 logger.info("Fallback model loaded successfully")
@@ -64,6 +67,8 @@ class ResumeParser:
             if path.suffix.lower() == ".pdf":
                 text = pdf_extract_text(file_path)
                 logger.info(f"Extracted {len(text)} characters from PDF")
                 return text
@@ -81,37 +86,61 @@ class ResumeParser:
             raise
     def extract_with_regex(self, text: str) -> Dict[str, List[str]]:
-        """Fallback extraction using regex patterns"""
         patterns = {
             'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
-            'phone': r'(\+\d{1,3}[-.\s]?)?$$?\d{3}$$?[-.\s]?\d{3}[-.\s]?\d{4}',
-            'skills': r'(?i)(?:skills?|technologies?|tools?)[:\-\s]*([^\n]+)',
-            'education': r'(?i)(?:education|degree|university|college|bachelor|master|phd)[:\-\s]*([^\n]+)',
-            'experience': r'(?i)(?:experience|work|employment|job)[:\-\s]*([^\n]+)'
         }
         results = {}
         for key, pattern in patterns.items():
-            matches = re.findall(pattern, text, re.MULTILINE)
-            results[key] = [match.strip() for match in matches if match.strip()]
         return results
     def extract_name_from_text(self, text: str) -> str:
-        """Extract name using heuristics"""
         lines = text.split('\n')
-        # Usually name is in the first few lines
-        for line in lines[:5]:
             line = line.strip()
-            if line and len(line.split()) <= 4 and len(line) > 2:
                 # Check if it looks like a name (not email, phone, etc.)
-                if not re.search(r'[@\d]', line) and not line.lower().startswith(('resume', 'cv', 'curriculum')):
-                    return line
         return "Not Found"
     def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]:
         """Process NER entities with improved logic"""
-        name, skills, education, experience = [], [], [], []
         logger.info(f"Processing {len(entities)} entities")
@@ -120,27 +149,77 @@ class ResumeParser:
             value = ent.get("word", "").strip()
             confidence = ent.get("score", 0)
-            logger.debug(f"Entity: {label} = {value} (confidence: {confidence:.2f})")
-            # Only consider high-confidence entities
-            if confidence < 0.5:
                 continue
             if label in ["PERSON", "PER", "NAME"]:
-                name.append(value)
             elif label in ["SKILL", "TECH", "TECHNOLOGY"]:
-                skills.append(value)
-            elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"]:
-                education.append(value)
             elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]:
-                experience.append(value)
-        return {
-            "name": name,
-            "skills": skills,
-            "education": education,
-            "experience": experience
         }
     def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]:
         """Parse resume with multiple extraction methods"""
@@ -154,59 +233,53 @@ class ResumeParser:
             logger.info(f"Text preview: {text[:200]}...")
             # Initialize results
-            results = {
-                "name": "Not Found",
-                "skills": "Not Found",
-                "education": "Not Found",
-                "experience": "Not Found"
             }
             # Method 1: Try NER model if available
             if self.model_loaded and self.ner_pipeline:
                 try:
                     logger.info("Using NER model for extraction")
-                    entities = self.ner_pipeline(text)
                     ner_results = self.process_ner_entities(entities)
-                    # Update results with NER findings
-                    for key in results.keys():
-                        if ner_results.get(key):
-                            unique_items = list(dict.fromkeys(ner_results[key]))
-                            results[key] = ", ".join(unique_items)
                 except Exception as e:
                     logger.warning(f"NER extraction failed: {e}")
-            # Method 2: Regex fallback
             logger.info("Using regex patterns for extraction")
             regex_results = self.extract_with_regex(text)
-            # Fill in missing information with regex results
-            if results["name"] == "Not Found":
-                results["name"] = self.extract_name_from_text(text)
-            if results["skills"] == "Not Found" and regex_results.get("skills"):
-                results["skills"] = ", ".join(regex_results["skills"][:3])  # Limit to first 3
-            if results["education"] == "Not Found" and regex_results.get("education"):
-                results["education"] = ", ".join(regex_results["education"][:2])  # Limit to first 2
-            if results["experience"] == "Not Found" and regex_results.get("experience"):
-                results["experience"] = ", ".join(regex_results["experience"][:3])  # Limit to first 3
-            # Add email and phone if found
-            if regex_results.get("email"):
-                results["email"] = regex_results["email"][0]
-            if regex_results.get("phone"):
-                results["phone"] = regex_results["phone"][0]
             logger.info("Parsing completed successfully")
-            return results
         except Exception as e:
             logger.error(f"Error parsing resume: {e}")
             return {
                 "name": "Error",
                 "skills": "Error",
                 "education": "Error",
                 "experience": "Error",
@@ -220,44 +293,12 @@ def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
     """Main function to parse resume"""
     return resume_parser.parse_resume(file_path, filename)
-# Test function
-def test_parser():
-    """Test the parser with sample text"""
-    sample_text = """
-    John Doe
-    Software Engineer
-    john.[email protected]
-    (555) 123-4567
-    Skills: Python, JavaScript, React, Node.js, SQL
-    Education:
-    Bachelor of Science in Computer Science
-    University of Technology, 2020
-    Experience:
-    Senior Software Developer at Tech Corp (2021-2023)
-    - Developed web applications using React and Node.js
-    - Managed database systems and APIs
-    """
-    # Create a temporary file for testing
-    import tempfile
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
-        f.write(sample_text)
-        temp_path = f.name
-    try:
-        # Test regex extraction
-        regex_results = resume_parser.extract_with_regex(sample_text)
-        print("Regex Results:", json.dumps(regex_results, indent=2))
-        # Test name extraction
-        name = resume_parser.extract_name_from_text(sample_text)
-        print(f"Extracted Name: {name}")
-    except Exception as e:
-        print(f"Test error: {e}")
-    finally:
-        Path(temp_path).unlink(missing_ok=True)

 import json
 import re
+import os
 from pathlib import Path
+from typing import Dict, List, Optional, Union
 from pdfminer.high_level import extract_text as pdf_extract_text
 from docx import Document
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
                 "ner",
                 model=model,
                 tokenizer=tokenizer,
+                aggregation_strategy="simple",
+                device=0 if os.environ.get("L4_GPU", "false").lower() == "true" else -1
             )
             self.model_loaded = True
             logger.info("Model loaded successfully")
                 self.ner_pipeline = pipeline(
                     "ner",
                     model=MODEL_NAME,
+                    aggregation_strategy="simple",
+                    device=0 if os.environ.get("L4_GPU", "false").lower() == "true" else -1
                 )
                 self.model_loaded = True
                 logger.info("Fallback model loaded successfully")
             if path.suffix.lower() == ".pdf":
                 text = pdf_extract_text(file_path)
+                # Clean up PDF text extraction artifacts
+                text = re.sub(r'\s+', ' ', text).strip()
                 logger.info(f"Extracted {len(text)} characters from PDF")
                 return text
             raise
     def extract_with_regex(self, text: str) -> Dict[str, List[str]]:
+        """Improved regex patterns for extraction"""
         patterns = {
             'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
+            'phone': r'(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
+            'skills': r'(?i)(?:skills?|technologies?|tools?|expertise)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)',
+            'education': r'(?i)(?:education|degree|university|college|bachelor|master|phd)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)',
+            'experience': r'(?i)(?:experience|work\shistory|employment|job\shistory)[:\-\s]*(.*?)(?:\n\n|\n\s*\n|$)',
+            'name': r'^(?!(resume|cv|curriculum vitae|\d))[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+'
         }
         results = {}
         for key, pattern in patterns.items():
+            matches = re.findall(pattern, text, re.MULTILINE | re.IGNORECASE)
+            if key == 'name' and matches:
+                # Take the first likely name match
+                results[key] = [matches[0].strip()]
+            else:
+                # Clean and filter matches
+                cleaned = [m.strip() for m in matches if m.strip()]
+                if cleaned:
+                    results[key] = cleaned
         return results
     def extract_name_from_text(self, text: str) -> str:
+        """Improved name extraction heuristics"""
+        # First try to find name using regex
+        name_match = re.search(
+            r'^(?!(resume|cv|curriculum vitae|\d))[A-Z][a-z]+(?:\s+[A-Z][a-z]+)+',
+            text,
+            re.MULTILINE | re.IGNORECASE
+        )
+        if name_match:
+            return name_match.group(0).strip()
+        # Fallback to line-based approach
         lines = text.split('\n')
+        for line in lines[:10]:  # Check first 10 lines
             line = line.strip()
+            if line and 2 <= len(line.split()) <= 4:
                 # Check if it looks like a name (not email, phone, etc.)
+                if not re.search(r'[@\d+\-\(\)]', line):
+                    if line[0].isupper() and not line.lower().startswith(('resume', 'cv', 'curriculum')):
+                        return line
         return "Not Found"
     def process_ner_entities(self, entities: List[Dict]) -> Dict[str, List[str]]:
         """Process NER entities with improved logic"""
+        results = {
+            "name": [],
+            "skills": [],
+            "education": [],
+            "experience": []
+        }
         logger.info(f"Processing {len(entities)} entities")
             value = ent.get("word", "").strip()
             confidence = ent.get("score", 0)
+            # Skip low confidence entities and empty values
+            if confidence < 0.7 or not value:
                 continue
+            # Normalize labels
             if label in ["PERSON", "PER", "NAME"]:
+                results["name"].append(value)
             elif label in ["SKILL", "TECH", "TECHNOLOGY"]:
+                results["skills"].append(value)
+            elif label in ["EDUCATION", "DEGREE", "EDU", "ORG"] and "university" not in value.lower():
+                results["education"].append(value)
             elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION", "WORK"]:
+                results["experience"].append(value)
+        # Deduplicate and clean results
+        for key in results:
+            results[key] = list(dict.fromkeys(results[key]))  # Preserve order
+        return results
+    def merge_results(self, ner_results: Dict, regex_results: Dict) -> Dict[str, str]:
+        """Merge NER and regex results intelligently"""
+        merged = {
+            "name": "Not Found",
+            "email": "Not Found",
+            "phone": "Not Found",
+            "skills": "Not Found",
+            "education": "Not Found",
+            "experience": "Not Found"
         }
+        # Name - prioritize NER, then regex, then text extraction
+        if ner_results.get("name"):
+            merged["name"] = " ".join(ner_results["name"][:1])  # Take first name only
+        elif regex_results.get("name"):
+            merged["name"] = regex_results["name"][0]
+        # Email and phone - only from regex
+        if regex_results.get("email"):
+            merged["email"] = regex_results["email"][0]
+        if regex_results.get("phone"):
+            merged["phone"] = regex_results["phone"][0]
+        # Skills - combine both sources
+        all_skills = []
+        if ner_results.get("skills"):
+            all_skills.extend(ner_results["skills"])
+        if regex_results.get("skills"):
+            all_skills.extend(regex_results["skills"])
+        if all_skills:
+            merged["skills"] = ", ".join(list(dict.fromkeys(all_skills))[:10])  # Limit to 10 skills
+        # Education - combine both sources
+        all_edu = []
+        if ner_results.get("education"):
+            all_edu.extend(ner_results["education"])
+        if regex_results.get("education"):
+            all_edu.extend(regex_results["education"])
+        if all_edu:
+            merged["education"] = ", ".join(list(dict.fromkeys(all_edu))[:3]  # Limit to 3 items
+        # Experience - combine both sources
+        all_exp = []
+        if ner_results.get("experience"):
+            all_exp.extend(ner_results["experience"])
+        if regex_results.get("experience"):
+            all_exp.extend(regex_results["experience"])
+        if all_exp:
+            merged["experience"] = ", ".join(list(dict.fromkeys(all_exp))[:3]  # Limit to 3 items
+        return merged
     def parse_resume(self, file_path: str, filename: str = None) -> Dict[str, str]:
         """Parse resume with multiple extraction methods"""
             logger.info(f"Text preview: {text[:200]}...")
             # Initialize results
+            ner_results = {
+                "name": [],
+                "skills": [],
+                "education": [],
+                "experience": []
             }
             # Method 1: Try NER model if available
             if self.model_loaded and self.ner_pipeline:
                 try:
                     logger.info("Using NER model for extraction")
+                    entities = self.ner_pipeline(text[:5120])  # Limit input size for NER
                     ner_results = self.process_ner_entities(entities)
+                    logger.info(f"NER results: {json.dumps(ner_results, indent=2)}")
                 except Exception as e:
                     logger.warning(f"NER extraction failed: {e}")
+            # Method 2: Regex extraction
             logger.info("Using regex patterns for extraction")
             regex_results = self.extract_with_regex(text)
+            logger.info(f"Regex results: {json.dumps(regex_results, indent=2)}")
+            # Method 3: Name extraction fallback
+            if not ner_results.get("name") and not regex_results.get("name"):
+                name = self.extract_name_from_text(text)
+                if name != "Not Found":
+                    regex_results["name"] = [name]
+            # Merge all results
+            final_results = self.merge_results(ner_results, regex_results)
+            # If name still not found, try filename
+            if final_results["name"] == "Not Found" and filename:
+                # Try to extract name from filename (common pattern: "Firstname Lastname - Resume.pdf")
+                name_from_file = re.sub(r'[-_].*', '', filename).strip()
+                if len(name_from_file.split()) >= 2:
+                    final_results["name"] = name_from_file
             logger.info("Parsing completed successfully")
+            return final_results
         except Exception as e:
             logger.error(f"Error parsing resume: {e}")
             return {
                 "name": "Error",
+                "email": "Error",
+                "phone": "Error",
                 "skills": "Error",
                 "education": "Error",
                 "experience": "Error",
     """Main function to parse resume"""
     return resume_parser.parse_resume(file_path, filename)
+if __name__ == "__main__":
+    # Test the parser
+    test_file = input("Enter path to resume file: ")
+    if os.path.exists(test_file):
+        results = parse_resume(test_file, os.path.basename(test_file))
+        print("\nParsing Results:")
+        print(json.dumps(results, indent=2))
+    else:
+        print("File not found")