Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on 14 days ago

Commit

288175b

1 Parent(s): c0dac84

resume parser updated

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +54 -103

backend/services/resume_parser.py CHANGED Viewed

@@ -1,112 +1,63 @@
-import re
 from pathlib import Path
 from pdfminer.high_level import extract_text as pdf_extract_text
 from docx import Document
-class ResumeParser:
-    def __init__(self):
-        pass
-    def extract_text(self, file_path: str) -> str:
-        """Extract text from PDF or DOCX files"""
-        path = Path(file_path)
-        if path.suffix.lower() == ".pdf":
-            text = pdf_extract_text(file_path)
-            return re.sub(r'\s+', ' ', text).strip()
-        elif path.suffix.lower() == ".docx":
-            doc = Document(file_path)
-            return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
-        else:
-            raise ValueError("Unsupported file format")
-    def extract_name(self, text: str) -> str:
-        """Extract name from resume text"""
-        # Try to find name at the beginning of document
-        first_lines = [line.strip() for line in text.split('\n')[:10] if line.strip()]
-        for line in first_lines:
-            # Simple name pattern (2-4 words, all starting with capital)
-            if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3}$', line):
-                if not any(word.lower() in ['resume', 'cv', 'curriculum'] for word in line.split()):
-                    return line
-        # Fallback: return first non-empty line that looks like a name
-        for line in first_lines:
-            if 2 <= len(line.split()) <= 4 and line[0].isupper():
-                return line
-        return "Not Found"
-    def extract_sections(self, text: str) -> dict:
-        """Extract skills, education, and experience using regex"""
-        results = {
-            "skills": [],
-            "education": [],
-            "experience": []
-        }
-        # Extract skills
-        skills_match = re.search(
-            r'(?:skills|technologies|expertise)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
-            text, re.IGNORECASE
-        )
-        if skills_match:
-            skills_text = skills_match.group(1)
-            results["skills"] = [s.strip() for s in re.split(r'[,;]', skills_text) if s.strip()]
-        # Extract education
-        edu_match = re.search(
-            r'(?:education|degrees?)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
-            text, re.IGNORECASE
-        )
-        if edu_match:
-            results["education"] = [e.strip() for e in edu_match.group(1).split('\n') if e.strip()]
-        # Extract experience
-        exp_match = re.search(
-            r'(?:experience|work history|employment)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
-            text, re.IGNORECASE
-        )
-        if exp_match:
-            results["experience"] = [x.strip() for x in exp_match.group(1).split('\n') if x.strip()]
-        return results
-    def parse_resume(self, file_path: str) -> dict:
-        """Main parsing function"""
-        try:
-            text = self.extract_text(file_path)
-            if not text or len(text.strip()) < 10:
-                return {
-                    "name": "Error: Empty file",
-                    "skills": [],
-                    "education": [],
-                    "experience": []
-                }
-            name = self.extract_name(text)
-            sections = self.extract_sections(text)
-            return {
-                "name": name,
-                "skills": sections["skills"][:10],  # Limit to 10 skills
-                "education": sections["education"][:3],  # Limit to 3 items
-                "experience": sections["experience"][:3]  # Limit to 3 items
-            }
-        except Exception as e:
-            return {
-                "name": f"Error: {str(e)}",
-                "skills": [],
-                "education": [],
-                "experience": []
-            }
-# Global instance
-resume_parser = ResumeParser()
-def parse_resume(file_path: str) -> dict:
-    """Public interface for resume parsing"""
-    return resume_parser.parse_resume(file_path)

+import json
 from pathlib import Path
+from typing import Dict
 from pdfminer.high_level import extract_text as pdf_extract_text
 from docx import Document
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+# --------------------
+# Load PyTorch Resume NER Model
+# --------------------
+MODEL_NAME = "manishiitg/resume-ner"  # Works with PyTorch on Hugging Face Spaces
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
+ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+# --------------------
+# Extract Text from PDF/DOCX
+# --------------------
+def extract_text(file_path: str) -> str:
+    path = Path(file_path)
+    if path.suffix.lower() == ".pdf":
+        return pdf_extract_text(file_path)
+    elif path.suffix.lower() == ".docx":
+        doc = Document(file_path)
+        return "\n".join([p.text for p in doc.paragraphs])
+    else:
+        raise ValueError("Unsupported file format")
+# --------------------
+# Parse Resume (returns only: full name, skills, education, experience)
+# --------------------
+def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
+    text = extract_text(file_path)
+    entities = ner_pipeline(text)
+    name_parts = []
+    skills = []
+    education = []
+    experience = []
+    for ent in entities:
+        label = ent["entity_group"].upper()
+        value = ent["word"].strip()
+        if label == "NAME":
+            name_parts.append(value)
+        elif label == "SKILL":
+            skills.append(value)
+        elif label in ["EDUCATION", "DEGREE"]:
+            education.append(value)
+        elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
+            experience.append(value)
+    full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"
+    return {
+        "name": full_name,
+        "skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
+        "education": ", ".join(dict.fromkeys(education)) or "Not Found",
+        "experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
+    }