Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on 13 days ago

Commit

33fa314

1 Parent(s): 682910e

updated

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +32 -49

backend/services/resume_parser.py CHANGED Viewed

@@ -1,78 +1,61 @@
-import os
-import re
-import subprocess
-import zipfile
 import json
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 # --------------------
-# Load Model
 # --------------------
-MODEL_NAME = "sravya-abburi/ResumeParserBERT"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
 # --------------------
-# Extract Text
 # --------------------
 def extract_text(file_path: str) -> str:
-    """Extract text from PDF/DOCX resumes."""
-    if file_path.lower().endswith(".pdf"):
-        try:
-            result = subprocess.run(
-                ["pdftotext", "-layout", file_path, "-"],
-                stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
-            )
-            return result.stdout.decode("utf-8", errors="ignore")
-        except:
-            return ""
-    elif file_path.lower().endswith(".docx"):
-        try:
-            with zipfile.ZipFile(file_path) as zf:
-                with zf.open("word/document.xml") as docx_xml:
-                    xml_text = docx_xml.read().decode("utf-8", errors="ignore")
-                    xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
-                    return re.sub(r"<[^>]+>", " ", xml_text)
-        except:
-            return ""
-    return ""
 # --------------------
 # Parse Resume
 # --------------------
-def parse_resume(file_path: str, filename: str = None) -> dict:
-    """Extract Name, Skills, Education, Experience from resume."""
     text = extract_text(file_path)
     entities = ner_pipeline(text)
-    name, skills, education, experience = [], [], [], []
     for ent in entities:
         label = ent["entity_group"].upper()
-        word = ent["word"].strip()
         if label == "NAME":
-            name.append(word)
         elif label == "SKILL":
-            skills.append(word)
         elif label in ["EDUCATION", "DEGREE"]:
-            education.append(word)
-        elif label in ["EXPERIENCE", "JOB", "ROLE"]:
-            experience.append(word)
     return {
-        "name": " ".join(dict.fromkeys(name)),
-        "skills": ", ".join(dict.fromkeys(skills)),
-        "education": ", ".join(dict.fromkeys(education)),
-        "experience": ", ".join(dict.fromkeys(experience))
     }
-# --------------------
-# Example
-# --------------------
-if __name__ == "__main__":
-    resume_path = "resume.pdf"  # Change to test file
-    result = parse_resume(resume_path)
-    print(json.dumps(result, indent=2))

 import json
+from pathlib import Path
+from typing import Dict
+from pdfminer.high_level import extract_text as pdf_extract_text
+from docx import Document
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 # --------------------
+# Load Resume NER Model
 # --------------------
+MODEL_NAME = "Ioana23/bert-finetuned-resumes-ner"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
 # --------------------
+# Extract Text from PDF/DOCX
 # --------------------
 def extract_text(file_path: str) -> str:
+    path = Path(file_path)
+    if path.suffix.lower() == ".pdf":
+        return pdf_extract_text(file_path)
+    elif path.suffix.lower() == ".docx":
+        doc = Document(file_path)
+        return "\n".join([p.text for p in doc.paragraphs])
+    else:
+        raise ValueError("Unsupported file format")
 # --------------------
 # Parse Resume
 # --------------------
+def parse_resume(file_path: str) -> Dict[str, str]:
     text = extract_text(file_path)
     entities = ner_pipeline(text)
+    name = []
+    skills = []
+    education = []
+    experience = []
     for ent in entities:
         label = ent["entity_group"].upper()
+        value = ent["word"].strip()
         if label == "NAME":
+            name.append(value)
         elif label == "SKILL":
+            skills.append(value)
         elif label in ["EDUCATION", "DEGREE"]:
+            education.append(value)
+        elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
+            experience.append(value)
     return {
+        "name": " ".join(dict.fromkeys(name)) or "Not Found",
+        "skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
+        "education": ", ".join(dict.fromkeys(education)) or "Not Found",
+        "experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
     }