Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on 15 days ago

Commit

a511250

1 Parent(s): 864c2ae

updated

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +19 -9

backend/services/resume_parser.py CHANGED Viewed

@@ -1,14 +1,17 @@
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 import subprocess, zipfile, re, os
-# === Load pretrained HF model instead of training ===
-MODEL_NAME = "sravya-abburi/ResumeParserBERT"  # or Kiet/autotrain-resume_parser-1159242747
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
-ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
 # === Extract text from PDF/DOCX ===
 def extract_text(file_path: str) -> str:
     if file_path.lower().endswith(".pdf"):
         result = subprocess.run(
             ["pdftotext", "-layout", file_path, "-"],
@@ -24,14 +27,21 @@ def extract_text(file_path: str) -> str:
     return ""
 # === Parse resume with NER ===
-def parse_resume(file_path: str) -> dict:
     text = extract_text(file_path)
     entities = ner_pipeline(text)
     name, skills, education, experience = [], [], [], []
     for ent in entities:
         label = ent["entity_group"].upper()
-        word = ent["word"]
         if label == "NAME":
             name.append(word)
         elif label == "SKILL":
@@ -42,8 +52,8 @@ def parse_resume(file_path: str) -> dict:
             experience.append(word)
     return {
-        "name": " ".join(set(name)),
-        "skills": ", ".join(set(skills)),
-        "education": ", ".join(set(education)),
-        "experience": ", ".join(set(experience))
     }

 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 import subprocess, zipfile, re, os
+# === Load pretrained HF model ===
+MODEL_NAME = "sravya-abburi/ResumeParserBERT"  # or "Kiet/autotrain-resume_parser-1159242747"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
+# Use CPU for stability (device=-1) to avoid GPU memory issues from other parts of the app
+ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=-1)
 # === Extract text from PDF/DOCX ===
 def extract_text(file_path: str) -> str:
+    """Extract text from PDF or DOCX resumes."""
     if file_path.lower().endswith(".pdf"):
         result = subprocess.run(
             ["pdftotext", "-layout", file_path, "-"],
     return ""
 # === Parse resume with NER ===
+def parse_resume(file_path: str, filename: str = None) -> dict:
+    """Parse resume and extract Name, Skills, Education, Experience."""
     text = extract_text(file_path)
     entities = ner_pipeline(text)
     name, skills, education, experience = [], [], [], []
     for ent in entities:
+        word = ent["word"].strip()
         label = ent["entity_group"].upper()
+        # Skip empty or placeholder tokens
+        if not word or word.startswith("LABEL_"):
+            continue
         if label == "NAME":
             name.append(word)
         elif label == "SKILL":
             experience.append(word)
     return {
+        "name": " ".join(dict.fromkeys(name)),
+        "skills": ", ".join(dict.fromkeys(skills)),
+        "education": ", ".join(dict.fromkeys(education)),
+        "experience": ", ".join(dict.fromkeys(experience))
     }