Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on 14 days ago

Commit

b8deff5

1 Parent(s): 288175b

updated

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +17 -19

backend/services/resume_parser.py CHANGED Viewed

@@ -6,49 +6,47 @@ from pdfminer.high_level import extract_text as pdf_extract_text
 from docx import Document
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
-# --------------------
-# Load PyTorch Resume NER Model
-# --------------------
-MODEL_NAME = "manishiitg/resume-ner"  # Works with PyTorch on Hugging Face Spaces
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
-# --------------------
-# Extract Text from PDF/DOCX
-# --------------------
 def extract_text(file_path: str) -> str:
     path = Path(file_path)
     if path.suffix.lower() == ".pdf":
-        return pdf_extract_text(file_path)
     elif path.suffix.lower() == ".docx":
         doc = Document(file_path)
-        return "\n".join([p.text for p in doc.paragraphs])
     else:
         raise ValueError("Unsupported file format")
-# --------------------
-# Parse Resume (returns only: full name, skills, education, experience)
-# --------------------
 def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
     text = extract_text(file_path)
     entities = ner_pipeline(text)
-    name_parts = []
-    skills = []
-    education = []
-    experience = []
     for ent in entities:
         label = ent["entity_group"].upper()
         value = ent["word"].strip()
-        if label == "NAME":
             name_parts.append(value)
-        elif label == "SKILL":
             skills.append(value)
-        elif label in ["EDUCATION", "DEGREE"]:
             education.append(value)
         elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
             experience.append(value)

 from docx import Document
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+MODEL_NAME = "manishiitg/resume-ner"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
 def extract_text(file_path: str) -> str:
     path = Path(file_path)
     if path.suffix.lower() == ".pdf":
+        text = pdf_extract_text(file_path)
     elif path.suffix.lower() == ".docx":
         doc = Document(file_path)
+        text = "\n".join([p.text for p in doc.paragraphs])
     else:
         raise ValueError("Unsupported file format")
+    # Clean text
+    text = text.replace("\n", " ").replace("\r", " ").strip()
+    return text
 def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
     text = extract_text(file_path)
     entities = ner_pipeline(text)
+    # Debug: Print actual detected entities
+    print("\n=== DEBUG: Entities Detected ===")
+    for ent in entities:
+        print(f"{ent['entity_group']} => {ent['word']}")
+    print("==============================\n")
+    name_parts, skills, education, experience = [], [], [], []
     for ent in entities:
         label = ent["entity_group"].upper()
         value = ent["word"].strip()
+        if label in ["NAME", "PERSON"]:
             name_parts.append(value)
+        elif label in ["SKILL", "SKILLS"]:
             skills.append(value)
+        elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
             education.append(value)
         elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
             experience.append(value)