Spaces:
Paused
Paused
Commit
·
b8deff5
1
Parent(s):
288175b
updated
Browse files
backend/services/resume_parser.py
CHANGED
@@ -6,49 +6,47 @@ from pdfminer.high_level import extract_text as pdf_extract_text
|
|
6 |
from docx import Document
|
7 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
8 |
|
9 |
-
|
10 |
-
# Load PyTorch Resume NER Model
|
11 |
-
# --------------------
|
12 |
-
MODEL_NAME = "manishiitg/resume-ner" # Works with PyTorch on Hugging Face Spaces
|
13 |
|
14 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
15 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
|
16 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
17 |
|
18 |
-
# --------------------
|
19 |
-
# Extract Text from PDF/DOCX
|
20 |
-
# --------------------
|
21 |
def extract_text(file_path: str) -> str:
|
22 |
path = Path(file_path)
|
23 |
if path.suffix.lower() == ".pdf":
|
24 |
-
|
25 |
elif path.suffix.lower() == ".docx":
|
26 |
doc = Document(file_path)
|
27 |
-
|
28 |
else:
|
29 |
raise ValueError("Unsupported file format")
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
# --------------------
|
32 |
-
# Parse Resume (returns only: full name, skills, education, experience)
|
33 |
-
# --------------------
|
34 |
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
|
35 |
text = extract_text(file_path)
|
36 |
entities = ner_pipeline(text)
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
42 |
|
43 |
for ent in entities:
|
44 |
label = ent["entity_group"].upper()
|
45 |
value = ent["word"].strip()
|
46 |
|
47 |
-
if label
|
48 |
name_parts.append(value)
|
49 |
-
elif label
|
50 |
skills.append(value)
|
51 |
-
elif label in ["EDUCATION", "DEGREE"]:
|
52 |
education.append(value)
|
53 |
elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
|
54 |
experience.append(value)
|
|
|
6 |
from docx import Document
|
7 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
8 |
|
9 |
+
MODEL_NAME = "manishiitg/resume-ner"
|
|
|
|
|
|
|
10 |
|
11 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
12 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
|
13 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
14 |
|
|
|
|
|
|
|
15 |
def extract_text(file_path: str) -> str:
|
16 |
path = Path(file_path)
|
17 |
if path.suffix.lower() == ".pdf":
|
18 |
+
text = pdf_extract_text(file_path)
|
19 |
elif path.suffix.lower() == ".docx":
|
20 |
doc = Document(file_path)
|
21 |
+
text = "\n".join([p.text for p in doc.paragraphs])
|
22 |
else:
|
23 |
raise ValueError("Unsupported file format")
|
24 |
+
|
25 |
+
# Clean text
|
26 |
+
text = text.replace("\n", " ").replace("\r", " ").strip()
|
27 |
+
return text
|
28 |
|
|
|
|
|
|
|
29 |
def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
|
30 |
text = extract_text(file_path)
|
31 |
entities = ner_pipeline(text)
|
32 |
|
33 |
+
# Debug: Print actual detected entities
|
34 |
+
print("\n=== DEBUG: Entities Detected ===")
|
35 |
+
for ent in entities:
|
36 |
+
print(f"{ent['entity_group']} => {ent['word']}")
|
37 |
+
print("==============================\n")
|
38 |
+
|
39 |
+
name_parts, skills, education, experience = [], [], [], []
|
40 |
|
41 |
for ent in entities:
|
42 |
label = ent["entity_group"].upper()
|
43 |
value = ent["word"].strip()
|
44 |
|
45 |
+
if label in ["NAME", "PERSON"]:
|
46 |
name_parts.append(value)
|
47 |
+
elif label in ["SKILL", "SKILLS"]:
|
48 |
skills.append(value)
|
49 |
+
elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
|
50 |
education.append(value)
|
51 |
elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
|
52 |
experience.append(value)
|