Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

Codingo / backend /services /resume_parser.py

husseinelsaadi

updated

b8deff5 about 2 months ago

raw

history blame

2.16 kB

	import json
	from pathlib import Path
	from typing import Dict

	from pdfminer.high_level import extract_text as pdf_extract_text
	from docx import Document
	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

	MODEL_NAME = "manishiitg/resume-ner"

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
	ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

	def extract_text(file_path: str) -> str:
	path = Path(file_path)
	if path.suffix.lower() == ".pdf":
	text = pdf_extract_text(file_path)
	elif path.suffix.lower() == ".docx":
	doc = Document(file_path)
	text = "\n".join([p.text for p in doc.paragraphs])
	else:
	raise ValueError("Unsupported file format")

	# Clean text
	text = text.replace("\n", " ").replace("\r", " ").strip()
	return text

	def parse_resume(file_path: str, filename: str = None) -> Dict[str, str]:
	text = extract_text(file_path)
	entities = ner_pipeline(text)

	# Debug: Print actual detected entities
	print("\n=== DEBUG: Entities Detected ===")
	for ent in entities:
	print(f"{ent['entity_group']} => {ent['word']}")
	print("==============================\n")

	name_parts, skills, education, experience = [], [], [], []

	for ent in entities:
	label = ent["entity_group"].upper()
	value = ent["word"].strip()

	if label in ["NAME", "PERSON"]:
	name_parts.append(value)
	elif label in ["SKILL", "SKILLS"]:
	skills.append(value)
	elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
	education.append(value)
	elif label in ["EXPERIENCE", "JOB", "ROLE", "POSITION"]:
	experience.append(value)

	full_name = " ".join(dict.fromkeys(name_parts)) or "Not Found"

	return {
	"name": full_name,
	"skills": ", ".join(dict.fromkeys(skills)) or "Not Found",
	"education": ", ".join(dict.fromkeys(education)) or "Not Found",
	"experience": ", ".join(dict.fromkeys(experience)) or "Not Found"
	}