Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

Codingo / backend /services /resume_parser.py

husseinelsaadi

updated

efffc2e 18 days ago

raw

history blame

3.45 kB

	from __future__ import annotations
	import os
	import re
	import subprocess
	import zipfile
	from typing import List, Dict
	import torch
	from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

	# ===============================
	# Load Model & Tokenizer
	# ===============================
	MODEL_ID = "sravya-abburi/ResumeParserBERT"

	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)

	ner_pipeline = pipeline(
	"ner",
	model=model,
	tokenizer=tokenizer,
	aggregation_strategy="simple",
	device=0 if torch.cuda.is_available() else -1
	)

	# ===============================
	# Text Extraction
	# ===============================
	def extract_text(file_path: str) -> str:
	"""Extract raw text from PDF or DOCX."""
	if not file_path or not os.path.isfile(file_path):
	return ""

	lower_name = file_path.lower()
	try:
	if lower_name.endswith(".pdf"):
	result = subprocess.run(
	["pdftotext", "-layout", file_path, "-"],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	check=False
	)
	return result.stdout.decode("utf-8", errors="ignore")

	elif lower_name.endswith(".docx"):
	with zipfile.ZipFile(file_path) as zf:
	with zf.open("word/document.xml") as docx_xml:
	xml_bytes = docx_xml.read()
	xml_text = xml_bytes.decode("utf-8", errors="ignore")
	xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
	text = re.sub(r"<[^>]+>", " ", xml_text)
	return re.sub(r"\s+", " ", text)
	else:
	return ""
	except Exception:
	return ""

	# ===============================
	# Parse Resume using BERT NER
	# ===============================
	def parse_with_bert(text: str) -> Dict[str, str]:
	"""Parse resume text into structured fields using BERT NER."""
	entities = ner_pipeline(text)

	name_tokens, skill_tokens, edu_tokens, exp_tokens = [], [], [], []

	for ent in entities:
	label = ent["entity_group"].upper()
	word = ent["word"].strip()

	if label == "NAME" and word not in name_tokens:
	name_tokens.append(word)
	elif label == "SKILL" and word not in skill_tokens:
	skill_tokens.append(word)
	elif label == "EDUCATION" and word not in edu_tokens:
	edu_tokens.append(word)
	elif label == "EXPERIENCE" and word not in exp_tokens:
	exp_tokens.append(word)

	return {
	"name": " ".join(name_tokens),
	"skills": ", ".join(skill_tokens),
	"education": ", ".join(edu_tokens),
	"experience": ", ".join(exp_tokens)
	}

	# ===============================
	# Main Parse Function
	# ===============================
	def parse_resume(file_path: str, filename: str) -> dict:
	"""Main function for resume parsing."""
	text = extract_text(file_path)
	if not text:
	return {"name": "", "skills": "", "education": "", "experience": ""}

	ents = parse_with_bert(text)

	# Fallback: use filename for name if model doesn't find one
	if not ents["name"]:
	base = os.path.basename(filename)
	base = re.sub(r"\.(pdf\|docx\|doc)$", "", base, flags=re.I)
	ents["name"] = re.sub(r"[\._-]+", " ", base).title().strip()

	return ents