Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

Codingo / backend /services /resume_parser.py

husseinelsaadi

updated

c0dac84 19 days ago

raw

history blame

3.98 kB

	import re
	from pathlib import Path
	from pdfminer.high_level import extract_text as pdf_extract_text
	from docx import Document

	class ResumeParser:
	def __init__(self):
	pass

	def extract_text(self, file_path: str) -> str:
	"""Extract text from PDF or DOCX files"""
	path = Path(file_path)

	if path.suffix.lower() == ".pdf":
	text = pdf_extract_text(file_path)
	return re.sub(r'\s+', ' ', text).strip()
	elif path.suffix.lower() == ".docx":
	doc = Document(file_path)
	return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
	else:
	raise ValueError("Unsupported file format")

	def extract_name(self, text: str) -> str:
	"""Extract name from resume text"""
	# Try to find name at the beginning of document
	first_lines = [line.strip() for line in text.split('\n')[:10] if line.strip()]

	for line in first_lines:
	# Simple name pattern (2-4 words, all starting with capital)
	if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3}$', line):
	if not any(word.lower() in ['resume', 'cv', 'curriculum'] for word in line.split()):
	return line

	# Fallback: return first non-empty line that looks like a name
	for line in first_lines:
	if 2 <= len(line.split()) <= 4 and line[0].isupper():
	return line

	return "Not Found"

	def extract_sections(self, text: str) -> dict:
	"""Extract skills, education, and experience using regex"""
	results = {
	"skills": [],
	"education": [],
	"experience": []
	}

	# Extract skills
	skills_match = re.search(
	r'(?:skills\|technologies\|expertise)[:\s](.?)(?:\n\n\|\n\s*\n\|$)',
	text, re.IGNORECASE
	)
	if skills_match:
	skills_text = skills_match.group(1)
	results["skills"] = [s.strip() for s in re.split(r'[,;]', skills_text) if s.strip()]

	# Extract education
	edu_match = re.search(
	r'(?:education\|degrees?)[:\s](.?)(?:\n\n\|\n\s*\n\|$)',
	text, re.IGNORECASE
	)
	if edu_match:
	results["education"] = [e.strip() for e in edu_match.group(1).split('\n') if e.strip()]

	# Extract experience
	exp_match = re.search(
	r'(?:experience\|work history\|employment)[:\s](.?)(?:\n\n\|\n\s*\n\|$)',
	text, re.IGNORECASE
	)
	if exp_match:
	results["experience"] = [x.strip() for x in exp_match.group(1).split('\n') if x.strip()]

	return results

	def parse_resume(self, file_path: str) -> dict:
	"""Main parsing function"""
	try:
	text = self.extract_text(file_path)

	if not text or len(text.strip()) < 10:
	return {
	"name": "Error: Empty file",
	"skills": [],
	"education": [],
	"experience": []
	}

	name = self.extract_name(text)
	sections = self.extract_sections(text)

	return {
	"name": name,
	"skills": sections["skills"][:10], # Limit to 10 skills
	"education": sections["education"][:3], # Limit to 3 items
	"experience": sections["experience"][:3] # Limit to 3 items
	}

	except Exception as e:
	return {
	"name": f"Error: {str(e)}",
	"skills": [],
	"education": [],
	"experience": []
	}

	# Global instance
	resume_parser = ResumeParser()

	def parse_resume(file_path: str) -> dict:
	"""Public interface for resume parsing"""
	return resume_parser.parse_resume(file_path)