Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

Codingo / backend /models /resume_parser /resume_to_features.py

husseinelsaadi

updated

9e72d2c 28 days ago

raw

history blame

9.32 kB

	import os
	import re
	import json
	from pathlib import Path
	import PyPDF2
	from docx import Document
	import textract

	class SimpleResumeParser:
	def __init__(self):
	# Common skills keywords
	self.skills_keywords = [
	'python', 'javascript', 'java', 'c++', 'c#', 'php', 'ruby', 'go', 'rust',
	'html', 'css', 'react', 'angular', 'vue', 'node.js', 'express', 'django',
	'flask', 'spring', 'laravel', 'rails', 'asp.net', 'jquery', 'bootstrap',
	'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch',
	'aws', 'azure', 'gcp', 'docker', 'kubernetes', 'jenkins', 'git', 'github',
	'machine learning', 'deep learning', 'tensorflow', 'pytorch', 'scikit-learn',
	'data analysis', 'pandas', 'numpy', 'matplotlib', 'tableau', 'power bi',
	'agile', 'scrum', 'devops', 'ci/cd', 'microservices', 'api', 'rest', 'graphql'
	]

	# Education keywords
	self.education_keywords = [
	'bachelor', 'master', 'phd', 'degree', 'university', 'college', 'institute',
	'computer science', 'engineering', 'mathematics', 'physics', 'chemistry',
	'business', 'mba', 'certification', 'diploma'
	]

	# Experience keywords
	self.experience_keywords = [
	'experience', 'worked', 'developed', 'managed', 'led', 'created', 'built',
	'designed', 'implemented', 'maintained', 'optimized', 'improved', 'years'
	]

	def extract_text_from_pdf(self, file_path):
	"""Extract text from PDF file"""
	try:
	with open(file_path, 'rb') as file:
	reader = PyPDF2.PdfReader(file)
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	print(f"Error reading PDF: {e}")
	return ""

	def extract_text_from_docx(self, file_path):
	"""Extract text from DOCX file"""
	try:
	doc = Document(file_path)
	text = ""
	for paragraph in doc.paragraphs:
	text += paragraph.text + "\n"
	return text
	except Exception as e:
	print(f"Error reading DOCX: {e}")
	return ""

	def extract_text_from_doc(self, file_path):
	"""Extract text from DOC file using textract"""
	try:
	text = textract.process(file_path).decode('utf-8')
	return text
	except Exception as e:
	print(f"Error reading DOC: {e}")
	return ""

	def extract_text(self, file_path):
	"""Extract text based on file extension"""
	file_extension = Path(file_path).suffix.lower()

	if file_extension == '.pdf':
	return self.extract_text_from_pdf(file_path)
	elif file_extension == '.docx':
	return self.extract_text_from_docx(file_path)
	elif file_extension == '.doc':
	return self.extract_text_from_doc(file_path)
	else:
	return ""

	def extract_email(self, text):
	"""Extract email addresses from text"""
	email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}\b'
	emails = re.findall(email_pattern, text)
	return emails[0] if emails else ""

	def extract_phone(self, text):
	"""Extract phone numbers from text"""
	phone_patterns = [
	r'\+?1?[-.\s]?$$?([0-9]{3})$$?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
	r'\+?([0-9]{1,3})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})[-.\s]?([0-9]{3,4})',
	r'(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})',
	r'($$\d{3}$$\s?\d{3}[-.\s]?\d{4})'
	]

	for pattern in phone_patterns:
	matches = re.findall(pattern, text)
	if matches:
	if isinstance(matches[0], tuple):
	return ''.join(matches[0])
	return matches[0]
	return ""

	def extract_name(self, text):
	"""Extract name from text (simple heuristic)"""
	lines = text.split('\n')
	for line in lines[:5]: # Check first 5 lines
	line = line.strip()
	if len(line.split()) == 2 and line.replace(' ', '').isalpha():
	# Simple check: two words, all alphabetic
	if not any(keyword in line.lower() for keyword in ['resume', 'cv', 'curriculum']):
	return line.title()
	return ""

	def extract_skills(self, text):
	"""Extract skills from text"""
	text_lower = text.lower()
	found_skills = []

	for skill in self.skills_keywords:
	if skill.lower() in text_lower:
	found_skills.append(skill.title())

	# Remove duplicates and return
	return list(set(found_skills))

	def extract_education(self, text):
	"""Extract education information"""
	text_lower = text.lower()
	education = []

	# Look for education section
	education_section = ""
	lines = text.split('\n')
	in_education_section = False

	for line in lines:
	line_lower = line.lower()
	if any(keyword in line_lower for keyword in ['education', 'academic', 'qualification']):
	in_education_section = True
	continue
	elif in_education_section and any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'project']):
	break
	elif in_education_section:
	education_section += line + " "

	# Extract degrees and institutions
	for keyword in self.education_keywords:
	if keyword in text_lower:
	# Find context around the keyword
	pattern = rf'.{{0,50}}{re.escape(keyword)}.{{0,50}}'
	matches = re.findall(pattern, text, re.IGNORECASE)
	education.extend(matches)

	return education[:3] # Return top 3 education entries

	def extract_experience(self, text):
	"""Extract work experience"""
	experience = []
	lines = text.split('\n')

	# Look for experience section
	in_experience_section = False
	current_experience = ""

	for line in lines:
	line_lower = line.lower()
	if any(keyword in line_lower for keyword in ['experience', 'work', 'employment', 'career']):
	in_experience_section = True
	continue
	elif in_experience_section and any(keyword in line_lower for keyword in ['education', 'skill', 'project']):
	if current_experience:
	experience.append(current_experience.strip())
	break
	elif in_experience_section:
	if line.strip():
	current_experience += line + " "
	elif current_experience:
	experience.append(current_experience.strip())
	current_experience = ""

	if current_experience:
	experience.append(current_experience.strip())

	return experience[:3] # Return top 3 experience entries

	def extract_summary(self, text):
	"""Extract summary/objective"""
	lines = text.split('\n')
	summary = ""

	for i, line in enumerate(lines):
	line_lower = line.lower()
	if any(keyword in line_lower for keyword in ['summary', 'objective', 'profile', 'about']):
	# Get next few lines as summary
	summary_lines = lines[i+1:i+4]
	summary = ' '.join([l.strip() for l in summary_lines if l.strip()])
	break

	return summary[:200] # Limit to 200 characters

	def extract_resume_features(file_path):
	"""
	Main function to extract features from resume
	Returns a dictionary with extracted information
	"""
	try:
	parser = SimpleResumeParser()
	text = parser.extract_text(file_path)

	if not text:
	return {
	'name': '',
	'email': '',
	'mobile_number': '',
	'skills': [],
	'experience': [],
	'education': [],
	'summary': ''
	}

	# Extract all features
	features = {
	'name': parser.extract_name(text),
	'email': parser.extract_email(text),
	'mobile_number': parser.extract_phone(text),
	'skills': parser.extract_skills(text),
	'experience': parser.extract_experience(text),
	'education': parser.extract_education(text),
	'summary': parser.extract_summary(text)
	}

	return features

	except Exception as e:
	print(f"Error extracting resume features: {e}")
	return {
	'name': '',
	'email': '',
	'mobile_number': '',
	'skills': [],
	'experience': [],
	'education': [],
	'summary': ''
	}

	# For backward compatibility
	def parse_resume(file_path):
	return extract_resume_features(file_path)