Spaces:
Paused
Paused
File size: 3,977 Bytes
6248af7 33fa314 6248af7 c0dac84 6248af7 c0dac84 6248af7 c0dac84 6248af7 c0dac84 f2a1cfa c0dac84 f2a1cfa 6248af7 c0dac84 f2a1cfa 6248af7 c0dac84 f2a1cfa c0dac84 f2a1cfa c0dac84 f2a1cfa c0dac84 6248af7 c0dac84 6248af7 c0dac84 6248af7 c0dac84 6248af7 c0dac84 6248af7 c0dac84 6248af7 c0dac84 6248af7 b336194 c0dac84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import re
from pathlib import Path
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
class ResumeParser:
def __init__(self):
pass
def extract_text(self, file_path: str) -> str:
"""Extract text from PDF or DOCX files"""
path = Path(file_path)
if path.suffix.lower() == ".pdf":
text = pdf_extract_text(file_path)
return re.sub(r'\s+', ' ', text).strip()
elif path.suffix.lower() == ".docx":
doc = Document(file_path)
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
else:
raise ValueError("Unsupported file format")
def extract_name(self, text: str) -> str:
"""Extract name from resume text"""
# Try to find name at the beginning of document
first_lines = [line.strip() for line in text.split('\n')[:10] if line.strip()]
for line in first_lines:
# Simple name pattern (2-4 words, all starting with capital)
if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3}$', line):
if not any(word.lower() in ['resume', 'cv', 'curriculum'] for word in line.split()):
return line
# Fallback: return first non-empty line that looks like a name
for line in first_lines:
if 2 <= len(line.split()) <= 4 and line[0].isupper():
return line
return "Not Found"
def extract_sections(self, text: str) -> dict:
"""Extract skills, education, and experience using regex"""
results = {
"skills": [],
"education": [],
"experience": []
}
# Extract skills
skills_match = re.search(
r'(?:skills|technologies|expertise)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
text, re.IGNORECASE
)
if skills_match:
skills_text = skills_match.group(1)
results["skills"] = [s.strip() for s in re.split(r'[,;]', skills_text) if s.strip()]
# Extract education
edu_match = re.search(
r'(?:education|degrees?)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
text, re.IGNORECASE
)
if edu_match:
results["education"] = [e.strip() for e in edu_match.group(1).split('\n') if e.strip()]
# Extract experience
exp_match = re.search(
r'(?:experience|work history|employment)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)',
text, re.IGNORECASE
)
if exp_match:
results["experience"] = [x.strip() for x in exp_match.group(1).split('\n') if x.strip()]
return results
def parse_resume(self, file_path: str) -> dict:
"""Main parsing function"""
try:
text = self.extract_text(file_path)
if not text or len(text.strip()) < 10:
return {
"name": "Error: Empty file",
"skills": [],
"education": [],
"experience": []
}
name = self.extract_name(text)
sections = self.extract_sections(text)
return {
"name": name,
"skills": sections["skills"][:10], # Limit to 10 skills
"education": sections["education"][:3], # Limit to 3 items
"experience": sections["experience"][:3] # Limit to 3 items
}
except Exception as e:
return {
"name": f"Error: {str(e)}",
"skills": [],
"education": [],
"experience": []
}
# Global instance
resume_parser = ResumeParser()
def parse_resume(file_path: str) -> dict:
"""Public interface for resume parsing"""
return resume_parser.parse_resume(file_path) |