Spaces:
Paused
Paused
import re | |
from pathlib import Path | |
from pdfminer.high_level import extract_text as pdf_extract_text | |
from docx import Document | |
class ResumeParser: | |
def __init__(self): | |
pass | |
def extract_text(self, file_path: str) -> str: | |
"""Extract text from PDF or DOCX files""" | |
path = Path(file_path) | |
if path.suffix.lower() == ".pdf": | |
text = pdf_extract_text(file_path) | |
return re.sub(r'\s+', ' ', text).strip() | |
elif path.suffix.lower() == ".docx": | |
doc = Document(file_path) | |
return "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) | |
else: | |
raise ValueError("Unsupported file format") | |
def extract_name(self, text: str) -> str: | |
"""Extract name from resume text""" | |
# Try to find name at the beginning of document | |
first_lines = [line.strip() for line in text.split('\n')[:10] if line.strip()] | |
for line in first_lines: | |
# Simple name pattern (2-4 words, all starting with capital) | |
if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3}$', line): | |
if not any(word.lower() in ['resume', 'cv', 'curriculum'] for word in line.split()): | |
return line | |
# Fallback: return first non-empty line that looks like a name | |
for line in first_lines: | |
if 2 <= len(line.split()) <= 4 and line[0].isupper(): | |
return line | |
return "Not Found" | |
def extract_sections(self, text: str) -> dict: | |
"""Extract skills, education, and experience using regex""" | |
results = { | |
"skills": [], | |
"education": [], | |
"experience": [] | |
} | |
# Extract skills | |
skills_match = re.search( | |
r'(?:skills|technologies|expertise)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)', | |
text, re.IGNORECASE | |
) | |
if skills_match: | |
skills_text = skills_match.group(1) | |
results["skills"] = [s.strip() for s in re.split(r'[,;]', skills_text) if s.strip()] | |
# Extract education | |
edu_match = re.search( | |
r'(?:education|degrees?)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)', | |
text, re.IGNORECASE | |
) | |
if edu_match: | |
results["education"] = [e.strip() for e in edu_match.group(1).split('\n') if e.strip()] | |
# Extract experience | |
exp_match = re.search( | |
r'(?:experience|work history|employment)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)', | |
text, re.IGNORECASE | |
) | |
if exp_match: | |
results["experience"] = [x.strip() for x in exp_match.group(1).split('\n') if x.strip()] | |
return results | |
def parse_resume(self, file_path: str) -> dict: | |
"""Main parsing function""" | |
try: | |
text = self.extract_text(file_path) | |
if not text or len(text.strip()) < 10: | |
return { | |
"name": "Error: Empty file", | |
"skills": [], | |
"education": [], | |
"experience": [] | |
} | |
name = self.extract_name(text) | |
sections = self.extract_sections(text) | |
return { | |
"name": name, | |
"skills": sections["skills"][:10], # Limit to 10 skills | |
"education": sections["education"][:3], # Limit to 3 items | |
"experience": sections["experience"][:3] # Limit to 3 items | |
} | |
except Exception as e: | |
return { | |
"name": f"Error: {str(e)}", | |
"skills": [], | |
"education": [], | |
"experience": [] | |
} | |
# Global instance | |
resume_parser = ResumeParser() | |
def parse_resume(file_path: str) -> dict: | |
"""Public interface for resume parsing""" | |
return resume_parser.parse_resume(file_path) |