import re from pathlib import Path from pdfminer.high_level import extract_text as pdf_extract_text from docx import Document class ResumeParser: def __init__(self): pass def extract_text(self, file_path: str) -> str: """Extract text from PDF or DOCX files""" path = Path(file_path) if path.suffix.lower() == ".pdf": text = pdf_extract_text(file_path) return re.sub(r'\s+', ' ', text).strip() elif path.suffix.lower() == ".docx": doc = Document(file_path) return "\n".join([p.text for p in doc.paragraphs if p.text.strip()]) else: raise ValueError("Unsupported file format") def extract_name(self, text: str) -> str: """Extract name from resume text""" # Try to find name at the beginning of document first_lines = [line.strip() for line in text.split('\n')[:10] if line.strip()] for line in first_lines: # Simple name pattern (2-4 words, all starting with capital) if re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,3}$', line): if not any(word.lower() in ['resume', 'cv', 'curriculum'] for word in line.split()): return line # Fallback: return first non-empty line that looks like a name for line in first_lines: if 2 <= len(line.split()) <= 4 and line[0].isupper(): return line return "Not Found" def extract_sections(self, text: str) -> dict: """Extract skills, education, and experience using regex""" results = { "skills": [], "education": [], "experience": [] } # Extract skills skills_match = re.search( r'(?:skills|technologies|expertise)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)', text, re.IGNORECASE ) if skills_match: skills_text = skills_match.group(1) results["skills"] = [s.strip() for s in re.split(r'[,;]', skills_text) if s.strip()] # Extract education edu_match = re.search( r'(?:education|degrees?)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)', text, re.IGNORECASE ) if edu_match: results["education"] = [e.strip() for e in edu_match.group(1).split('\n') if e.strip()] # Extract experience exp_match = re.search( r'(?:experience|work history|employment)[:\s]*(.*?)(?:\n\n|\n\s*\n|$)', text, re.IGNORECASE ) if exp_match: results["experience"] = [x.strip() for x in exp_match.group(1).split('\n') if x.strip()] return results def parse_resume(self, file_path: str) -> dict: """Main parsing function""" try: text = self.extract_text(file_path) if not text or len(text.strip()) < 10: return { "name": "Error: Empty file", "skills": [], "education": [], "experience": [] } name = self.extract_name(text) sections = self.extract_sections(text) return { "name": name, "skills": sections["skills"][:10], # Limit to 10 skills "education": sections["education"][:3], # Limit to 3 items "experience": sections["experience"][:3] # Limit to 3 items } except Exception as e: return { "name": f"Error: {str(e)}", "skills": [], "education": [], "experience": [] } # Global instance resume_parser = ResumeParser() def parse_resume(file_path: str) -> dict: """Public interface for resume parsing""" return resume_parser.parse_resume(file_path)