Spaces:
Sleeping
Sleeping
Commit
·
7b2b80a
1
Parent(s):
50fac90
Enhance CV parsing functionality in app.py by expanding section header patterns to include more variations and improving line cleaning. Implement a more robust categorization process for unstructured content, extracting contact information, education, experience, and skills based on keywords. Additionally, update text extraction from DOCX files to include table content, ensuring comprehensive CV data retrieval. This change improves the accuracy and completeness of CV parsing, contributing to better email generation based on user inputs.
Browse files
app.py
CHANGED
@@ -18,32 +18,71 @@ def parse_cv_sections(text: str) -> Dict[str, str]:
|
|
18 |
'projects': '',
|
19 |
}
|
20 |
|
21 |
-
# Common section headers in CVs
|
22 |
section_patterns = {
|
23 |
-
'contact': r'(?i)(contact|personal\s+information|profile)',
|
24 |
-
'education': r'(?i)(education|academic|qualification)',
|
25 |
-
'experience': r'(?i)(experience|work|employment|professional)',
|
26 |
-
'skills': r'(?i)(skills|technical
|
27 |
-
'projects': r'(?i)(projects|personal
|
28 |
}
|
29 |
|
30 |
-
# Split text into lines
|
31 |
-
lines = text.split('\n')
|
32 |
current_section = None
|
|
|
33 |
|
34 |
-
|
35 |
-
|
36 |
-
if not line:
|
37 |
-
continue
|
38 |
-
|
39 |
# Check if line is a section header
|
40 |
for section, pattern in section_patterns.items():
|
41 |
if re.search(pattern, line, re.IGNORECASE):
|
|
|
|
|
|
|
|
|
42 |
current_section = section
|
|
|
43 |
break
|
44 |
-
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
return sections
|
49 |
|
@@ -59,17 +98,27 @@ def extract_cv_text(file):
|
|
59 |
if file_ext == '.pdf':
|
60 |
reader = PdfReader(file)
|
61 |
for page in reader.pages:
|
62 |
-
text += page.extract_text()
|
63 |
|
64 |
elif file_ext == '.docx':
|
65 |
doc = docx.Document(file)
|
66 |
for paragraph in doc.paragraphs:
|
67 |
-
text += paragraph.text +
|
|
|
|
|
|
|
|
|
|
|
68 |
else:
|
69 |
return "Unsupported file format. Please upload PDF or DOCX files."
|
70 |
|
71 |
# Parse the CV into sections
|
72 |
sections = parse_cv_sections(text)
|
|
|
|
|
|
|
|
|
|
|
73 |
return sections
|
74 |
|
75 |
except Exception as e:
|
|
|
18 |
'projects': '',
|
19 |
}
|
20 |
|
21 |
+
# Common section headers in CVs with more variations
|
22 |
section_patterns = {
|
23 |
+
'contact': r'(?i)(contact|personal\s+information|profile|contact\s+details|about\s+me)',
|
24 |
+
'education': r'(?i)(education|academic|qualification|academic\s+background|educational\s+background)',
|
25 |
+
'experience': r'(?i)(experience|work|employment|professional|work\s+history|career|professional\s+experience)',
|
26 |
+
'skills': r'(?i)(skills|technical\s+skills|competencies|expertise|technologies|tools|programming|languages)',
|
27 |
+
'projects': r'(?i)(projects|personal\s+projects|portfolio|work\s+samples)',
|
28 |
}
|
29 |
|
30 |
+
# Split text into lines and clean
|
31 |
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
32 |
current_section = None
|
33 |
+
section_content = []
|
34 |
|
35 |
+
# First pass: identify sections
|
36 |
+
for i, line in enumerate(lines):
|
|
|
|
|
|
|
37 |
# Check if line is a section header
|
38 |
for section, pattern in section_patterns.items():
|
39 |
if re.search(pattern, line, re.IGNORECASE):
|
40 |
+
# If we found a section header
|
41 |
+
if current_section:
|
42 |
+
# Save previous section content
|
43 |
+
sections[current_section] = '\n'.join(section_content)
|
44 |
current_section = section
|
45 |
+
section_content = []
|
46 |
break
|
47 |
+
else:
|
48 |
+
# If line is not a header and we have a current section, add to content
|
49 |
+
if current_section:
|
50 |
+
section_content.append(line)
|
51 |
+
|
52 |
+
# Save the last section
|
53 |
+
if current_section and section_content:
|
54 |
+
sections[current_section] = '\n'.join(section_content)
|
55 |
+
|
56 |
+
# If no sections were found, try to categorize the content
|
57 |
+
if all(not content for content in sections.values()):
|
58 |
+
lines_text = '\n'.join(lines)
|
59 |
+
|
60 |
+
# Look for email addresses and phone numbers for contact
|
61 |
+
email_pattern = r'[\w\.-]+@[\w\.-]+'
|
62 |
+
phone_pattern = r'[\+\d]?(\d{2,3}[-\.\s]?){2}\d{4}'
|
63 |
+
|
64 |
+
emails = re.findall(email_pattern, lines_text)
|
65 |
+
phones = re.findall(phone_pattern, lines_text)
|
66 |
+
if emails or phones:
|
67 |
+
sections['contact'] = '\n'.join(emails + phones)
|
68 |
+
|
69 |
+
# Look for education keywords
|
70 |
+
edu_keywords = r'(?i)(university|college|school|degree|bachelor|master|phd|diploma)'
|
71 |
+
edu_lines = [l for l in lines if re.search(edu_keywords, l)]
|
72 |
+
if edu_lines:
|
73 |
+
sections['education'] = '\n'.join(edu_lines)
|
74 |
+
|
75 |
+
# Look for experience keywords
|
76 |
+
exp_keywords = r'(?i)(worked|developer|engineer|manager|consultant|analyst)'
|
77 |
+
exp_lines = [l for l in lines if re.search(exp_keywords, l)]
|
78 |
+
if exp_lines:
|
79 |
+
sections['experience'] = '\n'.join(exp_lines)
|
80 |
+
|
81 |
+
# Look for skills
|
82 |
+
skill_keywords = r'(?i)(python|java|javascript|react|node|sql|aws|docker|kubernetes|git)'
|
83 |
+
skill_lines = [l for l in lines if re.search(skill_keywords, l)]
|
84 |
+
if skill_lines:
|
85 |
+
sections['skills'] = '\n'.join(skill_lines)
|
86 |
|
87 |
return sections
|
88 |
|
|
|
98 |
if file_ext == '.pdf':
|
99 |
reader = PdfReader(file)
|
100 |
for page in reader.pages:
|
101 |
+
text += page.extract_text() + "\n"
|
102 |
|
103 |
elif file_ext == '.docx':
|
104 |
doc = docx.Document(file)
|
105 |
for paragraph in doc.paragraphs:
|
106 |
+
text += paragraph.text + "\n"
|
107 |
+
# Also check tables in docx
|
108 |
+
for table in doc.tables:
|
109 |
+
for row in table.rows:
|
110 |
+
for cell in row.cells:
|
111 |
+
text += cell.text + "\n"
|
112 |
else:
|
113 |
return "Unsupported file format. Please upload PDF or DOCX files."
|
114 |
|
115 |
# Parse the CV into sections
|
116 |
sections = parse_cv_sections(text)
|
117 |
+
|
118 |
+
# Verify that we have content
|
119 |
+
if all(not content.strip() for content in sections.values()):
|
120 |
+
return f"Could not parse CV sections. Raw text:\n{text}"
|
121 |
+
|
122 |
return sections
|
123 |
|
124 |
except Exception as e:
|