Jeet Paul commited on
Commit
3373af1
·
1 Parent(s): b378084

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -0
app.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from nltk.tokenize import word_tokenize
5
+ from nltk.stem import PorterStemmer
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from PyPDF2 import PdfReader
9
+ import os
10
+ from io import BytesIO
11
+ import pickle
12
+ import pdfminer
13
+ from pdfminer.high_level import extract_text
14
+ import re
15
+ import PyPDF2
16
+ import textract
17
+ import tempfile
18
+ import pandas as pd
19
+ from docx import Document
20
+ import csv
21
+ import base64
22
+
23
+
24
+
25
+ nltk.download('punkt')
26
+ nltk.download('stopwords')
27
+
28
+ def preprocess_text(text):
29
+ words = word_tokenize(text.lower())
30
+
31
+ stop_words = set(stopwords.words('english'))
32
+ words = [word for word in words if word not in stop_words]
33
+
34
+ stemmer = PorterStemmer()
35
+ words = [stemmer.stem(word) for word in words]
36
+
37
+ return ' '.join(words)
38
+
39
+ def extract_text_from_pdf(pdf_content):
40
+ pdf_reader = PdfReader(BytesIO(pdf_content))
41
+ text = ''
42
+ for page in pdf_reader.pages:
43
+ text += page.extract_text()
44
+ return text
45
+
46
+ def extract_text_from_docx(docx_content):
47
+ doc = Document(BytesIO(docx_content))
48
+ text = " ".join(paragraph.text for paragraph in doc.paragraphs)
49
+ return text
50
+
51
+
52
+ def extract_text_from_txt(txt_content):
53
+ text = textract.process(input_filename=None, input_bytes=txt_content)
54
+ return text
55
+
56
+ def extract_text_from_resume(file_path):
57
+ file_extension = file_path.split('.')[-1].lower()
58
+
59
+ if file_extension == 'pdf':
60
+ return extract_text_from_pdf(file_path)
61
+ elif file_extension == 'docx':
62
+ return extract_text_from_docx(file_path)
63
+ elif file_extension == 'txt':
64
+ return extract_text_from_txt(file_path)
65
+ else:
66
+ raise ValueError(f"Unsupported file format: {file_extension}")
67
+
68
+ def clean_pdf_text(text):
69
+ text = re.sub('http\S+\s*', ' ', text)
70
+ text = re.sub('RT|cc', ' ', text)
71
+ text = re.sub('#\S+', '', text)
72
+ text = re.sub('@\S+', ' ', text)
73
+ text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
74
+ text = re.sub(r'[^\x00-\x7f]',r' ', text)
75
+ text = re.sub('\s+', ' ', text)
76
+ return text
77
+
78
+ def extract_candidate_name(text):
79
+ pattern = r'(?:Mr\.|Ms\.|Mrs\.)?\s?([A-Z][a-z]+)\s([A-Z][a-z]+)'
80
+ match = re.search(pattern, text)
81
+ if match:
82
+ return match.group(0)
83
+ return "Candidate Name Not Found"
84
+
85
+ def calculate_similarity(job_description, cvs, cv_file_names):
86
+ processed_job_desc = preprocess_text(job_description)
87
+
88
+ processed_cvs = [preprocess_text(cv) for cv in cvs]
89
+
90
+ all_text = [processed_job_desc] + processed_cvs
91
+
92
+ vectorizer = TfidfVectorizer()
93
+ tfidf_matrix = vectorizer.fit_transform(all_text)
94
+
95
+ similarity_scores = cosine_similarity(tfidf_matrix)[0][1:]
96
+
97
+ ranked_cvs = list(zip(cv_file_names, similarity_scores))
98
+ ranked_cvs.sort(key=lambda x: x[1], reverse=True)
99
+
100
+ return ranked_cvs
101
+
102
+ def extract_email_phone(text):
103
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
104
+ phone_pattern = r'\b(?:\d{3}[-.\s]??\d{3}[-.\s]??\d{4}|\d{3}[-.\s]??\d{4})\b'
105
+
106
+ emails = re.findall(email_pattern, text)
107
+ phones = re.findall(phone_pattern, text)
108
+
109
+ return emails, phones
110
+
111
+
112
+
113
+ def rank_and_shortlist(job_description, cv_files, threshold=0.09):
114
+ cv_texts = []
115
+ cv_file_names = []
116
+ cv_emails = []
117
+ cv_phones = []
118
+
119
+ for cv_file in cv_files:
120
+ file_extension = os.path.splitext(cv_file.name)[1].lower()
121
+
122
+ try:
123
+ if file_extension == '.pdf':
124
+ cv_text = extract_text_from_pdf(cv_file.read())
125
+ elif file_extension == '.docx':
126
+ cv_text = extract_text_from_docx(cv_file.read())
127
+ elif file_extension == '.txt':
128
+ cv_text = cv_file.read().decode('utf-8', errors='ignore')
129
+ else:
130
+ st.warning(f"Unsupported file format: {file_extension}. Skipping file: {cv_file.name}")
131
+ continue
132
+
133
+ cv_texts.append(clean_pdf_text(cv_text))
134
+ cv_file_names.append(cv_file.name)
135
+
136
+ # Extract email and phone number from the CV text
137
+ emails, phones = extract_email_phone(cv_text)
138
+ cv_emails.append(emails)
139
+ cv_phones.append(phones)
140
+
141
+ except Exception as e:
142
+ st.warning(f"Error processing file '{cv_file.name}': {str(e)}")
143
+ continue
144
+
145
+ if not cv_texts:
146
+ st.error("No valid resumes found. Please upload resumes in supported formats (PDF, DOCX, or TXT).")
147
+ return [], {}
148
+
149
+ similarity_scores = calculate_similarity(job_description, cv_texts, cv_file_names)
150
+
151
+ ranked_cvs = [(cv_name, score) for (cv_name, score) in similarity_scores]
152
+ shortlisted_cvs = [(cv_name, score) for (cv_name, score) in ranked_cvs if score >= threshold]
153
+
154
+
155
+ contact_info_dict = {}
156
+ for cv_name, emails, phones in zip(cv_file_names, cv_emails, cv_phones):
157
+ contact_info_dict[cv_name] = {
158
+ 'emails': emails,
159
+ 'phones': phones,
160
+ }
161
+
162
+ return ranked_cvs, shortlisted_cvs, contact_info_dict
163
+
164
+ def export_to_csv(data, filename):
165
+ df = pd.DataFrame(data.items(), columns=['File Name', 'Emails'])
166
+ df.to_csv(filename, index=False)
167
+
168
+
169
+ def main():
170
+ st.title("Resume Ranking App")
171
+
172
+ st.write("Enter Job Title:")
173
+ job_title = st.text_input("Job Title")
174
+
175
+ st.write("Enter Job Description:")
176
+ job_description = st.text_area("Job Description", height=200, key='job_description')
177
+
178
+ st.write("Upload the Resumes:")
179
+ cv_files = st.file_uploader("Choose files", accept_multiple_files=True, key='cv_files')
180
+
181
+ if st.button("Submit"):
182
+ if job_title and job_description and cv_files:
183
+ job_description_text = f"{job_title} {job_description}"
184
+
185
+ ranked_cvs, shortlisted_cvs, contact_info_dict = rank_and_shortlist(job_description_text, cv_files)
186
+
187
+ st.markdown("### Ranking of Resumes:")
188
+ for rank, score in ranked_cvs:
189
+ st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}")
190
+
191
+ st.markdown("### Shortlisted Candidates:")
192
+ if not shortlisted_cvs:
193
+ st.markdown("None")
194
+ else:
195
+ shortlisted_candidates_data = {}
196
+ for rank, score in shortlisted_cvs:
197
+ st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}")
198
+
199
+ contact_info = contact_info_dict[rank]
200
+ candidate_emails = contact_info.get('emails', [])
201
+ if candidate_emails:
202
+ shortlisted_candidates_data[rank] = candidate_emails
203
+ st.markdown(f"**Emails:** {', '.join(candidate_emails)}")
204
+
205
+ if shortlisted_candidates_data:
206
+ export_filename = "shortlisted_candidates.csv"
207
+ temp_dir = tempfile.gettempdir()
208
+ temp_file_path = os.path.join(temp_dir, export_filename)
209
+ export_to_csv(shortlisted_candidates_data, temp_file_path)
210
+ with open(temp_file_path, 'rb') as file:
211
+ csv_content = file.read()
212
+ b64_encoded_csv = base64.b64encode(csv_content).decode()
213
+ st.markdown(
214
+ f'<a href="data:application/octet-stream;base64,{b64_encoded_csv}" download="{export_filename}">'
215
+ '<button style="padding: 10px; background-color: #4CAF50; color: white; border: none; cursor: pointer;">'
216
+ 'Download CSV</button></a>',unsafe_allow_html=True
217
+ )
218
+
219
+ st.markdown(
220
+ '<a href="https://huggingface.co/spaces/smallboy713102/Shortlisted_Candidate_Email_Sender" '
221
+ 'target="_blank"><button style="padding: 10px; background-color: #008CBA; color: white; border: none; cursor: pointer;">'
222
+ 'HR\'s Shortlisted Email Sender</button></a>',unsafe_allow_html=True
223
+ )
224
+
225
+ else:
226
+ st.error("Please enter the job title, job description, and upload resumes to proceed.")
227
+ else:
228
+ st.write("Please enter the job title, job description, and upload resumes to proceed.")
229
+
230
+ if __name__ == "__main__":
231
+ main()