Amrrs commited on
Commit
d4cec72
·
1 Parent(s): a5590cc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +406 -0
app.py ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ##############################
2
+ #### All library imports #####
3
+ ##############################
4
+
5
+
6
+ import streamlit as st # web app
7
+ import pandas as pd
8
+ import numpy as np
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+ import warnings
12
+ warnings.filterwarnings('ignore')
13
+
14
+ import nltk
15
+
16
+ from nltk.tokenize import word_tokenize
17
+ from nltk.stem import WordNetLemmatizer
18
+
19
+ from bs4 import BeautifulSoup
20
+ import re
21
+ from nltk.corpus import stopwords # Import the stop word list
22
+ from wordcloud import WordCloud
23
+ # stopwords.words("english")
24
+ from nltk import word_tokenize
25
+ from nltk.util import ngrams
26
+
27
+ import PyPDF2
28
+ import base64 # byte object into a pdf file
29
+
30
+
31
+
32
+
33
+ ############# Web app streamlit page config ##########
34
+
35
+
36
+ st.set_page_config(
37
+ page_title = 'Resume enhancement by extracting keywords using NLP ',
38
+ page_icon = '📖',
39
+ layout = 'wide'
40
+ )
41
+
42
+
43
+ st.title(" 📕 Resume enhancement by extracting keywords 📝 ")
44
+
45
+ st.subheader("📢 using NLP 📢")
46
+
47
+
48
+ """✅ ** Downloading Models and Basic Setup **"""
49
+
50
+
51
+ ##############
52
+ #### download NLTK models ##############
53
+ ##############
54
+
55
+ nltk.download("popular")
56
+ nltk.download('stopwords')
57
+
58
+ lemmatizer = WordNetLemmatizer()
59
+
60
+
61
+ #########################################
62
+ ###### read main resources files ########
63
+ #########################################
64
+
65
+
66
+ df=pd.read_csv('Resume_skills.csv')
67
+
68
+ df=df.drop(columns=['Unnamed: 0'])
69
+
70
+
71
+ file1=open('linkedin skill','r')
72
+ skills=[(line.strip()).split(',')[0].lower() for line in file1.readlines()]
73
+
74
+
75
+ def sentence_maker(unique_words):
76
+ sentences=''
77
+ for i in unique_words:
78
+ sentences+=''.join(i.strip())+' '
79
+ return sentences
80
+
81
+ #stop_words = set(nltk.corpus.stopwords.words('english'))
82
+
83
+
84
+
85
+ def extract_skills(input_text):
86
+ res=[]
87
+
88
+ for i in input_text:
89
+
90
+
91
+ # generate bigrams and trigrams (such as artificial intelligence)
92
+ bigrams_trigrams = list(map(' '.join, nltk.everygrams(i, 2, 3)))
93
+
94
+ # we create a set to keep the results in.
95
+ found_skills = set()
96
+ # we search for each token in our skills database
97
+ for token in i:
98
+ if token.lower() in skills:
99
+ found_skills.add(token)
100
+ # print('2',found_skills)
101
+ # we search for each bigram and trigram in our skills database
102
+ for ngram in bigrams_trigrams:
103
+ if ngram.lower() in skills:
104
+ found_skills.add(ngram)
105
+ res.append(found_skills)
106
+ print(res)
107
+ return res
108
+
109
+ def clean_sentences(df,col_name):
110
+ reviews = []
111
+
112
+ for sent in (df[col_name]):
113
+ #remove html content
114
+ review_text = BeautifulSoup(sent).get_text()
115
+ #remove non-alphabetic characters
116
+ review_text = re.sub("[^a-zA-Z]"," ", review_text)
117
+ #tokenize the sentences
118
+ words = word_tokenize(review_text.lower())
119
+
120
+ stops = set(stopwords.words("english"))
121
+ # 5. Remove stop words
122
+ meaningful_words = [w for w in words if not w in stops]
123
+
124
+ reviews.append(meaningful_words)
125
+
126
+ return(reviews)
127
+
128
+ def clean_sentences2(df,col_name):
129
+ reviews = []
130
+
131
+ for sent in (df[col_name]):
132
+
133
+ #remove html content
134
+ review_text = BeautifulSoup(sent).get_text()
135
+
136
+ #remove non-alphabetic characters
137
+ review_text = re.sub("[^a-zA-Z]"," ", review_text)
138
+
139
+ #tokenize the sentences
140
+ words = word_tokenize(review_text.lower())
141
+ reviews.append(words)
142
+ return(reviews)
143
+
144
+ def extract_keywords(res):
145
+ keywords=set()
146
+ for i in res:
147
+ for j in i:
148
+ keywords.add(j)
149
+ return(keywords)
150
+
151
+
152
+
153
+
154
+
155
+
156
+ def clean_sentences3(text):
157
+ reviews = []
158
+
159
+ #remove html content
160
+ review_text = BeautifulSoup(text).get_text()
161
+
162
+ #remove non-alphabetic characters
163
+ review_text = re.sub("[^a-zA-Z]"," ", review_text)
164
+
165
+ #tokenize the sentences
166
+ words = word_tokenize(review_text.lower())
167
+
168
+ stops = set(stopwords.words("english"))
169
+ #
170
+ # 5. Remove stop words
171
+ meaningful_words = [w for w in words if not w in stops]
172
+
173
+ reviews.append(meaningful_words)
174
+
175
+ return(reviews)
176
+
177
+ def decode_txt1(file_name): # for clean text
178
+ f= open(file_name,"r")
179
+ full_text = f.read()
180
+ clean_text=clean_sentences3(full_text)
181
+ f.close()
182
+ return clean_text
183
+ # return data
184
+
185
+
186
+
187
+
188
+ def decode_pdf(filename):
189
+ # creating a pdf file object
190
+ pdfFileObj = open(filename, 'rb')
191
+
192
+
193
+ # creating a pdf reader object
194
+ pdfReader1 = PyPDF2.PdfFileReader(pdfFileObj)
195
+
196
+
197
+ # printing number of pages in pdf file
198
+ num_pages=pdfReader1.numPages
199
+ # print(num)
200
+ text=open('Sample.txt','w')
201
+ for i in range(num_pages):
202
+ # creating a page object
203
+ pageObj = pdfReader1.getPage(i)
204
+
205
+ # extracting text from page
206
+ t=(pageObj.extractText())
207
+ text.write(t)
208
+
209
+ # closing the pdf file object
210
+ pdfFileObj.close()
211
+ text.close()
212
+ # print(text)
213
+ dec_txt=decode_txt1('Sample.txt')
214
+ #print(dec_txt)
215
+ return dec_txt
216
+
217
+
218
+ # resume_text
219
+
220
+ def extract_skills2(input_text):
221
+ found_skills=[]
222
+ # input_text=list(input_text)
223
+ for i in input_text:
224
+ # print('1',i)
225
+ # generate bigrams and trigrams (such as artificial intelligence)
226
+ bigrams_trigrams = list(map(' '.join, nltk.everygrams(i, 2, 3)))
227
+
228
+ # we create a set to keep the results in.
229
+ # found_skills = []
230
+
231
+ # we search for each token in our skills database
232
+ # for token in i:
233
+ token=i
234
+ if token.lower() in skills:
235
+ # print(found_skills)
236
+ found_skills.append(token)
237
+ # print('2',found_skills)
238
+ # we search for each bigram and trigram in our skills database
239
+ for ngram in bigrams_trigrams:
240
+ if ngram.lower() in skills:
241
+ found_skills.append(ngram)
242
+ # res.append(found_skills)
243
+ # print(found_skills)
244
+ return found_skills
245
+
246
+
247
+
248
+ #########################################
249
+ ############# Upload your resume ############
250
+ #########################################
251
+
252
+ uploaded_file = st.file_uploader('Choose your .pdf file', type="pdf")
253
+
254
+ if uploaded_file is not None:
255
+ with open("input.pdf", "wb") as f:
256
+ base64_pdf = base64.b64encode(uploaded_file.read()).decode('utf-8')
257
+ f.write(base64.b64decode(base64_pdf))
258
+ f.close()
259
+
260
+ resume_text=decode_pdf("input.pdf") # enter the resume name
261
+
262
+ ###################
263
+ ####### select the category ############
264
+ ####################
265
+
266
+ list_of_cats = [ 'Testing', 'HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE','BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE','BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE','CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT','CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION','Data Science', 'Advocate', 'Arts', 'Web Designing','Mechanical Engineer', 'Sales', 'Health and fitness','Civil Engineer', 'Java Developer', 'Business Analyst','SAP Developer', 'Automation Testing', 'Electrical Engineering','Operations Manager', 'Python Developer', 'DevOps Engineer','Network Security Engineer', 'PMO', 'Database', 'Hadoop','ETL Developer', 'DotNet Developer', 'Blockchain']
267
+
268
+ cat = st.selectbox("Select your desired Category",list_of_cats, index = 0)
269
+
270
+
271
+ #cat = "Testing" #@param ['HR', 'DESIGNER', 'INFORMATION-TECHNOLOGY', 'TEACHER', 'ADVOCATE','BUSINESS-DEVELOPMENT', 'HEALTHCARE', 'FITNESS', 'AGRICULTURE','BPO', 'SALES', 'CONSULTANT', 'DIGITAL-MEDIA', 'AUTOMOBILE','CHEF', 'FINANCE', 'APPAREL', 'ENGINEERING', 'ACCOUNTANT','CONSTRUCTION', 'PUBLIC-RELATIONS', 'BANKING', 'ARTS', 'AVIATION','Data Science', 'Advocate', 'Arts', 'Web Designing','Mechanical Engineer', 'Sales', 'Health and fitness','Civil Engineer', 'Java Developer', 'Business Analyst','SAP Developer', 'Automation Testing', 'Electrical Engineering','Operations Manager', 'Python Developer', 'DevOps Engineer','Network Security Engineer', 'PMO', 'Database', 'Hadoop','ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'] {allow-input: true}
272
+ print('You selected:', cat)
273
+
274
+ # cat='Data Science' # enter the category to extract
275
+ sub_df=df[df['Category']==cat]
276
+
277
+ sentences1=sentence_maker(sub_df['Resume_skills'])
278
+
279
+ """✅ **Extracting Data from PDF **"""
280
+
281
+ resume_text2=extract_skills2(resume_text[0])
282
+ resume_keywords=set(resume_text2)
283
+ # resume_keywords # keywords for existing resume
284
+ print(resume_keywords)
285
+
286
+
287
+ wc = WordCloud(width = 500, height = 500,include_numbers=True,collocations=True, background_color ='white',min_font_size = 10).generate(sentence_maker(resume_keywords))
288
+ plt.figure(figsize=(10,10))
289
+ plt.imshow(wc, interpolation='bilinear')
290
+ plt.axis("off")
291
+ plt.title(' existing Keywords')
292
+ plt.show()
293
+
294
+ # """**sub_unique_words** contains list of category related keywords and existing resume keywords in **resume_keywords**"""
295
+
296
+ """✅ Generating ***Similarity Score*** with existing skillset"""
297
+
298
+ from cdifflib import CSequenceMatcher
299
+
300
+ def get_similarity_score(s1,s2):
301
+
302
+ sm= CSequenceMatcher(None,s1,s2)
303
+ return(str(round(sm.ratio()*100,3))+'%')
304
+ #return round(sm.ratio()*100,3)
305
+
306
+
307
+ wc_r = WordCloud(width = 500, height = 500,max_words=200,include_numbers=True,collocations=True,
308
+ background_color ='white',min_font_size = 10).generate(sentences1)
309
+ # plt.figure(figsize=(10,10))
310
+ # plt.imshow(wc_r, interpolation='bilinear')
311
+ # plt.axis("off")
312
+ # plt.title('Keywords for : '+cat)
313
+
314
+
315
+
316
+
317
+ """✅ **Getting the matching score with database**"""
318
+
319
+ sub_unique_words=list(wc_r.words_.keys())
320
+ resume_keywords=list(resume_keywords)
321
+
322
+ bigram = list(map(' '.join,ngrams(sub_unique_words, 1)))
323
+ # print(bigram)
324
+ sub_keywords=set()
325
+ for bg in bigram:
326
+ if bg in skills:
327
+ # print(bg)
328
+ sub_keywords.add(bg)
329
+ tokens = nltk.word_tokenize(sentence_maker(sub_unique_words))
330
+ for i in tokens:
331
+ sub_keywords.add(i)
332
+
333
+ def preprocess(words):
334
+ res=set()
335
+ for i in words:
336
+ #remove html content
337
+ review_text = BeautifulSoup(i).get_text()
338
+
339
+ #remove non-alphabetic characters
340
+ review_text = re.sub("[^a-zA-Z]"," ", review_text)
341
+ #tokenize the sentences
342
+ words = word_tokenize(review_text.lower())
343
+ # print(words)
344
+ for j in words:
345
+ res.add(j)
346
+ # print(res)
347
+ return res
348
+
349
+
350
+ with st.spinner():
351
+
352
+ sub_unique_words_match=list(preprocess(sub_unique_words))
353
+ resume_keywords=list(preprocess(resume_keywords))
354
+
355
+
356
+ predicted_keywords_match=[i for i in sub_unique_words_match if i not in resume_keywords]
357
+ pred_keywords=[i for i in sub_keywords if i not in resume_keywords]
358
+
359
+ print(pred_keywords)
360
+
361
+ ############################
362
+ #### final word cloud ######
363
+ ############################
364
+
365
+ from collections import Counter
366
+ word_could_dict=Counter(pred_keywords)
367
+ wc = WordCloud(width = 500, height = 500,include_numbers=True,collocations=True,
368
+ background_color ='white',min_font_size = 10).generate_from_frequencies(word_could_dict)
369
+ plt.figure(figsize=(10,10))
370
+ plt.imshow(wc, interpolation='bilinear')
371
+ plt.axis("off")
372
+ plt.title(' predicted keywords')
373
+ #plt.show()
374
+ wc.to_file('prediction.jpg') # enter file name to save
375
+
376
+ st.markdown("# Output")
377
+
378
+ col1, col2, col3, col4 = st.columns(4)
379
+
380
+ with col2:
381
+ st.markdown("### Predicted Keywords WordCloud")
382
+
383
+ st.image('prediction.jpg')
384
+
385
+
386
+ ############################
387
+ #### similarty score ######
388
+ ############################
389
+
390
+ existing_score = get_similarity_score(sub_unique_words_match,resume_keywords) # get the matching score for resume keywords and category keywords
391
+
392
+ predicted_result_score = get_similarity_score(predicted_keywords_match,sub_unique_words_match)# matching score for predicted keywords and category keywords
393
+
394
+ with col1:
395
+ st.markdown('### Existing Keywords :' )
396
+
397
+ st.metric( label = 'Score', value = existing_score)
398
+
399
+ with col3:
400
+ st.markdown(" ")
401
+
402
+
403
+ with col4:
404
+ st.markdown('### Predicted Keywords :' )
405
+
406
+ st.metric( label = 'Score', value = predicted_result_score)