billusanda007 commited on
Commit
312d081
·
verified ·
1 Parent(s): bcbc731

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -105
app.py CHANGED
@@ -3,115 +3,59 @@ import pandas as pd
3
  import numpy as np
4
  import re
5
  import pickle
6
- import pdfminer
7
- from pdfminer.high_level import extract_text
8
- from tensorflow.keras.models import Sequential
9
- from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, GlobalMaxPooling1D
10
  from tensorflow.keras.preprocessing.text import Tokenizer
11
  from tensorflow.keras.preprocessing.sequence import pad_sequences
12
- from tensorflow.keras.utils import to_categorical
13
- from sklearn.preprocessing import LabelEncoder
14
 
 
 
15
 
16
  def cleanResume(resumeText):
17
-
18
- resumeText = re.sub('http\S+\s*', ' ', resumeText)
19
- resumeText = re.sub('RT|cc', ' ', resumeText)
20
- resumeText = re.sub('#\S+', '', resumeText)
21
- resumeText = re.sub('@\S+', ' ', resumeText)
22
- resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)
23
- resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
24
- resumeText = re.sub('\s+', ' ', resumeText)
25
  return resumeText
26
 
27
- def pdf_to_text(file):
28
-
29
- text = extract_text(file)
30
- return text
31
-
32
- def predict_category(resumes_data, selected_category,max_sequence_length):
33
-
34
- model = load_deeprank_model(max_sequence_length)
35
-
36
-
37
- resumes_df = pd.DataFrame(resumes_data)
38
- resumes_text = resumes_df['ResumeText'].values
39
-
40
-
41
- tokenized_text = tokenizer.texts_to_sequences(resumes_text)
42
-
43
-
44
- max_sequence_length = 500
45
- padded_text = pad_sequences(tokenized_text, maxlen=max_sequence_length)
46
-
47
-
48
- predicted_probs = model.predict(padded_text)
49
-
50
-
51
- for i, category in enumerate(label.classes_):
52
- resumes_df[category] = predicted_probs[:, i]
53
-
54
- resumes_df_sorted = resumes_df.sort_values(by=selected_category, ascending=False)
55
-
56
-
57
- ranks = []
58
- for rank, (idx, row) in enumerate(resumes_df_sorted.iterrows()):
59
- rank = rank + 1
60
- file_name = row['FileName']
61
- ranks.append({'Rank': rank, 'FileName': file_name})
62
-
63
- return ranks
64
-
65
- def load_deeprank_model(max_sequence_length):
66
-
67
- model.load_weights('deeprank_model.h5')
68
- model = Sequential()
69
-
70
- model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
71
- model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
72
- model.add(MaxPooling1D(pool_size=2))
73
- model.add(LSTM(64))
74
- model.add(Dense(num_classes, activation='softmax'))
75
- model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
76
- return model
77
-
78
- def main():
79
- st.title("Resume Ranking App")
80
- st.text("Upload resumes and select a category to rank them.")
81
-
82
- resumes_data = []
83
- selected_category = ""
84
-
85
-
86
- files = st.file_uploader("Upload resumes", type=["pdf"], accept_multiple_files=True)
87
- if files:
88
- for file in files:
89
- text = cleanResume(pdf_to_text(file))
90
- resumes_data.append({'ResumeText': text, 'FileName': file.name})
91
- selected_category = st.selectbox("Select a category to rank by", label.classes_)
92
-
93
- if st.button("Rank Resumes"):
94
- if not resumes_data or not selected_category:
95
- st.warning("Please upload resumes and select a category to continue.")
96
- else:
97
- ranks = predict_category(resumes_data, selected_category,max_sequence_length)
98
- st.write(pd.DataFrame(ranks))
99
-
100
- if __name__ == '__main__':
101
-
102
- df = pd.read_csv('UpdatedResumeDataSet.csv')
103
- df['cleaned'] = df['Resume'].apply(lambda x: cleanResume(x))
104
- label = LabelEncoder()
105
- df['Category'] = label.fit_transform(df['Category'])
106
-
107
-
108
- text = df['cleaned'].values
109
- #text=df['Resume'].values
110
- tokenizer = Tokenizer()
111
- tokenizer.fit_on_texts(text)
112
- vocab_size = len(tokenizer.word_index) + 1
113
- num_classes = len(label.classes_)
114
-
115
- max_sequence_length = 500
116
-
117
- main()
 
3
  import numpy as np
4
  import re
5
  import pickle
6
+ import nltk
7
+ from nltk.corpus import stopwords
8
+ from tensorflow.keras.models import load_model
 
9
  from tensorflow.keras.preprocessing.text import Tokenizer
10
  from tensorflow.keras.preprocessing.sequence import pad_sequences
 
 
11
 
12
+ nltk.download('stopwords')
13
+ stop_words = set(stopwords.words('english'))
14
 
15
  def cleanResume(resumeText):
16
+ resumeText = re.sub(r'http\S+\s*', ' ', resumeText)
17
+ resumeText = re.sub(r'RT|cc', ' ', resumeText)
18
+ resumeText = re.sub(r'#\S+', '', resumeText)
19
+ resumeText = re.sub(r'@\S+', ' ', resumeText)
20
+ resumeText = re.sub(r'[%s]' % re.escape("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"), ' ', resumeText)
21
+ resumeText = re.sub(r'[^\x00-\x7f]', r' ', resumeText)
22
+ resumeText = re.sub(r'\s+', ' ', resumeText)
23
+ resumeText = ' '.join([word for word in resumeText.split() if word.lower() not in stop_words])
24
  return resumeText
25
 
26
+ def load_resources():
27
+ with open('tokenizer.pkl', 'rb') as f:
28
+ tokenizer = pickle.load(f)
29
+ with open('label_encoder.pkl', 'rb') as f:
30
+ label_encoder = pickle.load(f)
31
+ model = load_model('deeprank_model_v2.h5')
32
+ return tokenizer, label_encoder, model
33
+
34
+ def infer(text, tokenizer, label_encoder, model):
35
+ cleaned_text = cleanResume(text)
36
+ sequence = tokenizer.texts_to_sequences([cleaned_text])
37
+ padded_sequence = pad_sequences(sequence, maxlen=500)
38
+ prediction = model.predict(padded_sequence)
39
+ predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])
40
+ return predicted_class[0]
41
+
42
+ st.title("Resume Category Predictor")
43
+
44
+ st.write("Upload a resume text file or enter text below to predict the job category.")
45
+
46
+ uploaded_file = st.file_uploader("Upload Resume (TXT file)", type=["txt"])
47
+ user_input = st.text_area("Or paste resume text here:")
48
+
49
+ if uploaded_file is not None:
50
+ resume_text = uploaded_file.read().decode("utf-8")
51
+ st.session_state["resume_text"] = resume_text
52
+ elif user_input:
53
+ resume_text = user_input
54
+ st.session_state["resume_text"] = resume_text
55
+ else:
56
+ resume_text = ""
57
+
58
+ if st.button("Predict Category") and resume_text:
59
+ tokenizer, label_encoder, model = load_resources()
60
+ prediction = infer(resume_text, tokenizer, label_encoder, model)
61
+ st.write(f"Predicted Category: **{prediction}**")