TKM03 commited on
Commit
59bc749
Β·
verified Β·
1 Parent(s): 1d2993e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -84
app.py CHANGED
@@ -4,121 +4,138 @@ import shutil
4
  import PyPDF2
5
  import gradio as gr
6
  from transformers import pipeline
7
- from collections import defaultdict
8
 
9
- # Load job classification model
10
- text_classifier = pipeline("text-classification", model="serbog/distilbert-jobCategory_410k")
11
-
12
- # Expanded label map (you can update based on actual model labels returned)
13
  CATEGORY_MAP = {
14
- "LABEL_0": "Information Technology / Software Engineering",
15
- "LABEL_1": "Healthcare / Medical / Nursing",
16
- "LABEL_2": "Finance / Accounting / Auditing",
17
- "LABEL_3": "Engineering / Mechanical / Civil / Electrical",
18
- "LABEL_4": "Education / Training / Teaching",
19
- "LABEL_5": "Sales / Marketing / Business Development",
20
- "LABEL_6": "Customer Service / Support",
21
- "LABEL_7": "Human Resources / Recruitment / Talent Acquisition",
22
- "LABEL_8": "Legal / Compliance",
23
- "LABEL_9": "Administration / Clerical",
24
- "LABEL_10": "Operations / Logistics / Supply Chain",
25
- "LABEL_11": "Management / Strategy / Consulting",
26
- "LABEL_12": "Science / Research / R&D",
27
- "LABEL_13": "Design / UI-UX / Creative",
28
- "LABEL_14": "Manufacturing / Production",
29
- "LABEL_15": "Hospitality / Tourism / Travel",
30
- "LABEL_16": "Construction / Architecture",
31
- "LABEL_17": "Media / Communication / PR / Journalism",
32
- "LABEL_18": "Procurement / Purchasing",
33
- "LABEL_19": "Security / Safety",
34
- "LABEL_20": "Real Estate / Property Management",
35
- "LABEL_21": "Energy / Oil & Gas / Utilities",
36
- "LABEL_22": "Agriculture / Environmental / Forestry",
37
- "LABEL_23": "Transportation / Automotive / Aviation",
38
- "LABEL_24": "Retail / Merchandising / E-commerce",
39
- "LABEL_25": "Data Science / Machine Learning / AI",
40
- "LABEL_26": "Product Management / Project Management",
41
- "LABEL_27": "Quality Assurance / Control",
42
- "LABEL_28": "Telecommunication / Network Engineering",
43
- "LABEL_29": "Entrepreneurship / Startups / Freelancing",
44
- "LABEL_30": "Other / Miscellaneous"
45
  }
46
 
47
- # Helper functions
48
- def clean_resume_text(text):
 
 
 
49
  text = re.sub(r'http\S+', ' ', text)
50
- text = re.sub(r'#\S+', '', text)
51
- text = re.sub(r'@\S+', ' ', text)
52
- text = re.sub(r'[^\w\s]', ' ', text)
53
  text = re.sub(r'[^\x00-\x7f]', ' ', text)
54
- return re.sub(r'\s+', ' ', text).strip()
 
 
55
 
56
- def extract_resume_text(file):
57
  try:
58
  reader = PyPDF2.PdfReader(file)
59
  text = ""
60
  for page in reader.pages:
61
- page_text = page.extract_text()
62
- if page_text:
63
- text += page_text + " "
64
- return text, None if text.strip() else "No text found in PDF"
65
  except Exception as e:
66
- return None, f"Error reading PDF: {str(e)}"
67
 
 
68
  def classify_resumes(files):
69
- categorized = defaultdict(list)
70
- label_scores = {}
71
- os.makedirs("classified_resumes", exist_ok=True)
 
 
 
72
 
73
  for file in files:
74
  file_name = os.path.basename(file.name)
75
- resume_text, error = extract_resume_text(file)
76
  if error:
 
77
  continue
78
 
79
- cleaned_text = clean_resume_text(resume_text)
80
- result = text_classifier(cleaned_text[:512])[0]
81
- label = result['label']
 
82
  score = round(result['score'], 4)
83
- category = CATEGORY_MAP.get(label, label)
84
 
85
- # Save to relevant folder
86
- cat_folder = os.path.join("classified_resumes", category.replace(" ", "_"))
87
- os.makedirs(cat_folder, exist_ok=True)
88
- save_path = os.path.join(cat_folder, file_name)
 
89
 
90
- with open(file.name, "rb") as f_in, open(save_path, "wb") as f_out:
 
 
 
 
91
  shutil.copyfileobj(f_in, f_out)
92
 
93
- categorized[category].append(save_path)
94
- label_scores[file_name] = {"Predicted Job Category": category, "Confidence Score": score}
 
 
 
95
 
96
- return label_scores, categorized
 
 
97
 
98
- def show_category_files(selected_category):
99
- category_path = os.path.join("classified_resumes", selected_category.replace(" ", "_"))
100
- if not os.path.exists(category_path):
101
- return []
102
- return [os.path.join(category_path, f) for f in os.listdir(category_path) if f.endswith(".pdf")]
103
 
104
- # Gradio UI
105
- with gr.Blocks(title="🧠 Resume Screening & Categorization") as demo:
106
- gr.Markdown("""## πŸ“„ Resume Screening by Job Role/Industry
107
- Upload resumes below. The app classifies each into categories like IT, HR, Sales, etc. Then click on any category to view/download relevant resumes.""")
108
 
109
- file_input = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload Resume PDFs")
110
- classify_button = gr.Button("πŸ“Š Classify Resumes")
111
- output_json = gr.JSON(label="Classification Summary")
112
 
113
- category_dropdown = gr.Dropdown(label="Select Category to View Files", choices=sorted(list(CATEGORY_MAP.values())))
114
- resume_file_list = gr.File(label="Filtered Resumes in Selected Category", file_count="multiple")
115
 
116
- def update_dropdown_options(files):
117
- _, cat_data = classify_resumes(files)
118
- return sorted(list(cat_data.keys()))
 
 
119
 
120
- classify_button.click(fn=classify_resumes, inputs=[file_input], outputs=[output_json, category_dropdown])
121
- category_dropdown.change(fn=show_category_files, inputs=[category_dropdown], outputs=[resume_file_list])
 
 
 
122
 
123
  if __name__ == "__main__":
124
- demo.launch()
 
4
  import PyPDF2
5
  import gradio as gr
6
  from transformers import pipeline
 
7
 
8
+ # ------------------- Category Mapping -------------------
 
 
 
9
  CATEGORY_MAP = {
10
+ "C0": "Administration / Clerical",
11
+ "C1": "Agriculture / Environmental / Forestry",
12
+ "C2": "Information Technology / Software Engineering",
13
+ "C3": "Data Science / Machine Learning / AI",
14
+ "C4": "Finance / Accounting / Auditing",
15
+ "C5": "Human Resources / Recruitment / Talent Acquisition",
16
+ "C6": "Sales / Marketing / Business Development",
17
+ "C7": "Engineering / Mechanical / Civil / Electrical",
18
+ "C8": "Customer Service / Support",
19
+ "C9": "Design / UI-UX / Creative",
20
+ "C10": "Healthcare / Medical / Nursing",
21
+ "C11": "Education / Training / Teaching",
22
+ "C12": "Retail / Merchandising / E-commerce",
23
+ "C13": "Telecommunication / Network Engineering",
24
+ "C14": "Operations / Logistics / Supply Chain",
25
+ "C15": "Entrepreneurship / Startups / Freelancing",
26
+ "C16": "Product Management / Project Management",
27
+ "C17": "Legal / Compliance",
28
+ "C18": "Real Estate / Property Management",
29
+ "C19": "Transportation / Automotive / Aviation",
30
+ "C20": "Construction / Architecture",
31
+ "C21": "Energy / Oil & Gas / Utilities",
32
+ "C22": "Security / Safety",
33
+ "C23": "Procurement / Purchasing",
34
+ "C24": "Manufacturing / Production",
35
+ "C25": "Media / Communication / PR / Journalism",
36
+ "C26": "Science / Research / R&D",
37
+ "C27": "Quality Assurance / Control",
38
+ "C28": "Hospitality / Tourism / Travel",
39
+ "C29": "Management / Strategy / Consulting",
40
+ "C30": "Other / Miscellaneous"
41
  }
42
 
43
+ # ------------------- Load Classification Model -------------------
44
+ classifier = pipeline("text-classification", model="CleveGreen/JobClassifier_v2")
45
+
46
+ # ------------------- Resume Utilities -------------------
47
+ def clean_text(text):
48
  text = re.sub(r'http\S+', ' ', text)
 
 
 
49
  text = re.sub(r'[^\x00-\x7f]', ' ', text)
50
+ text = re.sub(r'[^\w\s]', ' ', text)
51
+ text = re.sub(r'\s+', ' ', text).strip()
52
+ return text
53
 
54
+ def extract_text_from_pdf(file):
55
  try:
56
  reader = PyPDF2.PdfReader(file)
57
  text = ""
58
  for page in reader.pages:
59
+ content = page.extract_text()
60
+ if content:
61
+ text += content + " "
62
+ return text.strip(), None if text else "No text found in PDF."
63
  except Exception as e:
64
+ return None, str(e)
65
 
66
+ # ------------------- Resume Classification & Organization -------------------
67
  def classify_resumes(files):
68
+ predictions = {}
69
+ classified_files = {}
70
+
71
+ if os.path.exists("classified_resumes"):
72
+ shutil.rmtree("classified_resumes")
73
+ os.makedirs("classified_resumes")
74
 
75
  for file in files:
76
  file_name = os.path.basename(file.name)
77
+ resume_text, error = extract_text_from_pdf(file)
78
  if error:
79
+ predictions[file_name] = {"error": error}
80
  continue
81
 
82
+ cleaned_text = clean_text(resume_text)
83
+ result = classifier(cleaned_text[:512])[0] # Truncate to avoid max token
84
+
85
+ label = result['label'] # e.g., C2
86
  score = round(result['score'], 4)
87
+ category = CATEGORY_MAP.get(label, "Other / Miscellaneous")
88
 
89
+ predictions[file_name] = {
90
+ "Predicted Job Category": label,
91
+ "Category Name": category,
92
+ "Confidence Score": score
93
+ }
94
 
95
+ category_folder = os.path.join("classified_resumes", category)
96
+ os.makedirs(category_folder, exist_ok=True)
97
+ dest_path = os.path.join(category_folder, file_name)
98
+
99
+ with open(file.name, "rb") as f_in, open(dest_path, "wb") as f_out:
100
  shutil.copyfileobj(f_in, f_out)
101
 
102
+ if category not in classified_files:
103
+ classified_files[category] = []
104
+ classified_files[category].append(dest_path)
105
+
106
+ return predictions, classified_files
107
 
108
+ # ------------------- Gradio App -------------------
109
+ def filter_by_category(category, all_classified):
110
+ return all_classified.get(category, [])
111
 
112
+ with gr.Blocks(title="Resume Screening & Classification") as app:
113
+ gr.Markdown("""
114
+ # πŸ“„ Resume Screening Tool
115
+ Upload resumes in PDF format. The system will classify them into job categories using a pretrained AI model.
116
+ """)
117
 
118
+ with gr.Row():
119
+ uploaded_files = gr.File(file_types=[".pdf"], file_count="multiple", label="Upload Resumes")
120
+ classify_button = gr.Button("Classify Resumes")
 
121
 
122
+ classification_results = gr.JSON(label="Classification Output")
123
+ category_selector = gr.Dropdown(choices=list(CATEGORY_MAP.values()), label="Filter by Job Category")
124
+ filtered_resumes_output = gr.File(file_types=[".pdf"], file_count="multiple", label="Filtered Resumes")
125
 
126
+ all_classified_state = gr.State({})
 
127
 
128
+ classify_button.click(
129
+ fn=classify_resumes,
130
+ inputs=[uploaded_files],
131
+ outputs=[classification_results, all_classified_state]
132
+ )
133
 
134
+ category_selector.change(
135
+ fn=filter_by_category,
136
+ inputs=[category_selector, all_classified_state],
137
+ outputs=[filtered_resumes_output]
138
+ )
139
 
140
  if __name__ == "__main__":
141
+ app.launch()