TKM03 commited on
Commit
8bfcb85
Β·
verified Β·
1 Parent(s): b740a24

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -24
app.py CHANGED
@@ -4,11 +4,13 @@ import gradio as gr
4
  from transformers import pipeline
5
  from collections import Counter
6
 
7
- # Load the Hugging Face NER pipeline
8
  ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
9
 
 
 
 
10
  def clean_resume_text(text):
11
- """Clean resume text by removing unwanted characters and formatting."""
12
  text = re.sub(r'http\S+', ' ', text)
13
  text = re.sub(r'#\S+', '', text)
14
  text = re.sub(r'@\S+', ' ', text)
@@ -17,7 +19,6 @@ def clean_resume_text(text):
17
  return re.sub(r'\s+', ' ', text).strip()
18
 
19
  def extract_resume_text(file):
20
- """Extract raw text from uploaded PDF file."""
21
  try:
22
  reader = PyPDF2.PdfReader(file)
23
  text = ""
@@ -31,8 +32,7 @@ def extract_resume_text(file):
31
  except Exception as e:
32
  return None, f"Error reading PDF: {str(e)}"
33
 
34
- def classify_resume(entities):
35
- """Classify resume based on dominant entity types."""
36
  orgs = [e['word'] for e in entities if e['entity_group'] == 'ORG']
37
  locs = [e['word'] for e in entities if e['entity_group'] == 'LOC']
38
  jobs = [e['word'] for e in entities if e['entity_group'] == 'MISC']
@@ -44,45 +44,65 @@ def classify_resume(entities):
44
  return {
45
  "Main_Organization": dominant_org[0][0] if dominant_org else "Unknown",
46
  "Main_Location": dominant_loc[0][0] if dominant_loc else "Unknown",
47
- "Possible_Job/Field": dominant_job[0][0] if dominant_job else "General"
48
  }
49
 
50
- def extract_entities_from_pdfs(files):
51
- """Process multiple resumes, extract entities, and classify."""
52
- summary = {}
53
-
54
  for file in files:
55
  file_name = file.name.split("/")[-1]
56
  resume_text, error = extract_resume_text(file)
57
 
58
  if error:
59
- summary[file_name] = {"error": error}
60
  continue
61
 
62
  cleaned_text = clean_resume_text(resume_text)
63
  entities = ner_pipeline(cleaned_text)
 
64
 
65
- result = {
66
  "Persons": list({e["word"] for e in entities if e["entity_group"] == "PER"}),
67
  "Organizations": list({e["word"] for e in entities if e["entity_group"] == "ORG"}),
68
  "Locations": list({e["word"] for e in entities if e["entity_group"] == "LOC"}),
69
  "Other": list({e["word"] for e in entities if e["entity_group"] not in ["PER", "ORG", "LOC"]}),
70
  "Cleaned_Text": cleaned_text,
71
- "Classification": classify_resume(entities)
72
  }
 
73
 
74
- summary[file_name] = result
75
-
76
- return summary
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  # Gradio UI
79
- iface = gr.Interface(
80
- fn=extract_entities_from_pdfs,
81
- inputs=gr.File(file_types=[".pdf"], label="Upload Resumes (PDF)", file_count="multiple"),
82
- outputs=gr.JSON(label="Resume Classification & Entity Summary"),
83
- title="πŸ“‚ Multi-Resume Entity Extractor & Classifier",
84
- description="Upload multiple PDF resumes. This tool extracts text, identifies key entities, and classifies each resume by organizations, locations, and possible job/field."
85
- )
 
 
 
 
 
 
 
 
86
 
87
  if __name__ == "__main__":
88
- iface.launch()
 
4
  from transformers import pipeline
5
  from collections import Counter
6
 
7
+ # Load NER pipeline
8
  ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
9
 
10
+ # Load text classification model (replace with a job-role classifier if available)
11
+ text_classifier = pipeline("text-classification", model="MoritzLaurer/bert-multilingual-passage-reranking-msmarco")
12
+
13
  def clean_resume_text(text):
 
14
  text = re.sub(r'http\S+', ' ', text)
15
  text = re.sub(r'#\S+', '', text)
16
  text = re.sub(r'@\S+', ' ', text)
 
19
  return re.sub(r'\s+', ' ', text).strip()
20
 
21
  def extract_resume_text(file):
 
22
  try:
23
  reader = PyPDF2.PdfReader(file)
24
  text = ""
 
32
  except Exception as e:
33
  return None, f"Error reading PDF: {str(e)}"
34
 
35
+ def classify_resume_ner(entities):
 
36
  orgs = [e['word'] for e in entities if e['entity_group'] == 'ORG']
37
  locs = [e['word'] for e in entities if e['entity_group'] == 'LOC']
38
  jobs = [e['word'] for e in entities if e['entity_group'] == 'MISC']
 
44
  return {
45
  "Main_Organization": dominant_org[0][0] if dominant_org else "Unknown",
46
  "Main_Location": dominant_loc[0][0] if dominant_loc else "Unknown",
47
+ "Possible_Job/Field (NER)": dominant_job[0][0] if dominant_job else "General"
48
  }
49
 
50
+ def process_resumes(files):
51
+ all_results = {}
 
 
52
  for file in files:
53
  file_name = file.name.split("/")[-1]
54
  resume_text, error = extract_resume_text(file)
55
 
56
  if error:
57
+ all_results[file_name] = {"error": error}
58
  continue
59
 
60
  cleaned_text = clean_resume_text(resume_text)
61
  entities = ner_pipeline(cleaned_text)
62
+ classification = classify_resume_ner(entities)
63
 
64
+ all_results[file_name] = {
65
  "Persons": list({e["word"] for e in entities if e["entity_group"] == "PER"}),
66
  "Organizations": list({e["word"] for e in entities if e["entity_group"] == "ORG"}),
67
  "Locations": list({e["word"] for e in entities if e["entity_group"] == "LOC"}),
68
  "Other": list({e["word"] for e in entities if e["entity_group"] not in ["PER", "ORG", "LOC"]}),
69
  "Cleaned_Text": cleaned_text,
70
+ "Classification (NER)": classification
71
  }
72
+ return all_results
73
 
74
+ def classify_resumes_with_model(files):
75
+ predictions = {}
76
+ for file in files:
77
+ file_name = file.name.split("/")[-1]
78
+ resume_text, error = extract_resume_text(file)
79
+ if error:
80
+ predictions[file_name] = {"error": error}
81
+ continue
82
+ cleaned_text = clean_resume_text(resume_text)
83
+ result = text_classifier(cleaned_text[:512]) # Truncate long resumes
84
+ predictions[file_name] = {
85
+ "Predicted Label (HuggingFace Classifier)": result[0]['label'],
86
+ "Confidence": round(result[0]['score'], 4)
87
+ }
88
+ return predictions
89
 
90
  # Gradio UI
91
+ with gr.Blocks(title="Multi-Resume Entity & Job Classifier") as demo:
92
+ gr.Markdown("## πŸ“‚ Multi-Resume Entity Extractor & Classifier\nUpload multiple PDF resumes below. This tool extracts text, identifies key entities, and classifies job field using a Hugging Face model.")
93
+
94
+ with gr.Row():
95
+ file_input = gr.File(file_types=[".pdf"], label="Upload Resume PDFs", file_count="multiple")
96
+
97
+ with gr.Row():
98
+ extract_button = gr.Button("πŸ” Extract & Analyze Entities")
99
+ classify_button = gr.Button("🧠 Predict Job Role with Classifier")
100
+
101
+ output_entities = gr.JSON(label="Entity Extraction & NER Classification")
102
+ output_class = gr.JSON(label="Predicted Job Classification (Model)")
103
+
104
+ extract_button.click(fn=process_resumes, inputs=[file_input], outputs=[output_entities])
105
+ classify_button.click(fn=classify_resumes_with_model, inputs=[file_input], outputs=[output_class])
106
 
107
  if __name__ == "__main__":
108
+ demo.launch()