File size: 5,108 Bytes
25c62c3
 
 
 
b740a24
25c62c3
cc2242c
25c62c3
cc2242c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9f062d8
25c62c3
cc2242c
b740a24
 
 
 
 
 
25c62c3
 
cc2242c
25c62c3
 
 
 
e39d53c
 
 
 
b740a24
 
25c62c3
b740a24
 
8bfcb85
cc2242c
b740a24
 
 
 
 
 
 
 
 
 
 
8bfcb85
b740a24
 
8bfcb85
cc2242c
8bfcb85
b740a24
 
 
 
8bfcb85
b740a24
 
 
 
8bfcb85
25c62c3
8bfd778
 
 
 
 
 
 
 
8bfcb85
e39d53c
8bfcb85
cc2242c
8bfcb85
 
 
 
 
 
 
 
cc2242c
 
 
 
8bfcb85
cc2242c
 
3453a71
8bfcb85
 
25c62c3
3453a71
cc2242c
 
8bfcb85
 
9f062d8
8bfcb85
 
9f062d8
 
8bfcb85
9f062d8
cc2242c
8bfcb85
 
 
25c62c3
 
8bfcb85
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import re
import PyPDF2
import gradio as gr
from transformers import pipeline
from collections import Counter

# Load NER pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")

# Load Job Category Classifier
text_classifier = pipeline("text-classification", model="serbog/distilbert-jobCategory_410k")

# Mapping from category code to readable label
CATEGORY_MAP = {
    "C1": "Engineering",
    "C2": "Information Technology",
    "C3": "Sales & Marketing",
    "C4": "Accounting & Finance",
    "C5": "Healthcare",
    "D1": "Education",
    "D2": "Human Resources",
    "E1": "Operations & Logistics",
    "E2": "Legal",
    "F1": "Customer Support",
    "Other": "General / Undefined"
}

def clean_resume_text(text):
    """Clean text by removing unwanted characters and formatting."""
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'[^\x00-\x7f]', ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

def extract_resume_text(file):
    """Extract raw text from uploaded PDF."""
    try:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
        if not text.strip():
            return None, "Error: No text extracted from PDF."
        return text, None
    except Exception as e:
        return None, f"Error reading PDF: {str(e)}"

def classify_resume_ner(entities):
    """Basic rule-based NER classification using ORG, LOC, MISC."""
    orgs = [e['word'] for e in entities if e['entity_group'] == 'ORG']
    locs = [e['word'] for e in entities if e['entity_group'] == 'LOC']
    jobs = [e['word'] for e in entities if e['entity_group'] == 'MISC']

    dominant_org = Counter(orgs).most_common(1)
    dominant_loc = Counter(locs).most_common(1)
    dominant_job = Counter(jobs).most_common(1)

    return {
        "Main_Organization": dominant_org[0][0] if dominant_org else "Unknown",
        "Main_Location": dominant_loc[0][0] if dominant_loc else "Unknown",
        "Possible_Job/Field (NER)": dominant_job[0][0] if dominant_job else "General"
    }

def process_resumes(files):
    """Extract entities and show classification based on NER."""
    all_results = {}
    for file in files:
        file_name = file.name.split("/")[-1]
        resume_text, error = extract_resume_text(file)
        if error:
            all_results[file_name] = {"error": error}
            continue

        cleaned_text = clean_resume_text(resume_text)
        entities = ner_pipeline(cleaned_text)
        classification = classify_resume_ner(entities)

        all_results[file_name] = {
            "Persons": list({e["word"] for e in entities if e["entity_group"] == "PER"}),
            "Organizations": list({e["word"] for e in entities if e["entity_group"] == "ORG"}),
            "Locations": list({e["word"] for e in entities if e["entity_group"] == "LOC"}),
            "Other Entities": list({e["word"] for e in entities if e["entity_group"] not in ["PER", "ORG", "LOC"]}),
            "Cleaned_Text": cleaned_text,
            "Classification (NER)": classification
        }
    return all_results

def classify_resumes_with_model(files):
    """Use job category model to classify resume into readable job field."""
    predictions = {}
    for file in files:
        file_name = file.name.split("/")[-1]
        resume_text, error = extract_resume_text(file)
        if error:
            predictions[file_name] = {"error": error}
            continue
        cleaned_text = clean_resume_text(resume_text)
        result = text_classifier(cleaned_text[:512])  # Truncate for safety
        raw_label = result[0]['label']
        readable_label = CATEGORY_MAP.get(raw_label, "Unknown")

        predictions[file_name] = {
            "Predicted Job Category": readable_label,
            "Raw Label": raw_label,
            "Confidence Score": round(result[0]['score'], 4)
        }
    return predictions

# Gradio Interface
with gr.Blocks(title="Resume Analyzer") as demo:
    gr.Markdown("## πŸ“‚ Multi-Resume Entity Extractor & Job Classifier\nUpload multiple PDF resumes. This tool extracts entities using NER and predicts the job field using a trained classifier model.")

    with gr.Row():
        file_input = gr.File(file_types=[".pdf"], label="Upload Resume PDFs", file_count="multiple")

    with gr.Row():
        extract_button = gr.Button("πŸ” Extract Entities (NER)")
        classify_button = gr.Button("🧠 Predict Job Category (Model)")

    output_entities = gr.JSON(label="NER Results & Classification")
    output_class = gr.JSON(label="Model-Predicted Job Category")

    extract_button.click(fn=process_resumes, inputs=[file_input], outputs=[output_entities])
    classify_button.click(fn=classify_resumes_with_model, inputs=[file_input], outputs=[output_class])

if __name__ == "__main__":
    demo.launch()