File size: 4,367 Bytes
25c62c3
 
 
 
b740a24
25c62c3
8bfcb85
25c62c3
 
8bfcb85
99e48e5
 
8bfcb85
25c62c3
b740a24
 
 
 
 
 
25c62c3
 
 
 
 
 
e39d53c
 
 
 
b740a24
 
25c62c3
b740a24
 
8bfcb85
b740a24
 
 
 
 
 
 
 
 
 
 
8bfcb85
b740a24
 
8bfcb85
 
b740a24
 
 
 
 
8bfcb85
b740a24
 
 
 
8bfcb85
25c62c3
8bfcb85
b740a24
 
 
 
 
8bfcb85
e39d53c
8bfcb85
e39d53c
8bfcb85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25c62c3
b740a24
8bfcb85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25c62c3
 
8bfcb85
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import re
import PyPDF2
import gradio as gr
from transformers import pipeline
from collections import Counter

# Load NER pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")

# Load text classification model (replace with a job-role classifier if available)
text_classifier = pipeline("text-classification", model="khaimait/job-title-classification-bert")


def clean_resume_text(text):
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'#\S+', '', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'[^\x00-\x7f]', ' ', text)
    return re.sub(r'\s+', ' ', text).strip()

def extract_resume_text(file):
    try:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + " "
        if not text.strip():
            return None, "Error: No text extracted from PDF."
        return text, None
    except Exception as e:
        return None, f"Error reading PDF: {str(e)}"

def classify_resume_ner(entities):
    orgs = [e['word'] for e in entities if e['entity_group'] == 'ORG']
    locs = [e['word'] for e in entities if e['entity_group'] == 'LOC']
    jobs = [e['word'] for e in entities if e['entity_group'] == 'MISC']

    dominant_org = Counter(orgs).most_common(1)
    dominant_loc = Counter(locs).most_common(1)
    dominant_job = Counter(jobs).most_common(1)

    return {
        "Main_Organization": dominant_org[0][0] if dominant_org else "Unknown",
        "Main_Location": dominant_loc[0][0] if dominant_loc else "Unknown",
        "Possible_Job/Field (NER)": dominant_job[0][0] if dominant_job else "General"
    }

def process_resumes(files):
    all_results = {}
    for file in files:
        file_name = file.name.split("/")[-1]
        resume_text, error = extract_resume_text(file)

        if error:
            all_results[file_name] = {"error": error}
            continue

        cleaned_text = clean_resume_text(resume_text)
        entities = ner_pipeline(cleaned_text)
        classification = classify_resume_ner(entities)

        all_results[file_name] = {
            "Persons": list({e["word"] for e in entities if e["entity_group"] == "PER"}),
            "Organizations": list({e["word"] for e in entities if e["entity_group"] == "ORG"}),
            "Locations": list({e["word"] for e in entities if e["entity_group"] == "LOC"}),
            "Other": list({e["word"] for e in entities if e["entity_group"] not in ["PER", "ORG", "LOC"]}),
            "Cleaned_Text": cleaned_text,
            "Classification (NER)": classification
        }
    return all_results

def classify_resumes_with_model(files):
    predictions = {}
    for file in files:
        file_name = file.name.split("/")[-1]
        resume_text, error = extract_resume_text(file)
        if error:
            predictions[file_name] = {"error": error}
            continue
        cleaned_text = clean_resume_text(resume_text)
        result = text_classifier(cleaned_text[:512])  # Truncate long resumes
        predictions[file_name] = {
            "Predicted Label (HuggingFace Classifier)": result[0]['label'],
            "Confidence": round(result[0]['score'], 4)
        }
    return predictions

# Gradio UI
with gr.Blocks(title="Multi-Resume Entity & Job Classifier") as demo:
    gr.Markdown("## πŸ“‚ Multi-Resume Entity Extractor & Classifier\nUpload multiple PDF resumes below. This tool extracts text, identifies key entities, and classifies job field using a Hugging Face model.")

    with gr.Row():
        file_input = gr.File(file_types=[".pdf"], label="Upload Resume PDFs", file_count="multiple")

    with gr.Row():
        extract_button = gr.Button("πŸ” Extract & Analyze Entities")
        classify_button = gr.Button("🧠 Predict Job Role with Classifier")

    output_entities = gr.JSON(label="Entity Extraction & NER Classification")
    output_class = gr.JSON(label="Predicted Job Classification (Model)")

    extract_button.click(fn=process_resumes, inputs=[file_input], outputs=[output_entities])
    classify_button.click(fn=classify_resumes_with_model, inputs=[file_input], outputs=[output_class])

if __name__ == "__main__":
    demo.launch()