import re import PyPDF2 import gradio as gr from transformers import pipeline from collections import Counter # Load NER pipeline ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple") # Load Job Category Classifier text_classifier = pipeline("text-classification", model="serbog/distilbert-jobCategory_410k") # Mapping from category code to readable label CATEGORY_MAP = { "C1": "Engineering", "C2": "Information Technology", "C3": "Sales & Marketing", "C4": "Accounting & Finance", "C5": "Healthcare", "D1": "Education", "D2": "Human Resources", "E1": "Operations & Logistics", "E2": "Legal", "F1": "Customer Support", "Other": "General / Undefined" } def clean_resume_text(text): """Clean text by removing unwanted characters and formatting.""" text = re.sub(r'http\S+', ' ', text) text = re.sub(r'#\S+', '', text) text = re.sub(r'@\S+', ' ', text) text = re.sub(r'[^\w\s]', ' ', text) text = re.sub(r'[^\x00-\x7f]', ' ', text) return re.sub(r'\s+', ' ', text).strip() def extract_resume_text(file): """Extract raw text from uploaded PDF.""" try: reader = PyPDF2.PdfReader(file) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + " " if not text.strip(): return None, "Error: No text extracted from PDF." return text, None except Exception as e: return None, f"Error reading PDF: {str(e)}" def classify_resume_ner(entities): """Basic rule-based NER classification using ORG, LOC, MISC.""" orgs = [e['word'] for e in entities if e['entity_group'] == 'ORG'] locs = [e['word'] for e in entities if e['entity_group'] == 'LOC'] jobs = [e['word'] for e in entities if e['entity_group'] == 'MISC'] dominant_org = Counter(orgs).most_common(1) dominant_loc = Counter(locs).most_common(1) dominant_job = Counter(jobs).most_common(1) return { "Main_Organization": dominant_org[0][0] if dominant_org else "Unknown", "Main_Location": dominant_loc[0][0] if dominant_loc else "Unknown", "Possible_Job/Field (NER)": dominant_job[0][0] if dominant_job else "General" } def process_resumes(files): """Extract entities and show classification based on NER.""" all_results = {} for file in files: file_name = file.name.split("/")[-1] resume_text, error = extract_resume_text(file) if error: all_results[file_name] = {"error": error} continue cleaned_text = clean_resume_text(resume_text) entities = ner_pipeline(cleaned_text) classification = classify_resume_ner(entities) all_results[file_name] = { "Persons": list({e["word"] for e in entities if e["entity_group"] == "PER"}), "Organizations": list({e["word"] for e in entities if e["entity_group"] == "ORG"}), "Locations": list({e["word"] for e in entities if e["entity_group"] == "LOC"}), "Other Entities": list({e["word"] for e in entities if e["entity_group"] not in ["PER", "ORG", "LOC"]}), "Cleaned_Text": cleaned_text, "Classification (NER)": classification } return all_results def classify_resumes_with_model(files): """Use job category model to classify resume into readable job field.""" predictions = {} for file in files: file_name = file.name.split("/")[-1] resume_text, error = extract_resume_text(file) if error: predictions[file_name] = {"error": error} continue cleaned_text = clean_resume_text(resume_text) result = text_classifier(cleaned_text[:512]) # Truncate for safety raw_label = result[0]['label'] readable_label = CATEGORY_MAP.get(raw_label, "Unknown") predictions[file_name] = { "Predicted Job Category": readable_label, "Raw Label": raw_label, "Confidence Score": round(result[0]['score'], 4) } return predictions # Gradio Interface with gr.Blocks(title="Resume Analyzer") as demo: gr.Markdown("## 📂 Multi-Resume Entity Extractor & Job Classifier\nUpload multiple PDF resumes. This tool extracts entities using NER and predicts the job field using a trained classifier model.") with gr.Row(): file_input = gr.File(file_types=[".pdf"], label="Upload Resume PDFs", file_count="multiple") with gr.Row(): extract_button = gr.Button("🔍 Extract Entities (NER)") classify_button = gr.Button("🧠 Predict Job Category (Model)") output_entities = gr.JSON(label="NER Results & Classification") output_class = gr.JSON(label="Model-Predicted Job Category") extract_button.click(fn=process_resumes, inputs=[file_input], outputs=[output_entities]) classify_button.click(fn=classify_resumes_with_model, inputs=[file_input], outputs=[output_class]) if __name__ == "__main__": demo.launch()