Spaces:
Sleeping
Sleeping
File size: 4,367 Bytes
25c62c3 b740a24 25c62c3 8bfcb85 25c62c3 8bfcb85 99e48e5 8bfcb85 25c62c3 b740a24 25c62c3 e39d53c b740a24 25c62c3 b740a24 8bfcb85 b740a24 8bfcb85 b740a24 8bfcb85 b740a24 8bfcb85 b740a24 8bfcb85 25c62c3 8bfcb85 b740a24 8bfcb85 e39d53c 8bfcb85 e39d53c 8bfcb85 25c62c3 b740a24 8bfcb85 25c62c3 8bfcb85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import re
import PyPDF2
import gradio as gr
from transformers import pipeline
from collections import Counter
# Load NER pipeline
ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
# Load text classification model (replace with a job-role classifier if available)
text_classifier = pipeline("text-classification", model="khaimait/job-title-classification-bert")
def clean_resume_text(text):
text = re.sub(r'http\S+', ' ', text)
text = re.sub(r'#\S+', '', text)
text = re.sub(r'@\S+', ' ', text)
text = re.sub(r'[^\w\s]', ' ', text)
text = re.sub(r'[^\x00-\x7f]', ' ', text)
return re.sub(r'\s+', ' ', text).strip()
def extract_resume_text(file):
try:
reader = PyPDF2.PdfReader(file)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + " "
if not text.strip():
return None, "Error: No text extracted from PDF."
return text, None
except Exception as e:
return None, f"Error reading PDF: {str(e)}"
def classify_resume_ner(entities):
orgs = [e['word'] for e in entities if e['entity_group'] == 'ORG']
locs = [e['word'] for e in entities if e['entity_group'] == 'LOC']
jobs = [e['word'] for e in entities if e['entity_group'] == 'MISC']
dominant_org = Counter(orgs).most_common(1)
dominant_loc = Counter(locs).most_common(1)
dominant_job = Counter(jobs).most_common(1)
return {
"Main_Organization": dominant_org[0][0] if dominant_org else "Unknown",
"Main_Location": dominant_loc[0][0] if dominant_loc else "Unknown",
"Possible_Job/Field (NER)": dominant_job[0][0] if dominant_job else "General"
}
def process_resumes(files):
all_results = {}
for file in files:
file_name = file.name.split("/")[-1]
resume_text, error = extract_resume_text(file)
if error:
all_results[file_name] = {"error": error}
continue
cleaned_text = clean_resume_text(resume_text)
entities = ner_pipeline(cleaned_text)
classification = classify_resume_ner(entities)
all_results[file_name] = {
"Persons": list({e["word"] for e in entities if e["entity_group"] == "PER"}),
"Organizations": list({e["word"] for e in entities if e["entity_group"] == "ORG"}),
"Locations": list({e["word"] for e in entities if e["entity_group"] == "LOC"}),
"Other": list({e["word"] for e in entities if e["entity_group"] not in ["PER", "ORG", "LOC"]}),
"Cleaned_Text": cleaned_text,
"Classification (NER)": classification
}
return all_results
def classify_resumes_with_model(files):
predictions = {}
for file in files:
file_name = file.name.split("/")[-1]
resume_text, error = extract_resume_text(file)
if error:
predictions[file_name] = {"error": error}
continue
cleaned_text = clean_resume_text(resume_text)
result = text_classifier(cleaned_text[:512]) # Truncate long resumes
predictions[file_name] = {
"Predicted Label (HuggingFace Classifier)": result[0]['label'],
"Confidence": round(result[0]['score'], 4)
}
return predictions
# Gradio UI
with gr.Blocks(title="Multi-Resume Entity & Job Classifier") as demo:
gr.Markdown("## π Multi-Resume Entity Extractor & Classifier\nUpload multiple PDF resumes below. This tool extracts text, identifies key entities, and classifies job field using a Hugging Face model.")
with gr.Row():
file_input = gr.File(file_types=[".pdf"], label="Upload Resume PDFs", file_count="multiple")
with gr.Row():
extract_button = gr.Button("π Extract & Analyze Entities")
classify_button = gr.Button("π§ Predict Job Role with Classifier")
output_entities = gr.JSON(label="Entity Extraction & NER Classification")
output_class = gr.JSON(label="Predicted Job Classification (Model)")
extract_button.click(fn=process_resumes, inputs=[file_input], outputs=[output_entities])
classify_button.click(fn=classify_resumes_with_model, inputs=[file_input], outputs=[output_class])
if __name__ == "__main__":
demo.launch()
|