TKM03 commited on
Commit
b740a24
·
verified ·
1 Parent(s): e39d53c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -46
app.py CHANGED
@@ -2,18 +2,19 @@ import re
2
  import PyPDF2
3
  import gradio as gr
4
  from transformers import pipeline
 
5
 
6
- # Load the Hugging Face NER model pipeline
7
  ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
8
 
9
  def clean_resume_text(text):
10
  """Clean resume text by removing unwanted characters and formatting."""
11
- text = re.sub(r'http\S+', ' ', text) # Remove URLs
12
- text = re.sub(r'#\S+', '', text) # Remove hashtags
13
- text = re.sub(r'@\S+', ' ', text) # Remove mentions
14
- text = re.sub(r'[^\w\s]', ' ', text) # Remove punctuation
15
- text = re.sub(r'[^\x00-\x7f]', ' ', text) # Remove non-ASCII characters
16
- return re.sub(r'\s+', ' ', text).strip() # Normalize whitespace
17
 
18
  def extract_resume_text(file):
19
  """Extract raw text from uploaded PDF file."""
@@ -25,54 +26,63 @@ def extract_resume_text(file):
25
  if page_text:
26
  text += page_text + " "
27
  if not text.strip():
28
- return "Error: No text extracted from PDF."
29
- return text
30
  except Exception as e:
31
- return f"Error reading PDF: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- def extract_entities_from_pdf(file):
34
- """Main processing function: Extracts and cleans text, runs NER, and returns structured data."""
35
- try:
36
- resume_text = extract_resume_text(file)
37
- if resume_text.startswith("Error"):
38
- return {"error": resume_text}
39
-
40
- entities = ner_pipeline(resume_text)
41
-
42
  result = {
43
- "Persons": [],
44
- "Organizations": [],
45
- "Locations": [],
46
- "Other": []
 
 
47
  }
48
 
49
- for entity in entities:
50
- label = entity.get("entity_group")
51
- word = entity.get("word")
52
- if label == "PER":
53
- result["Persons"].append(word)
54
- elif label == "ORG":
55
- result["Organizations"].append(word)
56
- elif label == "LOC":
57
- result["Locations"].append(word)
58
- else:
59
- result["Other"].append(word)
60
 
61
- result["Cleaned_Text"] = clean_resume_text(resume_text)
62
- return result
63
-
64
- except Exception as e:
65
- return {"error": f"Exception during processing: {str(e)}"}
66
 
67
- # Gradio interface
68
  iface = gr.Interface(
69
- fn=extract_entities_from_pdf,
70
- inputs=gr.File(file_types=[".pdf"]),
71
- outputs=gr.JSON(),
72
- title="🧹 Resume Cleaner & Entity Extractor",
73
- description="Upload a PDF resume. The app will clean the text and extract entities like Person, Organization, and Location using a Hugging Face NER model."
74
  )
75
 
76
- # Launch
77
  if __name__ == "__main__":
78
  iface.launch()
 
2
  import PyPDF2
3
  import gradio as gr
4
  from transformers import pipeline
5
+ from collections import Counter
6
 
7
+ # Load the Hugging Face NER pipeline
8
  ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
9
 
10
  def clean_resume_text(text):
11
  """Clean resume text by removing unwanted characters and formatting."""
12
+ text = re.sub(r'http\S+', ' ', text)
13
+ text = re.sub(r'#\S+', '', text)
14
+ text = re.sub(r'@\S+', ' ', text)
15
+ text = re.sub(r'[^\w\s]', ' ', text)
16
+ text = re.sub(r'[^\x00-\x7f]', ' ', text)
17
+ return re.sub(r'\s+', ' ', text).strip()
18
 
19
  def extract_resume_text(file):
20
  """Extract raw text from uploaded PDF file."""
 
26
  if page_text:
27
  text += page_text + " "
28
  if not text.strip():
29
+ return None, "Error: No text extracted from PDF."
30
+ return text, None
31
  except Exception as e:
32
+ return None, f"Error reading PDF: {str(e)}"
33
+
34
+ def classify_resume(entities):
35
+ """Classify resume based on dominant entity types."""
36
+ orgs = [e['word'] for e in entities if e['entity_group'] == 'ORG']
37
+ locs = [e['word'] for e in entities if e['entity_group'] == 'LOC']
38
+ jobs = [e['word'] for e in entities if e['entity_group'] == 'MISC']
39
+
40
+ dominant_org = Counter(orgs).most_common(1)
41
+ dominant_loc = Counter(locs).most_common(1)
42
+ dominant_job = Counter(jobs).most_common(1)
43
+
44
+ return {
45
+ "Main_Organization": dominant_org[0][0] if dominant_org else "Unknown",
46
+ "Main_Location": dominant_loc[0][0] if dominant_loc else "Unknown",
47
+ "Possible_Job/Field": dominant_job[0][0] if dominant_job else "General"
48
+ }
49
+
50
+ def extract_entities_from_pdfs(files):
51
+ """Process multiple resumes, extract entities, and classify."""
52
+ summary = {}
53
+
54
+ for file in files:
55
+ file_name = file.name.split("/")[-1]
56
+ resume_text, error = extract_resume_text(file)
57
+
58
+ if error:
59
+ summary[file_name] = {"error": error}
60
+ continue
61
+
62
+ cleaned_text = clean_resume_text(resume_text)
63
+ entities = ner_pipeline(cleaned_text)
64
 
 
 
 
 
 
 
 
 
 
65
  result = {
66
+ "Persons": list({e["word"] for e in entities if e["entity_group"] == "PER"}),
67
+ "Organizations": list({e["word"] for e in entities if e["entity_group"] == "ORG"}),
68
+ "Locations": list({e["word"] for e in entities if e["entity_group"] == "LOC"}),
69
+ "Other": list({e["word"] for e in entities if e["entity_group"] not in ["PER", "ORG", "LOC"]}),
70
+ "Cleaned_Text": cleaned_text,
71
+ "Classification": classify_resume(entities)
72
  }
73
 
74
+ summary[file_name] = result
 
 
 
 
 
 
 
 
 
 
75
 
76
+ return summary
 
 
 
 
77
 
78
+ # Gradio UI
79
  iface = gr.Interface(
80
+ fn=extract_entities_from_pdfs,
81
+ inputs=gr.File(file_types=[".pdf"], label="Upload Resumes (PDF)", file_count="multiple"),
82
+ outputs=gr.JSON(label="Resume Classification & Entity Summary"),
83
+ title="📂 Multi-Resume Entity Extractor & Classifier",
84
+ description="Upload multiple PDF resumes. This tool extracts text, identifies key entities, and classifies each resume by organizations, locations, and possible job/field."
85
  )
86
 
 
87
  if __name__ == "__main__":
88
  iface.launch()