TKM03 commited on
Commit
e39d53c
·
verified ·
1 Parent(s): 18398f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -40
app.py CHANGED
@@ -3,64 +3,76 @@ import PyPDF2
3
  import gradio as gr
4
  from transformers import pipeline
5
 
 
6
  ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
7
 
8
  def clean_resume_text(text):
9
- text = re.sub(r'http\S+', ' ', text)
10
- text = re.sub(r'#\S+', '', text)
11
- text = re.sub(r'@\S+', ' ', text)
12
- text = re.sub(r'[^\w\s]', ' ', text)
13
- text = re.sub(r'[^\x00-\x7f]', ' ', text)
14
- return re.sub(r'\s+', ' ', text).strip()
 
15
 
16
  def extract_resume_text(file):
 
17
  try:
18
  reader = PyPDF2.PdfReader(file)
19
  text = ""
20
  for page in reader.pages:
21
- extracted = page.extract_text()
22
- if extracted:
23
- text += extracted + " "
24
- return text if text.strip() else "Error: No text extracted."
 
 
25
  except Exception as e:
26
  return f"Error reading PDF: {str(e)}"
27
 
28
  def extract_entities_from_pdf(file):
29
- resume_text = extract_resume_text(file)
30
- if resume_text.startswith("Error"):
31
- return resume_text
32
-
33
- entities = ner_pipeline(resume_text)
34
-
35
- result = {
36
- "Persons": [],
37
- "Organizations": [],
38
- "Locations": [],
39
- "Other": []
40
- }
41
-
42
- for entity in entities:
43
- label = entity.get("entity_group")
44
- word = entity.get("word")
45
- if label == "PER":
46
- result["Persons"].append(word)
47
- elif label == "ORG":
48
- result["Organizations"].append(word)
49
- elif label == "LOC":
50
- result["Locations"].append(word)
51
- else:
52
- result["Other"].append(word)
53
-
54
- result["Cleaned_Text"] = clean_resume_text(resume_text)
55
- return result
 
 
 
 
 
56
 
 
57
  iface = gr.Interface(
58
  fn=extract_entities_from_pdf,
59
  inputs=gr.File(file_types=[".pdf"]),
60
- outputs="json",
61
- title="Resume Entity Extractor",
62
- description="Upload a PDF resume. It will extract names, organizations, and locations using Hugging Face NER."
63
  )
64
 
 
65
  if __name__ == "__main__":
66
  iface.launch()
 
3
  import gradio as gr
4
  from transformers import pipeline
5
 
6
+ # Load the Hugging Face NER model pipeline
7
  ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
8
 
9
  def clean_resume_text(text):
10
+ """Clean resume text by removing unwanted characters and formatting."""
11
+ text = re.sub(r'http\S+', ' ', text) # Remove URLs
12
+ text = re.sub(r'#\S+', '', text) # Remove hashtags
13
+ text = re.sub(r'@\S+', ' ', text) # Remove mentions
14
+ text = re.sub(r'[^\w\s]', ' ', text) # Remove punctuation
15
+ text = re.sub(r'[^\x00-\x7f]', ' ', text) # Remove non-ASCII characters
16
+ return re.sub(r'\s+', ' ', text).strip() # Normalize whitespace
17
 
18
  def extract_resume_text(file):
19
+ """Extract raw text from uploaded PDF file."""
20
  try:
21
  reader = PyPDF2.PdfReader(file)
22
  text = ""
23
  for page in reader.pages:
24
+ page_text = page.extract_text()
25
+ if page_text:
26
+ text += page_text + " "
27
+ if not text.strip():
28
+ return "Error: No text extracted from PDF."
29
+ return text
30
  except Exception as e:
31
  return f"Error reading PDF: {str(e)}"
32
 
33
  def extract_entities_from_pdf(file):
34
+ """Main processing function: Extracts and cleans text, runs NER, and returns structured data."""
35
+ try:
36
+ resume_text = extract_resume_text(file)
37
+ if resume_text.startswith("Error"):
38
+ return {"error": resume_text}
39
+
40
+ entities = ner_pipeline(resume_text)
41
+
42
+ result = {
43
+ "Persons": [],
44
+ "Organizations": [],
45
+ "Locations": [],
46
+ "Other": []
47
+ }
48
+
49
+ for entity in entities:
50
+ label = entity.get("entity_group")
51
+ word = entity.get("word")
52
+ if label == "PER":
53
+ result["Persons"].append(word)
54
+ elif label == "ORG":
55
+ result["Organizations"].append(word)
56
+ elif label == "LOC":
57
+ result["Locations"].append(word)
58
+ else:
59
+ result["Other"].append(word)
60
+
61
+ result["Cleaned_Text"] = clean_resume_text(resume_text)
62
+ return result
63
+
64
+ except Exception as e:
65
+ return {"error": f"Exception during processing: {str(e)}"}
66
 
67
+ # Gradio interface
68
  iface = gr.Interface(
69
  fn=extract_entities_from_pdf,
70
  inputs=gr.File(file_types=[".pdf"]),
71
+ outputs=gr.JSON(),
72
+ title="🧹 Resume Cleaner & Entity Extractor",
73
+ description="Upload a PDF resume. The app will clean the text and extract entities like Person, Organization, and Location using a Hugging Face NER model."
74
  )
75
 
76
+ # Launch
77
  if __name__ == "__main__":
78
  iface.launch()