TKM03 commited on
Commit
25c62c3
·
verified ·
1 Parent(s): f314cfc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -0
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import PyPDF2
3
+ import gradio as gr
4
+ from transformers import pipeline
5
+
6
+ ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
7
+
8
+ def clean_resume_text(text):
9
+ text = re.sub(r'http\S+', ' ', text)
10
+ text = re.sub(r'#\S+', '', text)
11
+ text = re.sub(r'@\S+', ' ', text)
12
+ text = re.sub(r'[^\w\s]', ' ', text)
13
+ text = re.sub(r'[^\x00-\x7f]', ' ', text)
14
+ return re.sub(r'\s+', ' ', text).strip()
15
+
16
+ def extract_resume_text(file):
17
+ try:
18
+ reader = PyPDF2.PdfReader(file)
19
+ text = ""
20
+ for page in reader.pages:
21
+ extracted = page.extract_text()
22
+ if extracted:
23
+ text += extracted + " "
24
+ return text if text.strip() else "Error: No text extracted."
25
+ except Exception as e:
26
+ return f"Error reading PDF: {str(e)}"
27
+
28
+ def extract_entities_from_pdf(file):
29
+ resume_text = extract_resume_text(file)
30
+ if resume_text.startswith("Error"):
31
+ return resume_text
32
+
33
+ entities = ner_pipeline(resume_text)
34
+
35
+ result = {
36
+ "Persons": [],
37
+ "Organizations": [],
38
+ "Locations": [],
39
+ "Other": []
40
+ }
41
+
42
+ for entity in entities:
43
+ label = entity.get("entity_group")
44
+ word = entity.get("word")
45
+ if label == "PER":
46
+ result["Persons"].append(word)
47
+ elif label == "ORG":
48
+ result["Organizations"].append(word)
49
+ elif label == "LOC":
50
+ result["Locations"].append(word)
51
+ else:
52
+ result["Other"].append(word)
53
+
54
+ result["Cleaned_Text"] = clean_resume_text(resume_text)
55
+ return result
56
+
57
+ iface = gr.Interface(
58
+ fn=extract_entities_from_pdf,
59
+ inputs=gr.File(file_types=[".pdf"]),
60
+ outputs="json",
61
+ title="Resume Entity Extractor",
62
+ description="Upload a PDF resume. It will extract names, organizations, and locations using Hugging Face NER."
63
+ )
64
+
65
+ if __name__ == "__main__":
66
+ iface.launch()