Spaces:

TKM03
/

ResumeExtraction

Sleeping

App Files Files Community

TKM03 commited on May 20

Commit

25c62c3

verified ·

1 Parent(s): f314cfc

Create app.py

Browse files

Files changed (1) hide show

app.py +66 -0

app.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import re
+import PyPDF2
+import gradio as gr
+from transformers import pipeline
+ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
+def clean_resume_text(text):
+    text = re.sub(r'http\S+', ' ', text)
+    text = re.sub(r'#\S+', '', text)
+    text = re.sub(r'@\S+', ' ', text)
+    text = re.sub(r'[^\w\s]', ' ', text)
+    text = re.sub(r'[^\x00-\x7f]', ' ', text)
+    return re.sub(r'\s+', ' ', text).strip()
+def extract_resume_text(file):
+    try:
+        reader = PyPDF2.PdfReader(file)
+        text = ""
+        for page in reader.pages:
+            extracted = page.extract_text()
+            if extracted:
+                text += extracted + " "
+        return text if text.strip() else "Error: No text extracted."
+    except Exception as e:
+        return f"Error reading PDF: {str(e)}"
+def extract_entities_from_pdf(file):
+    resume_text = extract_resume_text(file)
+    if resume_text.startswith("Error"):
+        return resume_text
+    entities = ner_pipeline(resume_text)
+    result = {
+        "Persons": [],
+        "Organizations": [],
+        "Locations": [],
+        "Other": []
+    }
+    for entity in entities:
+        label = entity.get("entity_group")
+        word = entity.get("word")
+        if label == "PER":
+            result["Persons"].append(word)
+        elif label == "ORG":
+            result["Organizations"].append(word)
+        elif label == "LOC":
+            result["Locations"].append(word)
+        else:
+            result["Other"].append(word)
+    result["Cleaned_Text"] = clean_resume_text(resume_text)
+    return result
+iface = gr.Interface(
+    fn=extract_entities_from_pdf,
+    inputs=gr.File(file_types=[".pdf"]),
+    outputs="json",
+    title="Resume Entity Extractor",
+    description="Upload a PDF resume. It will extract names, organizations, and locations using Hugging Face NER."
+)
+if __name__ == "__main__":
+    iface.launch()