Spaces:

TKM03
/

ResumeExtraction

Sleeping

App Files Files Community

TKM03 commited on May 20

Commit

e39d53c

verified ·

1 Parent(s): 18398f6

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -40

app.py CHANGED Viewed

@@ -3,64 +3,76 @@ import PyPDF2
 import gradio as gr
 from transformers import pipeline
 ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
 def clean_resume_text(text):
-    text = re.sub(r'http\S+', ' ', text)
-    text = re.sub(r'#\S+', '', text)
-    text = re.sub(r'@\S+', ' ', text)
-    text = re.sub(r'[^\w\s]', ' ', text)
-    text = re.sub(r'[^\x00-\x7f]', ' ', text)
-    return re.sub(r'\s+', ' ', text).strip()
 def extract_resume_text(file):
     try:
         reader = PyPDF2.PdfReader(file)
         text = ""
         for page in reader.pages:
-            extracted = page.extract_text()
-            if extracted:
-                text += extracted + " "
-        return text if text.strip() else "Error: No text extracted."
     except Exception as e:
         return f"Error reading PDF: {str(e)}"
 def extract_entities_from_pdf(file):
-    resume_text = extract_resume_text(file)
-    if resume_text.startswith("Error"):
-        return resume_text
-    entities = ner_pipeline(resume_text)
-    result = {
-        "Persons": [],
-        "Organizations": [],
-        "Locations": [],
-        "Other": []
-    }
-    for entity in entities:
-        label = entity.get("entity_group")
-        word = entity.get("word")
-        if label == "PER":
-            result["Persons"].append(word)
-        elif label == "ORG":
-            result["Organizations"].append(word)
-        elif label == "LOC":
-            result["Locations"].append(word)
-        else:
-            result["Other"].append(word)
-    result["Cleaned_Text"] = clean_resume_text(resume_text)
-    return result
 iface = gr.Interface(
     fn=extract_entities_from_pdf,
     inputs=gr.File(file_types=[".pdf"]),
-    outputs="json",
-    title="Resume Entity Extractor",
-    description="Upload a PDF resume. It will extract names, organizations, and locations using Hugging Face NER."
 )
 if __name__ == "__main__":
     iface.launch()

 import gradio as gr
 from transformers import pipeline
+# Load the Hugging Face NER model pipeline
 ner_pipeline = pipeline("ner", model="dslim/bert-base-NER", tokenizer="dslim/bert-base-NER", aggregation_strategy="simple")
 def clean_resume_text(text):
+    """Clean resume text by removing unwanted characters and formatting."""
+    text = re.sub(r'http\S+', ' ', text)  # Remove URLs
+    text = re.sub(r'#\S+', '', text)      # Remove hashtags
+    text = re.sub(r'@\S+', ' ', text)     # Remove mentions
+    text = re.sub(r'[^\w\s]', ' ', text)  # Remove punctuation
+    text = re.sub(r'[^\x00-\x7f]', ' ', text)  # Remove non-ASCII characters
+    return re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
 def extract_resume_text(file):
+    """Extract raw text from uploaded PDF file."""
     try:
         reader = PyPDF2.PdfReader(file)
         text = ""
         for page in reader.pages:
+            page_text = page.extract_text()
+            if page_text:
+                text += page_text + " "
+        if not text.strip():
+            return "Error: No text extracted from PDF."
+        return text
     except Exception as e:
         return f"Error reading PDF: {str(e)}"
 def extract_entities_from_pdf(file):
+    """Main processing function: Extracts and cleans text, runs NER, and returns structured data."""
+    try:
+        resume_text = extract_resume_text(file)
+        if resume_text.startswith("Error"):
+            return {"error": resume_text}
+        entities = ner_pipeline(resume_text)
+        result = {
+            "Persons": [],
+            "Organizations": [],
+            "Locations": [],
+            "Other": []
+        }
+        for entity in entities:
+            label = entity.get("entity_group")
+            word = entity.get("word")
+            if label == "PER":
+                result["Persons"].append(word)
+            elif label == "ORG":
+                result["Organizations"].append(word)
+            elif label == "LOC":
+                result["Locations"].append(word)
+            else:
+                result["Other"].append(word)
+        result["Cleaned_Text"] = clean_resume_text(resume_text)
+        return result
+    except Exception as e:
+        return {"error": f"Exception during processing: {str(e)}"}
+# Gradio interface
 iface = gr.Interface(
     fn=extract_entities_from_pdf,
     inputs=gr.File(file_types=[".pdf"]),
+    outputs=gr.JSON(),
+    title="🧹 Resume Cleaner & Entity Extractor",
+    description="Upload a PDF resume. The app will clean the text and extract entities like Person, Organization, and Location using a Hugging Face NER model."
 )
+# Launch
 if __name__ == "__main__":
     iface.launch()