Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on Aug 3

Commit

682910e

1 Parent(s): 947d727

updated

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +45 -35

backend/services/resume_parser.py CHANGED Viewed

@@ -1,56 +1,58 @@
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
-import zipfile, re, os
-from PyPDF2 import PdfReader  # Lightweight & already in Spaces
-# ===============================
-# Load Model & Tokenizer
-# ===============================
-MODEL_NAME = "sravya-abburi/ResumeParserBERT"  # Swap to Kiet model if needed
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
-# ===============================
-# Extract Text (PDF & DOCX)
-# ===============================
 def extract_text(file_path: str) -> str:
-    """Extract text from PDF or DOCX without external dependencies."""
-    file_path_lower = file_path.lower()
-    # ✅ PDF reading using PyPDF2 (no fitz, no installs needed)
-    if file_path_lower.endswith(".pdf"):
-        text = ""
-        with open(file_path, "rb") as f:
-            reader = PdfReader(f)
-            for page in reader.pages:
-                page_text = page.extract_text()
-                if page_text:
-                    text += page_text + "\n"
-        return text
-    # ✅ DOCX reading by extracting XML content
-    elif file_path_lower.endswith(".docx"):
-        with zipfile.ZipFile(file_path) as zf:
-            with zf.open("word/document.xml") as docx_xml:
-                xml_text = docx_xml.read().decode("utf-8", errors="ignore")
-                xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
-                return re.sub(r"<[^>]+>", " ", xml_text)
     return ""
-# ===============================
 # Parse Resume
-# ===============================
 def parse_resume(file_path: str, filename: str = None) -> dict:
-    """Parse resume and extract structured information."""
     text = extract_text(file_path)
     entities = ner_pipeline(text)
     name, skills, education, experience = [], [], [], []
     for ent in entities:
         label = ent["entity_group"].upper()
         word = ent["word"].strip()
         if label == "NAME":
             name.append(word)
         elif label == "SKILL":
@@ -66,3 +68,11 @@ def parse_resume(file_path: str, filename: str = None) -> dict:
         "education": ", ".join(dict.fromkeys(education)),
         "experience": ", ".join(dict.fromkeys(experience))
     }

+import os
+import re
+import subprocess
+import zipfile
+import json
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+# --------------------
+# Load Model
+# --------------------
+MODEL_NAME = "sravya-abburi/ResumeParserBERT"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+# --------------------
+# Extract Text
+# --------------------
 def extract_text(file_path: str) -> str:
+    """Extract text from PDF/DOCX resumes."""
+    if file_path.lower().endswith(".pdf"):
+        try:
+            result = subprocess.run(
+                ["pdftotext", "-layout", file_path, "-"],
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
+            )
+            return result.stdout.decode("utf-8", errors="ignore")
+        except:
+            return ""
+    elif file_path.lower().endswith(".docx"):
+        try:
+            with zipfile.ZipFile(file_path) as zf:
+                with zf.open("word/document.xml") as docx_xml:
+                    xml_text = docx_xml.read().decode("utf-8", errors="ignore")
+                    xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
+                    return re.sub(r"<[^>]+>", " ", xml_text)
+        except:
+            return ""
     return ""
+# --------------------
 # Parse Resume
+# --------------------
 def parse_resume(file_path: str, filename: str = None) -> dict:
+    """Extract Name, Skills, Education, Experience from resume."""
     text = extract_text(file_path)
     entities = ner_pipeline(text)
     name, skills, education, experience = [], [], [], []
     for ent in entities:
         label = ent["entity_group"].upper()
         word = ent["word"].strip()
         if label == "NAME":
             name.append(word)
         elif label == "SKILL":
         "education": ", ".join(dict.fromkeys(education)),
         "experience": ", ".join(dict.fromkeys(experience))
     }
+# --------------------
+# Example
+# --------------------
+if __name__ == "__main__":
+    resume_path = "resume.pdf"  # Change to test file
+    result = parse_resume(resume_path)
+    print(json.dumps(result, indent=2))