Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on 15 days ago

Commit

6d286f1

1 Parent(s): a511250

updated

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +30 -24

backend/services/resume_parser.py CHANGED Viewed

@@ -1,47 +1,53 @@
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
-import subprocess, zipfile, re, os
-# === Load pretrained HF model ===
-MODEL_NAME = "sravya-abburi/ResumeParserBERT"  # or "Kiet/autotrain-resume_parser-1159242747"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
-# Use CPU for stability (device=-1) to avoid GPU memory issues from other parts of the app
-ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=-1)
-# === Extract text from PDF/DOCX ===
 def extract_text(file_path: str) -> str:
-    """Extract text from PDF or DOCX resumes."""
-    if file_path.lower().endswith(".pdf"):
-        result = subprocess.run(
-            ["pdftotext", "-layout", file_path, "-"],
-            stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
-        )
-        return result.stdout.decode("utf-8", errors="ignore")
-    elif file_path.lower().endswith(".docx"):
         with zipfile.ZipFile(file_path) as zf:
             with zf.open("word/document.xml") as docx_xml:
                 xml_text = docx_xml.read().decode("utf-8", errors="ignore")
                 xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
                 return re.sub(r"<[^>]+>", " ", xml_text)
     return ""
-# === Parse resume with NER ===
 def parse_resume(file_path: str, filename: str = None) -> dict:
-    """Parse resume and extract Name, Skills, Education, Experience."""
     text = extract_text(file_path)
     entities = ner_pipeline(text)
     name, skills, education, experience = [], [], [], []
     for ent in entities:
-        word = ent["word"].strip()
         label = ent["entity_group"].upper()
-        # Skip empty or placeholder tokens
-        if not word or word.startswith("LABEL_"):
-            continue
         if label == "NAME":
             name.append(word)
         elif label == "SKILL":

 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+import zipfile, re, os
+# ===============================
+# Load Model & Tokenizer
+# ===============================
+MODEL_NAME = "sravya-abburi/ResumeParserBERT"  # Change to Kiet/autotrain-resume_parser-1159242747 if needed
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
+ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
+# ===============================
+# Extract Text (PDF & DOCX)
+# ===============================
 def extract_text(file_path: str) -> str:
+    """Extract text from PDF or DOCX without external dependencies."""
+    file_path_lower = file_path.lower()
+    # PDF reading using PyMuPDF (built into Spaces environment)
+    if file_path_lower.endswith(".pdf"):
+        import fitz  # PyMuPDF
+        text = ""
+        with fitz.open(file_path) as pdf_doc:
+            for page in pdf_doc:
+                text += page.get_text()
+        return text
+    # DOCX reading by extracting XML content
+    elif file_path_lower.endswith(".docx"):
         with zipfile.ZipFile(file_path) as zf:
             with zf.open("word/document.xml") as docx_xml:
                 xml_text = docx_xml.read().decode("utf-8", errors="ignore")
                 xml_text = re.sub(r"<w:p[^>]*>", "\n", xml_text, flags=re.I)
                 return re.sub(r"<[^>]+>", " ", xml_text)
     return ""
+# ===============================
+# Parse Resume
+# ===============================
 def parse_resume(file_path: str, filename: str = None) -> dict:
+    """Parse resume and extract structured information."""
     text = extract_text(file_path)
     entities = ner_pipeline(text)
     name, skills, education, experience = [], [], [], []
     for ent in entities:
         label = ent["entity_group"].upper()
+        word = ent["word"].strip()
         if label == "NAME":
             name.append(word)
         elif label == "SKILL":