Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on Aug 3

Commit

d4b2339

1 Parent(s): 50d928c

deepseek model loaded

Browse files

Files changed (1) hide show

backend/services/resume_parser.py +108 -51

backend/services/resume_parser.py CHANGED Viewed

@@ -3,49 +3,61 @@ import os
 import re
 import subprocess
 import zipfile
 from typing import List
-from transformers import pipeline
-# Load the NER model for resume parsing
-ner = pipeline("ner", model="Kiet/ResumeParserBERT", aggregation_strategy="simple")
 def extract_text(file_path: str) -> str:
-    """Extract text from PDF or DOCX."""
     if not file_path or not os.path.isfile(file_path):
         return ""
     lower_name = file_path.lower()
     try:
         if lower_name.endswith('.pdf'):
-            try:
-                result = subprocess.run(
-                    ['pdftotext', '-layout', file_path, '-'],
-                    stdout=subprocess.PIPE,
-                    stderr=subprocess.PIPE,
-                    check=False
-                )
-                return result.stdout.decode('utf-8', errors='ignore')
-            except Exception:
-                return ""
         elif lower_name.endswith('.docx'):
-            try:
-                with zipfile.ZipFile(file_path) as zf:
-                    with zf.open('word/document.xml') as docx_xml:
-                        xml_bytes = docx_xml.read()
-                        xml_text = xml_bytes.decode('utf-8', errors='ignore')
-                        xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
-                        text = re.sub(r'<[^>]+>', ' ', xml_text)
-                        text = re.sub(r'\s+', ' ', text)
-                        return text
-            except Exception:
-                return ""
         else:
             return ""
     except Exception:
         return ""
 def extract_name(text: str, filename: str) -> str:
-    """Extract candidate's name from text or filename."""
     if text:
         lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
         for line in lines[:10]:
@@ -59,36 +71,81 @@ def extract_name(text: str, filename: str) -> str:
     base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
     base = re.sub(r'[\._-]+', ' ', base)
     base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
-    base = re.sub(r'\s+', ' ', base).strip()
-    return base.title() if base else ''
-def extract_entities(text: str) -> dict:
-    """Extract structured info using NER model."""
-    entities = ner(text)
-    skills, education, experience = [], [], []
-    for ent in entities:
-        label = ent['entity_group'].upper()
-        word = ent['word'].strip()
-        if label in ["SKILL", "TECH", "TECHNOLOGY"]:
-            skills.append(word)
-        elif label in ["EDUCATION", "DEGREE", "QUALIFICATION"]:
-            education.append(word)
-        elif label in ["EXPERIENCE", "JOB", "ROLE"]:
-            experience.append(word)
     return {
-        "skills": list(dict.fromkeys(skills)),
-        "education": list(dict.fromkeys(education)),
-        "experience": list(dict.fromkeys(experience))
     }
 def parse_resume(file_path: str, filename: str) -> dict:
-    """Main function to parse resume fields."""
     text = extract_text(file_path)
     name = extract_name(text, filename)
-    ents = extract_entities(text)
     return {
-        'name': name or '',
-        'skills': ', '.join(ents["skills"]) if ents["skills"] else '',
-        'education': ', '.join(ents["education"]) if ents["education"] else '',
-        'experience': ', '.join(ents["experience"]) if ents["experience"] else ''
     }

 import re
 import subprocess
 import zipfile
+import json
+import torch
 from typing import List
+from transformers import AutoModelForCausalLM, AutoTokenizer
+# ===============================
+# Load DeepSeek Janus-Pro-7B Model
+# ===============================
+MODEL_ID = "deepseek-ai/Janus-Pro-7B"
+print(f"Loading {MODEL_ID}... (This may take some time on first run)")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+# ===============================
+# Text Extraction (PDF/DOCX)
+# ===============================
 def extract_text(file_path: str) -> str:
+    """Extract text from PDF or DOCX resumes."""
     if not file_path or not os.path.isfile(file_path):
         return ""
     lower_name = file_path.lower()
     try:
         if lower_name.endswith('.pdf'):
+            result = subprocess.run(
+                ['pdftotext', '-layout', file_path, '-'],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                check=False
+            )
+            return result.stdout.decode('utf-8', errors='ignore')
         elif lower_name.endswith('.docx'):
+            with zipfile.ZipFile(file_path) as zf:
+                with zf.open('word/document.xml') as docx_xml:
+                    xml_bytes = docx_xml.read()
+                    xml_text = xml_bytes.decode('utf-8', errors='ignore')
+                    xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
+                    text = re.sub(r'<[^>]+>', ' ', xml_text)
+                    return re.sub(r'\s+', ' ', text)
         else:
             return ""
     except Exception:
         return ""
+# ===============================
+# Name Extraction (Fallback)
+# ===============================
 def extract_name(text: str, filename: str) -> str:
+    """Extract candidate's name from resume text or filename."""
     if text:
         lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
         for line in lines[:10]:
     base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
     base = re.sub(r'[\._-]+', ' ', base)
     base = re.sub(r'(?i)\b(cv|resume)\b', '', base)
+    return base.title().strip()
+# ===============================
+# Janus-Pro Parsing
+# ===============================
+def parse_with_deepseek(text: str) -> dict:
+    """Use DeepSeek Janus-Pro-7B to extract resume details in JSON format."""
+    prompt = f"""
+    Extract the following information from the resume text below:
+    - Full Name
+    - Skills (comma separated)
+    - Education (degrees + institutions)
+    - Experience (job titles + companies)
+    Return only valid JSON in the following structure:
+    {{
+      "name": "Full Name",
+      "skills": "Skill1, Skill2, Skill3",
+      "education": "Degree1 - Institution1; Degree2 - Institution2",
+      "experience": "Job1 - Company1; Job2 - Company2"
+    }}
+    Resume:
+    {text}
+    """
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=512)
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract JSON safely
+    match = re.search(r"\{.*\}", response, re.S)
+    if match:
+        try:
+            return json.loads(match.group())
+        except:
+            pass
+    return {"name": "", "skills": "", "education": "", "experience": ""}
+# ===============================
+# Fallback Heading-based Parsing
+# ===============================
+def fallback_parse(text: str) -> dict:
+    """Simple heading-based parsing as backup."""
+    skills = re.findall(r"Skills\s*[:\-]?\s*(.*)", text, re.I)
+    education = re.findall(r"Education\s*[:\-]?\s*(.*)", text, re.I)
+    experience = re.findall(r"(Experience|Work History)\s*[:\-]?\s*(.*)", text, re.I)
     return {
+        "skills": ", ".join(skills),
+        "education": ", ".join(education),
+        "experience": ", ".join([exp[1] for exp in experience])
     }
+# ===============================
+# Main Parse Function
+# ===============================
 def parse_resume(file_path: str, filename: str) -> dict:
+    """Main resume parsing function."""
     text = extract_text(file_path)
     name = extract_name(text, filename)
+    # Try Janus-Pro parsing
+    ents = parse_with_deepseek(text)
+    # If Janus-Pro misses fields, use fallback
+    if not ents.get("skills") or not ents.get("education"):
+        fb = fallback_parse(text)
+        ents["skills"] = ents.get("skills") or fb["skills"]
+        ents["education"] = ents.get("education") or fb["education"]
+        ents["experience"] = ents.get("experience") or fb["experience"]
     return {
+        "name": ents.get("name") or name,
+        "skills": ents.get("skills", ""),
+        "education": ents.get("education", ""),
+        "experience": ents.get("experience", "")
     }