Spaces:

husseinelsaadi
/

Codingo

Paused

App Files Files Community

husseinelsaadi commited on 15 days ago

Commit

8d99522

1 Parent(s): a5be571

updated

Browse files

Files changed (2) hide show

backend/services/resume_parser.py +44 -49
requirements.txt +0 -1

backend/services/resume_parser.py CHANGED Viewed

@@ -1,21 +1,18 @@
 from __future__ import annotations
-import os
-import re
-import subprocess
-import zipfile
-import json
-import torch
 from typing import List
-os.environ["OMP_NUM_THREADS"] = "1"
-os.environ["OPENBLAS_NUM_THREADS"] = "1"
-os.environ["MKL_NUM_THREADS"] = "1"
-os.environ["NUMEXPR_NUM_THREADS"] = "1"
-os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-import torch
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.float16,
@@ -23,10 +20,9 @@ bnb_config = BitsAndBytesConfig(
     bnb_4bit_quant_type="nf4"
 )
-# --- UPDATED: Using Deepseek-Coder-V2-Lite-Instruct for better performance ---
-tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/Deepseek-Coder-V2-Lite-Instruct", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
-    "deepseek-ai/Deepseek-Coder-V2-Lite-Instruct",
     quantization_config=bnb_config,
     device_map="auto",
     torch_dtype=torch.bfloat16,
@@ -37,13 +33,10 @@ model = AutoModelForCausalLM.from_pretrained(
 # Text Extraction (PDF/DOCX)
 # ===============================
 def extract_text(file_path: str) -> str:
-    """Extract text from PDF or DOCX resumes."""
     if not file_path or not os.path.isfile(file_path):
         return ""
-    lower_name = file_path.lower()
     try:
-        if lower_name.endswith('.pdf'):
             result = subprocess.run(
                 ['pdftotext', '-layout', file_path, '-'],
                 stdout=subprocess.PIPE,
@@ -51,8 +44,7 @@ def extract_text(file_path: str) -> str:
                 check=False
             )
             return result.stdout.decode('utf-8', errors='ignore')
-        elif lower_name.endswith('.docx'):
             with zipfile.ZipFile(file_path) as zf:
                 with zf.open('word/document.xml') as docx_xml:
                     xml_bytes = docx_xml.read()
@@ -60,24 +52,20 @@ def extract_text(file_path: str) -> str:
                     xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
                     text = re.sub(r'<[^>]+>', ' ', xml_text)
                     return re.sub(r'\s+', ' ', text)
-        else:
-            return ""
     except Exception:
-        return ""
 # ===============================
 # Name Extraction (Fallback)
 # ===============================
 def extract_name(text: str, filename: str) -> str:
-    """Extract candidate's name from resume text or filename."""
     if text:
         lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
         for line in lines[:10]:
-            if re.match(r'(?i)resume|curriculum vitae', line):
-                continue
-            words = line.split()
-            if 1 < len(words) <= 4:
-                if all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
                     return line
     base = os.path.basename(filename)
     base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
@@ -86,26 +74,25 @@ def extract_name(text: str, filename: str) -> str:
     return base.title().strip()
 # ===============================
-# Janus-Pro Parsing
 # ===============================
-def parse_with_deepseek(text: str) -> dict:
-    """Use Deepseek-Coder-V2-Lite-Instruct to extract resume details in JSON format."""
     prompt = f"""
-Extract the following information from the resume text provided below. Your response should be a valid JSON object.
 Information to extract:
-- Full Name: The candidate's full name.
-- Email: The candidate's email address.
-- Phone: The candidate's phone number.
-- Skills: A list of technical and soft skills.
-- Education: A list of academic degrees and institutions.
-- Experience: A list of previous jobs, including company, title, and dates.
-Resume Text:
 {text}
-Return only valid JSON in the following format:
 {{
   "name": "Full Name",
   "email": "[email protected]",
@@ -115,17 +102,25 @@ Return only valid JSON in the following format:
   "experience": ["Job1 - Company1 (Dates)", "Job2 - Company2 (Dates)"]
 }}
 """
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    outputs = model.generate(**inputs, max_new_tokens=512)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    import re, json
     match = re.search(r"\{.*\}", response, re.S)
     if match:
         try:
             return json.loads(match.group())
         except:
             pass
     return {"name": "", "email": "", "phone": "", "skills": [], "education": [], "experience": []}

 from __future__ import annotations
+import os, re, subprocess, zipfile, json, torch
 from typing import List
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+# Limit threads to avoid Hugging Face Spaces threading issues
+os.environ.update({
+    "OMP_NUM_THREADS": "1",
+    "OPENBLAS_NUM_THREADS": "1",
+    "MKL_NUM_THREADS": "1",
+    "NUMEXPR_NUM_THREADS": "1",
+    "VECLIB_MAXIMUM_THREADS": "1"
+})
+# Load Zephyr in 4-bit
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_compute_dtype=torch.float16,
     bnb_4bit_quant_type="nf4"
 )
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
+    "HuggingFaceH4/zephyr-7b-beta",
     quantization_config=bnb_config,
     device_map="auto",
     torch_dtype=torch.bfloat16,
 # Text Extraction (PDF/DOCX)
 # ===============================
 def extract_text(file_path: str) -> str:
     if not file_path or not os.path.isfile(file_path):
         return ""
     try:
+        if file_path.lower().endswith('.pdf'):
             result = subprocess.run(
                 ['pdftotext', '-layout', file_path, '-'],
                 stdout=subprocess.PIPE,
                 check=False
             )
             return result.stdout.decode('utf-8', errors='ignore')
+        elif file_path.lower().endswith('.docx'):
             with zipfile.ZipFile(file_path) as zf:
                 with zf.open('word/document.xml') as docx_xml:
                     xml_bytes = docx_xml.read()
                     xml_text = re.sub(r'<w:p[^>]*>', '\n', xml_text, flags=re.I)
                     text = re.sub(r'<[^>]+>', ' ', xml_text)
                     return re.sub(r'\s+', ' ', text)
     except Exception:
+        pass
+    return ""
 # ===============================
 # Name Extraction (Fallback)
 # ===============================
 def extract_name(text: str, filename: str) -> str:
     if text:
         lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
         for line in lines[:10]:
+            if not re.match(r'(?i)resume|curriculum vitae', line):
+                words = line.split()
+                if 1 < len(words) <= 4 and all(re.match(r'^[A-ZÀ-ÖØ-Þ][\w\-]*', w) for w in words):
                     return line
     base = os.path.basename(filename)
     base = re.sub(r'\.(pdf|docx|doc)$', '', base, flags=re.I)
     return base.title().strip()
 # ===============================
+# Zephyr Parsing
 # ===============================
+def parse_with_zephyr(text: str) -> dict:
     prompt = f"""
+Extract the following information from the resume text provided below.
+Return ONLY a valid JSON object (no extra commentary).
 Information to extract:
+- Full Name
+- Email
+- Phone
+- Skills (list)
+- Education (list of degrees + institutions)
+- Experience (list of jobs with company, title, and dates)
+Resume:
 {text}
+JSON format:
 {{
   "name": "Full Name",
   "email": "[email protected]",
   "experience": ["Job1 - Company1 (Dates)", "Job2 - Company2 (Dates)"]
 }}
 """
     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.0)
     response = tokenizer.decode(outputs[0], skip_special_tokens=True)
     match = re.search(r"\{.*\}", response, re.S)
     if match:
         try:
             return json.loads(match.group())
         except:
             pass
     return {"name": "", "email": "", "phone": "", "skills": [], "education": [], "experience": []}
+# ===============================
+# Main Parse Function
+# ===============================
+def parse_resume(file_path: str, filename: str) -> dict:
+    text = extract_text(file_path)
+    name_fallback = extract_name(text, filename)
+    data = parse_with_zephyr(text)
+    if not data.get("name"):
+        data["name"] = name_fallback
+    return data

requirements.txt CHANGED Viewed

@@ -62,5 +62,4 @@ requests>=2.31.0
 psycopg2-binary
 matplotlib
 bitsandbytes>=0.41.0
-flash-attn==2.3.6 --no-build-isolation

 psycopg2-binary
 matplotlib
 bitsandbytes>=0.41.0