husseinelsaadi commited on
Commit
0e43f07
·
1 Parent(s): 722e882

gemini updated

Browse files
Files changed (1) hide show
  1. backend/services/resume_parser.py +17 -74
backend/services/resume_parser.py CHANGED
@@ -23,15 +23,16 @@ bnb_config = BitsAndBytesConfig(
23
  bnb_4bit_quant_type="nf4"
24
  )
25
 
26
- tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")
 
27
  model = AutoModelForCausalLM.from_pretrained(
28
- "tiiuae/falcon-7b-instruct",
29
  quantization_config=bnb_config,
30
- device_map="auto"
 
 
31
  )
32
 
33
-
34
-
35
  # ===============================
36
  # Text Extraction (PDF/DOCX)
37
  # ===============================
@@ -88,75 +89,17 @@ def extract_name(text: str, filename: str) -> str:
88
  # Janus-Pro Parsing
89
  # ===============================
90
  def parse_with_deepseek(text: str) -> dict:
91
- """Use DeepSeek Janus-Pro-7B to extract resume details in JSON format."""
 
92
  prompt = f"""
93
- Extract the following information from the resume text below:
94
-
95
- - Full Name
96
- - Skills (comma separated)
97
- - Education (degrees + institutions)
98
- - Experience (job titles + companies)
99
-
100
- Return only valid JSON in the following structure:
101
- {{
102
- "name": "Full Name",
103
- "skills": "Skill1, Skill2, Skill3",
104
- "education": "Degree1 - Institution1; Degree2 - Institution2",
105
- "experience": "Job1 - Company1; Job2 - Company2"
106
- }}
107
-
108
- Resume:
109
- {text}
110
- """
111
-
112
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
113
- outputs = model.generate(**inputs, max_new_tokens=512)
114
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
115
-
116
- # Extract JSON safely
117
- match = re.search(r"\{.*\}", response, re.S)
118
- if match:
119
- try:
120
- return json.loads(match.group())
121
- except:
122
- pass
123
- return {"name": "", "skills": "", "education": "", "experience": ""}
124
-
125
- # ===============================
126
- # Fallback Heading-based Parsing
127
- # ===============================
128
- def fallback_parse(text: str) -> dict:
129
- """Simple heading-based parsing as backup."""
130
- skills = re.findall(r"Skills\s*[:\-]?\s*(.*)", text, re.I)
131
- education = re.findall(r"Education\s*[:\-]?\s*(.*)", text, re.I)
132
- experience = re.findall(r"(Experience|Work History)\s*[:\-]?\s*(.*)", text, re.I)
133
- return {
134
- "skills": ", ".join(skills),
135
- "education": ", ".join(education),
136
- "experience": ", ".join([exp[1] for exp in experience])
137
- }
138
-
139
- # ===============================
140
- # Main Parse Function
141
- # ===============================
142
- def parse_resume(file_path: str, filename: str) -> dict:
143
- """Main resume parsing function."""
144
- text = extract_text(file_path)
145
- name = extract_name(text, filename)
146
-
147
- # Try Janus-Pro parsing
148
- ents = parse_with_deepseek(text)
149
 
150
- # If Janus-Pro misses fields, use fallback
151
- if not ents.get("skills") or not ents.get("education"):
152
- fb = fallback_parse(text)
153
- ents["skills"] = ents.get("skills") or fb["skills"]
154
- ents["education"] = ents.get("education") or fb["education"]
155
- ents["experience"] = ents.get("experience") or fb["experience"]
 
156
 
157
- return {
158
- "name": ents.get("name") or name,
159
- "skills": ents.get("skills", ""),
160
- "education": ents.get("education", ""),
161
- "experience": ents.get("experience", "")
162
- }
 
23
  bnb_4bit_quant_type="nf4"
24
  )
25
 
26
+ # --- UPDATED: Using Deepseek-Coder-V2-Lite-Instruct for better performance ---
27
+ tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/Deepseek-Coder-V2-Lite-Instruct", trust_remote_code=True)
28
  model = AutoModelForCausalLM.from_pretrained(
29
+ "deepseek-ai/Deepseek-Coder-V2-Lite-Instruct",
30
  quantization_config=bnb_config,
31
+ device_map="auto",
32
+ torch_dtype=torch.bfloat16,
33
+ trust_remote_code=True
34
  )
35
 
 
 
36
  # ===============================
37
  # Text Extraction (PDF/DOCX)
38
  # ===============================
 
89
  # Janus-Pro Parsing
90
  # ===============================
91
  def parse_with_deepseek(text: str) -> dict:
92
+ """Use Deepseek-Coder-V2-Lite-Instruct to extract resume details in JSON format."""
93
+ # --- UPDATED: Refined prompt for better JSON extraction ---
94
  prompt = f"""
95
+ Extract the following information from the resume text provided below. Your response should be a valid JSON object.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ **Information to extract:**
98
+ - **Full Name:** The candidate's full name.
99
+ - **Email:** The candidate's email address.
100
+ - **Phone:** The candidate's phone number.
101
+ - **Skills:** A list of technical and soft skills.
102
+ - **Education:** A list of academic degrees and institutions.
103
+ - **Experience:** A list of previous jobs, including company, title, and dates.
104
 
105
+ **Resume Text:**