husseinelsaadi commited on
Commit
947d727
·
1 Parent(s): 6d286f1
Files changed (1) hide show
  1. backend/services/resume_parser.py +10 -7
backend/services/resume_parser.py CHANGED
@@ -1,10 +1,11 @@
1
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
2
  import zipfile, re, os
 
3
 
4
  # ===============================
5
  # Load Model & Tokenizer
6
  # ===============================
7
- MODEL_NAME = "sravya-abburi/ResumeParserBERT" # Change to Kiet/autotrain-resume_parser-1159242747 if needed
8
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
9
  model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
10
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
@@ -16,16 +17,18 @@ def extract_text(file_path: str) -> str:
16
  """Extract text from PDF or DOCX without external dependencies."""
17
  file_path_lower = file_path.lower()
18
 
19
- # PDF reading using PyMuPDF (built into Spaces environment)
20
  if file_path_lower.endswith(".pdf"):
21
- import fitz # PyMuPDF
22
  text = ""
23
- with fitz.open(file_path) as pdf_doc:
24
- for page in pdf_doc:
25
- text += page.get_text()
 
 
 
26
  return text
27
 
28
- # DOCX reading by extracting XML content
29
  elif file_path_lower.endswith(".docx"):
30
  with zipfile.ZipFile(file_path) as zf:
31
  with zf.open("word/document.xml") as docx_xml:
 
1
  from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
2
  import zipfile, re, os
3
+ from PyPDF2 import PdfReader # Lightweight & already in Spaces
4
 
5
  # ===============================
6
  # Load Model & Tokenizer
7
  # ===============================
8
+ MODEL_NAME = "sravya-abburi/ResumeParserBERT" # Swap to Kiet model if needed
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
  model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
11
  ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
 
17
  """Extract text from PDF or DOCX without external dependencies."""
18
  file_path_lower = file_path.lower()
19
 
20
+ # PDF reading using PyPDF2 (no fitz, no installs needed)
21
  if file_path_lower.endswith(".pdf"):
 
22
  text = ""
23
+ with open(file_path, "rb") as f:
24
+ reader = PdfReader(f)
25
+ for page in reader.pages:
26
+ page_text = page.extract_text()
27
+ if page_text:
28
+ text += page_text + "\n"
29
  return text
30
 
31
+ # DOCX reading by extracting XML content
32
  elif file_path_lower.endswith(".docx"):
33
  with zipfile.ZipFile(file_path) as zf:
34
  with zf.open("word/document.xml") as docx_xml: