Spaces:
Paused
Paused
Commit
·
947d727
1
Parent(s):
6d286f1
updated
Browse files
backend/services/resume_parser.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
2 |
import zipfile, re, os
|
|
|
3 |
|
4 |
# ===============================
|
5 |
# Load Model & Tokenizer
|
6 |
# ===============================
|
7 |
-
MODEL_NAME = "sravya-abburi/ResumeParserBERT" #
|
8 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
9 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
|
10 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
@@ -16,16 +17,18 @@ def extract_text(file_path: str) -> str:
|
|
16 |
"""Extract text from PDF or DOCX without external dependencies."""
|
17 |
file_path_lower = file_path.lower()
|
18 |
|
19 |
-
# PDF reading using
|
20 |
if file_path_lower.endswith(".pdf"):
|
21 |
-
import fitz # PyMuPDF
|
22 |
text = ""
|
23 |
-
with
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
26 |
return text
|
27 |
|
28 |
-
# DOCX reading by extracting XML content
|
29 |
elif file_path_lower.endswith(".docx"):
|
30 |
with zipfile.ZipFile(file_path) as zf:
|
31 |
with zf.open("word/document.xml") as docx_xml:
|
|
|
1 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
2 |
import zipfile, re, os
|
3 |
+
from PyPDF2 import PdfReader # Lightweight & already in Spaces
|
4 |
|
5 |
# ===============================
|
6 |
# Load Model & Tokenizer
|
7 |
# ===============================
|
8 |
+
MODEL_NAME = "sravya-abburi/ResumeParserBERT" # Swap to Kiet model if needed
|
9 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
10 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
|
11 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
|
|
17 |
"""Extract text from PDF or DOCX without external dependencies."""
|
18 |
file_path_lower = file_path.lower()
|
19 |
|
20 |
+
# ✅ PDF reading using PyPDF2 (no fitz, no installs needed)
|
21 |
if file_path_lower.endswith(".pdf"):
|
|
|
22 |
text = ""
|
23 |
+
with open(file_path, "rb") as f:
|
24 |
+
reader = PdfReader(f)
|
25 |
+
for page in reader.pages:
|
26 |
+
page_text = page.extract_text()
|
27 |
+
if page_text:
|
28 |
+
text += page_text + "\n"
|
29 |
return text
|
30 |
|
31 |
+
# ✅ DOCX reading by extracting XML content
|
32 |
elif file_path_lower.endswith(".docx"):
|
33 |
with zipfile.ZipFile(file_path) as zf:
|
34 |
with zf.open("word/document.xml") as docx_xml:
|