Spaces:

husseinelsaadi
/

Codingo

Paused

husseinelsaadi commited on 14 days ago

Commit

947d727

1 Parent(s): 6d286f1

updated

Files changed (1) hide show

backend/services/resume_parser.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 import zipfile, re, os
 # ===============================
 # Load Model & Tokenizer
 # ===============================
-MODEL_NAME = "sravya-abburi/ResumeParserBERT"  # Change to Kiet/autotrain-resume_parser-1159242747 if needed
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
@@ -16,16 +17,18 @@ def extract_text(file_path: str) -> str:
     """Extract text from PDF or DOCX without external dependencies."""
     file_path_lower = file_path.lower()
-    # PDF reading using PyMuPDF (built into Spaces environment)
     if file_path_lower.endswith(".pdf"):
-        import fitz  # PyMuPDF
         text = ""
-        with fitz.open(file_path) as pdf_doc:
-            for page in pdf_doc:
-                text += page.get_text()
         return text
-    # DOCX reading by extracting XML content
     elif file_path_lower.endswith(".docx"):
         with zipfile.ZipFile(file_path) as zf:
             with zf.open("word/document.xml") as docx_xml:

 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
 import zipfile, re, os
+from PyPDF2 import PdfReader  # Lightweight & already in Spaces
 # ===============================
 # Load Model & Tokenizer
 # ===============================
+MODEL_NAME = "sravya-abburi/ResumeParserBERT"  # Swap to Kiet model if needed
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
 ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
     """Extract text from PDF or DOCX without external dependencies."""
     file_path_lower = file_path.lower()
+    # ✅ PDF reading using PyPDF2 (no fitz, no installs needed)
     if file_path_lower.endswith(".pdf"):
         text = ""
+        with open(file_path, "rb") as f:
+            reader = PdfReader(f)
+            for page in reader.pages:
+                page_text = page.extract_text()
+                if page_text:
+                    text += page_text + "\n"
         return text
+    # ✅ DOCX reading by extracting XML content
     elif file_path_lower.endswith(".docx"):
         with zipfile.ZipFile(file_path) as zf:
             with zf.open("word/document.xml") as docx_xml: