Spaces:
Running
Running
# text_extractor.py | |
import os | |
import docx2txt | |
import PyPDF2 | |
def extract_text_from_file(file_path): | |
ext = os.path.splitext(file_path)[1].lower() | |
if ext == ".pdf": | |
try: | |
with open(file_path, "rb") as f: | |
reader = PyPDF2.PdfReader(f) | |
return " ".join([page.extract_text() or "" for page in reader.pages]) | |
except: | |
return "[Error extracting PDF text]" | |
elif ext == ".docx": | |
try: | |
return docx2txt.process(file_path) | |
except: | |
return "[Error extracting DOCX text]" | |
else: | |
return "[Unsupported file type]" | |