Spaces:
Running
Running
File size: 631 Bytes
0c91aa8 7cc953c 0c91aa8 7cc953c 0c91aa8 7cc953c 0c91aa8 7cc953c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# text_extractor.py
import os
import docx2txt
import PyPDF2
def extract_text_from_file(file_path):
ext = os.path.splitext(file_path)[1].lower()
if ext == ".pdf":
try:
with open(file_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
return " ".join([page.extract_text() or "" for page in reader.pages])
except:
return "[Error extracting PDF text]"
elif ext == ".docx":
try:
return docx2txt.process(file_path)
except:
return "[Error extracting DOCX text]"
else:
return "[Unsupported file type]"
|