File size: 631 Bytes
0c91aa8
7cc953c
 
0c91aa8
7cc953c
0c91aa8
 
7cc953c
 
 
 
 
 
 
 
 
0c91aa8
7cc953c
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# text_extractor.py

import os
import docx2txt
import PyPDF2

def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        try:
            with open(file_path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                return " ".join([page.extract_text() or "" for page in reader.pages])
        except:
            return "[Error extracting PDF text]"

    elif ext == ".docx":
        try:
            return docx2txt.process(file_path)
        except:
            return "[Error extracting DOCX text]"

    else:
        return "[Unsupported file type]"