saherPervaiz commited on
Commit
0c91aa8
·
verified ·
1 Parent(s): 94516ce

Create text_extractor.py

Browse files
Files changed (1) hide show
  1. text_extractor.py +18 -0
text_extractor.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # text_extractor.py
2
+ import docx2txt
3
+ import fitz # PyMuPDF
4
+
5
+ def extract_text_from_file(file_path):
6
+ if file_path.endswith(".pdf"):
7
+ return extract_text_from_pdf(file_path)
8
+ elif file_path.endswith(".docx"):
9
+ return docx2txt.process(file_path)
10
+ else:
11
+ return "Unsupported file type."
12
+
13
+ def extract_text_from_pdf(file_path):
14
+ text = ""
15
+ with fitz.open(file_path) as doc:
16
+ for page in doc:
17
+ text += page.get_text()
18
+ return text