Spaces:
Running
Running
Update document_chunker.py
Browse files- document_chunker.py +12 -12
document_chunker.py
CHANGED
@@ -74,18 +74,18 @@ class DocumentChunker:
|
|
74 |
# doc = Document(file_path)
|
75 |
# return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
|
76 |
def extract_text(self, file_path: str) -> str:
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
|
90 |
|
91 |
def detect_document_type(self, text: str) -> str:
|
|
|
74 |
# doc = Document(file_path)
|
75 |
# return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
|
76 |
def extract_text(self, file_path: str) -> str:
|
77 |
+
if file_path.endswith(".docx"):
|
78 |
+
doc = Document(file_path)
|
79 |
+
return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
|
80 |
+
elif file_path.endswith(".pdf"):
|
81 |
+
import fitz # PyMuPDF
|
82 |
+
text = ""
|
83 |
+
with fitz.open(file_path) as doc:
|
84 |
+
for page in doc:
|
85 |
+
text += page.get_text()
|
86 |
+
return text
|
87 |
+
else:
|
88 |
+
return Path(file_path).read_text()
|
89 |
|
90 |
|
91 |
def detect_document_type(self, text: str) -> str:
|