Spaces:

Tesneem
/

document_chunker

Running

Tesneem commited on 6 days ago

Commit

f6c9c19

verified ·

1 Parent(s): 7e6f24f

Update document_chunker.py

Files changed (1) hide show

document_chunker.py CHANGED Viewed

@@ -74,18 +74,18 @@ class DocumentChunker:
     #     doc = Document(file_path)
     #     return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
     def extract_text(self, file_path: str) -> str:
-    if file_path.endswith(".docx"):
-        doc = Document(file_path)
-        return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
-    elif file_path.endswith(".pdf"):
-        import fitz  # PyMuPDF
-        text = ""
-        with fitz.open(file_path) as doc:
-            for page in doc:
-                text += page.get_text()
-        return text
-    else:
-        return Path(file_path).read_text()
     def detect_document_type(self, text: str) -> str:

     #     doc = Document(file_path)
     #     return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
     def extract_text(self, file_path: str) -> str:
+        if file_path.endswith(".docx"):
+            doc = Document(file_path)
+            return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
+        elif file_path.endswith(".pdf"):
+            import fitz  # PyMuPDF
+            text = ""
+            with fitz.open(file_path) as doc:
+                for page in doc:
+                    text += page.get_text()
+            return text
+        else:
+            return Path(file_path).read_text()
     def detect_document_type(self, text: str) -> str: