Tesneem commited on
Commit
f6c9c19
·
verified ·
1 Parent(s): 7e6f24f

Update document_chunker.py

Browse files
Files changed (1) hide show
  1. document_chunker.py +12 -12
document_chunker.py CHANGED
@@ -74,18 +74,18 @@ class DocumentChunker:
74
  # doc = Document(file_path)
75
  # return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
76
  def extract_text(self, file_path: str) -> str:
77
- if file_path.endswith(".docx"):
78
- doc = Document(file_path)
79
- return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
80
- elif file_path.endswith(".pdf"):
81
- import fitz # PyMuPDF
82
- text = ""
83
- with fitz.open(file_path) as doc:
84
- for page in doc:
85
- text += page.get_text()
86
- return text
87
- else:
88
- return Path(file_path).read_text()
89
 
90
 
91
  def detect_document_type(self, text: str) -> str:
 
74
  # doc = Document(file_path)
75
  # return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
76
  def extract_text(self, file_path: str) -> str:
77
+ if file_path.endswith(".docx"):
78
+ doc = Document(file_path)
79
+ return '\n'.join([f"**{p.text}**" if any(r.bold for r in p.runs) else p.text for p in doc.paragraphs])
80
+ elif file_path.endswith(".pdf"):
81
+ import fitz # PyMuPDF
82
+ text = ""
83
+ with fitz.open(file_path) as doc:
84
+ for page in doc:
85
+ text += page.get_text()
86
+ return text
87
+ else:
88
+ return Path(file_path).read_text()
89
 
90
 
91
  def detect_document_type(self, text: str) -> str: