John-Jiang's picture
init commit
5301c48
raw
history blame
2.19 kB
# DOCX parsers
import os
from starfish.data_ingest.parsers.base_parser import BaseParser
from typing import Dict, Any
class WordDocumentParser(BaseParser):
"""Parser for Microsoft Word documents"""
def __init__(self):
super().__init__()
self._docx = None
self.supported_extensions = [".docx"]
self.metadata = {}
def _load_docx(self):
"""Lazy load the docx module"""
if self._docx is None:
try:
import docx
self._docx = docx
except ImportError:
raise ImportError("python-docx is required for DOCX parsing. Install it with: pip install python-docx")
def parse(self, file_path: str) -> str:
"""Parse a DOCX file into plain text
Args:
file_path: Path to the DOCX file
Returns:
Extracted text from the document
"""
self._load_docx()
doc = self._docx.Document(file_path)
# Extract metadata
self.metadata = {
"author": doc.core_properties.author,
"created": doc.core_properties.created,
"modified": doc.core_properties.modified,
"title": doc.core_properties.title,
"pages": len(doc.paragraphs) // 50, # Estimate pages
}
# Extract text from paragraphs
paragraphs = [p.text for p in doc.paragraphs]
# Extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
paragraphs.append(cell.text)
return "\n\n".join(p for p in paragraphs if p)
def get_metadata(self) -> Dict[str, Any]:
"""Get document metadata
Returns:
Dictionary containing document metadata
"""
return self.metadata
def is_supported(self, file_path: str) -> bool:
"""Check if the file is supported by this parser
Args:
file_path: Path to the file
Returns:
True if the file is supported, False otherwise
"""
return os.path.splitext(file_path)[1].lower() in self.supported_extensions