Spaces:
Running
Running
# DOCX parsers | |
import os | |
from starfish.data_ingest.parsers.base_parser import BaseParser | |
from typing import Dict, Any | |
class WordDocumentParser(BaseParser): | |
"""Parser for Microsoft Word documents""" | |
def __init__(self): | |
super().__init__() | |
self._docx = None | |
self.supported_extensions = [".docx"] | |
self.metadata = {} | |
def _load_docx(self): | |
"""Lazy load the docx module""" | |
if self._docx is None: | |
try: | |
import docx | |
self._docx = docx | |
except ImportError: | |
raise ImportError("python-docx is required for DOCX parsing. Install it with: pip install python-docx") | |
def parse(self, file_path: str) -> str: | |
"""Parse a DOCX file into plain text | |
Args: | |
file_path: Path to the DOCX file | |
Returns: | |
Extracted text from the document | |
""" | |
self._load_docx() | |
doc = self._docx.Document(file_path) | |
# Extract metadata | |
self.metadata = { | |
"author": doc.core_properties.author, | |
"created": doc.core_properties.created, | |
"modified": doc.core_properties.modified, | |
"title": doc.core_properties.title, | |
"pages": len(doc.paragraphs) // 50, # Estimate pages | |
} | |
# Extract text from paragraphs | |
paragraphs = [p.text for p in doc.paragraphs] | |
# Extract text from tables | |
for table in doc.tables: | |
for row in table.rows: | |
for cell in row.cells: | |
paragraphs.append(cell.text) | |
return "\n\n".join(p for p in paragraphs if p) | |
def get_metadata(self) -> Dict[str, Any]: | |
"""Get document metadata | |
Returns: | |
Dictionary containing document metadata | |
""" | |
return self.metadata | |
def is_supported(self, file_path: str) -> bool: | |
"""Check if the file is supported by this parser | |
Args: | |
file_path: Path to the file | |
Returns: | |
True if the file is supported, False otherwise | |
""" | |
return os.path.splitext(file_path)[1].lower() in self.supported_extensions | |