Spaces:
Running
Running
File size: 2,185 Bytes
5301c48 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
# DOCX parsers
import os
from starfish.data_ingest.parsers.base_parser import BaseParser
from typing import Dict, Any
class WordDocumentParser(BaseParser):
"""Parser for Microsoft Word documents"""
def __init__(self):
super().__init__()
self._docx = None
self.supported_extensions = [".docx"]
self.metadata = {}
def _load_docx(self):
"""Lazy load the docx module"""
if self._docx is None:
try:
import docx
self._docx = docx
except ImportError:
raise ImportError("python-docx is required for DOCX parsing. Install it with: pip install python-docx")
def parse(self, file_path: str) -> str:
"""Parse a DOCX file into plain text
Args:
file_path: Path to the DOCX file
Returns:
Extracted text from the document
"""
self._load_docx()
doc = self._docx.Document(file_path)
# Extract metadata
self.metadata = {
"author": doc.core_properties.author,
"created": doc.core_properties.created,
"modified": doc.core_properties.modified,
"title": doc.core_properties.title,
"pages": len(doc.paragraphs) // 50, # Estimate pages
}
# Extract text from paragraphs
paragraphs = [p.text for p in doc.paragraphs]
# Extract text from tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
paragraphs.append(cell.text)
return "\n\n".join(p for p in paragraphs if p)
def get_metadata(self) -> Dict[str, Any]:
"""Get document metadata
Returns:
Dictionary containing document metadata
"""
return self.metadata
def is_supported(self, file_path: str) -> bool:
"""Check if the file is supported by this parser
Args:
file_path: Path to the file
Returns:
True if the file is supported, False otherwise
"""
return os.path.splitext(file_path)[1].lower() in self.supported_extensions
|