John-Jiang's picture
init commit
5301c48
raw
history blame
2.4 kB
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
# Excel parser logic
import os
from typing import Dict, Any
from starfish.data_ingest.parsers.base_parser import BaseParser
class ExcelParser(BaseParser):
"""Parser for Excel files"""
def __init__(self):
super().__init__()
self.supported_extensions = [".xlsx", ".xls"]
self.metadata = {}
def parse(self, file_path: str) -> str:
"""Parse an Excel file into text
Args:
file_path: Path to the Excel file
Returns:
Extracted text from the Excel file
"""
try:
import openpyxl
except ImportError:
raise ImportError("openpyxl is required for Excel parsing. Install it with: pip install openpyxl")
# Load workbook and extract metadata
wb = openpyxl.load_workbook(file_path)
self.metadata = {
"file_path": file_path,
"sheets": wb.sheetnames,
"creator": wb.properties.creator,
"created": wb.properties.created,
"modified": wb.properties.modified,
"last_modified_by": wb.properties.lastModifiedBy,
}
# Extract text from all sheets
all_text = []
for sheet_name in wb.sheetnames:
sheet = wb[sheet_name]
sheet_text = []
sheet_text.append(f"--- Sheet: {sheet_name} ---")
for row in sheet.iter_rows(values_only=True):
row_text = [str(cell) for cell in row if cell is not None]
if row_text:
sheet_text.append("\t".join(row_text))
all_text.append("\n".join(sheet_text))
return "\n\n".join(all_text)
def get_metadata(self) -> Dict[str, Any]:
"""Get Excel file metadata
Returns:
Dictionary containing file metadata
"""
return self.metadata
def is_supported(self, file_path: str) -> bool:
"""Check if the file is supported by this parser
Args:
file_path: Path to the file
Returns:
True if the file is supported, False otherwise
"""
return os.path.splitext(file_path)[1].lower() in self.supported_extensions