import os from typing import Dict, Any from starfish.data_ingest.parsers.base_parser import BaseParser class PPTParser(BaseParser): """Parser for PowerPoint presentations""" def __init__(self): super().__init__() self.supported_extensions = [".pptx"] self.metadata = {} def parse(self, file_path: str) -> str: """Parse a PPTX file into plain text Args: file_path: Path to the PPTX file Returns: Extracted text from the presentation """ try: from pptx import Presentation except ImportError: raise ImportError("python-pptx is required for PPTX parsing. Install it with: pip install python-pptx") prs = Presentation(file_path) # Extract metadata self.metadata = { "title": prs.core_properties.title, "author": prs.core_properties.author, "created": prs.core_properties.created, "modified": prs.core_properties.modified, "slides": len(prs.slides), } # Extract text from slides all_text = [] for i, slide in enumerate(prs.slides): slide_text = [] slide_text.append(f"--- Slide {i+1} ---") # Get slide title if slide.shapes.title and slide.shapes.title.text: slide_text.append(f"Title: {slide.shapes.title.text}") # Get text from shapes for shape in slide.shapes: if hasattr(shape, "text") and shape.text: slide_text.append(shape.text) all_text.append("\n".join(slide_text)) return "\n\n".join(all_text) def get_metadata(self) -> Dict[str, Any]: """Get presentation metadata Returns: Dictionary containing presentation metadata """ return self.metadata def is_supported(self, file_path: str) -> bool: """Check if the file is supported by this parser Args: file_path: Path to the file Returns: True if the file is supported, False otherwise """ return os.path.splitext(file_path)[1].lower() in self.supported_extensions