Spaces:

John-Jiang
/

starfish_data_ai

Running

App Files Files Community

starfish_data_ai / src /starfish /data_ingest /parsers /ppt_parser.py

John-Jiang

init commit

5301c48 2 months ago

raw

history blame

2.24 kB

	import os
	from typing import Dict, Any
	from starfish.data_ingest.parsers.base_parser import BaseParser


	class PPTParser(BaseParser):
	"""Parser for PowerPoint presentations"""

	def __init__(self):
	super().__init__()
	self.supported_extensions = [".pptx"]
	self.metadata = {}

	def parse(self, file_path: str) -> str:
	"""Parse a PPTX file into plain text

	Args:
	file_path: Path to the PPTX file

	Returns:
	Extracted text from the presentation
	"""
	try:
	from pptx import Presentation
	except ImportError:
	raise ImportError("python-pptx is required for PPTX parsing. Install it with: pip install python-pptx")

	prs = Presentation(file_path)

	# Extract metadata
	self.metadata = {
	"title": prs.core_properties.title,
	"author": prs.core_properties.author,
	"created": prs.core_properties.created,
	"modified": prs.core_properties.modified,
	"slides": len(prs.slides),
	}

	# Extract text from slides
	all_text = []

	for i, slide in enumerate(prs.slides):
	slide_text = []
	slide_text.append(f"--- Slide {i+1} ---")

	# Get slide title
	if slide.shapes.title and slide.shapes.title.text:
	slide_text.append(f"Title: {slide.shapes.title.text}")

	# Get text from shapes
	for shape in slide.shapes:
	if hasattr(shape, "text") and shape.text:
	slide_text.append(shape.text)

	all_text.append("\n".join(slide_text))

	return "\n\n".join(all_text)

	def get_metadata(self) -> Dict[str, Any]:
	"""Get presentation metadata

	Returns:
	Dictionary containing presentation metadata
	"""
	return self.metadata

	def is_supported(self, file_path: str) -> bool:
	"""Check if the file is supported by this parser

	Args:
	file_path: Path to the file

	Returns:
	True if the file is supported, False otherwise
	"""
	return os.path.splitext(file_path)[1].lower() in self.supported_extensions