Studymaker2 / ppt_parser.py
g0th's picture
Update ppt_parser.py
3b3c05a verified
raw
history blame
3.67 kB
import json
import os
from pptx import Presentation
from pptx.util import Inches
from pptx.shapes.group import GroupShape
from pptx.shapes.picture import Picture
from PIL import Image
import io
def print_json(item):
return json.dumps(item, ensure_ascii=False, indent=4)
def safe_font_attribute(run, attr):
try:
return getattr(run.font, attr)
except Exception:
return None
def safe_color(run):
try:
return str(run.font.color.rgb) if run.font.color and run.font.color.rgb else None
except Exception:
return None
def extract_paragraph_data(paragraph):
if not paragraph.runs:
return None
run = paragraph.runs[0]
return {
'text': paragraph.text,
'align': paragraph.alignment,
'font': {
'name': safe_font_attribute(run, 'name'),
'bold': safe_font_attribute(run, 'bold'),
'italic': safe_font_attribute(run, 'italic'),
'underline': safe_font_attribute(run, 'underline'),
'color': safe_color(run),
'language_id': safe_font_attribute(run, 'language_id'),
}
}
def transfer_textbox_content_in_group(group_shape):
group_shape_item = {}
for l, shape in enumerate(group_shape.shapes):
shape_item = {}
if shape.has_text_frame:
shape_item['type'] = "text"
shape_item['location'] = (shape.left, shape.top)
text_frame = shape.text_frame
for r, paragraph in enumerate(text_frame.paragraphs):
data = extract_paragraph_data(paragraph)
if data:
shape_item[f'paragraph_{r}'] = data
group_shape_item[f"shape_{l}"] = shape_item
return group_shape_item
def transfer_to_structure(pptx_file, images_dir_path):
item = {}
prs = Presentation(pptx_file)
image_path_list = []
os.makedirs(images_dir_path, exist_ok=True)
for i, slide in enumerate(prs.slides):
slide_item = {}
for j, shape in enumerate(slide.shapes):
shape_item = {}
# Case 1: Normal text box
if shape.has_text_frame:
shape_item['type'] = "text"
text_frame = shape.text_frame
for r, paragraph in enumerate(text_frame.paragraphs):
data = extract_paragraph_data(paragraph)
if data:
shape_item[f'paragraph_{r}'] = data
# Case 2: Grouped shapes
elif isinstance(shape, GroupShape):
shape_item['type'] = "group"
shape_item['group_content'] = transfer_textbox_content_in_group(shape)
# Case 3: Picture
elif isinstance(shape, Picture):
shape_item['type'] = "picture"
image_path = os.path.join(images_dir_path, f"picture_{j}.png")
image_path_list.append(image_path)
shape_item['image_path'] = image_path
shape_item['size'] = shape.image.size
shape_item['dpi'] = shape.image.dpi
shape_item['location'] = (shape.left, shape.top)
shape_item['location_inches'] = (Inches(shape.left).inches, Inches(shape.top).inches)
try:
image_stream = io.BytesIO(shape.image.blob)
shape_image = Image.open(image_stream)
shape_image.save(image_path)
except Exception:
pass # Could not parse image
slide_item[f"shape_{j}"] = shape_item
item[f"slide_{i}"] = slide_item
return print_json(item), image_path_list