Studymaker / ppt_parser.py
g0th's picture
Update ppt_parser.py
34a7f38 verified
raw
history blame
3.97 kB
import json
import os
from pptx import Presentation
from pptx.util import Inches
from pptx.shapes.group import GroupShape
from pptx.shapes.picture import Picture
from PIL import Image
import io
def print_json(item):
return json.dumps(item, ensure_ascii=False, indent=4)
def transfer_textbox_content_in_group(group_shape):
group_shape_item = {}
for l, shape in enumerate(group_shape.shapes):
shape_item = {}
if shape.has_text_frame:
shape_item['type'] = "text"
shape_item['location'] = (shape.left, shape.top)
text_frame = shape.text_frame
for r, paragraph in enumerate(text_frame.paragraphs):
if paragraph.runs:
original_run = paragraph.runs[0]
paragraph_item = {
'text': paragraph.text,
'align': paragraph.alignment,
'font': {
'name': original_run.font.name,
'bold': original_run.font.bold,
'italic': original_run.font.italic,
'underline': original_run.font.underline,
'color': str(original_run.font.color.rgb),
'language_id': original_run.font.language_id,
}
}
shape_item[f'paragraph_{r}'] = paragraph_item
group_shape_item[f"shape_{l}"] = shape_item
return group_shape_item
def transfer_to_structure(pptx_file, images_dir_path):
item = {}
prs = Presentation(pptx_file)
image_path_list = []
for i, slide in enumerate(prs.slides):
slide_item = {}
for j, shape in enumerate(slide.shapes):
shape_item = {}
# Case 1: Normal text box
if shape.has_text_frame:
shape_item['type'] = "text"
text_frame = shape.text_frame
for r, paragraph in enumerate(text_frame.paragraphs):
if paragraph.runs:
original_run = paragraph.runs[0]
paragraph_item = {
'text': paragraph.text,
'align': paragraph.alignment,
'font': {
'name': original_run.font.name,
'bold': original_run.font.bold,
'italic': original_run.font.italic,
'underline': original_run.font.underline,
'color': str(original_run.font.color.rgb),
'language_id': original_run.font.language_id,
}
}
shape_item[f'paragraph_{r}'] = paragraph_item
# Case 2: Grouped shapes
elif isinstance(shape, GroupShape):
shape_item['type'] = "group"
shape_item['group_content'] = transfer_textbox_content_in_group(shape)
# Case 3: Picture
elif isinstance(shape, Picture):
shape_item['type'] = "picture"
image_path = os.path.join(images_dir_path, f"picture_{j}.png")
image_path_list.append(image_path)
shape_item['image_path'] = image_path
shape_item['size'] = shape.image.size
shape_item['dpi'] = shape.image.dpi
shape_item['location'] = (shape.left, shape.top)
shape_item['location_inches'] = (Inches(shape.left).inches, Inches(shape.top).inches)
image_stream = io.BytesIO(shape.image.blob)
shape_image = Image.open(image_stream)
shape_image.save(image_path)
slide_item[f"shape_{j}"] = shape_item
item[f"slide_{i}"] = slide_item
return print_json(item), image_path_list