File size: 3,627 Bytes
b681c52
 
 
 
 
 
 
 
 
 
 
 
b55bd9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b681c52
 
 
 
 
 
 
 
 
b55bd9b
 
 
b681c52
 
 
 
 
 
 
 
 
 
34a7f38
b681c52
 
34a7f38
 
 
 
 
 
b55bd9b
 
 
34a7f38
 
 
b681c52
 
34a7f38
 
b681c52
 
 
 
 
 
 
 
 
b55bd9b
 
 
 
 
 
34a7f38
b681c52
34a7f38
b681c52
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json
import os
from pptx import Presentation
from pptx.util import Inches
from pptx.shapes.group import GroupShape
from pptx.shapes.picture import Picture
from PIL import Image
import io

def print_json(item):
    return json.dumps(item, ensure_ascii=False, indent=4)

def safe_font_attribute(run, attr):
    try:
        return getattr(run.font, attr)
    except Exception:
        return None

def safe_color(run):
    try:
        return str(run.font.color.rgb) if run.font.color and run.font.color.rgb else None
    except Exception:
        return None

def extract_paragraph_data(paragraph):
    if not paragraph.runs:
        return None
    run = paragraph.runs[0]
    return {
        'text': paragraph.text,
        'align': paragraph.alignment,
        'font': {
            'name': safe_font_attribute(run, 'name'),
            'bold': safe_font_attribute(run, 'bold'),
            'italic': safe_font_attribute(run, 'italic'),
            'underline': safe_font_attribute(run, 'underline'),
            'color': safe_color(run),
            'language_id': safe_font_attribute(run, 'language_id'),
        }
    }

def transfer_textbox_content_in_group(group_shape):
    group_shape_item = {}
    for l, shape in enumerate(group_shape.shapes):
        shape_item = {}
        if shape.has_text_frame:
            shape_item['type'] = "text"
            shape_item['location'] = (shape.left, shape.top)
            text_frame = shape.text_frame
            for r, paragraph in enumerate(text_frame.paragraphs):
                data = extract_paragraph_data(paragraph)
                if data:
                    shape_item[f'paragraph_{r}'] = data
        group_shape_item[f"shape_{l}"] = shape_item
    return group_shape_item

def transfer_to_structure(pptx_file, images_dir_path):
    item = {}
    prs = Presentation(pptx_file)
    image_path_list = []

    for i, slide in enumerate(prs.slides):
        slide_item = {}

        for j, shape in enumerate(slide.shapes):
            shape_item = {}

            # Case 1: Normal text box
            if shape.has_text_frame:
                shape_item['type'] = "text"
                text_frame = shape.text_frame
                for r, paragraph in enumerate(text_frame.paragraphs):
                    data = extract_paragraph_data(paragraph)
                    if data:
                        shape_item[f'paragraph_{r}'] = data

            # Case 2: Grouped shapes
            elif isinstance(shape, GroupShape):
                shape_item['type'] = "group"
                shape_item['group_content'] = transfer_textbox_content_in_group(shape)

            # Case 3: Picture
            elif isinstance(shape, Picture):
                shape_item['type'] = "picture"
                image_path = os.path.join(images_dir_path, f"picture_{j}.png")
                image_path_list.append(image_path)
                shape_item['image_path'] = image_path
                shape_item['size'] = shape.image.size
                shape_item['dpi'] = shape.image.dpi
                shape_item['location'] = (shape.left, shape.top)
                shape_item['location_inches'] = (Inches(shape.left).inches, Inches(shape.top).inches)
                try:
                    image_stream = io.BytesIO(shape.image.blob)
                    shape_image = Image.open(image_stream)
                    shape_image.save(image_path)
                except Exception:
                    pass  # Corrupt or unsupported image

            slide_item[f"shape_{j}"] = shape_item

        item[f"slide_{i}"] = slide_item

    return print_json(item), image_path_list