g0th commited on
Commit
b681c52
·
verified ·
1 Parent(s): f40afe0

Create ppt_parser.py

Browse files
Files changed (1) hide show
  1. ppt_parser.py +67 -0
ppt_parser.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from pptx import Presentation
4
+ from pptx.util import Inches
5
+ from pptx.shapes.group import GroupShape
6
+ from pptx.shapes.picture import Picture
7
+ from PIL import Image
8
+ import io
9
+
10
+ def print_json(item):
11
+ return json.dumps(item, ensure_ascii=False, indent=4)
12
+
13
+ def transfer_textbox_content_in_group(group_shape):
14
+ group_shape_item = {}
15
+ for l, shape in enumerate(group_shape.shapes):
16
+ shape_item = {}
17
+ if shape.has_text_frame:
18
+ shape_item['type'] = "text"
19
+ shape_item['location'] = (shape.left, shape.top)
20
+ text_frame = shape.text_frame
21
+ for r, paragraph in enumerate(text_frame.paragraphs):
22
+ if paragraph.runs:
23
+ original_run = paragraph.runs[0]
24
+ paragraph_item = {
25
+ 'text': paragraph.text,
26
+ 'align': paragraph.alignment,
27
+ 'font': {
28
+ 'name': original_run.font.name,
29
+ 'bold': original_run.font.bold,
30
+ 'italic': original_run.font.italic,
31
+ 'underline': original_run.font.underline,
32
+ 'color': str(original_run.font.color.rgb),
33
+ 'language_id': original_run.font.language_id,
34
+ }
35
+ }
36
+ shape_item[f'paragraph_{r}'] = paragraph_item
37
+ group_shape_item[f"shape_{l}"] = shape_item
38
+ return group_shape_item
39
+
40
+ def transfer_to_structure(pptx_file, images_dir_path):
41
+ item = {}
42
+ prs = Presentation(pptx_file)
43
+ image_path_list = []
44
+
45
+ for i, slide in enumerate(prs.slides):
46
+ slide_item = {}
47
+ for j, shape in enumerate(slide.shapes):
48
+ shape_item = {}
49
+ if isinstance(shape, GroupShape):
50
+ shape_item['type'] = "group"
51
+ shape_item['group_content'] = transfer_textbox_content_in_group(shape)
52
+ elif isinstance(shape, Picture):
53
+ shape_item['type'] = "picture"
54
+ image_path = os.path.join(images_dir_path, f"picture_{j}.png")
55
+ image_path_list.append(image_path)
56
+ shape_item['image_path'] = image_path
57
+ shape_item['size'] = shape.image.size
58
+ shape_item['dpi'] = shape.image.dpi
59
+ shape_item['location'] = (shape.left, shape.top)
60
+ shape_item['location_inches'] = (Inches(shape.left).inches, Inches(shape.top).inches)
61
+ image_stream = io.BytesIO(shape.image.blob)
62
+ shape_image = Image.open(image_stream)
63
+ shape_image.save(image_path)
64
+ slide_item[f"shape_{j}"] = shape_item
65
+ item[f"slide_{i}"] = slide_item
66
+
67
+ return print_json(item), image_path_list