g0th commited on
Commit
b55bd9b
·
verified ·
1 Parent(s): 8f42037

Update ppt_parser.py

Browse files
Files changed (1) hide show
  1. ppt_parser.py +41 -33
ppt_parser.py CHANGED
@@ -10,6 +10,35 @@ import io
10
  def print_json(item):
11
  return json.dumps(item, ensure_ascii=False, indent=4)
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  def transfer_textbox_content_in_group(group_shape):
14
  group_shape_item = {}
15
  for l, shape in enumerate(group_shape.shapes):
@@ -19,21 +48,9 @@ def transfer_textbox_content_in_group(group_shape):
19
  shape_item['location'] = (shape.left, shape.top)
20
  text_frame = shape.text_frame
21
  for r, paragraph in enumerate(text_frame.paragraphs):
22
- if paragraph.runs:
23
- original_run = paragraph.runs[0]
24
- paragraph_item = {
25
- 'text': paragraph.text,
26
- 'align': paragraph.alignment,
27
- 'font': {
28
- 'name': original_run.font.name,
29
- 'bold': original_run.font.bold,
30
- 'italic': original_run.font.italic,
31
- 'underline': original_run.font.underline,
32
- 'color': str(original_run.font.color.rgb),
33
- 'language_id': original_run.font.language_id,
34
- }
35
- }
36
- shape_item[f'paragraph_{r}'] = paragraph_item
37
  group_shape_item[f"shape_{l}"] = shape_item
38
  return group_shape_item
39
 
@@ -53,21 +70,9 @@ def transfer_to_structure(pptx_file, images_dir_path):
53
  shape_item['type'] = "text"
54
  text_frame = shape.text_frame
55
  for r, paragraph in enumerate(text_frame.paragraphs):
56
- if paragraph.runs:
57
- original_run = paragraph.runs[0]
58
- paragraph_item = {
59
- 'text': paragraph.text,
60
- 'align': paragraph.alignment,
61
- 'font': {
62
- 'name': original_run.font.name,
63
- 'bold': original_run.font.bold,
64
- 'italic': original_run.font.italic,
65
- 'underline': original_run.font.underline,
66
- 'color': str(original_run.font.color.rgb),
67
- 'language_id': original_run.font.language_id,
68
- }
69
- }
70
- shape_item[f'paragraph_{r}'] = paragraph_item
71
 
72
  # Case 2: Grouped shapes
73
  elif isinstance(shape, GroupShape):
@@ -84,9 +89,12 @@ def transfer_to_structure(pptx_file, images_dir_path):
84
  shape_item['dpi'] = shape.image.dpi
85
  shape_item['location'] = (shape.left, shape.top)
86
  shape_item['location_inches'] = (Inches(shape.left).inches, Inches(shape.top).inches)
87
- image_stream = io.BytesIO(shape.image.blob)
88
- shape_image = Image.open(image_stream)
89
- shape_image.save(image_path)
 
 
 
90
 
91
  slide_item[f"shape_{j}"] = shape_item
92
 
 
10
  def print_json(item):
11
  return json.dumps(item, ensure_ascii=False, indent=4)
12
 
13
+ def safe_font_attribute(run, attr):
14
+ try:
15
+ return getattr(run.font, attr)
16
+ except Exception:
17
+ return None
18
+
19
+ def safe_color(run):
20
+ try:
21
+ return str(run.font.color.rgb) if run.font.color and run.font.color.rgb else None
22
+ except Exception:
23
+ return None
24
+
25
+ def extract_paragraph_data(paragraph):
26
+ if not paragraph.runs:
27
+ return None
28
+ run = paragraph.runs[0]
29
+ return {
30
+ 'text': paragraph.text,
31
+ 'align': paragraph.alignment,
32
+ 'font': {
33
+ 'name': safe_font_attribute(run, 'name'),
34
+ 'bold': safe_font_attribute(run, 'bold'),
35
+ 'italic': safe_font_attribute(run, 'italic'),
36
+ 'underline': safe_font_attribute(run, 'underline'),
37
+ 'color': safe_color(run),
38
+ 'language_id': safe_font_attribute(run, 'language_id'),
39
+ }
40
+ }
41
+
42
  def transfer_textbox_content_in_group(group_shape):
43
  group_shape_item = {}
44
  for l, shape in enumerate(group_shape.shapes):
 
48
  shape_item['location'] = (shape.left, shape.top)
49
  text_frame = shape.text_frame
50
  for r, paragraph in enumerate(text_frame.paragraphs):
51
+ data = extract_paragraph_data(paragraph)
52
+ if data:
53
+ shape_item[f'paragraph_{r}'] = data
 
 
 
 
 
 
 
 
 
 
 
 
54
  group_shape_item[f"shape_{l}"] = shape_item
55
  return group_shape_item
56
 
 
70
  shape_item['type'] = "text"
71
  text_frame = shape.text_frame
72
  for r, paragraph in enumerate(text_frame.paragraphs):
73
+ data = extract_paragraph_data(paragraph)
74
+ if data:
75
+ shape_item[f'paragraph_{r}'] = data
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  # Case 2: Grouped shapes
78
  elif isinstance(shape, GroupShape):
 
89
  shape_item['dpi'] = shape.image.dpi
90
  shape_item['location'] = (shape.left, shape.top)
91
  shape_item['location_inches'] = (Inches(shape.left).inches, Inches(shape.top).inches)
92
+ try:
93
+ image_stream = io.BytesIO(shape.image.blob)
94
+ shape_image = Image.open(image_stream)
95
+ shape_image.save(image_path)
96
+ except Exception:
97
+ pass # Corrupt or unsupported image
98
 
99
  slide_item[f"shape_{j}"] = shape_item
100