microsoft/hve-core

Public

mirrored fromhttps://github.com/microsoft/hve-coreAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
ci/884-codeql-python-analysis

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

.github/skills/experimental/powerpoint/scripts/extract_content.py

1101lines · modecode

1"""Extract content from an existing PPTX into YAML content and style definitions.
2
3Usage::
4
5 python extract_content.py \
6 --input existing-deck.pptx --output-dir content/
7
8 python extract_content.py \
9 --input existing-deck.pptx --output-dir content/ \
10 --slides 3,7,15
11"""
12
13import argparse
14from collections import Counter
15from pathlib import Path
16
17import yaml
18from lxml import etree
19from pptx import Presentation
20from pptx.oxml.ns import qn
21from pptx_charts import extract_chart
22from pptx_colors import extract_color, hex_brightness
23from pptx_fills import extract_effect_list, extract_fill, extract_line
24from pptx_fonts import (
25 extract_alignment,
26 extract_font_info,
27 extract_paragraph_font,
28 normalize_font_family,
29)
30from pptx_shapes import AUTO_SHAPE_NAME_MAP, extract_rotation
31from pptx_tables import extract_table
32from pptx_text import (
33 extract_bullet_properties,
34 extract_paragraph_properties,
35 extract_run_properties,
36 extract_text_frame_properties,
37)
38from pptx_utils import emu_to_inches
39
40
41def extract_connector(shape) -> dict:
42 """Extract a connector element definition."""
43 elem = {
44 "type": "connector",
45 "begin_x": emu_to_inches(shape.begin_x),
46 "begin_y": emu_to_inches(shape.begin_y),
47 "end_x": emu_to_inches(shape.end_x),
48 "end_y": emu_to_inches(shape.end_y),
49 "name": shape.name,
50 }
51 line_props = extract_line(shape)
52 if line_props:
53 elem.update(line_props)
54 return elem
55
56
57def _is_freeform(shape) -> bool:
58 """Check whether a shape is a freeform with custom geometry."""
59 nsmap = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}
60 return shape._element.find(".//a:custGeom", nsmap) is not None
61
62
63def _is_background_image(shape, slide_w: float, slide_h: float) -> bool:
64 """Detect whether a PICTURE shape covers the full slide as a background.
65
66 A shape qualifies if it covers at least 95% of slide dimensions.
67 """
68 w = emu_to_inches(shape.width)
69 h = emu_to_inches(shape.height)
70 return (w >= slide_w * 0.95) and (h >= slide_h * 0.95)
71
72
73def _save_image_blob(shape, output_dir: Path, slide_num: int, img_count: int) -> dict:
74 """Save an image shape's blob to disk and return a path dict."""
75 try:
76 img = shape.image
77 except ValueError:
78 return {"path": "LINKED_IMAGE_NOT_EMBEDDED"}
79
80 ext = img.content_type.split("/")[-1]
81 if ext == "jpeg":
82 ext = "jpg"
83 img_name = f"image-{img_count:02d}.{ext}"
84 img_path = output_dir / "images" / img_name
85 img_path.parent.mkdir(parents=True, exist_ok=True)
86
87 with open(img_path, "wb") as f:
88 f.write(img.blob)
89
90 return {"path": f"images/{img_name}"}
91
92
93def extract_freeform(shape) -> dict:
94 """Extract a freeform shape with its path vertices."""
95 elem = {
96 "type": "freeform",
97 "left": emu_to_inches(shape.left),
98 "top": emu_to_inches(shape.top),
99 "width": emu_to_inches(shape.width),
100 "height": emu_to_inches(shape.height),
101 "name": shape.name,
102 }
103
104 rot = extract_rotation(shape)
105 if rot is not None:
106 elem["rotation"] = rot
107
108 # Extract fill and line properties
109 try:
110 fill_result = extract_fill(shape.fill)
111 if fill_result is not None:
112 elem["fill"] = fill_result
113 except (AttributeError, TypeError):
114 pass
115
116 line_props = extract_line(shape)
117 if line_props:
118 elem.update(line_props)
119
120 # Extract path vertices from custGeom XML
121 nsmap = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}
122 paths = []
123 for path_el in shape._element.findall(".//a:custGeom/a:pathLst/a:path", nsmap):
124 commands = []
125 for child in path_el:
126 tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
127 if tag == "moveTo":
128 pt = child.find("a:pt", nsmap)
129 if pt is not None:
130 commands.append(
131 {
132 "cmd": "moveTo",
133 "x": int(pt.get("x", 0)),
134 "y": int(pt.get("y", 0)),
135 }
136 )
137 elif tag == "lnTo":
138 pt = child.find("a:pt", nsmap)
139 if pt is not None:
140 commands.append(
141 {
142 "cmd": "lineTo",
143 "x": int(pt.get("x", 0)),
144 "y": int(pt.get("y", 0)),
145 }
146 )
147 elif tag == "cubicBezTo":
148 pts = child.findall("a:pt", nsmap)
149 commands.append(
150 {
151 "cmd": "cubicBezTo",
152 "pts": [
153 {"x": int(p.get("x", 0)), "y": int(p.get("y", 0))}
154 for p in pts
155 ],
156 }
157 )
158 elif tag == "close":
159 commands.append({"cmd": "close"})
160 if commands:
161 paths.append(commands)
162
163 if paths:
164 elem["paths"] = paths
165
166 return elem
167
168
169def extract_group(shape, slide_num: int, output_dir, img_count: int) -> dict:
170 """Extract a group shape and its nested child elements."""
171 elem = {
172 "type": "group",
173 "left": emu_to_inches(shape.left),
174 "top": emu_to_inches(shape.top),
175 "width": emu_to_inches(shape.width),
176 "height": emu_to_inches(shape.height),
177 "name": shape.name,
178 "elements": [],
179 }
180 for child in shape.shapes:
181 child_elem = extract_child_shape(child, slide_num, output_dir, img_count)
182 if child_elem:
183 elem["elements"].append(child_elem)
184 return elem
185
186
187def _extract_shape_by_type(
188 shape, slide_num: int, output_dir, img_count: int
189) -> dict | None:
190 """Dispatch extraction based on shape_type, table/chart, or freeform."""
191 shape_type = shape.shape_type
192
193 # Simple shape_type dispatch (these extractors need no extra context)
194 _SIMPLE_EXTRACTORS = {
195 17: extract_textbox, # TEXT_BOX
196 1: extract_shape, # AUTO_SHAPE
197 9: extract_connector, # LINE / CONNECTOR
198 }
199 extractor = _SIMPLE_EXTRACTORS.get(shape_type)
200 if extractor:
201 return extractor(shape)
202
203 if shape_type == 13: # PICTURE
204 return extract_image(shape, output_dir, slide_num, img_count)
205 if shape_type == 6: # GROUP
206 return extract_group(shape, slide_num, output_dir, img_count)
207
208 # Table and chart detection via attribute check
209 if hasattr(shape, "has_table") and shape.has_table:
210 return extract_table(shape)
211 if hasattr(shape, "has_chart") and shape.has_chart:
212 return extract_chart(shape)
213 if _is_freeform(shape):
214 return extract_freeform(shape)
215
216 return None
217
218
219def extract_child_shape(
220 shape, slide_num: int, output_dir, img_count: int
221) -> dict | None:
222 """Extract a single child shape within a group."""
223 result = _extract_shape_by_type(shape, slide_num, output_dir, img_count)
224 if result is not None:
225 return result
226
227 # Fallback for unrecognized shape types
228 elem = {
229 "type": "shape",
230 "shape": "rectangle",
231 "left": emu_to_inches(shape.left),
232 "top": emu_to_inches(shape.top),
233 "width": emu_to_inches(shape.width),
234 "height": emu_to_inches(shape.height),
235 "name": shape.name,
236 }
237 if shape.shape_type is not None:
238 elem["_unrecognized_shape_type"] = int(shape.shape_type)
239 return elem
240
241
242def _has_formatting_variation(runs: list) -> bool:
243 """Check if multiple runs have different formatting properties."""
244 if len(runs) <= 1:
245 return False
246 fonts = {r.get("font") for r in runs if "font" in r}
247 sizes = {r.get("size") for r in runs if "size" in r}
248 colors = {r.get("color") for r in runs if "color" in r}
249 bolds = {r.get("bold", False) for r in runs}
250 italics = {r.get("italic", False) for r in runs}
251 underlines = {r.get("underline", False) for r in runs}
252 return (
253 len(fonts) > 1
254 or len(sizes) > 1
255 or len(colors) > 1
256 or len(bolds) > 1
257 or len(italics) > 1
258 or len(underlines) > 1
259 )
260
261
262# Key-mapping for extraction: maps canonical keys to output YAML key names
263_SHAPE_EXTRACT_KEYS = {
264 "font": "text_font",
265 "size": "text_size",
266 "color": "text_color",
267 "bold": "text_bold",
268}
269_TEXTBOX_EXTRACT_KEYS = {
270 "font": "font",
271 "size": "font_size",
272 "color": "font_color",
273 "bold": "font_bold",
274}
275
276# Keys to promote from first paragraph to element level
277_SHAPE_PROMOTE_KEYS = (
278 "text_font",
279 "text_size",
280 "text_color",
281 "text_bold",
282 "italic",
283 "alignment",
284 "char_spacing",
285)
286_TEXTBOX_PROMOTE_KEYS = (
287 "font",
288 "font_size",
289 "font_color",
290 "font_bold",
291 "italic",
292 "alignment",
293 "char_spacing",
294)
295
296
297def _extract_text_content(text_frame, keys: dict, promote_keys: tuple) -> dict:
298 """Extract text content from a text frame into an element dict fragment.
299
300 Handles paragraph iteration, run extraction, rich-text detection, and
301 paragraph/element-level key promotion.
302
303 Args:
304 text_frame: python-pptx TextFrame object.
305 keys: Key-mapping dict for font/size/color/bold output names.
306 promote_keys: Tuple of keys to promote from first paragraph to element level.
307
308 Returns:
309 Dict with text, text frame properties, paragraph data, and promoted defaults.
310 """
311 result = {}
312 text = text_frame.text.strip()
313 if not text:
314 return result
315
316 result["text"] = text
317
318 tf_props = extract_text_frame_properties(text_frame)
319 if tf_props:
320 result.update(tf_props)
321
322 para_dicts = []
323 for para in text_frame.paragraphs:
324 run_info = {}
325 para_runs = []
326 for run in para.runs:
327 font_info = extract_font_info(run.font)
328 run_extra = extract_run_properties(run)
329 para_runs.append({"text": run.text, **font_info, **run_extra})
330 if not run_info:
331 run_info = {**font_info, **run_extra}
332
333 para_info = extract_paragraph_font(para)
334 para_spacing = extract_paragraph_properties(para)
335 bullet_props = extract_bullet_properties(para)
336 alignment = extract_alignment(para)
337 merged = {**para_info, **run_info}
338
339 p_dict = {"text": para.text}
340 if "font" in merged:
341 p_dict[keys["font"]] = merged["font"]
342 if "size" in merged:
343 p_dict[keys["size"]] = merged["size"]
344 if "color" in merged:
345 p_dict[keys["color"]] = merged["color"]
346 if merged.get("bold"):
347 p_dict[keys["bold"]] = True
348 if merged.get("italic"):
349 p_dict["italic"] = True
350 if merged.get("underline"):
351 p_dict["underline"] = True
352 if merged.get("hyperlink"):
353 p_dict["hyperlink"] = merged["hyperlink"]
354 if "char_spacing" in merged:
355 p_dict["char_spacing"] = merged["char_spacing"]
356 if "effect" in merged:
357 p_dict["text_effect"] = merged["effect"]
358 if alignment:
359 p_dict["alignment"] = alignment
360 if para_spacing:
361 p_dict.update(para_spacing)
362 if bullet_props:
363 p_dict.update(bullet_props)
364 if _has_formatting_variation(para_runs):
365 p_dict["runs"] = para_runs
366 para_dicts.append(p_dict)
367
368 non_empty = [p for p in para_dicts if p["text"].strip()]
369 any_has_runs = any("runs" in p for p in para_dicts)
370 if len(para_dicts) > 1 or any_has_runs:
371 result["paragraphs"] = para_dicts
372 if non_empty:
373 first = non_empty[0]
374 for key in promote_keys:
375 if key in first:
376 result[key] = first[key]
377 elif non_empty:
378 first = non_empty[0]
379 for key, val in first.items():
380 if key != "text":
381 result[key] = val
382
383 return result
384
385
386def extract_shape(shape) -> dict:
387 """Extract a shape element definition."""
388 elem = {
389 "type": "shape",
390 "shape": "rectangle",
391 "left": emu_to_inches(shape.left),
392 "top": emu_to_inches(shape.top),
393 "width": emu_to_inches(shape.width),
394 "height": emu_to_inches(shape.height),
395 "name": shape.name,
396 }
397
398 rot = extract_rotation(shape)
399 if rot is not None:
400 elem["rotation"] = rot
401
402 # Detect shape type from auto_shape_type enum
403 try:
404 elem["shape"] = AUTO_SHAPE_NAME_MAP.get(shape.auto_shape_type, "rectangle")
405 except (AttributeError, TypeError):
406 elem["shape"] = "rectangle"
407
408 # Extract corner radius (adjustment values) for rounded rectangles
409 try:
410 if shape.adjustments and len(shape.adjustments) > 0:
411 elem["corner_radius"] = round(shape.adjustments[0], 5)
412 except (AttributeError, TypeError, IndexError):
413 pass
414
415 # Extract fill
416 try:
417 fill_result = extract_fill(shape.fill)
418 if fill_result is not None:
419 elem["fill"] = fill_result
420 except (AttributeError, TypeError):
421 pass
422
423 # Extract line properties
424 line_props = extract_line(shape)
425 if line_props:
426 elem.update(line_props)
427
428 # Extract effect list (outer shadow)
429 effect = extract_effect_list(shape)
430 if effect:
431 elem["effect"] = effect
432
433 # Extract text if present
434 if shape.has_text_frame:
435 text_data = _extract_text_content(
436 shape.text_frame, _SHAPE_EXTRACT_KEYS, _SHAPE_PROMOTE_KEYS
437 )
438 elem.update(text_data)
439
440 return elem
441
442
443def extract_textbox(shape) -> dict:
444 """Extract a text box element definition."""
445 elem = {
446 "type": "textbox",
447 "left": emu_to_inches(shape.left),
448 "top": emu_to_inches(shape.top),
449 "width": emu_to_inches(shape.width),
450 "height": emu_to_inches(shape.height),
451 "text": shape.text_frame.text.strip() if shape.has_text_frame else "",
452 "name": shape.name,
453 }
454
455 rot = extract_rotation(shape)
456 if rot is not None:
457 elem["rotation"] = rot
458
459 if shape.has_text_frame:
460 text_data = _extract_text_content(
461 shape.text_frame, _TEXTBOX_EXTRACT_KEYS, _TEXTBOX_PROMOTE_KEYS
462 )
463 elem.update(text_data)
464
465 return elem
466
467
468def extract_image(shape, output_dir: Path, slide_num: int, img_count: int) -> dict:
469 """Extract an image element and save the image file."""
470 try:
471 img = shape.image
472 except ValueError:
473 # Linked images have no embedded blob
474 elem = {
475 "type": "image",
476 "path": "LINKED_IMAGE_NOT_EMBEDDED",
477 "left": emu_to_inches(shape.left),
478 "top": emu_to_inches(shape.top),
479 "width": emu_to_inches(shape.width),
480 "height": emu_to_inches(shape.height),
481 "name": shape.name,
482 "_note": "Image was linked, not embedded in the PPTX",
483 }
484 rot = extract_rotation(shape)
485 if rot is not None:
486 elem["rotation"] = rot
487 return elem
488
489 ext = img.content_type.split("/")[-1]
490 if ext == "jpeg":
491 ext = "jpg"
492
493 img_name = f"image-{img_count:02d}.{ext}"
494 img_path = output_dir / "images" / img_name
495 img_path.parent.mkdir(parents=True, exist_ok=True)
496
497 with open(img_path, "wb") as f:
498 f.write(img.blob)
499
500 elem = {
501 "type": "image",
502 "path": f"images/{img_name}",
503 "left": emu_to_inches(shape.left),
504 "top": emu_to_inches(shape.top),
505 "width": emu_to_inches(shape.width),
506 "height": emu_to_inches(shape.height),
507 "name": shape.name,
508 }
509 rot = extract_rotation(shape)
510 if rot is not None:
511 elem["rotation"] = rot
512
513 # Extract image crop from srcRect on blipFill
514 blipFill = shape._element.find(qn("p:blipFill"))
515 if blipFill is not None:
516 # Preserve blipFill attributes (rotWithShape, dpi, etc.)
517 blip_fill_attrs = {}
518 for attr_name in ("rotWithShape", "dpi"):
519 val = blipFill.get(attr_name)
520 if val is not None:
521 blip_fill_attrs[attr_name] = val
522 if blip_fill_attrs:
523 elem["blip_fill_attrs"] = blip_fill_attrs
524
525 srcRect = blipFill.find(qn("a:srcRect"))
526 if srcRect is not None and srcRect.attrib:
527 crop = {}
528 for side in ("l", "t", "r", "b"):
529 val = srcRect.get(side)
530 if val is not None:
531 crop[side] = int(val)
532 if crop:
533 elem["crop"] = crop
534
535 # Extract image opacity from alphaModFix on the blip element
536 blip = shape._element.find(".//" + qn("a:blip"))
537 if blip is not None:
538 amf = blip.find(qn("a:alphaModFix"))
539 if amf is not None:
540 amt = int(amf.get("amt", "100000"))
541 elem["opacity"] = round(amt / 1000, 1)
542
543 return elem
544
545
546def detect_global_style(prs) -> dict:
547 """Analyze the presentation to detect common styling patterns.
548
549 Detects multiple theme zones (e.g., light and dark slides) by clustering
550 slides based on background brightness and dominant text colors.
551 """
552 bg_colors = Counter()
553 text_colors = Counter()
554 accent_colors = Counter()
555 fill_colors = Counter()
556 font_names = Counter()
557 font_sizes = Counter()
558
559 # Per-slide analysis for theme clustering
560 slide_profiles = []
561
562 slide_w = emu_to_inches(prs.slide_width)
563 slide_h = emu_to_inches(prs.slide_height)
564
565 for slide_idx, slide in enumerate(prs.slides):
566 slide_num = slide_idx + 1
567 slide_bg = None
568 slide_text_colors = Counter()
569 slide_fill_colors = Counter()
570 has_bg_image = False
571
572 # Detect background colors
573 try:
574 fill_result = extract_fill(slide.background.fill)
575 if isinstance(fill_result, str):
576 bg_colors[fill_result] += 1
577 slide_bg = fill_result
578 except (AttributeError, TypeError):
579 pass
580
581 for i, shape in enumerate(slide.shapes):
582 # Detect full-slide background images
583 if (
584 i == 0
585 and shape.shape_type == 13
586 and _is_background_image(shape, slide_w, slide_h)
587 ):
588 has_bg_image = True
589 continue
590
591 # Collect fill colors
592 try:
593 fill_result = extract_fill(shape.fill)
594 if isinstance(fill_result, str):
595 h = emu_to_inches(shape.height)
596 if h < 0.1:
597 accent_colors[fill_result] += 1
598 else:
599 fill_colors[fill_result] += 1
600 slide_fill_colors[fill_result] += 1
601 except (AttributeError, TypeError):
602 pass
603
604 # Collect font information
605 if shape.has_text_frame:
606 for para in shape.text_frame.paragraphs:
607 for run in para.runs:
608 if run.font.name:
609 base_name = normalize_font_family(run.font.name)
610 font_names[base_name] += 1
611 if run.font.size:
612 font_sizes[int(run.font.size.pt)] += 1
613 try:
614 color = extract_color(run.font.color)
615 if isinstance(color, str) and color.startswith("#"):
616 text_colors[color] += 1
617 slide_text_colors[color] += 1
618 except (AttributeError, TypeError):
619 pass
620
621 # Classify slide brightness
622 bg_brightness = _classify_slide_brightness(
623 slide_bg, slide_text_colors, has_bg_image
624 )
625 slide_profiles.append(
626 {
627 "slide": slide_num,
628 "bg_color": slide_bg,
629 "bg_brightness": bg_brightness,
630 "has_bg_image": has_bg_image,
631 "text_colors": dict(slide_text_colors),
632 "fill_colors": dict(slide_fill_colors),
633 }
634 )
635
636 # Build global color map from frequency analysis
637 colors = _build_color_map(bg_colors, fill_colors, text_colors, accent_colors)
638
639 # Detect themes by clustering slides into light/dark groups
640 themes = _cluster_themes(slide_profiles, text_colors, fill_colors, accent_colors)
641
642 # Determine primary fonts
643 body_font = "Segoe UI"
644 code_font = "Cascadia Code"
645 for f, _count in font_names.most_common():
646 if any(kw in f.lower() for kw in ("cascadia", "consolas", "mono", "courier")):
647 code_font = f
648 else:
649 body_font = f
650 break
651
652 # Determine font sizes
653 heading_size = 28
654 body_size = 16
655 if font_sizes:
656 filtered = {s: c for s, c in font_sizes.items() if 8 < s < 60}
657 if filtered:
658 sorted_sizes = sorted(filtered.keys())
659 body_size = sorted_sizes[len(sorted_sizes) // 2]
660 heading_size = sorted_sizes[int(len(sorted_sizes) * 0.85)]
661
662 style = {
663 "dimensions": {
664 "width_inches": emu_to_inches(prs.slide_width),
665 "height_inches": emu_to_inches(prs.slide_height),
666 "format": "16:9",
667 },
668 "defaults": {
669 "speaker_notes_required": True,
670 },
671 "typography": {
672 "body_font": body_font,
673 "code_font": code_font,
674 "heading_size": heading_size,
675 "body_size": body_size,
676 },
677 }
678
679 if colors:
680 style["colors"] = colors
681
682 if themes:
683 style["themes"] = themes
684
685 # Extract presentation metadata
686 metadata = {}
687 props = prs.core_properties
688 for attr in ("title", "author", "subject", "keywords", "description", "category"):
689 val = getattr(props, attr, None)
690 if val:
691 metadata[attr] = val
692 if metadata:
693 style["metadata"] = metadata
694
695 return style
696
697
698def _classify_slide_brightness(
699 bg_color: str | None, text_colors: Counter, has_bg_image: bool
700) -> str:
701 """Classify a slide as 'light' or 'dark' based on background and text colors."""
702 if has_bg_image and bg_color is None:
703 # Slides with background images and no solid bg — infer from text colors
704 dark_text = sum(
705 c for hex_c, c in text_colors.items() if hex_brightness(hex_c) < 100
706 )
707 light_text = sum(
708 c for hex_c, c in text_colors.items() if hex_brightness(hex_c) > 150
709 )
710 return "light" if dark_text >= light_text else "dark"
711
712 if bg_color and isinstance(bg_color, str) and bg_color.startswith("#"):
713 return "light" if hex_brightness(bg_color) > 128 else "dark"
714
715 # Default: infer from text colors
716 dark_text = sum(
717 c for hex_c, c in text_colors.items() if hex_brightness(hex_c) < 100
718 )
719 light_text = sum(
720 c for hex_c, c in text_colors.items() if hex_brightness(hex_c) > 150
721 )
722 if dark_text > light_text:
723 return "light"
724 if light_text > dark_text:
725 return "dark"
726 return "dark"
727
728
729def _build_color_map(
730 bg_colors: Counter,
731 fill_colors: Counter,
732 text_colors: Counter,
733 accent_colors: Counter,
734) -> dict:
735 """Build the global color map from frequency analysis."""
736 colors = {}
737 if bg_colors:
738 colors["bg_dark"] = bg_colors.most_common(1)[0][0]
739 if fill_colors:
740 colors["bg_card"] = fill_colors.most_common(1)[0][0]
741
742 for color_hex, _count in text_colors.most_common(5):
743 brightness = hex_brightness(color_hex)
744 if brightness > 200 and "text_white" not in colors:
745 colors["text_white"] = color_hex
746 elif brightness < 80 and "text_dark" not in colors:
747 colors["text_dark"] = color_hex
748 elif 80 <= brightness <= 200 and "text_gray" not in colors:
749 colors["text_gray"] = color_hex
750
751 accent_names = ["accent_blue", "accent_teal", "accent_green"]
752 for i, (color_hex, _count) in enumerate(accent_colors.most_common(3)):
753 if i < len(accent_names):
754 colors[accent_names[i]] = color_hex
755
756 return colors
757
758
759def _cluster_themes(
760 slide_profiles: list[dict],
761 text_colors: Counter,
762 fill_colors: Counter,
763 accent_colors: Counter,
764) -> list[dict]:
765 """Cluster slides into theme groups based on brightness classification."""
766 light_slides = [p for p in slide_profiles if p["bg_brightness"] == "light"]
767 dark_slides = [p for p in slide_profiles if p["bg_brightness"] == "dark"]
768
769 # Only produce themes when both light and dark groups exist
770 if not light_slides or not dark_slides:
771 return []
772
773 themes = []
774
775 # Light theme
776 light_text = Counter()
777 light_fills = Counter()
778 for p in light_slides:
779 light_text.update(p["text_colors"])
780 light_fills.update(p["fill_colors"])
781
782 light_colors = {}
783 for color_hex, _count in light_text.most_common(5):
784 brightness = hex_brightness(color_hex)
785 if brightness < 80 and "text_primary" not in light_colors:
786 light_colors["text_primary"] = color_hex
787 elif 80 <= brightness <= 200 and "text_secondary" not in light_colors:
788 light_colors["text_secondary"] = color_hex
789 if light_fills:
790 light_colors["bg_card"] = light_fills.most_common(1)[0][0]
791
792 themes.append(
793 {
794 "name": "light",
795 "slides": sorted(p["slide"] for p in light_slides),
796 "colors": light_colors,
797 }
798 )
799
800 # Dark theme
801 dark_text = Counter()
802 dark_fills = Counter()
803 dark_bgs = Counter()
804 for p in dark_slides:
805 dark_text.update(p["text_colors"])
806 dark_fills.update(p["fill_colors"])
807 if p["bg_color"]:
808 dark_bgs[p["bg_color"]] += 1
809
810 dark_colors = {}
811 if dark_bgs:
812 dark_colors["bg_dark"] = dark_bgs.most_common(1)[0][0]
813 for color_hex, _count in dark_text.most_common(5):
814 brightness = hex_brightness(color_hex)
815 if brightness > 200 and "text_primary" not in dark_colors:
816 dark_colors["text_primary"] = color_hex
817 elif 80 <= brightness <= 200 and "text_secondary" not in dark_colors:
818 dark_colors["text_secondary"] = color_hex
819 if dark_fills:
820 dark_colors["bg_card"] = dark_fills.most_common(1)[0][0]
821
822 themes.append(
823 {
824 "name": "dark",
825 "slides": sorted(p["slide"] for p in dark_slides),
826 "colors": dark_colors,
827 }
828 )
829
830 return themes
831
832
833def extract_slide(
834 slide,
835 slide_num: int,
836 output_dir: Path,
837 slide_dims: tuple[float, float] | None = None,
838) -> dict:
839 """Extract all elements from a slide into a content.yaml structure."""
840 slide_dir = output_dir / f"slide-{slide_num:03d}"
841 slide_dir.mkdir(parents=True, exist_ok=True)
842
843 content = {
844 "slide": slide_num,
845 "title": "",
846 "elements": [],
847 }
848
849 # Extract layout name
850 try:
851 layout_name = slide.slide_layout.name
852 if layout_name:
853 content["layout"] = layout_name
854 except (AttributeError, TypeError):
855 pass
856
857 # Extract slide background
858 try:
859 if not slide.follow_master_background:
860 fill_result = extract_fill(slide.background.fill)
861 if fill_result is not None:
862 content["background"] = {"fill": fill_result}
863 except (AttributeError, TypeError):
864 pass
865
866 # Extract speaker notes (include empty string when notes slide exists)
867 try:
868 if slide.has_notes_slide:
869 notes = slide.notes_slide.notes_text_frame.text.strip()
870 content["speaker_notes"] = notes
871 except (AttributeError, TypeError):
872 pass
873
874 img_count = 0
875
876 for z_index, shape in enumerate(list(slide.shapes)):
877 shape_type = shape.shape_type
878
879 # Track image count for filename generation
880 if shape_type == 13:
881 img_count += 1
882
883 # Handle placeholders specially (extract as textbox with marker)
884 if shape_type == 14:
885 if not shape.has_text_frame:
886 continue
887 elem = extract_textbox(shape)
888 elem["_placeholder"] = True
889 elem["z_order"] = z_index
890 content["elements"].append(elem)
891 continue
892
893 # Use shared dispatcher for all other shape types
894 elem = _extract_shape_by_type(shape, slide_num, slide_dir, img_count)
895 if elem is not None:
896 elem["z_order"] = z_index
897 content["elements"].append(elem)
898
899 # Detect title from textbox near top of slide
900 if (
901 shape_type == 17
902 and not content["title"]
903 and emu_to_inches(shape.top) < 1.5
904 ):
905 text = shape.text_frame.text.strip() if shape.has_text_frame else ""
906 if text and len(text) < 100:
907 content["title"] = text
908 continue
909
910 # Fallback for unrecognized shape types
911 elem_data = {
912 "type": "shape",
913 "shape": "rectangle",
914 "left": emu_to_inches(shape.left),
915 "top": emu_to_inches(shape.top),
916 "width": emu_to_inches(shape.width),
917 "height": emu_to_inches(shape.height),
918 "name": shape.name,
919 "z_order": z_index,
920 }
921 if shape_type is not None:
922 elem_data["_unrecognized_shape_type"] = int(shape_type)
923 content["elements"].append(elem_data)
924
925 return content, slide_dir
926
927
928def _resolve_theme_colors(prs) -> dict:
929 """Extract theme color name→hex mappings from the presentation's theme XML.
930
931 Reads clrScheme from the slide master's theme and maps theme names
932 (background_1, text_1, accent_1, etc.) to their actual hex values.
933 """
934 color_map = {}
935 scheme_names = {
936 "dk1": "dark_1",
937 "dk2": "dark_2",
938 "lt1": "light_1",
939 "lt2": "light_2",
940 "accent1": "accent_1",
941 "accent2": "accent_2",
942 "accent3": "accent_3",
943 "accent4": "accent_4",
944 "accent5": "accent_5",
945 "accent6": "accent_6",
946 "hlink": "hyperlink",
947 "folHlink": "followed_hyperlink",
948 }
949 # Map canonical aliases
950 aliases = {
951 "dark_1": "text_1",
952 "dark_2": "text_2",
953 "light_1": "background_1",
954 "light_2": "background_2",
955 }
956 try:
957 ns_a = "http://schemas.openxmlformats.org/drawingml/2006/main"
958 master = prs.slide_masters[0]
959 theme_el = None
960 # Theme is stored as a related part (generic Part, not XmlPart),
961 # so parse its blob directly with lxml.
962 for rel in master.part.rels.values():
963 if "theme" in rel.reltype:
964 theme_el = etree.fromstring(rel.target_part.blob)
965 break
966
967 if theme_el is not None:
968 clr_scheme = theme_el.find(f".//{{{ns_a}}}clrScheme")
969 if clr_scheme is not None:
970 for child in clr_scheme:
971 tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
972 theme_name = scheme_names.get(tag)
973 if theme_name is None:
974 continue
975 # Extract hex value from srgbClr or sysClr
976 srgb = child.find(f"{{{ns_a}}}srgbClr")
977 if srgb is not None:
978 color_map[theme_name] = f"#{srgb.get('val', '000000')}"
979 else:
980 sys_clr = child.find(f"{{{ns_a}}}sysClr")
981 if sys_clr is not None:
982 color_map[theme_name] = (
983 f"#{sys_clr.get('lastClr', '000000')}"
984 )
985 # Add alias mappings
986 if theme_name in aliases:
987 alias = aliases[theme_name]
988 if theme_name in color_map:
989 color_map[alias] = color_map[theme_name]
990 except (AttributeError, TypeError, IndexError):
991 pass
992 return color_map
993
994
995def _resolve_theme_refs_in_content(content: dict, theme_colors: dict) -> dict:
996 """Replace @theme_name references with resolved hex values in content."""
997
998 def resolve_value(val):
999 if isinstance(val, str) and val.startswith("@"):
1000 theme_name = val[1:]
1001 return theme_colors.get(theme_name, val)
1002 if isinstance(val, dict):
1003 return {k: resolve_value(v) for k, v in val.items()}
1004 if isinstance(val, list):
1005 return [resolve_value(item) for item in val]
1006 return val
1007
1008 return resolve_value(content)
1009
1010
1011def main():
1012 """CLI entry point for extracting PPTX content into YAML."""
1013 parser = argparse.ArgumentParser(
1014 description="Extract content from a PPTX into YAML"
1015 )
1016 parser.add_argument("--input", required=True, help="Input PPTX file path")
1017 parser.add_argument("--output-dir", required=True, help="Output content directory")
1018 parser.add_argument(
1019 "--slides", help="Comma-separated slide numbers to extract (default: all)"
1020 )
1021 parser.add_argument(
1022 "--resolve-themes",
1023 action="store_true",
1024 help="Resolve @theme references to actual hex RGB values from the deck's theme",
1025 )
1026 args = parser.parse_args()
1027
1028 pptx_path = Path(args.input)
1029 output_dir = Path(args.output_dir)
1030 output_dir.mkdir(parents=True, exist_ok=True)
1031
1032 slide_filter = None
1033 if args.slides:
1034 slide_filter = {int(s.strip()) for s in args.slides.split(",")}
1035
1036 prs = Presentation(str(pptx_path))
1037 print(f"Extracting from: {pptx_path}")
1038 print(f"Slides: {len(prs.slides)}")
1039 w = emu_to_inches(prs.slide_width)
1040 h = emu_to_inches(prs.slide_height)
1041 print(f'Dimensions: {w}" x {h}"')
1042
1043 # Detect and save global style
1044 global_style = detect_global_style(prs)
1045
1046 # Resolve theme colors when requested
1047 theme_colors = {}
1048 if args.resolve_themes:
1049 theme_colors = _resolve_theme_colors(prs)
1050 if theme_colors:
1051 global_style["theme_colors"] = theme_colors
1052 global_style = _resolve_theme_refs_in_content(global_style, theme_colors)
1053 print(f"Resolved {len(theme_colors)} theme colors")
1054
1055 global_dir = output_dir / "global"
1056 global_dir.mkdir(parents=True, exist_ok=True)
1057 style_path = global_dir / "style.yaml"
1058 with open(style_path, "w", encoding="utf-8") as f:
1059 yaml.dump(
1060 global_style,
1061 f,
1062 default_flow_style=False,
1063 sort_keys=False,
1064 allow_unicode=True,
1065 )
1066 print(f"Global style saved to {style_path}")
1067
1068 # Extract slides (filtered or all)
1069 slide_dims = (emu_to_inches(prs.slide_width), emu_to_inches(prs.slide_height))
1070 extracted = 0
1071 for i, slide in enumerate(prs.slides):
1072 slide_num = i + 1
1073 if slide_filter and slide_num not in slide_filter:
1074 continue
1075 content, slide_dir = extract_slide(
1076 slide, slide_num, output_dir, slide_dims=slide_dims
1077 )
1078
1079 # Resolve @theme references to hex values when --resolve-themes is set
1080 if args.resolve_themes and theme_colors:
1081 content = _resolve_theme_refs_in_content(content, theme_colors)
1082
1083 content_path = slide_dir / "content.yaml"
1084 with open(content_path, "w", encoding="utf-8") as f:
1085 yaml.dump(
1086 content,
1087 f,
1088 default_flow_style=False,
1089 sort_keys=False,
1090 allow_unicode=True,
1091 )
1092 print(
1093 f"Slide {slide_num}: {content.get('title', 'Untitled')} -> {content_path}"
1094 )
1095 extracted += 1
1096
1097 print(f"\nExtraction complete. {extracted} slide(s) extracted to {output_dir}")
1098
1099
1100if __name__ == "__main__":
1101 main()
1102