microsoft/hve-core

Public

mirrored fromhttps://github.com/microsoft/hve-coreAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
fix/1124-exclude-python-env-dirs-from-skill-validation

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

.github/skills/experimental/powerpoint/scripts/extract_content.py

1279lines · modecode

1# Copyright (c) Microsoft Corporation.
2# SPDX-License-Identifier: MIT
3"""Extract content from an existing PPTX into YAML content and style definitions.
4
5Usage::
6
7 python extract_content.py \
8 --input existing-deck.pptx --output-dir content/
9
10 python extract_content.py \
11 --input existing-deck.pptx --output-dir content/ \
12 --slides 3,7,15
13"""
14
15import argparse
16import logging
17from collections import Counter
18from pathlib import Path
19
20import cairosvg
21import yaml
22from lxml import etree
23from pptx import Presentation
24from pptx.oxml.ns import qn
25from pptx_charts import extract_chart
26from pptx_colors import extract_color, hex_brightness
27from pptx_fills import extract_effect_list, extract_fill, extract_line
28from pptx_fonts import (
29 extract_alignment,
30 extract_font_info,
31 extract_paragraph_font,
32 normalize_font_family,
33)
34from pptx_shapes import AUTO_SHAPE_NAME_MAP, extract_rotation
35from pptx_tables import extract_table
36from pptx_text import (
37 extract_bullet_properties,
38 extract_paragraph_properties,
39 extract_run_properties,
40 extract_text_frame_properties,
41)
42from pptx_utils import emu_to_inches
43
44MAX_IMAGE_BLOB_BYTES = 100 * 1024 * 1024 # 100 MB
45
46
47class _ImageSecurityError(ValueError):
48 """Security-critical image validation failure that must not be suppressed."""
49
50
51_CONTENT_TYPE_TO_EXT: dict[str, str] = {
52 "image/bmp": "bmp",
53 "image/gif": "gif",
54 "image/jpeg": "jpg",
55 "image/png": "png",
56 "image/tiff": "tiff",
57 # WMF retained for legitimate PPTX files; validated by magic-byte check.
58 # See CVE-2005-4560 for historical WMF risk context.
59 "image/x-wmf": "wmf",
60 # EMF retained for charts, SmartArt, and diagrams; validated by magic-byte check.
61 "image/emf": "emf",
62 "image/x-emf": "emf",
63 # SVG sanitized via hardened XMLParser and converted to PNG by cairosvg.
64 "image/svg+xml": "svg",
65}
66
67# WMF file signatures used for magic-byte validation.
68_WMF_ALDUS_MAGIC = b"\xd7\xcd\xc6\x9a"
69_WMF_STANDARD_PREFIXES = (b"\x01\x00\x09\x00", b"\x02\x00\x09\x00")
70
71# EMF file signatures: EMR_HEADER record type at offset 0, " EMF" at offset 40.
72_EMF_RECORD_TYPE = b"\x01\x00\x00\x00"
73_EMF_SIGNATURE = b" EMF"
74
75
76def _validate_wmf_magic_bytes(blob: bytes) -> None:
77 """Reject WMF blobs that lack a recognized file signature."""
78 if len(blob) < 4:
79 raise _ImageSecurityError("WMF blob too short for magic-byte validation")
80 head = blob[:4]
81 if head == _WMF_ALDUS_MAGIC or head in _WMF_STANDARD_PREFIXES:
82 return
83 raise _ImageSecurityError(
84 "WMF blob does not start with a recognized file signature"
85 )
86
87
88def _validate_emf_magic_bytes(blob: bytes) -> None:
89 """Reject EMF blobs that lack the expected EMR_HEADER and signature."""
90 if len(blob) < 44:
91 raise _ImageSecurityError("EMF blob too short for magic-byte validation")
92 if blob[:4] != _EMF_RECORD_TYPE or blob[40:44] != _EMF_SIGNATURE:
93 raise _ImageSecurityError("EMF blob does not match expected file signature")
94
95
96def _sanitize_svg(blob: bytes) -> bytes:
97 """Parse SVG through a hardened XMLParser to block XXE and DTD attacks.
98
99 Returns re-serialized XML bytes. Raises *_ImageSecurityError* when
100 the blob is not well-formed XML or contains prohibited constructs.
101 """
102 parser = etree.XMLParser(
103 resolve_entities=False,
104 no_network=True,
105 dtd_validation=False,
106 load_dtd=False,
107 )
108 try:
109 root = etree.fromstring(blob, parser=parser)
110 except etree.XMLSyntaxError as exc:
111 raise _ImageSecurityError(f"SVG blob is not well-formed XML: {exc}") from exc
112 if root.getroottree().docinfo.internalDTD is not None:
113 raise _ImageSecurityError("SVG blob contains a DTD declaration")
114 return etree.tostring(root, xml_declaration=True, encoding="UTF-8")
115
116
117def _convert_svg_to_png(blob: bytes) -> bytes:
118 """Sanitize an SVG blob and convert it to PNG via cairosvg."""
119 clean_svg = _sanitize_svg(blob)
120 return cairosvg.svg2png(bytestring=clean_svg)
121
122
123def extract_connector(shape) -> dict:
124 """Extract a connector element definition."""
125 elem = {
126 "type": "connector",
127 "begin_x": emu_to_inches(shape.begin_x),
128 "begin_y": emu_to_inches(shape.begin_y),
129 "end_x": emu_to_inches(shape.end_x),
130 "end_y": emu_to_inches(shape.end_y),
131 "name": shape.name,
132 }
133 line_props = extract_line(shape)
134 if line_props:
135 elem.update(line_props)
136 return elem
137
138
139def _is_freeform(shape) -> bool:
140 """Check whether a shape is a freeform with custom geometry."""
141 nsmap = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}
142 return shape._element.find(".//a:custGeom", nsmap) is not None
143
144
145def _is_background_image(shape, slide_w: float, slide_h: float) -> bool:
146 """Detect whether a PICTURE shape covers the full slide as a background.
147
148 A shape qualifies if it covers at least 95% of slide dimensions.
149 """
150 w = emu_to_inches(shape.width)
151 h = emu_to_inches(shape.height)
152 return (w >= slide_w * 0.95) and (h >= slide_h * 0.95)
153
154
155def _save_image_blob(shape, output_dir: Path, slide_num: int, img_count: int) -> dict:
156 """Save an embedded image blob to disk with security validation.
157
158 Validates content type against an allowlist, enforces a size limit,
159 and checks that the resolved output path stays within *output_dir*.
160 """
161 try:
162 img = shape.image
163 except ValueError:
164 return {"path": "LINKED_IMAGE_NOT_EMBEDDED"}
165
166 ext = _CONTENT_TYPE_TO_EXT.get(img.content_type)
167 if ext is None:
168 raise ValueError(f"Unsupported image content type: {img.content_type}")
169
170 blob = img.blob
171 if len(blob) > MAX_IMAGE_BLOB_BYTES:
172 raise ValueError(
173 f"Image blob size {len(blob)} exceeds limit of {MAX_IMAGE_BLOB_BYTES} bytes"
174 )
175
176 if ext == "wmf":
177 _validate_wmf_magic_bytes(blob)
178 elif ext == "emf":
179 _validate_emf_magic_bytes(blob)
180 elif ext == "svg":
181 blob = _convert_svg_to_png(blob)
182 ext = "png"
183
184 img_name = f"image-{img_count:02d}.{ext}"
185 img_path = output_dir / "images" / img_name
186
187 if not img_path.resolve().is_relative_to(output_dir.resolve()):
188 raise _ImageSecurityError(
189 f"Image path {img_path} escapes output directory {output_dir}"
190 )
191
192 img_path.parent.mkdir(parents=True, exist_ok=True)
193 with open(img_path, "wb") as f:
194 f.write(blob)
195 return {"path": f"images/{img_name}"}
196
197
198def extract_freeform(shape) -> dict:
199 """Extract a freeform shape with its path vertices."""
200 elem = {
201 "type": "freeform",
202 "left": emu_to_inches(shape.left),
203 "top": emu_to_inches(shape.top),
204 "width": emu_to_inches(shape.width),
205 "height": emu_to_inches(shape.height),
206 "name": shape.name,
207 }
208
209 rot = extract_rotation(shape)
210 if rot is not None:
211 elem["rotation"] = rot
212
213 # Extract fill and line properties
214 try:
215 fill_result = extract_fill(shape.fill)
216 if fill_result is not None:
217 elem["fill"] = fill_result
218 except (AttributeError, TypeError):
219 pass
220
221 line_props = extract_line(shape)
222 if line_props:
223 elem.update(line_props)
224
225 # Extract path vertices from custGeom XML
226 nsmap = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}
227 paths = []
228 for path_el in shape._element.findall(".//a:custGeom/a:pathLst/a:path", nsmap):
229 commands = []
230 for child in path_el:
231 tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
232 if tag == "moveTo":
233 pt = child.find("a:pt", nsmap)
234 if pt is not None:
235 commands.append(
236 {
237 "cmd": "moveTo",
238 "x": int(pt.get("x", 0)),
239 "y": int(pt.get("y", 0)),
240 }
241 )
242 elif tag == "lnTo":
243 pt = child.find("a:pt", nsmap)
244 if pt is not None:
245 commands.append(
246 {
247 "cmd": "lineTo",
248 "x": int(pt.get("x", 0)),
249 "y": int(pt.get("y", 0)),
250 }
251 )
252 elif tag == "cubicBezTo":
253 pts = child.findall("a:pt", nsmap)
254 commands.append(
255 {
256 "cmd": "cubicBezTo",
257 "pts": [
258 {"x": int(p.get("x", 0)), "y": int(p.get("y", 0))}
259 for p in pts
260 ],
261 }
262 )
263 elif tag == "close":
264 commands.append({"cmd": "close"})
265 if commands:
266 paths.append(commands)
267
268 if paths:
269 elem["paths"] = paths
270
271 return elem
272
273
274MAX_GROUP_DEPTH = 20
275
276
277def extract_group(
278 shape,
279 slide_num: int,
280 output_dir,
281 img_count: int,
282 *,
283 _depth: int = 0,
284 max_depth: int = MAX_GROUP_DEPTH,
285) -> dict:
286 """Extract a group shape and its nested child elements.
287
288 Raises ValueError when nesting exceeds *max_depth*.
289 """
290 if _depth >= max_depth:
291 raise ValueError(f"Group nesting depth {_depth} exceeds limit of {max_depth}")
292 elem = {
293 "type": "group",
294 "left": emu_to_inches(shape.left),
295 "top": emu_to_inches(shape.top),
296 "width": emu_to_inches(shape.width),
297 "height": emu_to_inches(shape.height),
298 "name": shape.name,
299 "elements": [],
300 }
301 for child in shape.shapes:
302 child_elem = extract_child_shape(
303 child,
304 slide_num,
305 output_dir,
306 img_count,
307 _depth=_depth + 1,
308 max_depth=max_depth,
309 )
310 if child_elem:
311 elem["elements"].append(child_elem)
312 return elem
313
314
315def _extract_shape_by_type(
316 shape,
317 slide_num: int,
318 output_dir,
319 img_count: int,
320 *,
321 _depth: int = 0,
322 max_depth: int = MAX_GROUP_DEPTH,
323) -> dict | None:
324 """Dispatch extraction based on shape_type, table/chart, or freeform."""
325 shape_type = shape.shape_type
326
327 # Simple shape_type dispatch (these extractors need no extra context)
328 _SIMPLE_EXTRACTORS = {
329 17: extract_textbox, # TEXT_BOX
330 1: extract_shape, # AUTO_SHAPE
331 9: extract_connector, # LINE / CONNECTOR
332 }
333 extractor = _SIMPLE_EXTRACTORS.get(shape_type)
334 if extractor:
335 return extractor(shape)
336
337 if shape_type == 13: # PICTURE
338 return extract_image(shape, output_dir, slide_num, img_count)
339 if shape_type == 6: # GROUP
340 return extract_group(
341 shape,
342 slide_num,
343 output_dir,
344 img_count,
345 _depth=_depth,
346 max_depth=max_depth,
347 )
348
349 # Table and chart detection via attribute check
350 if hasattr(shape, "has_table") and shape.has_table:
351 return extract_table(shape)
352 if hasattr(shape, "has_chart") and shape.has_chart:
353 return extract_chart(shape)
354 if _is_freeform(shape):
355 return extract_freeform(shape)
356
357 return None
358
359
360def extract_child_shape(
361 shape,
362 slide_num: int,
363 output_dir,
364 img_count: int,
365 *,
366 _depth: int = 0,
367 max_depth: int = MAX_GROUP_DEPTH,
368) -> dict | None:
369 """Extract a single child shape within a group."""
370 result = _extract_shape_by_type(
371 shape,
372 slide_num,
373 output_dir,
374 img_count,
375 _depth=_depth,
376 max_depth=max_depth,
377 )
378 if result is not None:
379 return result
380
381 # Fallback for unrecognized shape types
382 elem = {
383 "type": "shape",
384 "shape": "rectangle",
385 "left": emu_to_inches(shape.left),
386 "top": emu_to_inches(shape.top),
387 "width": emu_to_inches(shape.width),
388 "height": emu_to_inches(shape.height),
389 "name": shape.name,
390 }
391 if shape.shape_type is not None:
392 elem["_unrecognized_shape_type"] = int(shape.shape_type)
393 return elem
394
395
396def _has_formatting_variation(runs: list) -> bool:
397 """Check if multiple runs have different formatting properties."""
398 if len(runs) <= 1:
399 return False
400 fonts = {r.get("font") for r in runs if "font" in r}
401 sizes = {r.get("size") for r in runs if "size" in r}
402 colors = {r.get("color") for r in runs if "color" in r}
403 bolds = {r.get("bold", False) for r in runs}
404 italics = {r.get("italic", False) for r in runs}
405 underlines = {r.get("underline", False) for r in runs}
406 return (
407 len(fonts) > 1
408 or len(sizes) > 1
409 or len(colors) > 1
410 or len(bolds) > 1
411 or len(italics) > 1
412 or len(underlines) > 1
413 )
414
415
416# Key-mapping for extraction: maps canonical keys to output YAML key names
417_SHAPE_EXTRACT_KEYS = {
418 "font": "text_font",
419 "size": "text_size",
420 "color": "text_color",
421 "bold": "text_bold",
422}
423_TEXTBOX_EXTRACT_KEYS = {
424 "font": "font",
425 "size": "font_size",
426 "color": "font_color",
427 "bold": "font_bold",
428}
429
430# Keys to promote from first paragraph to element level
431_SHAPE_PROMOTE_KEYS = (
432 "text_font",
433 "text_size",
434 "text_color",
435 "text_bold",
436 "italic",
437 "alignment",
438 "char_spacing",
439)
440_TEXTBOX_PROMOTE_KEYS = (
441 "font",
442 "font_size",
443 "font_color",
444 "font_bold",
445 "italic",
446 "alignment",
447 "char_spacing",
448)
449
450
451def _extract_text_content(text_frame, keys: dict, promote_keys: tuple) -> dict:
452 """Extract text content from a text frame into an element dict fragment.
453
454 Handles paragraph iteration, run extraction, rich-text detection, and
455 paragraph/element-level key promotion.
456
457 Args:
458 text_frame: python-pptx TextFrame object.
459 keys: Key-mapping dict for font/size/color/bold output names.
460 promote_keys: Tuple of keys to promote from first paragraph to element level.
461
462 Returns:
463 Dict with text, text frame properties, paragraph data, and promoted defaults.
464 """
465 result = {}
466 text = text_frame.text.strip()
467 if not text:
468 return result
469
470 result["text"] = text
471
472 tf_props = extract_text_frame_properties(text_frame)
473 if tf_props:
474 result.update(tf_props)
475
476 para_dicts = []
477 for para in text_frame.paragraphs:
478 run_info = {}
479 para_runs = []
480 for run in para.runs:
481 font_info = extract_font_info(run.font)
482 run_extra = extract_run_properties(run)
483 para_runs.append({"text": run.text, **font_info, **run_extra})
484 if not run_info:
485 run_info = {**font_info, **run_extra}
486
487 para_info = extract_paragraph_font(para)
488 para_spacing = extract_paragraph_properties(para)
489 bullet_props = extract_bullet_properties(para)
490 alignment = extract_alignment(para)
491 merged = {**para_info, **run_info}
492
493 p_dict = {"text": para.text}
494 if "font" in merged:
495 p_dict[keys["font"]] = merged["font"]
496 if "size" in merged:
497 p_dict[keys["size"]] = merged["size"]
498 if "color" in merged:
499 p_dict[keys["color"]] = merged["color"]
500 if merged.get("bold"):
501 p_dict[keys["bold"]] = True
502 if merged.get("italic"):
503 p_dict["italic"] = True
504 if merged.get("underline"):
505 p_dict["underline"] = True
506 if merged.get("hyperlink"):
507 p_dict["hyperlink"] = merged["hyperlink"]
508 if "char_spacing" in merged:
509 p_dict["char_spacing"] = merged["char_spacing"]
510 if "effect" in merged:
511 p_dict["text_effect"] = merged["effect"]
512 if alignment:
513 p_dict["alignment"] = alignment
514 if para_spacing:
515 p_dict.update(para_spacing)
516 if bullet_props:
517 p_dict.update(bullet_props)
518 if _has_formatting_variation(para_runs):
519 p_dict["runs"] = para_runs
520 para_dicts.append(p_dict)
521
522 non_empty = [p for p in para_dicts if p["text"].strip()]
523 any_has_runs = any("runs" in p for p in para_dicts)
524 if len(para_dicts) > 1 or any_has_runs:
525 result["paragraphs"] = para_dicts
526 if non_empty:
527 first = non_empty[0]
528 for key in promote_keys:
529 if key in first:
530 result[key] = first[key]
531 elif non_empty:
532 first = non_empty[0]
533 for key, val in first.items():
534 if key != "text":
535 result[key] = val
536
537 return result
538
539
540def extract_shape(shape) -> dict:
541 """Extract a shape element definition."""
542 elem = {
543 "type": "shape",
544 "shape": "rectangle",
545 "left": emu_to_inches(shape.left),
546 "top": emu_to_inches(shape.top),
547 "width": emu_to_inches(shape.width),
548 "height": emu_to_inches(shape.height),
549 "name": shape.name,
550 }
551
552 rot = extract_rotation(shape)
553 if rot is not None:
554 elem["rotation"] = rot
555
556 # Detect shape type from auto_shape_type enum
557 try:
558 elem["shape"] = AUTO_SHAPE_NAME_MAP.get(shape.auto_shape_type, "rectangle")
559 except (AttributeError, TypeError):
560 elem["shape"] = "rectangle"
561
562 # Extract corner radius (adjustment values) for rounded rectangles
563 try:
564 if shape.adjustments and len(shape.adjustments) > 0:
565 elem["corner_radius"] = round(shape.adjustments[0], 5)
566 except (AttributeError, TypeError, IndexError):
567 pass
568
569 # Extract fill
570 try:
571 fill_result = extract_fill(shape.fill)
572 if fill_result is not None:
573 elem["fill"] = fill_result
574 except (AttributeError, TypeError):
575 pass
576
577 # Extract line properties
578 line_props = extract_line(shape)
579 if line_props:
580 elem.update(line_props)
581
582 # Extract effect list (outer shadow)
583 effect = extract_effect_list(shape)
584 if effect:
585 elem["effect"] = effect
586
587 # Extract text if present
588 if shape.has_text_frame:
589 text_data = _extract_text_content(
590 shape.text_frame, _SHAPE_EXTRACT_KEYS, _SHAPE_PROMOTE_KEYS
591 )
592 elem.update(text_data)
593
594 return elem
595
596
597def extract_textbox(shape) -> dict:
598 """Extract a text box element definition."""
599 elem = {
600 "type": "textbox",
601 "left": emu_to_inches(shape.left),
602 "top": emu_to_inches(shape.top),
603 "width": emu_to_inches(shape.width),
604 "height": emu_to_inches(shape.height),
605 "text": shape.text_frame.text.strip() if shape.has_text_frame else "",
606 "name": shape.name,
607 }
608
609 rot = extract_rotation(shape)
610 if rot is not None:
611 elem["rotation"] = rot
612
613 if shape.has_text_frame:
614 text_data = _extract_text_content(
615 shape.text_frame, _TEXTBOX_EXTRACT_KEYS, _TEXTBOX_PROMOTE_KEYS
616 )
617 elem.update(text_data)
618
619 return elem
620
621
622def extract_image(shape, output_dir: Path, slide_num: int, img_count: int) -> dict:
623 """Extract an image element and save the image file."""
624 try:
625 blob_result = _save_image_blob(shape, output_dir, slide_num, img_count)
626 except _ImageSecurityError:
627 raise
628 except ValueError as exc:
629 logging.warning("Skipping image on slide %d: %s", slide_num, exc)
630 return {
631 "type": "image",
632 "path": "SKIPPED",
633 "left": emu_to_inches(shape.left),
634 "top": emu_to_inches(shape.top),
635 "width": emu_to_inches(shape.width),
636 "height": emu_to_inches(shape.height),
637 "name": shape.name,
638 "_skipped_reason": str(exc),
639 }
640
641 if blob_result["path"] == "LINKED_IMAGE_NOT_EMBEDDED":
642 elem = {
643 "type": "image",
644 "path": "LINKED_IMAGE_NOT_EMBEDDED",
645 "left": emu_to_inches(shape.left),
646 "top": emu_to_inches(shape.top),
647 "width": emu_to_inches(shape.width),
648 "height": emu_to_inches(shape.height),
649 "name": shape.name,
650 "_note": "Image was linked, not embedded in the PPTX",
651 }
652 rot = extract_rotation(shape)
653 if rot is not None:
654 elem["rotation"] = rot
655 return elem
656
657 elem = {
658 "type": "image",
659 "path": blob_result["path"],
660 "left": emu_to_inches(shape.left),
661 "top": emu_to_inches(shape.top),
662 "width": emu_to_inches(shape.width),
663 "height": emu_to_inches(shape.height),
664 "name": shape.name,
665 }
666 rot = extract_rotation(shape)
667 if rot is not None:
668 elem["rotation"] = rot
669
670 # Extract image crop from srcRect on blipFill
671 blipFill = shape._element.find(qn("p:blipFill"))
672 if blipFill is not None:
673 # Preserve blipFill attributes (rotWithShape, dpi, etc.)
674 blip_fill_attrs = {}
675 for attr_name in ("rotWithShape", "dpi"):
676 val = blipFill.get(attr_name)
677 if val is not None:
678 blip_fill_attrs[attr_name] = val
679 if blip_fill_attrs:
680 elem["blip_fill_attrs"] = blip_fill_attrs
681
682 srcRect = blipFill.find(qn("a:srcRect"))
683 if srcRect is not None and srcRect.attrib:
684 crop = {}
685 for side in ("l", "t", "r", "b"):
686 val = srcRect.get(side)
687 if val is not None:
688 crop[side] = int(val)
689 if crop:
690 elem["crop"] = crop
691
692 # Extract image opacity from alphaModFix on the blip element
693 blip = shape._element.find(".//" + qn("a:blip"))
694 if blip is not None:
695 amf = blip.find(qn("a:alphaModFix"))
696 if amf is not None:
697 amt = int(amf.get("amt", "100000"))
698 elem["opacity"] = round(amt / 1000, 1)
699
700 return elem
701
702
703def detect_global_style(prs) -> dict:
704 """Analyze the presentation to detect common styling patterns.
705
706 Detects multiple theme zones (e.g., light and dark slides) by clustering
707 slides based on background brightness and dominant text colors.
708 """
709 bg_colors = Counter()
710 text_colors = Counter()
711 accent_colors = Counter()
712 fill_colors = Counter()
713 font_names = Counter()
714 font_sizes = Counter()
715
716 # Per-slide analysis for theme clustering
717 slide_profiles = []
718
719 slide_w = emu_to_inches(prs.slide_width)
720 slide_h = emu_to_inches(prs.slide_height)
721
722 for slide_idx, slide in enumerate(prs.slides):
723 slide_num = slide_idx + 1
724 slide_bg = None
725 slide_text_colors = Counter()
726 slide_fill_colors = Counter()
727 has_bg_image = False
728
729 # Detect background colors
730 try:
731 fill_result = extract_fill(slide.background.fill)
732 if isinstance(fill_result, str):
733 bg_colors[fill_result] += 1
734 slide_bg = fill_result
735 except (AttributeError, TypeError):
736 pass
737
738 for i, shape in enumerate(slide.shapes):
739 # Detect full-slide background images
740 if (
741 i == 0
742 and shape.shape_type == 13
743 and _is_background_image(shape, slide_w, slide_h)
744 ):
745 has_bg_image = True
746 continue
747
748 # Collect fill colors
749 try:
750 fill_result = extract_fill(shape.fill)
751 if isinstance(fill_result, str):
752 h = emu_to_inches(shape.height)
753 if h < 0.1:
754 accent_colors[fill_result] += 1
755 else:
756 fill_colors[fill_result] += 1
757 slide_fill_colors[fill_result] += 1
758 except (AttributeError, TypeError):
759 pass
760
761 # Collect font information
762 if shape.has_text_frame:
763 for para in shape.text_frame.paragraphs:
764 for run in para.runs:
765 if run.font.name:
766 base_name = normalize_font_family(run.font.name)
767 font_names[base_name] += 1
768 if run.font.size:
769 font_sizes[int(run.font.size.pt)] += 1
770 try:
771 color = extract_color(run.font.color)
772 if isinstance(color, str) and color.startswith("#"):
773 text_colors[color] += 1
774 slide_text_colors[color] += 1
775 except (AttributeError, TypeError):
776 pass
777
778 # Classify slide brightness
779 bg_brightness = _classify_slide_brightness(
780 slide_bg, slide_text_colors, has_bg_image
781 )
782 slide_profiles.append(
783 {
784 "slide": slide_num,
785 "bg_color": slide_bg,
786 "bg_brightness": bg_brightness,
787 "has_bg_image": has_bg_image,
788 "text_colors": dict(slide_text_colors),
789 "fill_colors": dict(slide_fill_colors),
790 }
791 )
792
793 # Build global color map from frequency analysis
794 colors = _build_color_map(bg_colors, fill_colors, text_colors, accent_colors)
795
796 # Detect themes by clustering slides into light/dark groups
797 themes = _cluster_themes(slide_profiles, text_colors, fill_colors, accent_colors)
798
799 # Determine primary fonts
800 body_font = "Segoe UI"
801 code_font = "Cascadia Code"
802 for f, _count in font_names.most_common():
803 if any(kw in f.lower() for kw in ("cascadia", "consolas", "mono", "courier")):
804 code_font = f
805 else:
806 body_font = f
807 break
808
809 # Determine font sizes
810 heading_size = 28
811 body_size = 16
812 if font_sizes:
813 filtered = {s: c for s, c in font_sizes.items() if 8 < s < 60}
814 if filtered:
815 sorted_sizes = sorted(filtered.keys())
816 body_size = sorted_sizes[len(sorted_sizes) // 2]
817 heading_size = sorted_sizes[int(len(sorted_sizes) * 0.85)]
818
819 style = {
820 "dimensions": {
821 "width_inches": emu_to_inches(prs.slide_width),
822 "height_inches": emu_to_inches(prs.slide_height),
823 "format": "16:9",
824 },
825 "defaults": {
826 "speaker_notes_required": True,
827 },
828 "typography": {
829 "body_font": body_font,
830 "code_font": code_font,
831 "heading_size": heading_size,
832 "body_size": body_size,
833 },
834 }
835
836 if colors:
837 style["colors"] = colors
838
839 if themes:
840 style["themes"] = themes
841
842 # Extract presentation metadata
843 metadata = {}
844 props = prs.core_properties
845 for attr in ("title", "author", "subject", "keywords", "description", "category"):
846 val = getattr(props, attr, None)
847 if val:
848 metadata[attr] = val
849 if metadata:
850 style["metadata"] = metadata
851
852 return style
853
854
855def _classify_slide_brightness(
856 bg_color: str | None, text_colors: Counter, has_bg_image: bool
857) -> str:
858 """Classify a slide as 'light' or 'dark' based on background and text colors."""
859 if has_bg_image and bg_color is None:
860 # Slides with background images and no solid bg — infer from text colors
861 dark_text = sum(
862 c for hex_c, c in text_colors.items() if hex_brightness(hex_c) < 100
863 )
864 light_text = sum(
865 c for hex_c, c in text_colors.items() if hex_brightness(hex_c) > 150
866 )
867 return "light" if dark_text >= light_text else "dark"
868
869 if bg_color and isinstance(bg_color, str) and bg_color.startswith("#"):
870 return "light" if hex_brightness(bg_color) > 128 else "dark"
871
872 # Default: infer from text colors
873 dark_text = sum(
874 c for hex_c, c in text_colors.items() if hex_brightness(hex_c) < 100
875 )
876 light_text = sum(
877 c for hex_c, c in text_colors.items() if hex_brightness(hex_c) > 150
878 )
879 if dark_text > light_text:
880 return "light"
881 if light_text > dark_text:
882 return "dark"
883 return "dark"
884
885
886def _build_color_map(
887 bg_colors: Counter,
888 fill_colors: Counter,
889 text_colors: Counter,
890 accent_colors: Counter,
891) -> dict:
892 """Build the global color map from frequency analysis."""
893 colors = {}
894 if bg_colors:
895 colors["bg_dark"] = bg_colors.most_common(1)[0][0]
896 if fill_colors:
897 colors["bg_card"] = fill_colors.most_common(1)[0][0]
898
899 for color_hex, _count in text_colors.most_common(5):
900 brightness = hex_brightness(color_hex)
901 if brightness > 200 and "text_white" not in colors:
902 colors["text_white"] = color_hex
903 elif brightness < 80 and "text_dark" not in colors:
904 colors["text_dark"] = color_hex
905 elif 80 <= brightness <= 200 and "text_gray" not in colors:
906 colors["text_gray"] = color_hex
907
908 accent_names = ["accent_blue", "accent_teal", "accent_green"]
909 for i, (color_hex, _count) in enumerate(accent_colors.most_common(3)):
910 if i < len(accent_names):
911 colors[accent_names[i]] = color_hex
912
913 return colors
914
915
916def _cluster_themes(
917 slide_profiles: list[dict],
918 text_colors: Counter,
919 fill_colors: Counter,
920 accent_colors: Counter,
921) -> list[dict]:
922 """Cluster slides into theme groups based on brightness classification."""
923 light_slides = [p for p in slide_profiles if p["bg_brightness"] == "light"]
924 dark_slides = [p for p in slide_profiles if p["bg_brightness"] == "dark"]
925
926 # Only produce themes when both light and dark groups exist
927 if not light_slides or not dark_slides:
928 return []
929
930 themes = []
931
932 # Light theme
933 light_text = Counter()
934 light_fills = Counter()
935 for p in light_slides:
936 light_text.update(p["text_colors"])
937 light_fills.update(p["fill_colors"])
938
939 light_colors = {}
940 for color_hex, _count in light_text.most_common(5):
941 brightness = hex_brightness(color_hex)
942 if brightness < 80 and "text_primary" not in light_colors:
943 light_colors["text_primary"] = color_hex
944 elif 80 <= brightness <= 200 and "text_secondary" not in light_colors:
945 light_colors["text_secondary"] = color_hex
946 if light_fills:
947 light_colors["bg_card"] = light_fills.most_common(1)[0][0]
948
949 themes.append(
950 {
951 "name": "light",
952 "slides": sorted(p["slide"] for p in light_slides),
953 "colors": light_colors,
954 }
955 )
956
957 # Dark theme
958 dark_text = Counter()
959 dark_fills = Counter()
960 dark_bgs = Counter()
961 for p in dark_slides:
962 dark_text.update(p["text_colors"])
963 dark_fills.update(p["fill_colors"])
964 if p["bg_color"]:
965 dark_bgs[p["bg_color"]] += 1
966
967 dark_colors = {}
968 if dark_bgs:
969 dark_colors["bg_dark"] = dark_bgs.most_common(1)[0][0]
970 for color_hex, _count in dark_text.most_common(5):
971 brightness = hex_brightness(color_hex)
972 if brightness > 200 and "text_primary" not in dark_colors:
973 dark_colors["text_primary"] = color_hex
974 elif 80 <= brightness <= 200 and "text_secondary" not in dark_colors:
975 dark_colors["text_secondary"] = color_hex
976 if dark_fills:
977 dark_colors["bg_card"] = dark_fills.most_common(1)[0][0]
978
979 themes.append(
980 {
981 "name": "dark",
982 "slides": sorted(p["slide"] for p in dark_slides),
983 "colors": dark_colors,
984 }
985 )
986
987 return themes
988
989
990def extract_slide(
991 slide,
992 slide_num: int,
993 output_dir: Path,
994 slide_dims: tuple[float, float] | None = None,
995) -> dict:
996 """Extract all elements from a slide into a content.yaml structure."""
997 slide_dir = output_dir / f"slide-{slide_num:03d}"
998 slide_dir.mkdir(parents=True, exist_ok=True)
999
1000 content = {
1001 "slide": slide_num,
1002 "title": "",
1003 "elements": [],
1004 }
1005
1006 # Extract layout name
1007 try:
1008 layout_name = slide.slide_layout.name
1009 if layout_name:
1010 content["layout"] = layout_name
1011 except (AttributeError, TypeError):
1012 pass
1013
1014 # Extract slide background
1015 try:
1016 if not slide.follow_master_background:
1017 fill_result = extract_fill(slide.background.fill)
1018 if fill_result is not None:
1019 content["background"] = {"fill": fill_result}
1020 except (AttributeError, TypeError):
1021 pass
1022
1023 # Extract speaker notes (include empty string when notes slide exists)
1024 try:
1025 if slide.has_notes_slide:
1026 notes = slide.notes_slide.notes_text_frame.text.strip()
1027 content["speaker_notes"] = notes
1028 except (AttributeError, TypeError):
1029 pass
1030
1031 img_count = 0
1032
1033 for z_index, shape in enumerate(list(slide.shapes)):
1034 shape_type = shape.shape_type
1035
1036 # Track image count for filename generation
1037 if shape_type == 13:
1038 img_count += 1
1039
1040 # Handle placeholders specially (extract as textbox with marker)
1041 if shape_type == 14:
1042 if not shape.has_text_frame:
1043 continue
1044 elem = extract_textbox(shape)
1045 elem["_placeholder"] = True
1046 elem["z_order"] = z_index
1047 content["elements"].append(elem)
1048 continue
1049
1050 # Use shared dispatcher for all other shape types
1051 elem = _extract_shape_by_type(shape, slide_num, slide_dir, img_count)
1052 if elem is not None:
1053 elem["z_order"] = z_index
1054 content["elements"].append(elem)
1055
1056 # Detect title from textbox near top of slide
1057 if (
1058 shape_type == 17
1059 and not content["title"]
1060 and emu_to_inches(shape.top) < 1.5
1061 ):
1062 text = shape.text_frame.text.strip() if shape.has_text_frame else ""
1063 if text and len(text) < 100:
1064 content["title"] = text
1065 continue
1066
1067 # Fallback for unrecognized shape types
1068 elem_data = {
1069 "type": "shape",
1070 "shape": "rectangle",
1071 "left": emu_to_inches(shape.left),
1072 "top": emu_to_inches(shape.top),
1073 "width": emu_to_inches(shape.width),
1074 "height": emu_to_inches(shape.height),
1075 "name": shape.name,
1076 "z_order": z_index,
1077 }
1078 if shape_type is not None:
1079 elem_data["_unrecognized_shape_type"] = int(shape_type)
1080 content["elements"].append(elem_data)
1081
1082 return content, slide_dir
1083
1084
1085def _resolve_theme_colors(prs) -> dict:
1086 """Extract theme color name→hex mappings from the presentation's theme XML.
1087
1088 Reads clrScheme from the slide master's theme and maps theme names
1089 (background_1, text_1, accent_1, etc.) to their actual hex values.
1090 """
1091 color_map = {}
1092 scheme_names = {
1093 "dk1": "dark_1",
1094 "dk2": "dark_2",
1095 "lt1": "light_1",
1096 "lt2": "light_2",
1097 "accent1": "accent_1",
1098 "accent2": "accent_2",
1099 "accent3": "accent_3",
1100 "accent4": "accent_4",
1101 "accent5": "accent_5",
1102 "accent6": "accent_6",
1103 "hlink": "hyperlink",
1104 "folHlink": "followed_hyperlink",
1105 }
1106 # Map canonical aliases
1107 aliases = {
1108 "dark_1": "text_1",
1109 "dark_2": "text_2",
1110 "light_1": "background_1",
1111 "light_2": "background_2",
1112 }
1113 try:
1114 ns_a = "http://schemas.openxmlformats.org/drawingml/2006/main"
1115 master = prs.slide_masters[0]
1116 theme_el = None
1117 # Theme is stored as a related part (generic Part, not XmlPart),
1118 # so parse its blob directly with lxml.
1119 for rel in master.part.rels.values():
1120 if "theme" in rel.reltype:
1121 parser = etree.XMLParser(resolve_entities=False, no_network=True)
1122 theme_el = etree.fromstring(rel.target_part.blob, parser=parser)
1123 break
1124
1125 if theme_el is not None:
1126 clr_scheme = theme_el.find(f".//{{{ns_a}}}clrScheme")
1127 if clr_scheme is not None:
1128 for child in clr_scheme:
1129 tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
1130 theme_name = scheme_names.get(tag)
1131 if theme_name is None:
1132 continue
1133 # Extract hex value from srgbClr or sysClr
1134 srgb = child.find(f"{{{ns_a}}}srgbClr")
1135 if srgb is not None:
1136 color_map[theme_name] = f"#{srgb.get('val', '000000')}"
1137 else:
1138 sys_clr = child.find(f"{{{ns_a}}}sysClr")
1139 if sys_clr is not None:
1140 color_map[theme_name] = (
1141 f"#{sys_clr.get('lastClr', '000000')}"
1142 )
1143 # Add alias mappings
1144 if theme_name in aliases:
1145 alias = aliases[theme_name]
1146 if theme_name in color_map:
1147 color_map[alias] = color_map[theme_name]
1148 except (AttributeError, TypeError, IndexError):
1149 # Theme elements missing or malformed; degrade gracefully
1150 pass
1151 except etree.XMLSyntaxError:
1152 logging.warning(
1153 "Malformed theme XML in slide master; skipping theme color resolution"
1154 )
1155 return color_map
1156
1157
1158MAX_THEME_REF_DEPTH = 50
1159
1160
1161def _resolve_theme_refs_in_content(
1162 content: dict,
1163 theme_colors: dict,
1164 *,
1165 max_depth: int = MAX_THEME_REF_DEPTH,
1166) -> dict:
1167 """Replace @theme_name references with resolved hex values in content.
1168
1169 Raises ValueError when nesting exceeds *max_depth*.
1170 """
1171
1172 def resolve_value(val, _depth: int = 0):
1173 if _depth >= max_depth:
1174 raise ValueError(
1175 f"Theme reference nesting depth {_depth} exceeds limit of {max_depth}"
1176 )
1177 if isinstance(val, str) and val.startswith("@"):
1178 theme_name = val[1:]
1179 return theme_colors.get(theme_name, val)
1180 if isinstance(val, dict):
1181 return {k: resolve_value(v, _depth + 1) for k, v in val.items()}
1182 if isinstance(val, list):
1183 return [resolve_value(item, _depth + 1) for item in val]
1184 return val
1185
1186 return resolve_value(content)
1187
1188
1189def main():
1190 """CLI entry point for extracting PPTX content into YAML."""
1191 parser = argparse.ArgumentParser(
1192 description="Extract content from a PPTX into YAML"
1193 )
1194 parser.add_argument("--input", required=True, help="Input PPTX file path")
1195 parser.add_argument("--output-dir", required=True, help="Output content directory")
1196 parser.add_argument(
1197 "--slides", help="Comma-separated slide numbers to extract (default: all)"
1198 )
1199 parser.add_argument(
1200 "--resolve-themes",
1201 action="store_true",
1202 help="Resolve @theme references to actual hex RGB values from the deck's theme",
1203 )
1204 args = parser.parse_args()
1205
1206 pptx_path = Path(args.input)
1207 output_dir = Path(args.output_dir)
1208 output_dir.mkdir(parents=True, exist_ok=True)
1209
1210 slide_filter = None
1211 if args.slides:
1212 slide_filter = {int(s.strip()) for s in args.slides.split(",")}
1213
1214 prs = Presentation(str(pptx_path))
1215 print(f"Extracting from: {pptx_path}")
1216 print(f"Slides: {len(prs.slides)}")
1217 w = emu_to_inches(prs.slide_width)
1218 h = emu_to_inches(prs.slide_height)
1219 print(f'Dimensions: {w}" x {h}"')
1220
1221 # Detect and save global style
1222 global_style = detect_global_style(prs)
1223
1224 # Resolve theme colors when requested
1225 theme_colors = {}
1226 if args.resolve_themes:
1227 theme_colors = _resolve_theme_colors(prs)
1228 if theme_colors:
1229 global_style["theme_colors"] = theme_colors
1230 global_style = _resolve_theme_refs_in_content(global_style, theme_colors)
1231 print(f"Resolved {len(theme_colors)} theme colors")
1232
1233 global_dir = output_dir / "global"
1234 global_dir.mkdir(parents=True, exist_ok=True)
1235 style_path = global_dir / "style.yaml"
1236 with open(style_path, "w", encoding="utf-8") as f:
1237 yaml.dump(
1238 global_style,
1239 f,
1240 default_flow_style=False,
1241 sort_keys=False,
1242 allow_unicode=True,
1243 )
1244 print(f"Global style saved to {style_path}")
1245
1246 # Extract slides (filtered or all)
1247 slide_dims = (emu_to_inches(prs.slide_width), emu_to_inches(prs.slide_height))
1248 extracted = 0
1249 for i, slide in enumerate(prs.slides):
1250 slide_num = i + 1
1251 if slide_filter and slide_num not in slide_filter:
1252 continue
1253 content, slide_dir = extract_slide(
1254 slide, slide_num, output_dir, slide_dims=slide_dims
1255 )
1256
1257 # Resolve @theme references to hex values when --resolve-themes is set
1258 if args.resolve_themes and theme_colors:
1259 content = _resolve_theme_refs_in_content(content, theme_colors)
1260
1261 content_path = slide_dir / "content.yaml"
1262 with open(content_path, "w", encoding="utf-8") as f:
1263 yaml.dump(
1264 content,
1265 f,
1266 default_flow_style=False,
1267 sort_keys=False,
1268 allow_unicode=True,
1269 )
1270 print(
1271 f"Slide {slide_num}: {content.get('title', 'Untitled')} -> {content_path}"
1272 )
1273 extracted += 1
1274
1275 print(f"\nExtraction complete. {extracted} slide(s) extracted to {output_dir}")
1276
1277
1278if __name__ == "__main__":
1279 main()
1280