microsoft/hve-core

Public

mirrored fromhttps://github.com/microsoft/hve-coreAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

fix/1124-exclude-python-env-dirs-from-skill-validation

Find a branch or tag

Branches

fix/1124-exclude-python-env-dirs-from-skill-validation

Clone

HTTPS

Download ZIP

hve-core/.github/skills/experimental/powerpoint/scripts

.github/skills/experimental/powerpoint/scripts/extract_content.py

1279lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`# Copyright (c) Microsoft Corporation.`
2	`# SPDX-License-Identifier: MIT`
3	`"""Extract content from an existing PPTX into YAML content and style definitions.`
4
5	`Usage::`
6
7	`python extract_content.py \`
8	`--input existing-deck.pptx --output-dir content/`
9
10	`python extract_content.py \`
11	`--input existing-deck.pptx --output-dir content/ \`
12	`--slides 3,7,15`
13	`"""`
14
15	`import argparse`
16	`import logging`
17	`from collections import Counter`
18	`from pathlib import Path`
19
20	`import cairosvg`
21	`import yaml`
22	`from lxml import etree`
23	`from pptx import Presentation`
24	`from pptx.oxml.ns import qn`
25	`from pptx_charts import extract_chart`
26	`from pptx_colors import extract_color, hex_brightness`
27	`from pptx_fills import extract_effect_list, extract_fill, extract_line`
28	`from pptx_fonts import (`
29	`extract_alignment,`
30	`extract_font_info,`
31	`extract_paragraph_font,`
32	`normalize_font_family,`
33	`)`
34	`from pptx_shapes import AUTO_SHAPE_NAME_MAP, extract_rotation`
35	`from pptx_tables import extract_table`
36	`from pptx_text import (`
37	`extract_bullet_properties,`
38	`extract_paragraph_properties,`
39	`extract_run_properties,`
40	`extract_text_frame_properties,`
41	`)`
42	`from pptx_utils import emu_to_inches`
43
44	`MAX_IMAGE_BLOB_BYTES = 100 * 1024 * 1024 # 100 MB`
45
46
47	`class _ImageSecurityError(ValueError):`
48	`"""Security-critical image validation failure that must not be suppressed."""`
49
50
51	`_CONTENT_TYPE_TO_EXT: dict[str, str] = {`
52	`"image/bmp": "bmp",`
53	`"image/gif": "gif",`
54	`"image/jpeg": "jpg",`
55	`"image/png": "png",`
56	`"image/tiff": "tiff",`
57	`# WMF retained for legitimate PPTX files; validated by magic-byte check.`
58	`# See CVE-2005-4560 for historical WMF risk context.`
59	`"image/x-wmf": "wmf",`
60	`# EMF retained for charts, SmartArt, and diagrams; validated by magic-byte check.`
61	`"image/emf": "emf",`
62	`"image/x-emf": "emf",`
63	`# SVG sanitized via hardened XMLParser and converted to PNG by cairosvg.`
64	`"image/svg+xml": "svg",`
65	`}`
66
67	`# WMF file signatures used for magic-byte validation.`
68	`_WMF_ALDUS_MAGIC = b"\xd7\xcd\xc6\x9a"`
69	`_WMF_STANDARD_PREFIXES = (b"\x01\x00\x09\x00", b"\x02\x00\x09\x00")`
70
71	`# EMF file signatures: EMR_HEADER record type at offset 0, " EMF" at offset 40.`
72	`_EMF_RECORD_TYPE = b"\x01\x00\x00\x00"`
73	`_EMF_SIGNATURE = b" EMF"`
74
75
76	`def _validate_wmf_magic_bytes(blob: bytes) -> None:`
77	`"""Reject WMF blobs that lack a recognized file signature."""`
78	`if len(blob) < 4:`
79	`raise _ImageSecurityError("WMF blob too short for magic-byte validation")`
80	`head = blob[:4]`
81	`if head == _WMF_ALDUS_MAGIC or head in _WMF_STANDARD_PREFIXES:`
82	`return`
83	`raise _ImageSecurityError(`
84	`"WMF blob does not start with a recognized file signature"`
85	`)`
86
87
88	`def _validate_emf_magic_bytes(blob: bytes) -> None:`
89	`"""Reject EMF blobs that lack the expected EMR_HEADER and signature."""`
90	`if len(blob) < 44:`
91	`raise _ImageSecurityError("EMF blob too short for magic-byte validation")`
92	`if blob[:4] != _EMF_RECORD_TYPE or blob[40:44] != _EMF_SIGNATURE:`
93	`raise _ImageSecurityError("EMF blob does not match expected file signature")`
94
95
96	`def _sanitize_svg(blob: bytes) -> bytes:`
97	`"""Parse SVG through a hardened XMLParser to block XXE and DTD attacks.`
98
99	`Returns re-serialized XML bytes. Raises _ImageSecurityError when`
100	`the blob is not well-formed XML or contains prohibited constructs.`
101	`"""`
102	`parser = etree.XMLParser(`
103	`resolve_entities=False,`
104	`no_network=True,`
105	`dtd_validation=False,`
106	`load_dtd=False,`
107	`)`
108	`try:`
109	`root = etree.fromstring(blob, parser=parser)`
110	`except etree.XMLSyntaxError as exc:`
111	`raise _ImageSecurityError(f"SVG blob is not well-formed XML: {exc}") from exc`
112	`if root.getroottree().docinfo.internalDTD is not None:`
113	`raise _ImageSecurityError("SVG blob contains a DTD declaration")`
114	`return etree.tostring(root, xml_declaration=True, encoding="UTF-8")`
115
116
117	`def _convert_svg_to_png(blob: bytes) -> bytes:`
118	`"""Sanitize an SVG blob and convert it to PNG via cairosvg."""`
119	`clean_svg = _sanitize_svg(blob)`
120	`return cairosvg.svg2png(bytestring=clean_svg)`
121
122
123	`def extract_connector(shape) -> dict:`
124	`"""Extract a connector element definition."""`
125	`elem = {`
126	`"type": "connector",`
127	`"begin_x": emu_to_inches(shape.begin_x),`
128	`"begin_y": emu_to_inches(shape.begin_y),`
129	`"end_x": emu_to_inches(shape.end_x),`
130	`"end_y": emu_to_inches(shape.end_y),`
131	`"name": shape.name,`
132	`}`
133	`line_props = extract_line(shape)`
134	`if line_props:`
135	`elem.update(line_props)`
136	`return elem`
137
138
139	`def _is_freeform(shape) -> bool:`
140	`"""Check whether a shape is a freeform with custom geometry."""`
141	`nsmap = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}`
142	`return shape._element.find(".//a:custGeom", nsmap) is not None`
143
144
145	`def _is_background_image(shape, slide_w: float, slide_h: float) -> bool:`
146	`"""Detect whether a PICTURE shape covers the full slide as a background.`
147
148	`A shape qualifies if it covers at least 95% of slide dimensions.`
149	`"""`
150	`w = emu_to_inches(shape.width)`
151	`h = emu_to_inches(shape.height)`
152	`return (w >= slide_w * 0.95) and (h >= slide_h * 0.95)`
153
154
155	`def _save_image_blob(shape, output_dir: Path, slide_num: int, img_count: int) -> dict:`
156	`"""Save an embedded image blob to disk with security validation.`
157
158	`Validates content type against an allowlist, enforces a size limit,`
159	`and checks that the resolved output path stays within output_dir.`
160	`"""`
161	`try:`
162	`img = shape.image`
163	`except ValueError:`
164	`return {"path": "LINKED_IMAGE_NOT_EMBEDDED"}`
165
166	`ext = _CONTENT_TYPE_TO_EXT.get(img.content_type)`
167	`if ext is None:`
168	`raise ValueError(f"Unsupported image content type: {img.content_type}")`
169
170	`blob = img.blob`
171	`if len(blob) > MAX_IMAGE_BLOB_BYTES:`
172	`raise ValueError(`
173	`f"Image blob size {len(blob)} exceeds limit of {MAX_IMAGE_BLOB_BYTES} bytes"`
174	`)`
175
176	`if ext == "wmf":`
177	`_validate_wmf_magic_bytes(blob)`
178	`elif ext == "emf":`
179	`_validate_emf_magic_bytes(blob)`
180	`elif ext == "svg":`
181	`blob = _convert_svg_to_png(blob)`
182	`ext = "png"`
183
184	`img_name = f"image-{img_count:02d}.{ext}"`
185	`img_path = output_dir / "images" / img_name`
186
187	`if not img_path.resolve().is_relative_to(output_dir.resolve()):`
188	`raise _ImageSecurityError(`
189	`f"Image path {img_path} escapes output directory {output_dir}"`
190	`)`
191
192	`img_path.parent.mkdir(parents=True, exist_ok=True)`
193	`with open(img_path, "wb") as f:`
194	`f.write(blob)`
195	`return {"path": f"images/{img_name}"}`
196
197
198	`def extract_freeform(shape) -> dict:`
199	`"""Extract a freeform shape with its path vertices."""`
200	`elem = {`
201	`"type": "freeform",`
202	`"left": emu_to_inches(shape.left),`
203	`"top": emu_to_inches(shape.top),`
204	`"width": emu_to_inches(shape.width),`
205	`"height": emu_to_inches(shape.height),`
206	`"name": shape.name,`
207	`}`
208
209	`rot = extract_rotation(shape)`
210	`if rot is not None:`
211	`elem["rotation"] = rot`
212
213	`# Extract fill and line properties`
214	`try:`
215	`fill_result = extract_fill(shape.fill)`
216	`if fill_result is not None:`
217	`elem["fill"] = fill_result`
218	`except (AttributeError, TypeError):`
219	`pass`
220
221	`line_props = extract_line(shape)`
222	`if line_props:`
223	`elem.update(line_props)`
224
225	`# Extract path vertices from custGeom XML`
226	`nsmap = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}`
227	`paths = []`
228	`for path_el in shape._element.findall(".//a:custGeom/a:pathLst/a:path", nsmap):`
229	`commands = []`
230	`for child in path_el:`
231	`tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag`
232	`if tag == "moveTo":`
233	`pt = child.find("a:pt", nsmap)`
234	`if pt is not None:`
235	`commands.append(`
236	`{`
237	`"cmd": "moveTo",`
238	`"x": int(pt.get("x", 0)),`
239	`"y": int(pt.get("y", 0)),`
240	`}`
241	`)`
242	`elif tag == "lnTo":`
243	`pt = child.find("a:pt", nsmap)`
244	`if pt is not None:`
245	`commands.append(`
246	`{`
247	`"cmd": "lineTo",`
248	`"x": int(pt.get("x", 0)),`
249	`"y": int(pt.get("y", 0)),`
250	`}`
251	`)`
252	`elif tag == "cubicBezTo":`
253	`pts = child.findall("a:pt", nsmap)`
254	`commands.append(`
255	`{`
256	`"cmd": "cubicBezTo",`
257	`"pts": [`
258	`{"x": int(p.get("x", 0)), "y": int(p.get("y", 0))}`
259	`for p in pts`
260	`],`
261	`}`
262	`)`
263	`elif tag == "close":`
264	`commands.append({"cmd": "close"})`
265	`if commands:`
266	`paths.append(commands)`
267
268	`if paths:`
269	`elem["paths"] = paths`
270
271	`return elem`
272
273
274	`MAX_GROUP_DEPTH = 20`
275
276
277	`def extract_group(`
278	`shape,`
279	`slide_num: int,`
280	`output_dir,`
281	`img_count: int,`
282	`*,`
283	`_depth: int = 0,`
284	`max_depth: int = MAX_GROUP_DEPTH,`
285	`) -> dict:`
286	`"""Extract a group shape and its nested child elements.`
287
288	`Raises ValueError when nesting exceeds max_depth.`
289	`"""`
290	`if _depth >= max_depth:`
291	`raise ValueError(f"Group nesting depth {_depth} exceeds limit of {max_depth}")`
292	`elem = {`
293	`"type": "group",`
294	`"left": emu_to_inches(shape.left),`
295	`"top": emu_to_inches(shape.top),`
296	`"width": emu_to_inches(shape.width),`
297	`"height": emu_to_inches(shape.height),`
298	`"name": shape.name,`
299	`"elements": [],`
300	`}`
301	`for child in shape.shapes:`
302	`child_elem = extract_child_shape(`
303	`child,`
304	`slide_num,`
305	`output_dir,`
306	`img_count,`
307	`_depth=_depth + 1,`
308	`max_depth=max_depth,`
309	`)`
310	`if child_elem:`
311	`elem["elements"].append(child_elem)`
312	`return elem`
313
314
315	`def _extract_shape_by_type(`
316	`shape,`
317	`slide_num: int,`
318	`output_dir,`
319	`img_count: int,`
320	`*,`
321	`_depth: int = 0,`
322	`max_depth: int = MAX_GROUP_DEPTH,`
323	`) -> dict \| None:`
324	`"""Dispatch extraction based on shape_type, table/chart, or freeform."""`
325	`shape_type = shape.shape_type`
326
327	`# Simple shape_type dispatch (these extractors need no extra context)`
328	`_SIMPLE_EXTRACTORS = {`
329	`17: extract_textbox, # TEXT_BOX`
330	`1: extract_shape, # AUTO_SHAPE`
331	`9: extract_connector, # LINE / CONNECTOR`
332	`}`
333	`extractor = _SIMPLE_EXTRACTORS.get(shape_type)`
334	`if extractor:`
335	`return extractor(shape)`
336
337	`if shape_type == 13: # PICTURE`
338	`return extract_image(shape, output_dir, slide_num, img_count)`
339	`if shape_type == 6: # GROUP`
340	`return extract_group(`
341	`shape,`
342	`slide_num,`
343	`output_dir,`
344	`img_count,`
345	`_depth=_depth,`
346	`max_depth=max_depth,`
347	`)`
348
349	`# Table and chart detection via attribute check`
350	`if hasattr(shape, "has_table") and shape.has_table:`
351	`return extract_table(shape)`
352	`if hasattr(shape, "has_chart") and shape.has_chart:`
353	`return extract_chart(shape)`
354	`if _is_freeform(shape):`
355	`return extract_freeform(shape)`
356
357	`return None`
358
359
360	`def extract_child_shape(`
361	`shape,`
362	`slide_num: int,`
363	`output_dir,`
364	`img_count: int,`
365	`*,`
366	`_depth: int = 0,`
367	`max_depth: int = MAX_GROUP_DEPTH,`
368	`) -> dict \| None:`
369	`"""Extract a single child shape within a group."""`
370	`result = _extract_shape_by_type(`
371	`shape,`
372	`slide_num,`
373	`output_dir,`
374	`img_count,`
375	`_depth=_depth,`
376	`max_depth=max_depth,`
377	`)`
378	`if result is not None:`
379	`return result`
380
381	`# Fallback for unrecognized shape types`
382	`elem = {`
383	`"type": "shape",`
384	`"shape": "rectangle",`
385	`"left": emu_to_inches(shape.left),`
386	`"top": emu_to_inches(shape.top),`
387	`"width": emu_to_inches(shape.width),`
388	`"height": emu_to_inches(shape.height),`
389	`"name": shape.name,`
390	`}`
391	`if shape.shape_type is not None:`
392	`elem["_unrecognized_shape_type"] = int(shape.shape_type)`
393	`return elem`
394
395
396	`def _has_formatting_variation(runs: list) -> bool:`
397	`"""Check if multiple runs have different formatting properties."""`
398	`if len(runs) <= 1:`
399	`return False`
400	`fonts = {r.get("font") for r in runs if "font" in r}`
401	`sizes = {r.get("size") for r in runs if "size" in r}`
402	`colors = {r.get("color") for r in runs if "color" in r}`
403	`bolds = {r.get("bold", False) for r in runs}`
404	`italics = {r.get("italic", False) for r in runs}`
405	`underlines = {r.get("underline", False) for r in runs}`
406	`return (`
407	`len(fonts) > 1`
408	`or len(sizes) > 1`
409	`or len(colors) > 1`
410	`or len(bolds) > 1`
411	`or len(italics) > 1`
412	`or len(underlines) > 1`
413	`)`
414
415
416	`# Key-mapping for extraction: maps canonical keys to output YAML key names`
417	`_SHAPE_EXTRACT_KEYS = {`
418	`"font": "text_font",`
419	`"size": "text_size",`
420	`"color": "text_color",`
421	`"bold": "text_bold",`
422	`}`
423	`_TEXTBOX_EXTRACT_KEYS = {`
424	`"font": "font",`
425	`"size": "font_size",`
426	`"color": "font_color",`
427	`"bold": "font_bold",`
428	`}`
429
430	`# Keys to promote from first paragraph to element level`
431	`_SHAPE_PROMOTE_KEYS = (`
432	`"text_font",`
433	`"text_size",`
434	`"text_color",`
435	`"text_bold",`
436	`"italic",`
437	`"alignment",`
438	`"char_spacing",`
439	`)`
440	`_TEXTBOX_PROMOTE_KEYS = (`
441	`"font",`
442	`"font_size",`
443	`"font_color",`
444	`"font_bold",`
445	`"italic",`
446	`"alignment",`
447	`"char_spacing",`
448	`)`
449
450
451	`def _extract_text_content(text_frame, keys: dict, promote_keys: tuple) -> dict:`
452	`"""Extract text content from a text frame into an element dict fragment.`
453
454	`Handles paragraph iteration, run extraction, rich-text detection, and`
455	`paragraph/element-level key promotion.`
456
457	`Args:`
458	`text_frame: python-pptx TextFrame object.`
459	`keys: Key-mapping dict for font/size/color/bold output names.`
460	`promote_keys: Tuple of keys to promote from first paragraph to element level.`
461
462	`Returns:`
463	`Dict with text, text frame properties, paragraph data, and promoted defaults.`
464	`"""`
465	`result = {}`
466	`text = text_frame.text.strip()`
467	`if not text:`
468	`return result`
469
470	`result["text"] = text`
471
472	`tf_props = extract_text_frame_properties(text_frame)`
473	`if tf_props:`
474	`result.update(tf_props)`
475
476	`para_dicts = []`
477	`for para in text_frame.paragraphs:`
478	`run_info = {}`
479	`para_runs = []`
480	`for run in para.runs:`
481	`font_info = extract_font_info(run.font)`
482	`run_extra = extract_run_properties(run)`
483	`para_runs.append({"text": run.text, font_info, run_extra})`
484	`if not run_info:`
485	`run_info = {font_info, run_extra}`
486
487	`para_info = extract_paragraph_font(para)`
488	`para_spacing = extract_paragraph_properties(para)`
489	`bullet_props = extract_bullet_properties(para)`
490	`alignment = extract_alignment(para)`
491	`merged = {para_info, run_info}`
492
493	`p_dict = {"text": para.text}`
494	`if "font" in merged:`
495	`p_dict[keys["font"]] = merged["font"]`
496	`if "size" in merged:`
497	`p_dict[keys["size"]] = merged["size"]`
498	`if "color" in merged:`
499	`p_dict[keys["color"]] = merged["color"]`
500	`if merged.get("bold"):`
501	`p_dict[keys["bold"]] = True`
502	`if merged.get("italic"):`
503	`p_dict["italic"] = True`
504	`if merged.get("underline"):`
505	`p_dict["underline"] = True`
506	`if merged.get("hyperlink"):`
507	`p_dict["hyperlink"] = merged["hyperlink"]`
508	`if "char_spacing" in merged:`
509	`p_dict["char_spacing"] = merged["char_spacing"]`
510	`if "effect" in merged:`
511	`p_dict["text_effect"] = merged["effect"]`
512	`if alignment:`
513	`p_dict["alignment"] = alignment`
514	`if para_spacing:`
515	`p_dict.update(para_spacing)`
516	`if bullet_props:`
517	`p_dict.update(bullet_props)`
518	`if _has_formatting_variation(para_runs):`
519	`p_dict["runs"] = para_runs`
520	`para_dicts.append(p_dict)`
521
522	`non_empty = [p for p in para_dicts if p["text"].strip()]`
523	`any_has_runs = any("runs" in p for p in para_dicts)`
524	`if len(para_dicts) > 1 or any_has_runs:`
525	`result["paragraphs"] = para_dicts`
526	`if non_empty:`
527	`first = non_empty[0]`
528	`for key in promote_keys:`
529	`if key in first:`
530	`result[key] = first[key]`
531	`elif non_empty:`
532	`first = non_empty[0]`
533	`for key, val in first.items():`
534	`if key != "text":`
535	`result[key] = val`
536
537	`return result`
538
539
540	`def extract_shape(shape) -> dict:`
541	`"""Extract a shape element definition."""`
542	`elem = {`
543	`"type": "shape",`
544	`"shape": "rectangle",`
545	`"left": emu_to_inches(shape.left),`
546	`"top": emu_to_inches(shape.top),`
547	`"width": emu_to_inches(shape.width),`
548	`"height": emu_to_inches(shape.height),`
549	`"name": shape.name,`
550	`}`
551
552	`rot = extract_rotation(shape)`
553	`if rot is not None:`
554	`elem["rotation"] = rot`
555
556	`# Detect shape type from auto_shape_type enum`
557	`try:`
558	`elem["shape"] = AUTO_SHAPE_NAME_MAP.get(shape.auto_shape_type, "rectangle")`
559	`except (AttributeError, TypeError):`
560	`elem["shape"] = "rectangle"`
561
562	`# Extract corner radius (adjustment values) for rounded rectangles`
563	`try:`
564	`if shape.adjustments and len(shape.adjustments) > 0:`
565	`elem["corner_radius"] = round(shape.adjustments[0], 5)`
566	`except (AttributeError, TypeError, IndexError):`
567	`pass`
568
569	`# Extract fill`
570	`try:`
571	`fill_result = extract_fill(shape.fill)`
572	`if fill_result is not None:`
573	`elem["fill"] = fill_result`
574	`except (AttributeError, TypeError):`
575	`pass`
576
577	`# Extract line properties`
578	`line_props = extract_line(shape)`
579	`if line_props:`
580	`elem.update(line_props)`
581
582	`# Extract effect list (outer shadow)`
583	`effect = extract_effect_list(shape)`
584	`if effect:`
585	`elem["effect"] = effect`
586
587	`# Extract text if present`
588	`if shape.has_text_frame:`
589	`text_data = _extract_text_content(`
590	`shape.text_frame, _SHAPE_EXTRACT_KEYS, _SHAPE_PROMOTE_KEYS`
591	`)`
592	`elem.update(text_data)`
593
594	`return elem`
595
596
597	`def extract_textbox(shape) -> dict:`
598	`"""Extract a text box element definition."""`
599	`elem = {`
600	`"type": "textbox",`
601	`"left": emu_to_inches(shape.left),`
602	`"top": emu_to_inches(shape.top),`
603	`"width": emu_to_inches(shape.width),`
604	`"height": emu_to_inches(shape.height),`
605	`"text": shape.text_frame.text.strip() if shape.has_text_frame else "",`
606	`"name": shape.name,`
607	`}`
608
609	`rot = extract_rotation(shape)`
610	`if rot is not None:`
611	`elem["rotation"] = rot`
612
613	`if shape.has_text_frame:`
614	`text_data = _extract_text_content(`
615	`shape.text_frame, _TEXTBOX_EXTRACT_KEYS, _TEXTBOX_PROMOTE_KEYS`
616	`)`
617	`elem.update(text_data)`
618
619	`return elem`
620
621
622	`def extract_image(shape, output_dir: Path, slide_num: int, img_count: int) -> dict:`
623	`"""Extract an image element and save the image file."""`
624	`try:`
625	`blob_result = _save_image_blob(shape, output_dir, slide_num, img_count)`
626	`except _ImageSecurityError:`
627	`raise`
628	`except ValueError as exc:`
629	`logging.warning("Skipping image on slide %d: %s", slide_num, exc)`
630	`return {`
631	`"type": "image",`
632	`"path": "SKIPPED",`
633	`"left": emu_to_inches(shape.left),`
634	`"top": emu_to_inches(shape.top),`
635	`"width": emu_to_inches(shape.width),`
636	`"height": emu_to_inches(shape.height),`
637	`"name": shape.name,`
638	`"_skipped_reason": str(exc),`
639	`}`
640
641	`if blob_result["path"] == "LINKED_IMAGE_NOT_EMBEDDED":`
642	`elem = {`
643	`"type": "image",`
644	`"path": "LINKED_IMAGE_NOT_EMBEDDED",`
645	`"left": emu_to_inches(shape.left),`
646	`"top": emu_to_inches(shape.top),`
647	`"width": emu_to_inches(shape.width),`
648	`"height": emu_to_inches(shape.height),`
649	`"name": shape.name,`
650	`"_note": "Image was linked, not embedded in the PPTX",`
651	`}`
652	`rot = extract_rotation(shape)`
653	`if rot is not None:`
654	`elem["rotation"] = rot`
655	`return elem`
656
657	`elem = {`
658	`"type": "image",`
659	`"path": blob_result["path"],`
660	`"left": emu_to_inches(shape.left),`
661	`"top": emu_to_inches(shape.top),`
662	`"width": emu_to_inches(shape.width),`
663	`"height": emu_to_inches(shape.height),`
664	`"name": shape.name,`
665	`}`
666	`rot = extract_rotation(shape)`
667	`if rot is not None:`
668	`elem["rotation"] = rot`
669
670	`# Extract image crop from srcRect on blipFill`
671	`blipFill = shape._element.find(qn("p:blipFill"))`
672	`if blipFill is not None:`
673	`# Preserve blipFill attributes (rotWithShape, dpi, etc.)`
674	`blip_fill_attrs = {}`
675	`for attr_name in ("rotWithShape", "dpi"):`
676	`val = blipFill.get(attr_name)`
677	`if val is not None:`
678	`blip_fill_attrs[attr_name] = val`
679	`if blip_fill_attrs:`
680	`elem["blip_fill_attrs"] = blip_fill_attrs`
681
682	`srcRect = blipFill.find(qn("a:srcRect"))`
683	`if srcRect is not None and srcRect.attrib:`
684	`crop = {}`
685	`for side in ("l", "t", "r", "b"):`
686	`val = srcRect.get(side)`
687	`if val is not None:`
688	`crop[side] = int(val)`
689	`if crop:`
690	`elem["crop"] = crop`
691
692	`# Extract image opacity from alphaModFix on the blip element`
693	`blip = shape._element.find(".//" + qn("a:blip"))`
694	`if blip is not None:`
695	`amf = blip.find(qn("a:alphaModFix"))`
696	`if amf is not None:`
697	`amt = int(amf.get("amt", "100000"))`
698	`elem["opacity"] = round(amt / 1000, 1)`
699
700	`return elem`
701
702
703	`def detect_global_style(prs) -> dict:`
704	`"""Analyze the presentation to detect common styling patterns.`
705
706	`Detects multiple theme zones (e.g., light and dark slides) by clustering`
707	`slides based on background brightness and dominant text colors.`
708	`"""`
709	`bg_colors = Counter()`
710	`text_colors = Counter()`
711	`accent_colors = Counter()`
712	`fill_colors = Counter()`
713	`font_names = Counter()`
714	`font_sizes = Counter()`
715
716	`# Per-slide analysis for theme clustering`
717	`slide_profiles = []`
718
719	`slide_w = emu_to_inches(prs.slide_width)`
720	`slide_h = emu_to_inches(prs.slide_height)`
721
722	`for slide_idx, slide in enumerate(prs.slides):`
723	`slide_num = slide_idx + 1`
724	`slide_bg = None`
725	`slide_text_colors = Counter()`
726	`slide_fill_colors = Counter()`
727	`has_bg_image = False`
728
729	`# Detect background colors`
730	`try:`
731	`fill_result = extract_fill(slide.background.fill)`
732	`if isinstance(fill_result, str):`
733	`bg_colors[fill_result] += 1`
734	`slide_bg = fill_result`
735	`except (AttributeError, TypeError):`
736	`pass`
737
738	`for i, shape in enumerate(slide.shapes):`
739	`# Detect full-slide background images`
740	`if (`
741	`i == 0`
742	`and shape.shape_type == 13`
743	`and _is_background_image(shape, slide_w, slide_h)`
744	`):`
745	`has_bg_image = True`
746	`continue`
747
748	`# Collect fill colors`
749	`try:`
750	`fill_result = extract_fill(shape.fill)`
751	`if isinstance(fill_result, str):`
752	`h = emu_to_inches(shape.height)`
753	`if h < 0.1:`
754	`accent_colors[fill_result] += 1`
755	`else:`
756	`fill_colors[fill_result] += 1`
757	`slide_fill_colors[fill_result] += 1`
758	`except (AttributeError, TypeError):`
759	`pass`
760
761	`# Collect font information`
762	`if shape.has_text_frame:`
763	`for para in shape.text_frame.paragraphs:`
764	`for run in para.runs:`
765	`if run.font.name:`
766	`base_name = normalize_font_family(run.font.name)`
767	`font_names[base_name] += 1`
768	`if run.font.size:`
769	`font_sizes[int(run.font.size.pt)] += 1`
770	`try:`
771	`color = extract_color(run.font.color)`
772	`if isinstance(color, str) and color.startswith("#"):`
773	`text_colors[color] += 1`
774	`slide_text_colors[color] += 1`
775	`except (AttributeError, TypeError):`
776	`pass`
777
778	`# Classify slide brightness`
779	`bg_brightness = _classify_slide_brightness(`
780	`slide_bg, slide_text_colors, has_bg_image`
781	`)`
782	`slide_profiles.append(`
783	`{`
784	`"slide": slide_num,`
785	`"bg_color": slide_bg,`
786	`"bg_brightness": bg_brightness,`
787	`"has_bg_image": has_bg_image,`
788	`"text_colors": dict(slide_text_colors),`
789	`"fill_colors": dict(slide_fill_colors),`
790	`}`
791	`)`
792
793	`# Build global color map from frequency analysis`
794	`colors = _build_color_map(bg_colors, fill_colors, text_colors, accent_colors)`
795
796	`# Detect themes by clustering slides into light/dark groups`
797	`themes = _cluster_themes(slide_profiles, text_colors, fill_colors, accent_colors)`
798
799	`# Determine primary fonts`
800	`body_font = "Segoe UI"`
801	`code_font = "Cascadia Code"`
802	`for f, _count in font_names.most_common():`
803	`if any(kw in f.lower() for kw in ("cascadia", "consolas", "mono", "courier")):`
804	`code_font = f`
805	`else:`
806	`body_font = f`
807	`break`
808
809	`# Determine font sizes`
810	`heading_size = 28`
811	`body_size = 16`
812	`if font_sizes:`
813	`filtered = {s: c for s, c in font_sizes.items() if 8 < s < 60}`
814	`if filtered:`
815	`sorted_sizes = sorted(filtered.keys())`
816	`body_size = sorted_sizes[len(sorted_sizes) // 2]`
817	`heading_size = sorted_sizes[int(len(sorted_sizes) * 0.85)]`
818
819	`style = {`
820	`"dimensions": {`
821	`"width_inches": emu_to_inches(prs.slide_width),`
822	`"height_inches": emu_to_inches(prs.slide_height),`
823	`"format": "16:9",`
824	`},`
825	`"defaults": {`
826	`"speaker_notes_required": True,`
827	`},`
828	`"typography": {`
829	`"body_font": body_font,`
830	`"code_font": code_font,`
831	`"heading_size": heading_size,`
832	`"body_size": body_size,`
833	`},`
834	`}`
835
836	`if colors:`
837	`style["colors"] = colors`
838
839	`if themes:`
840	`style["themes"] = themes`
841
842	`# Extract presentation metadata`
843	`metadata = {}`
844	`props = prs.core_properties`
845	`for attr in ("title", "author", "subject", "keywords", "description", "category"):`
846	`val = getattr(props, attr, None)`
847	`if val:`
848	`metadata[attr] = val`
849	`if metadata:`
850	`style["metadata"] = metadata`
851
852	`return style`
853
854
855	`def _classify_slide_brightness(`
856	`bg_color: str \| None, text_colors: Counter, has_bg_image: bool`
857	`) -> str:`
858	`"""Classify a slide as 'light' or 'dark' based on background and text colors."""`
859	`if has_bg_image and bg_color is None:`
860	`# Slides with background images and no solid bg — infer from text colors`
861	`dark_text = sum(`
862	`c for hex_c, c in text_colors.items() if hex_brightness(hex_c) < 100`
863	`)`
864	`light_text = sum(`
865	`c for hex_c, c in text_colors.items() if hex_brightness(hex_c) > 150`
866	`)`
867	`return "light" if dark_text >= light_text else "dark"`
868
869	`if bg_color and isinstance(bg_color, str) and bg_color.startswith("#"):`
870	`return "light" if hex_brightness(bg_color) > 128 else "dark"`
871
872	`# Default: infer from text colors`
873	`dark_text = sum(`
874	`c for hex_c, c in text_colors.items() if hex_brightness(hex_c) < 100`
875	`)`
876	`light_text = sum(`
877	`c for hex_c, c in text_colors.items() if hex_brightness(hex_c) > 150`
878	`)`
879	`if dark_text > light_text:`
880	`return "light"`
881	`if light_text > dark_text:`
882	`return "dark"`
883	`return "dark"`
884
885
886	`def _build_color_map(`
887	`bg_colors: Counter,`
888	`fill_colors: Counter,`
889	`text_colors: Counter,`
890	`accent_colors: Counter,`
891	`) -> dict:`
892	`"""Build the global color map from frequency analysis."""`
893	`colors = {}`
894	`if bg_colors:`
895	`colors["bg_dark"] = bg_colors.most_common(1)[0][0]`
896	`if fill_colors:`
897	`colors["bg_card"] = fill_colors.most_common(1)[0][0]`
898
899	`for color_hex, _count in text_colors.most_common(5):`
900	`brightness = hex_brightness(color_hex)`
901	`if brightness > 200 and "text_white" not in colors:`
902	`colors["text_white"] = color_hex`
903	`elif brightness < 80 and "text_dark" not in colors:`
904	`colors["text_dark"] = color_hex`
905	`elif 80 <= brightness <= 200 and "text_gray" not in colors:`
906	`colors["text_gray"] = color_hex`
907
908	`accent_names = ["accent_blue", "accent_teal", "accent_green"]`
909	`for i, (color_hex, _count) in enumerate(accent_colors.most_common(3)):`
910	`if i < len(accent_names):`
911	`colors[accent_names[i]] = color_hex`
912
913	`return colors`
914
915
916	`def _cluster_themes(`
917	`slide_profiles: list[dict],`
918	`text_colors: Counter,`
919	`fill_colors: Counter,`
920	`accent_colors: Counter,`
921	`) -> list[dict]:`
922	`"""Cluster slides into theme groups based on brightness classification."""`
923	`light_slides = [p for p in slide_profiles if p["bg_brightness"] == "light"]`
924	`dark_slides = [p for p in slide_profiles if p["bg_brightness"] == "dark"]`
925
926	`# Only produce themes when both light and dark groups exist`
927	`if not light_slides or not dark_slides:`
928	`return []`
929
930	`themes = []`
931
932	`# Light theme`
933	`light_text = Counter()`
934	`light_fills = Counter()`
935	`for p in light_slides:`
936	`light_text.update(p["text_colors"])`
937	`light_fills.update(p["fill_colors"])`
938
939	`light_colors = {}`
940	`for color_hex, _count in light_text.most_common(5):`
941	`brightness = hex_brightness(color_hex)`
942	`if brightness < 80 and "text_primary" not in light_colors:`
943	`light_colors["text_primary"] = color_hex`
944	`elif 80 <= brightness <= 200 and "text_secondary" not in light_colors:`
945	`light_colors["text_secondary"] = color_hex`
946	`if light_fills:`
947	`light_colors["bg_card"] = light_fills.most_common(1)[0][0]`
948
949	`themes.append(`
950	`{`
951	`"name": "light",`
952	`"slides": sorted(p["slide"] for p in light_slides),`
953	`"colors": light_colors,`
954	`}`
955	`)`
956
957	`# Dark theme`
958	`dark_text = Counter()`
959	`dark_fills = Counter()`
960	`dark_bgs = Counter()`
961	`for p in dark_slides:`
962	`dark_text.update(p["text_colors"])`
963	`dark_fills.update(p["fill_colors"])`
964	`if p["bg_color"]:`
965	`dark_bgs[p["bg_color"]] += 1`
966
967	`dark_colors = {}`
968	`if dark_bgs:`
969	`dark_colors["bg_dark"] = dark_bgs.most_common(1)[0][0]`
970	`for color_hex, _count in dark_text.most_common(5):`
971	`brightness = hex_brightness(color_hex)`
972	`if brightness > 200 and "text_primary" not in dark_colors:`
973	`dark_colors["text_primary"] = color_hex`
974	`elif 80 <= brightness <= 200 and "text_secondary" not in dark_colors:`
975	`dark_colors["text_secondary"] = color_hex`
976	`if dark_fills:`
977	`dark_colors["bg_card"] = dark_fills.most_common(1)[0][0]`
978
979	`themes.append(`
980	`{`
981	`"name": "dark",`
982	`"slides": sorted(p["slide"] for p in dark_slides),`
983	`"colors": dark_colors,`
984	`}`
985	`)`
986
987	`return themes`
988
989
990	`def extract_slide(`
991	`slide,`
992	`slide_num: int,`
993	`output_dir: Path,`
994	`slide_dims: tuple[float, float] \| None = None,`
995	`) -> dict:`
996	`"""Extract all elements from a slide into a content.yaml structure."""`
997	`slide_dir = output_dir / f"slide-{slide_num:03d}"`
998	`slide_dir.mkdir(parents=True, exist_ok=True)`
999
1000	`content = {`
1001	`"slide": slide_num,`
1002	`"title": "",`
1003	`"elements": [],`
1004	`}`
1005
1006	`# Extract layout name`
1007	`try:`
1008	`layout_name = slide.slide_layout.name`
1009	`if layout_name:`
1010	`content["layout"] = layout_name`
1011	`except (AttributeError, TypeError):`
1012	`pass`
1013
1014	`# Extract slide background`
1015	`try:`
1016	`if not slide.follow_master_background:`
1017	`fill_result = extract_fill(slide.background.fill)`
1018	`if fill_result is not None:`
1019	`content["background"] = {"fill": fill_result}`
1020	`except (AttributeError, TypeError):`
1021	`pass`
1022
1023	`# Extract speaker notes (include empty string when notes slide exists)`
1024	`try:`
1025	`if slide.has_notes_slide:`
1026	`notes = slide.notes_slide.notes_text_frame.text.strip()`
1027	`content["speaker_notes"] = notes`
1028	`except (AttributeError, TypeError):`
1029	`pass`
1030
1031	`img_count = 0`
1032
1033	`for z_index, shape in enumerate(list(slide.shapes)):`
1034	`shape_type = shape.shape_type`
1035
1036	`# Track image count for filename generation`
1037	`if shape_type == 13:`
1038	`img_count += 1`
1039
1040	`# Handle placeholders specially (extract as textbox with marker)`
1041	`if shape_type == 14:`
1042	`if not shape.has_text_frame:`
1043	`continue`
1044	`elem = extract_textbox(shape)`
1045	`elem["_placeholder"] = True`
1046	`elem["z_order"] = z_index`
1047	`content["elements"].append(elem)`
1048	`continue`
1049
1050	`# Use shared dispatcher for all other shape types`
1051	`elem = _extract_shape_by_type(shape, slide_num, slide_dir, img_count)`
1052	`if elem is not None:`
1053	`elem["z_order"] = z_index`
1054	`content["elements"].append(elem)`
1055
1056	`# Detect title from textbox near top of slide`
1057	`if (`
1058	`shape_type == 17`
1059	`and not content["title"]`
1060	`and emu_to_inches(shape.top) < 1.5`
1061	`):`
1062	`text = shape.text_frame.text.strip() if shape.has_text_frame else ""`
1063	`if text and len(text) < 100:`
1064	`content["title"] = text`
1065	`continue`
1066
1067	`# Fallback for unrecognized shape types`
1068	`elem_data = {`
1069	`"type": "shape",`
1070	`"shape": "rectangle",`
1071	`"left": emu_to_inches(shape.left),`
1072	`"top": emu_to_inches(shape.top),`
1073	`"width": emu_to_inches(shape.width),`
1074	`"height": emu_to_inches(shape.height),`
1075	`"name": shape.name,`
1076	`"z_order": z_index,`
1077	`}`
1078	`if shape_type is not None:`
1079	`elem_data["_unrecognized_shape_type"] = int(shape_type)`
1080	`content["elements"].append(elem_data)`
1081
1082	`return content, slide_dir`
1083
1084
1085	`def _resolve_theme_colors(prs) -> dict:`
1086	`"""Extract theme color name→hex mappings from the presentation's theme XML.`
1087
1088	`Reads clrScheme from the slide master's theme and maps theme names`
1089	`(background_1, text_1, accent_1, etc.) to their actual hex values.`
1090	`"""`
1091	`color_map = {}`
1092	`scheme_names = {`
1093	`"dk1": "dark_1",`
1094	`"dk2": "dark_2",`
1095	`"lt1": "light_1",`
1096	`"lt2": "light_2",`
1097	`"accent1": "accent_1",`
1098	`"accent2": "accent_2",`
1099	`"accent3": "accent_3",`
1100	`"accent4": "accent_4",`
1101	`"accent5": "accent_5",`
1102	`"accent6": "accent_6",`
1103	`"hlink": "hyperlink",`
1104	`"folHlink": "followed_hyperlink",`
1105	`}`
1106	`# Map canonical aliases`
1107	`aliases = {`
1108	`"dark_1": "text_1",`
1109	`"dark_2": "text_2",`
1110	`"light_1": "background_1",`
1111	`"light_2": "background_2",`
1112	`}`
1113	`try:`
1114	`ns_a = "http://schemas.openxmlformats.org/drawingml/2006/main"`
1115	`master = prs.slide_masters[0]`
1116	`theme_el = None`
1117	`# Theme is stored as a related part (generic Part, not XmlPart),`
1118	`# so parse its blob directly with lxml.`
1119	`for rel in master.part.rels.values():`
1120	`if "theme" in rel.reltype:`
1121	`parser = etree.XMLParser(resolve_entities=False, no_network=True)`
1122	`theme_el = etree.fromstring(rel.target_part.blob, parser=parser)`
1123	`break`
1124
1125	`if theme_el is not None:`
1126	`clr_scheme = theme_el.find(f".//{{{ns_a}}}clrScheme")`
1127	`if clr_scheme is not None:`
1128	`for child in clr_scheme:`
1129	`tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag`
1130	`theme_name = scheme_names.get(tag)`
1131	`if theme_name is None:`
1132	`continue`
1133	`# Extract hex value from srgbClr or sysClr`
1134	`srgb = child.find(f"{{{ns_a}}}srgbClr")`
1135	`if srgb is not None:`
1136	`color_map[theme_name] = f"#{srgb.get('val', '000000')}"`
1137	`else:`
1138	`sys_clr = child.find(f"{{{ns_a}}}sysClr")`
1139	`if sys_clr is not None:`
1140	`color_map[theme_name] = (`
1141	`f"#{sys_clr.get('lastClr', '000000')}"`
1142	`)`
1143	`# Add alias mappings`
1144	`if theme_name in aliases:`
1145	`alias = aliases[theme_name]`
1146	`if theme_name in color_map:`
1147	`color_map[alias] = color_map[theme_name]`
1148	`except (AttributeError, TypeError, IndexError):`
1149	`# Theme elements missing or malformed; degrade gracefully`
1150	`pass`
1151	`except etree.XMLSyntaxError:`
1152	`logging.warning(`
1153	`"Malformed theme XML in slide master; skipping theme color resolution"`
1154	`)`
1155	`return color_map`
1156
1157
1158	`MAX_THEME_REF_DEPTH = 50`
1159
1160
1161	`def _resolve_theme_refs_in_content(`
1162	`content: dict,`
1163	`theme_colors: dict,`
1164	`*,`
1165	`max_depth: int = MAX_THEME_REF_DEPTH,`
1166	`) -> dict:`
1167	`"""Replace @theme_name references with resolved hex values in content.`
1168
1169	`Raises ValueError when nesting exceeds max_depth.`
1170	`"""`
1171
1172	`def resolve_value(val, _depth: int = 0):`
1173	`if _depth >= max_depth:`
1174	`raise ValueError(`
1175	`f"Theme reference nesting depth {_depth} exceeds limit of {max_depth}"`
1176	`)`
1177	`if isinstance(val, str) and val.startswith("@"):`
1178	`theme_name = val[1:]`
1179	`return theme_colors.get(theme_name, val)`
1180	`if isinstance(val, dict):`
1181	`return {k: resolve_value(v, _depth + 1) for k, v in val.items()}`
1182	`if isinstance(val, list):`
1183	`return [resolve_value(item, _depth + 1) for item in val]`
1184	`return val`
1185
1186	`return resolve_value(content)`
1187
1188
1189	`def main():`
1190	`"""CLI entry point for extracting PPTX content into YAML."""`
1191	`parser = argparse.ArgumentParser(`
1192	`description="Extract content from a PPTX into YAML"`
1193	`)`
1194	`parser.add_argument("--input", required=True, help="Input PPTX file path")`
1195	`parser.add_argument("--output-dir", required=True, help="Output content directory")`
1196	`parser.add_argument(`
1197	`"--slides", help="Comma-separated slide numbers to extract (default: all)"`
1198	`)`
1199	`parser.add_argument(`
1200	`"--resolve-themes",`
1201	`action="store_true",`
1202	`help="Resolve @theme references to actual hex RGB values from the deck's theme",`
1203	`)`
1204	`args = parser.parse_args()`
1205
1206	`pptx_path = Path(args.input)`
1207	`output_dir = Path(args.output_dir)`
1208	`output_dir.mkdir(parents=True, exist_ok=True)`
1209
1210	`slide_filter = None`
1211	`if args.slides:`
1212	`slide_filter = {int(s.strip()) for s in args.slides.split(",")}`
1213
1214	`prs = Presentation(str(pptx_path))`
1215	`print(f"Extracting from: {pptx_path}")`
1216	`print(f"Slides: {len(prs.slides)}")`
1217	`w = emu_to_inches(prs.slide_width)`
1218	`h = emu_to_inches(prs.slide_height)`
1219	`print(f'Dimensions: {w}" x {h}"')`
1220
1221	`# Detect and save global style`
1222	`global_style = detect_global_style(prs)`
1223
1224	`# Resolve theme colors when requested`
1225	`theme_colors = {}`
1226	`if args.resolve_themes:`
1227	`theme_colors = _resolve_theme_colors(prs)`
1228	`if theme_colors:`
1229	`global_style["theme_colors"] = theme_colors`
1230	`global_style = _resolve_theme_refs_in_content(global_style, theme_colors)`
1231	`print(f"Resolved {len(theme_colors)} theme colors")`
1232
1233	`global_dir = output_dir / "global"`
1234	`global_dir.mkdir(parents=True, exist_ok=True)`
1235	`style_path = global_dir / "style.yaml"`
1236	`with open(style_path, "w", encoding="utf-8") as f:`
1237	`yaml.dump(`
1238	`global_style,`
1239	`f,`
1240	`default_flow_style=False,`
1241	`sort_keys=False,`
1242	`allow_unicode=True,`
1243	`)`
1244	`print(f"Global style saved to {style_path}")`
1245
1246	`# Extract slides (filtered or all)`
1247	`slide_dims = (emu_to_inches(prs.slide_width), emu_to_inches(prs.slide_height))`
1248	`extracted = 0`
1249	`for i, slide in enumerate(prs.slides):`
1250	`slide_num = i + 1`
1251	`if slide_filter and slide_num not in slide_filter:`
1252	`continue`
1253	`content, slide_dir = extract_slide(`
1254	`slide, slide_num, output_dir, slide_dims=slide_dims`
1255	`)`
1256
1257	`# Resolve @theme references to hex values when --resolve-themes is set`
1258	`if args.resolve_themes and theme_colors:`
1259	`content = _resolve_theme_refs_in_content(content, theme_colors)`
1260
1261	`content_path = slide_dir / "content.yaml"`
1262	`with open(content_path, "w", encoding="utf-8") as f:`
1263	`yaml.dump(`
1264	`content,`
1265	`f,`
1266	`default_flow_style=False,`
1267	`sort_keys=False,`
1268	`allow_unicode=True,`
1269	`)`
1270	`print(`
1271	`f"Slide {slide_num}: {content.get('title', 'Untitled')} -> {content_path}"`
1272	`)`
1273	`extracted += 1`
1274
1275	`print(f"\nExtraction complete. {extracted} slide(s) extracted to {output_dir}")`
1276
1277
1278	`if __name__ == "__main__":`
1279	`main()`
1280

microsoft/hve-core

Branches

Tags

Clone