microsoft/hve-core

Public

mirrored fromhttps://github.com/microsoft/hve-coreAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

ci/884-codeql-python-analysis

Find a branch or tag

Branches

ci/884-codeql-python-analysis

Clone

HTTPS

Download ZIP

hve-core/.github/skills/experimental/powerpoint/scripts

.github/skills/experimental/powerpoint/scripts/extract_content.py

1101lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`"""Extract content from an existing PPTX into YAML content and style definitions.`
2
3	`Usage::`
4
5	`python extract_content.py \`
6	`--input existing-deck.pptx --output-dir content/`
7
8	`python extract_content.py \`
9	`--input existing-deck.pptx --output-dir content/ \`
10	`--slides 3,7,15`
11	`"""`
12
13	`import argparse`
14	`from collections import Counter`
15	`from pathlib import Path`
16
17	`import yaml`
18	`from lxml import etree`
19	`from pptx import Presentation`
20	`from pptx.oxml.ns import qn`
21	`from pptx_charts import extract_chart`
22	`from pptx_colors import extract_color, hex_brightness`
23	`from pptx_fills import extract_effect_list, extract_fill, extract_line`
24	`from pptx_fonts import (`
25	`extract_alignment,`
26	`extract_font_info,`
27	`extract_paragraph_font,`
28	`normalize_font_family,`
29	`)`
30	`from pptx_shapes import AUTO_SHAPE_NAME_MAP, extract_rotation`
31	`from pptx_tables import extract_table`
32	`from pptx_text import (`
33	`extract_bullet_properties,`
34	`extract_paragraph_properties,`
35	`extract_run_properties,`
36	`extract_text_frame_properties,`
37	`)`
38	`from pptx_utils import emu_to_inches`
39
40
41	`def extract_connector(shape) -> dict:`
42	`"""Extract a connector element definition."""`
43	`elem = {`
44	`"type": "connector",`
45	`"begin_x": emu_to_inches(shape.begin_x),`
46	`"begin_y": emu_to_inches(shape.begin_y),`
47	`"end_x": emu_to_inches(shape.end_x),`
48	`"end_y": emu_to_inches(shape.end_y),`
49	`"name": shape.name,`
50	`}`
51	`line_props = extract_line(shape)`
52	`if line_props:`
53	`elem.update(line_props)`
54	`return elem`
55
56
57	`def _is_freeform(shape) -> bool:`
58	`"""Check whether a shape is a freeform with custom geometry."""`
59	`nsmap = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}`
60	`return shape._element.find(".//a:custGeom", nsmap) is not None`
61
62
63	`def _is_background_image(shape, slide_w: float, slide_h: float) -> bool:`
64	`"""Detect whether a PICTURE shape covers the full slide as a background.`
65
66	`A shape qualifies if it covers at least 95% of slide dimensions.`
67	`"""`
68	`w = emu_to_inches(shape.width)`
69	`h = emu_to_inches(shape.height)`
70	`return (w >= slide_w * 0.95) and (h >= slide_h * 0.95)`
71
72
73	`def _save_image_blob(shape, output_dir: Path, slide_num: int, img_count: int) -> dict:`
74	`"""Save an image shape's blob to disk and return a path dict."""`
75	`try:`
76	`img = shape.image`
77	`except ValueError:`
78	`return {"path": "LINKED_IMAGE_NOT_EMBEDDED"}`
79
80	`ext = img.content_type.split("/")[-1]`
81	`if ext == "jpeg":`
82	`ext = "jpg"`
83	`img_name = f"image-{img_count:02d}.{ext}"`
84	`img_path = output_dir / "images" / img_name`
85	`img_path.parent.mkdir(parents=True, exist_ok=True)`
86
87	`with open(img_path, "wb") as f:`
88	`f.write(img.blob)`
89
90	`return {"path": f"images/{img_name}"}`
91
92
93	`def extract_freeform(shape) -> dict:`
94	`"""Extract a freeform shape with its path vertices."""`
95	`elem = {`
96	`"type": "freeform",`
97	`"left": emu_to_inches(shape.left),`
98	`"top": emu_to_inches(shape.top),`
99	`"width": emu_to_inches(shape.width),`
100	`"height": emu_to_inches(shape.height),`
101	`"name": shape.name,`
102	`}`
103
104	`rot = extract_rotation(shape)`
105	`if rot is not None:`
106	`elem["rotation"] = rot`
107
108	`# Extract fill and line properties`
109	`try:`
110	`fill_result = extract_fill(shape.fill)`
111	`if fill_result is not None:`
112	`elem["fill"] = fill_result`
113	`except (AttributeError, TypeError):`
114	`pass`
115
116	`line_props = extract_line(shape)`
117	`if line_props:`
118	`elem.update(line_props)`
119
120	`# Extract path vertices from custGeom XML`
121	`nsmap = {"a": "http://schemas.openxmlformats.org/drawingml/2006/main"}`
122	`paths = []`
123	`for path_el in shape._element.findall(".//a:custGeom/a:pathLst/a:path", nsmap):`
124	`commands = []`
125	`for child in path_el:`
126	`tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag`
127	`if tag == "moveTo":`
128	`pt = child.find("a:pt", nsmap)`
129	`if pt is not None:`
130	`commands.append(`
131	`{`
132	`"cmd": "moveTo",`
133	`"x": int(pt.get("x", 0)),`
134	`"y": int(pt.get("y", 0)),`
135	`}`
136	`)`
137	`elif tag == "lnTo":`
138	`pt = child.find("a:pt", nsmap)`
139	`if pt is not None:`
140	`commands.append(`
141	`{`
142	`"cmd": "lineTo",`
143	`"x": int(pt.get("x", 0)),`
144	`"y": int(pt.get("y", 0)),`
145	`}`
146	`)`
147	`elif tag == "cubicBezTo":`
148	`pts = child.findall("a:pt", nsmap)`
149	`commands.append(`
150	`{`
151	`"cmd": "cubicBezTo",`
152	`"pts": [`
153	`{"x": int(p.get("x", 0)), "y": int(p.get("y", 0))}`
154	`for p in pts`
155	`],`
156	`}`
157	`)`
158	`elif tag == "close":`
159	`commands.append({"cmd": "close"})`
160	`if commands:`
161	`paths.append(commands)`
162
163	`if paths:`
164	`elem["paths"] = paths`
165
166	`return elem`
167
168
169	`def extract_group(shape, slide_num: int, output_dir, img_count: int) -> dict:`
170	`"""Extract a group shape and its nested child elements."""`
171	`elem = {`
172	`"type": "group",`
173	`"left": emu_to_inches(shape.left),`
174	`"top": emu_to_inches(shape.top),`
175	`"width": emu_to_inches(shape.width),`
176	`"height": emu_to_inches(shape.height),`
177	`"name": shape.name,`
178	`"elements": [],`
179	`}`
180	`for child in shape.shapes:`
181	`child_elem = extract_child_shape(child, slide_num, output_dir, img_count)`
182	`if child_elem:`
183	`elem["elements"].append(child_elem)`
184	`return elem`
185
186
187	`def _extract_shape_by_type(`
188	`shape, slide_num: int, output_dir, img_count: int`
189	`) -> dict \| None:`
190	`"""Dispatch extraction based on shape_type, table/chart, or freeform."""`
191	`shape_type = shape.shape_type`
192
193	`# Simple shape_type dispatch (these extractors need no extra context)`
194	`_SIMPLE_EXTRACTORS = {`
195	`17: extract_textbox, # TEXT_BOX`
196	`1: extract_shape, # AUTO_SHAPE`
197	`9: extract_connector, # LINE / CONNECTOR`
198	`}`
199	`extractor = _SIMPLE_EXTRACTORS.get(shape_type)`
200	`if extractor:`
201	`return extractor(shape)`
202
203	`if shape_type == 13: # PICTURE`
204	`return extract_image(shape, output_dir, slide_num, img_count)`
205	`if shape_type == 6: # GROUP`
206	`return extract_group(shape, slide_num, output_dir, img_count)`
207
208	`# Table and chart detection via attribute check`
209	`if hasattr(shape, "has_table") and shape.has_table:`
210	`return extract_table(shape)`
211	`if hasattr(shape, "has_chart") and shape.has_chart:`
212	`return extract_chart(shape)`
213	`if _is_freeform(shape):`
214	`return extract_freeform(shape)`
215
216	`return None`
217
218
219	`def extract_child_shape(`
220	`shape, slide_num: int, output_dir, img_count: int`
221	`) -> dict \| None:`
222	`"""Extract a single child shape within a group."""`
223	`result = _extract_shape_by_type(shape, slide_num, output_dir, img_count)`
224	`if result is not None:`
225	`return result`
226
227	`# Fallback for unrecognized shape types`
228	`elem = {`
229	`"type": "shape",`
230	`"shape": "rectangle",`
231	`"left": emu_to_inches(shape.left),`
232	`"top": emu_to_inches(shape.top),`
233	`"width": emu_to_inches(shape.width),`
234	`"height": emu_to_inches(shape.height),`
235	`"name": shape.name,`
236	`}`
237	`if shape.shape_type is not None:`
238	`elem["_unrecognized_shape_type"] = int(shape.shape_type)`
239	`return elem`
240
241
242	`def _has_formatting_variation(runs: list) -> bool:`
243	`"""Check if multiple runs have different formatting properties."""`
244	`if len(runs) <= 1:`
245	`return False`
246	`fonts = {r.get("font") for r in runs if "font" in r}`
247	`sizes = {r.get("size") for r in runs if "size" in r}`
248	`colors = {r.get("color") for r in runs if "color" in r}`
249	`bolds = {r.get("bold", False) for r in runs}`
250	`italics = {r.get("italic", False) for r in runs}`
251	`underlines = {r.get("underline", False) for r in runs}`
252	`return (`
253	`len(fonts) > 1`
254	`or len(sizes) > 1`
255	`or len(colors) > 1`
256	`or len(bolds) > 1`
257	`or len(italics) > 1`
258	`or len(underlines) > 1`
259	`)`
260
261
262	`# Key-mapping for extraction: maps canonical keys to output YAML key names`
263	`_SHAPE_EXTRACT_KEYS = {`
264	`"font": "text_font",`
265	`"size": "text_size",`
266	`"color": "text_color",`
267	`"bold": "text_bold",`
268	`}`
269	`_TEXTBOX_EXTRACT_KEYS = {`
270	`"font": "font",`
271	`"size": "font_size",`
272	`"color": "font_color",`
273	`"bold": "font_bold",`
274	`}`
275
276	`# Keys to promote from first paragraph to element level`
277	`_SHAPE_PROMOTE_KEYS = (`
278	`"text_font",`
279	`"text_size",`
280	`"text_color",`
281	`"text_bold",`
282	`"italic",`
283	`"alignment",`
284	`"char_spacing",`
285	`)`
286	`_TEXTBOX_PROMOTE_KEYS = (`
287	`"font",`
288	`"font_size",`
289	`"font_color",`
290	`"font_bold",`
291	`"italic",`
292	`"alignment",`
293	`"char_spacing",`
294	`)`
295
296
297	`def _extract_text_content(text_frame, keys: dict, promote_keys: tuple) -> dict:`
298	`"""Extract text content from a text frame into an element dict fragment.`
299
300	`Handles paragraph iteration, run extraction, rich-text detection, and`
301	`paragraph/element-level key promotion.`
302
303	`Args:`
304	`text_frame: python-pptx TextFrame object.`
305	`keys: Key-mapping dict for font/size/color/bold output names.`
306	`promote_keys: Tuple of keys to promote from first paragraph to element level.`
307
308	`Returns:`
309	`Dict with text, text frame properties, paragraph data, and promoted defaults.`
310	`"""`
311	`result = {}`
312	`text = text_frame.text.strip()`
313	`if not text:`
314	`return result`
315
316	`result["text"] = text`
317
318	`tf_props = extract_text_frame_properties(text_frame)`
319	`if tf_props:`
320	`result.update(tf_props)`
321
322	`para_dicts = []`
323	`for para in text_frame.paragraphs:`
324	`run_info = {}`
325	`para_runs = []`
326	`for run in para.runs:`
327	`font_info = extract_font_info(run.font)`
328	`run_extra = extract_run_properties(run)`
329	`para_runs.append({"text": run.text, font_info, run_extra})`
330	`if not run_info:`
331	`run_info = {font_info, run_extra}`
332
333	`para_info = extract_paragraph_font(para)`
334	`para_spacing = extract_paragraph_properties(para)`
335	`bullet_props = extract_bullet_properties(para)`
336	`alignment = extract_alignment(para)`
337	`merged = {para_info, run_info}`
338
339	`p_dict = {"text": para.text}`
340	`if "font" in merged:`
341	`p_dict[keys["font"]] = merged["font"]`
342	`if "size" in merged:`
343	`p_dict[keys["size"]] = merged["size"]`
344	`if "color" in merged:`
345	`p_dict[keys["color"]] = merged["color"]`
346	`if merged.get("bold"):`
347	`p_dict[keys["bold"]] = True`
348	`if merged.get("italic"):`
349	`p_dict["italic"] = True`
350	`if merged.get("underline"):`
351	`p_dict["underline"] = True`
352	`if merged.get("hyperlink"):`
353	`p_dict["hyperlink"] = merged["hyperlink"]`
354	`if "char_spacing" in merged:`
355	`p_dict["char_spacing"] = merged["char_spacing"]`
356	`if "effect" in merged:`
357	`p_dict["text_effect"] = merged["effect"]`
358	`if alignment:`
359	`p_dict["alignment"] = alignment`
360	`if para_spacing:`
361	`p_dict.update(para_spacing)`
362	`if bullet_props:`
363	`p_dict.update(bullet_props)`
364	`if _has_formatting_variation(para_runs):`
365	`p_dict["runs"] = para_runs`
366	`para_dicts.append(p_dict)`
367
368	`non_empty = [p for p in para_dicts if p["text"].strip()]`
369	`any_has_runs = any("runs" in p for p in para_dicts)`
370	`if len(para_dicts) > 1 or any_has_runs:`
371	`result["paragraphs"] = para_dicts`
372	`if non_empty:`
373	`first = non_empty[0]`
374	`for key in promote_keys:`
375	`if key in first:`
376	`result[key] = first[key]`
377	`elif non_empty:`
378	`first = non_empty[0]`
379	`for key, val in first.items():`
380	`if key != "text":`
381	`result[key] = val`
382
383	`return result`
384
385
386	`def extract_shape(shape) -> dict:`
387	`"""Extract a shape element definition."""`
388	`elem = {`
389	`"type": "shape",`
390	`"shape": "rectangle",`
391	`"left": emu_to_inches(shape.left),`
392	`"top": emu_to_inches(shape.top),`
393	`"width": emu_to_inches(shape.width),`
394	`"height": emu_to_inches(shape.height),`
395	`"name": shape.name,`
396	`}`
397
398	`rot = extract_rotation(shape)`
399	`if rot is not None:`
400	`elem["rotation"] = rot`
401
402	`# Detect shape type from auto_shape_type enum`
403	`try:`
404	`elem["shape"] = AUTO_SHAPE_NAME_MAP.get(shape.auto_shape_type, "rectangle")`
405	`except (AttributeError, TypeError):`
406	`elem["shape"] = "rectangle"`
407
408	`# Extract corner radius (adjustment values) for rounded rectangles`
409	`try:`
410	`if shape.adjustments and len(shape.adjustments) > 0:`
411	`elem["corner_radius"] = round(shape.adjustments[0], 5)`
412	`except (AttributeError, TypeError, IndexError):`
413	`pass`
414
415	`# Extract fill`
416	`try:`
417	`fill_result = extract_fill(shape.fill)`
418	`if fill_result is not None:`
419	`elem["fill"] = fill_result`
420	`except (AttributeError, TypeError):`
421	`pass`
422
423	`# Extract line properties`
424	`line_props = extract_line(shape)`
425	`if line_props:`
426	`elem.update(line_props)`
427
428	`# Extract effect list (outer shadow)`
429	`effect = extract_effect_list(shape)`
430	`if effect:`
431	`elem["effect"] = effect`
432
433	`# Extract text if present`
434	`if shape.has_text_frame:`
435	`text_data = _extract_text_content(`
436	`shape.text_frame, _SHAPE_EXTRACT_KEYS, _SHAPE_PROMOTE_KEYS`
437	`)`
438	`elem.update(text_data)`
439
440	`return elem`
441
442
443	`def extract_textbox(shape) -> dict:`
444	`"""Extract a text box element definition."""`
445	`elem = {`
446	`"type": "textbox",`
447	`"left": emu_to_inches(shape.left),`
448	`"top": emu_to_inches(shape.top),`
449	`"width": emu_to_inches(shape.width),`
450	`"height": emu_to_inches(shape.height),`
451	`"text": shape.text_frame.text.strip() if shape.has_text_frame else "",`
452	`"name": shape.name,`
453	`}`
454
455	`rot = extract_rotation(shape)`
456	`if rot is not None:`
457	`elem["rotation"] = rot`
458
459	`if shape.has_text_frame:`
460	`text_data = _extract_text_content(`
461	`shape.text_frame, _TEXTBOX_EXTRACT_KEYS, _TEXTBOX_PROMOTE_KEYS`
462	`)`
463	`elem.update(text_data)`
464
465	`return elem`
466
467
468	`def extract_image(shape, output_dir: Path, slide_num: int, img_count: int) -> dict:`
469	`"""Extract an image element and save the image file."""`
470	`try:`
471	`img = shape.image`
472	`except ValueError:`
473	`# Linked images have no embedded blob`
474	`elem = {`
475	`"type": "image",`
476	`"path": "LINKED_IMAGE_NOT_EMBEDDED",`
477	`"left": emu_to_inches(shape.left),`
478	`"top": emu_to_inches(shape.top),`
479	`"width": emu_to_inches(shape.width),`
480	`"height": emu_to_inches(shape.height),`
481	`"name": shape.name,`
482	`"_note": "Image was linked, not embedded in the PPTX",`
483	`}`
484	`rot = extract_rotation(shape)`
485	`if rot is not None:`
486	`elem["rotation"] = rot`
487	`return elem`
488
489	`ext = img.content_type.split("/")[-1]`
490	`if ext == "jpeg":`
491	`ext = "jpg"`
492
493	`img_name = f"image-{img_count:02d}.{ext}"`
494	`img_path = output_dir / "images" / img_name`
495	`img_path.parent.mkdir(parents=True, exist_ok=True)`
496
497	`with open(img_path, "wb") as f:`
498	`f.write(img.blob)`
499
500	`elem = {`
501	`"type": "image",`
502	`"path": f"images/{img_name}",`
503	`"left": emu_to_inches(shape.left),`
504	`"top": emu_to_inches(shape.top),`
505	`"width": emu_to_inches(shape.width),`
506	`"height": emu_to_inches(shape.height),`
507	`"name": shape.name,`
508	`}`
509	`rot = extract_rotation(shape)`
510	`if rot is not None:`
511	`elem["rotation"] = rot`
512
513	`# Extract image crop from srcRect on blipFill`
514	`blipFill = shape._element.find(qn("p:blipFill"))`
515	`if blipFill is not None:`
516	`# Preserve blipFill attributes (rotWithShape, dpi, etc.)`
517	`blip_fill_attrs = {}`
518	`for attr_name in ("rotWithShape", "dpi"):`
519	`val = blipFill.get(attr_name)`
520	`if val is not None:`
521	`blip_fill_attrs[attr_name] = val`
522	`if blip_fill_attrs:`
523	`elem["blip_fill_attrs"] = blip_fill_attrs`
524
525	`srcRect = blipFill.find(qn("a:srcRect"))`
526	`if srcRect is not None and srcRect.attrib:`
527	`crop = {}`
528	`for side in ("l", "t", "r", "b"):`
529	`val = srcRect.get(side)`
530	`if val is not None:`
531	`crop[side] = int(val)`
532	`if crop:`
533	`elem["crop"] = crop`
534
535	`# Extract image opacity from alphaModFix on the blip element`
536	`blip = shape._element.find(".//" + qn("a:blip"))`
537	`if blip is not None:`
538	`amf = blip.find(qn("a:alphaModFix"))`
539	`if amf is not None:`
540	`amt = int(amf.get("amt", "100000"))`
541	`elem["opacity"] = round(amt / 1000, 1)`
542
543	`return elem`
544
545
546	`def detect_global_style(prs) -> dict:`
547	`"""Analyze the presentation to detect common styling patterns.`
548
549	`Detects multiple theme zones (e.g., light and dark slides) by clustering`
550	`slides based on background brightness and dominant text colors.`
551	`"""`
552	`bg_colors = Counter()`
553	`text_colors = Counter()`
554	`accent_colors = Counter()`
555	`fill_colors = Counter()`
556	`font_names = Counter()`
557	`font_sizes = Counter()`
558
559	`# Per-slide analysis for theme clustering`
560	`slide_profiles = []`
561
562	`slide_w = emu_to_inches(prs.slide_width)`
563	`slide_h = emu_to_inches(prs.slide_height)`
564
565	`for slide_idx, slide in enumerate(prs.slides):`
566	`slide_num = slide_idx + 1`
567	`slide_bg = None`
568	`slide_text_colors = Counter()`
569	`slide_fill_colors = Counter()`
570	`has_bg_image = False`
571
572	`# Detect background colors`
573	`try:`
574	`fill_result = extract_fill(slide.background.fill)`
575	`if isinstance(fill_result, str):`
576	`bg_colors[fill_result] += 1`
577	`slide_bg = fill_result`
578	`except (AttributeError, TypeError):`
579	`pass`
580
581	`for i, shape in enumerate(slide.shapes):`
582	`# Detect full-slide background images`
583	`if (`
584	`i == 0`
585	`and shape.shape_type == 13`
586	`and _is_background_image(shape, slide_w, slide_h)`
587	`):`
588	`has_bg_image = True`
589	`continue`
590
591	`# Collect fill colors`
592	`try:`
593	`fill_result = extract_fill(shape.fill)`
594	`if isinstance(fill_result, str):`
595	`h = emu_to_inches(shape.height)`
596	`if h < 0.1:`
597	`accent_colors[fill_result] += 1`
598	`else:`
599	`fill_colors[fill_result] += 1`
600	`slide_fill_colors[fill_result] += 1`
601	`except (AttributeError, TypeError):`
602	`pass`
603
604	`# Collect font information`
605	`if shape.has_text_frame:`
606	`for para in shape.text_frame.paragraphs:`
607	`for run in para.runs:`
608	`if run.font.name:`
609	`base_name = normalize_font_family(run.font.name)`
610	`font_names[base_name] += 1`
611	`if run.font.size:`
612	`font_sizes[int(run.font.size.pt)] += 1`
613	`try:`
614	`color = extract_color(run.font.color)`
615	`if isinstance(color, str) and color.startswith("#"):`
616	`text_colors[color] += 1`
617	`slide_text_colors[color] += 1`
618	`except (AttributeError, TypeError):`
619	`pass`
620
621	`# Classify slide brightness`
622	`bg_brightness = _classify_slide_brightness(`
623	`slide_bg, slide_text_colors, has_bg_image`
624	`)`
625	`slide_profiles.append(`
626	`{`
627	`"slide": slide_num,`
628	`"bg_color": slide_bg,`
629	`"bg_brightness": bg_brightness,`
630	`"has_bg_image": has_bg_image,`
631	`"text_colors": dict(slide_text_colors),`
632	`"fill_colors": dict(slide_fill_colors),`
633	`}`
634	`)`
635
636	`# Build global color map from frequency analysis`
637	`colors = _build_color_map(bg_colors, fill_colors, text_colors, accent_colors)`
638
639	`# Detect themes by clustering slides into light/dark groups`
640	`themes = _cluster_themes(slide_profiles, text_colors, fill_colors, accent_colors)`
641
642	`# Determine primary fonts`
643	`body_font = "Segoe UI"`
644	`code_font = "Cascadia Code"`
645	`for f, _count in font_names.most_common():`
646	`if any(kw in f.lower() for kw in ("cascadia", "consolas", "mono", "courier")):`
647	`code_font = f`
648	`else:`
649	`body_font = f`
650	`break`
651
652	`# Determine font sizes`
653	`heading_size = 28`
654	`body_size = 16`
655	`if font_sizes:`
656	`filtered = {s: c for s, c in font_sizes.items() if 8 < s < 60}`
657	`if filtered:`
658	`sorted_sizes = sorted(filtered.keys())`
659	`body_size = sorted_sizes[len(sorted_sizes) // 2]`
660	`heading_size = sorted_sizes[int(len(sorted_sizes) * 0.85)]`
661
662	`style = {`
663	`"dimensions": {`
664	`"width_inches": emu_to_inches(prs.slide_width),`
665	`"height_inches": emu_to_inches(prs.slide_height),`
666	`"format": "16:9",`
667	`},`
668	`"defaults": {`
669	`"speaker_notes_required": True,`
670	`},`
671	`"typography": {`
672	`"body_font": body_font,`
673	`"code_font": code_font,`
674	`"heading_size": heading_size,`
675	`"body_size": body_size,`
676	`},`
677	`}`
678
679	`if colors:`
680	`style["colors"] = colors`
681
682	`if themes:`
683	`style["themes"] = themes`
684
685	`# Extract presentation metadata`
686	`metadata = {}`
687	`props = prs.core_properties`
688	`for attr in ("title", "author", "subject", "keywords", "description", "category"):`
689	`val = getattr(props, attr, None)`
690	`if val:`
691	`metadata[attr] = val`
692	`if metadata:`
693	`style["metadata"] = metadata`
694
695	`return style`
696
697
698	`def _classify_slide_brightness(`
699	`bg_color: str \| None, text_colors: Counter, has_bg_image: bool`
700	`) -> str:`
701	`"""Classify a slide as 'light' or 'dark' based on background and text colors."""`
702	`if has_bg_image and bg_color is None:`
703	`# Slides with background images and no solid bg — infer from text colors`
704	`dark_text = sum(`
705	`c for hex_c, c in text_colors.items() if hex_brightness(hex_c) < 100`
706	`)`
707	`light_text = sum(`
708	`c for hex_c, c in text_colors.items() if hex_brightness(hex_c) > 150`
709	`)`
710	`return "light" if dark_text >= light_text else "dark"`
711
712	`if bg_color and isinstance(bg_color, str) and bg_color.startswith("#"):`
713	`return "light" if hex_brightness(bg_color) > 128 else "dark"`
714
715	`# Default: infer from text colors`
716	`dark_text = sum(`
717	`c for hex_c, c in text_colors.items() if hex_brightness(hex_c) < 100`
718	`)`
719	`light_text = sum(`
720	`c for hex_c, c in text_colors.items() if hex_brightness(hex_c) > 150`
721	`)`
722	`if dark_text > light_text:`
723	`return "light"`
724	`if light_text > dark_text:`
725	`return "dark"`
726	`return "dark"`
727
728
729	`def _build_color_map(`
730	`bg_colors: Counter,`
731	`fill_colors: Counter,`
732	`text_colors: Counter,`
733	`accent_colors: Counter,`
734	`) -> dict:`
735	`"""Build the global color map from frequency analysis."""`
736	`colors = {}`
737	`if bg_colors:`
738	`colors["bg_dark"] = bg_colors.most_common(1)[0][0]`
739	`if fill_colors:`
740	`colors["bg_card"] = fill_colors.most_common(1)[0][0]`
741
742	`for color_hex, _count in text_colors.most_common(5):`
743	`brightness = hex_brightness(color_hex)`
744	`if brightness > 200 and "text_white" not in colors:`
745	`colors["text_white"] = color_hex`
746	`elif brightness < 80 and "text_dark" not in colors:`
747	`colors["text_dark"] = color_hex`
748	`elif 80 <= brightness <= 200 and "text_gray" not in colors:`
749	`colors["text_gray"] = color_hex`
750
751	`accent_names = ["accent_blue", "accent_teal", "accent_green"]`
752	`for i, (color_hex, _count) in enumerate(accent_colors.most_common(3)):`
753	`if i < len(accent_names):`
754	`colors[accent_names[i]] = color_hex`
755
756	`return colors`
757
758
759	`def _cluster_themes(`
760	`slide_profiles: list[dict],`
761	`text_colors: Counter,`
762	`fill_colors: Counter,`
763	`accent_colors: Counter,`
764	`) -> list[dict]:`
765	`"""Cluster slides into theme groups based on brightness classification."""`
766	`light_slides = [p for p in slide_profiles if p["bg_brightness"] == "light"]`
767	`dark_slides = [p for p in slide_profiles if p["bg_brightness"] == "dark"]`
768
769	`# Only produce themes when both light and dark groups exist`
770	`if not light_slides or not dark_slides:`
771	`return []`
772
773	`themes = []`
774
775	`# Light theme`
776	`light_text = Counter()`
777	`light_fills = Counter()`
778	`for p in light_slides:`
779	`light_text.update(p["text_colors"])`
780	`light_fills.update(p["fill_colors"])`
781
782	`light_colors = {}`
783	`for color_hex, _count in light_text.most_common(5):`
784	`brightness = hex_brightness(color_hex)`
785	`if brightness < 80 and "text_primary" not in light_colors:`
786	`light_colors["text_primary"] = color_hex`
787	`elif 80 <= brightness <= 200 and "text_secondary" not in light_colors:`
788	`light_colors["text_secondary"] = color_hex`
789	`if light_fills:`
790	`light_colors["bg_card"] = light_fills.most_common(1)[0][0]`
791
792	`themes.append(`
793	`{`
794	`"name": "light",`
795	`"slides": sorted(p["slide"] for p in light_slides),`
796	`"colors": light_colors,`
797	`}`
798	`)`
799
800	`# Dark theme`
801	`dark_text = Counter()`
802	`dark_fills = Counter()`
803	`dark_bgs = Counter()`
804	`for p in dark_slides:`
805	`dark_text.update(p["text_colors"])`
806	`dark_fills.update(p["fill_colors"])`
807	`if p["bg_color"]:`
808	`dark_bgs[p["bg_color"]] += 1`
809
810	`dark_colors = {}`
811	`if dark_bgs:`
812	`dark_colors["bg_dark"] = dark_bgs.most_common(1)[0][0]`
813	`for color_hex, _count in dark_text.most_common(5):`
814	`brightness = hex_brightness(color_hex)`
815	`if brightness > 200 and "text_primary" not in dark_colors:`
816	`dark_colors["text_primary"] = color_hex`
817	`elif 80 <= brightness <= 200 and "text_secondary" not in dark_colors:`
818	`dark_colors["text_secondary"] = color_hex`
819	`if dark_fills:`
820	`dark_colors["bg_card"] = dark_fills.most_common(1)[0][0]`
821
822	`themes.append(`
823	`{`
824	`"name": "dark",`
825	`"slides": sorted(p["slide"] for p in dark_slides),`
826	`"colors": dark_colors,`
827	`}`
828	`)`
829
830	`return themes`
831
832
833	`def extract_slide(`
834	`slide,`
835	`slide_num: int,`
836	`output_dir: Path,`
837	`slide_dims: tuple[float, float] \| None = None,`
838	`) -> dict:`
839	`"""Extract all elements from a slide into a content.yaml structure."""`
840	`slide_dir = output_dir / f"slide-{slide_num:03d}"`
841	`slide_dir.mkdir(parents=True, exist_ok=True)`
842
843	`content = {`
844	`"slide": slide_num,`
845	`"title": "",`
846	`"elements": [],`
847	`}`
848
849	`# Extract layout name`
850	`try:`
851	`layout_name = slide.slide_layout.name`
852	`if layout_name:`
853	`content["layout"] = layout_name`
854	`except (AttributeError, TypeError):`
855	`pass`
856
857	`# Extract slide background`
858	`try:`
859	`if not slide.follow_master_background:`
860	`fill_result = extract_fill(slide.background.fill)`
861	`if fill_result is not None:`
862	`content["background"] = {"fill": fill_result}`
863	`except (AttributeError, TypeError):`
864	`pass`
865
866	`# Extract speaker notes (include empty string when notes slide exists)`
867	`try:`
868	`if slide.has_notes_slide:`
869	`notes = slide.notes_slide.notes_text_frame.text.strip()`
870	`content["speaker_notes"] = notes`
871	`except (AttributeError, TypeError):`
872	`pass`
873
874	`img_count = 0`
875
876	`for z_index, shape in enumerate(list(slide.shapes)):`
877	`shape_type = shape.shape_type`
878
879	`# Track image count for filename generation`
880	`if shape_type == 13:`
881	`img_count += 1`
882
883	`# Handle placeholders specially (extract as textbox with marker)`
884	`if shape_type == 14:`
885	`if not shape.has_text_frame:`
886	`continue`
887	`elem = extract_textbox(shape)`
888	`elem["_placeholder"] = True`
889	`elem["z_order"] = z_index`
890	`content["elements"].append(elem)`
891	`continue`
892
893	`# Use shared dispatcher for all other shape types`
894	`elem = _extract_shape_by_type(shape, slide_num, slide_dir, img_count)`
895	`if elem is not None:`
896	`elem["z_order"] = z_index`
897	`content["elements"].append(elem)`
898
899	`# Detect title from textbox near top of slide`
900	`if (`
901	`shape_type == 17`
902	`and not content["title"]`
903	`and emu_to_inches(shape.top) < 1.5`
904	`):`
905	`text = shape.text_frame.text.strip() if shape.has_text_frame else ""`
906	`if text and len(text) < 100:`
907	`content["title"] = text`
908	`continue`
909
910	`# Fallback for unrecognized shape types`
911	`elem_data = {`
912	`"type": "shape",`
913	`"shape": "rectangle",`
914	`"left": emu_to_inches(shape.left),`
915	`"top": emu_to_inches(shape.top),`
916	`"width": emu_to_inches(shape.width),`
917	`"height": emu_to_inches(shape.height),`
918	`"name": shape.name,`
919	`"z_order": z_index,`
920	`}`
921	`if shape_type is not None:`
922	`elem_data["_unrecognized_shape_type"] = int(shape_type)`
923	`content["elements"].append(elem_data)`
924
925	`return content, slide_dir`
926
927
928	`def _resolve_theme_colors(prs) -> dict:`
929	`"""Extract theme color name→hex mappings from the presentation's theme XML.`
930
931	`Reads clrScheme from the slide master's theme and maps theme names`
932	`(background_1, text_1, accent_1, etc.) to their actual hex values.`
933	`"""`
934	`color_map = {}`
935	`scheme_names = {`
936	`"dk1": "dark_1",`
937	`"dk2": "dark_2",`
938	`"lt1": "light_1",`
939	`"lt2": "light_2",`
940	`"accent1": "accent_1",`
941	`"accent2": "accent_2",`
942	`"accent3": "accent_3",`
943	`"accent4": "accent_4",`
944	`"accent5": "accent_5",`
945	`"accent6": "accent_6",`
946	`"hlink": "hyperlink",`
947	`"folHlink": "followed_hyperlink",`
948	`}`
949	`# Map canonical aliases`
950	`aliases = {`
951	`"dark_1": "text_1",`
952	`"dark_2": "text_2",`
953	`"light_1": "background_1",`
954	`"light_2": "background_2",`
955	`}`
956	`try:`
957	`ns_a = "http://schemas.openxmlformats.org/drawingml/2006/main"`
958	`master = prs.slide_masters[0]`
959	`theme_el = None`
960	`# Theme is stored as a related part (generic Part, not XmlPart),`
961	`# so parse its blob directly with lxml.`
962	`for rel in master.part.rels.values():`
963	`if "theme" in rel.reltype:`
964	`theme_el = etree.fromstring(rel.target_part.blob)`
965	`break`
966
967	`if theme_el is not None:`
968	`clr_scheme = theme_el.find(f".//{{{ns_a}}}clrScheme")`
969	`if clr_scheme is not None:`
970	`for child in clr_scheme:`
971	`tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag`
972	`theme_name = scheme_names.get(tag)`
973	`if theme_name is None:`
974	`continue`
975	`# Extract hex value from srgbClr or sysClr`
976	`srgb = child.find(f"{{{ns_a}}}srgbClr")`
977	`if srgb is not None:`
978	`color_map[theme_name] = f"#{srgb.get('val', '000000')}"`
979	`else:`
980	`sys_clr = child.find(f"{{{ns_a}}}sysClr")`
981	`if sys_clr is not None:`
982	`color_map[theme_name] = (`
983	`f"#{sys_clr.get('lastClr', '000000')}"`
984	`)`
985	`# Add alias mappings`
986	`if theme_name in aliases:`
987	`alias = aliases[theme_name]`
988	`if theme_name in color_map:`
989	`color_map[alias] = color_map[theme_name]`
990	`except (AttributeError, TypeError, IndexError):`
991	`pass`
992	`return color_map`
993
994
995	`def _resolve_theme_refs_in_content(content: dict, theme_colors: dict) -> dict:`
996	`"""Replace @theme_name references with resolved hex values in content."""`
997
998	`def resolve_value(val):`
999	`if isinstance(val, str) and val.startswith("@"):`
1000	`theme_name = val[1:]`
1001	`return theme_colors.get(theme_name, val)`
1002	`if isinstance(val, dict):`
1003	`return {k: resolve_value(v) for k, v in val.items()}`
1004	`if isinstance(val, list):`
1005	`return [resolve_value(item) for item in val]`
1006	`return val`
1007
1008	`return resolve_value(content)`
1009
1010
1011	`def main():`
1012	`"""CLI entry point for extracting PPTX content into YAML."""`
1013	`parser = argparse.ArgumentParser(`
1014	`description="Extract content from a PPTX into YAML"`
1015	`)`
1016	`parser.add_argument("--input", required=True, help="Input PPTX file path")`
1017	`parser.add_argument("--output-dir", required=True, help="Output content directory")`
1018	`parser.add_argument(`
1019	`"--slides", help="Comma-separated slide numbers to extract (default: all)"`
1020	`)`
1021	`parser.add_argument(`
1022	`"--resolve-themes",`
1023	`action="store_true",`
1024	`help="Resolve @theme references to actual hex RGB values from the deck's theme",`
1025	`)`
1026	`args = parser.parse_args()`
1027
1028	`pptx_path = Path(args.input)`
1029	`output_dir = Path(args.output_dir)`
1030	`output_dir.mkdir(parents=True, exist_ok=True)`
1031
1032	`slide_filter = None`
1033	`if args.slides:`
1034	`slide_filter = {int(s.strip()) for s in args.slides.split(",")}`
1035
1036	`prs = Presentation(str(pptx_path))`
1037	`print(f"Extracting from: {pptx_path}")`
1038	`print(f"Slides: {len(prs.slides)}")`
1039	`w = emu_to_inches(prs.slide_width)`
1040	`h = emu_to_inches(prs.slide_height)`
1041	`print(f'Dimensions: {w}" x {h}"')`
1042
1043	`# Detect and save global style`
1044	`global_style = detect_global_style(prs)`
1045
1046	`# Resolve theme colors when requested`
1047	`theme_colors = {}`
1048	`if args.resolve_themes:`
1049	`theme_colors = _resolve_theme_colors(prs)`
1050	`if theme_colors:`
1051	`global_style["theme_colors"] = theme_colors`
1052	`global_style = _resolve_theme_refs_in_content(global_style, theme_colors)`
1053	`print(f"Resolved {len(theme_colors)} theme colors")`
1054
1055	`global_dir = output_dir / "global"`
1056	`global_dir.mkdir(parents=True, exist_ok=True)`
1057	`style_path = global_dir / "style.yaml"`
1058	`with open(style_path, "w", encoding="utf-8") as f:`
1059	`yaml.dump(`
1060	`global_style,`
1061	`f,`
1062	`default_flow_style=False,`
1063	`sort_keys=False,`
1064	`allow_unicode=True,`
1065	`)`
1066	`print(f"Global style saved to {style_path}")`
1067
1068	`# Extract slides (filtered or all)`
1069	`slide_dims = (emu_to_inches(prs.slide_width), emu_to_inches(prs.slide_height))`
1070	`extracted = 0`
1071	`for i, slide in enumerate(prs.slides):`
1072	`slide_num = i + 1`
1073	`if slide_filter and slide_num not in slide_filter:`
1074	`continue`
1075	`content, slide_dir = extract_slide(`
1076	`slide, slide_num, output_dir, slide_dims=slide_dims`
1077	`)`
1078
1079	`# Resolve @theme references to hex values when --resolve-themes is set`
1080	`if args.resolve_themes and theme_colors:`
1081	`content = _resolve_theme_refs_in_content(content, theme_colors)`
1082
1083	`content_path = slide_dir / "content.yaml"`
1084	`with open(content_path, "w", encoding="utf-8") as f:`
1085	`yaml.dump(`
1086	`content,`
1087	`f,`
1088	`default_flow_style=False,`
1089	`sort_keys=False,`
1090	`allow_unicode=True,`
1091	`)`
1092	`print(`
1093	`f"Slide {slide_num}: {content.get('title', 'Untitled')} -> {content_path}"`
1094	`)`
1095	`extracted += 1`
1096
1097	`print(f"\nExtraction complete. {extracted} slide(s) extracted to {output_dir}")`
1098
1099
1100	`if __name__ == "__main__":`
1101	`main()`
1102

microsoft/hve-core

Branches

Tags

Clone