microsoft/TypeAgent

Public

mirrored fromhttps://github.com/microsoft/TypeAgentAvailable

Watch0 Fork0 Star0

Code Commits Issues Pull requests Actions Insights Security

ecec6489db2ad85ccd1be99c0860dc3b057af2b1

Find a branch or tag

Branches

ecec6489db2ad85ccd1be99c0860dc3b057af2b1

Clone

HTTPS

Download ZIP

TypeAgent/python/nprData

python/nprData/generate_chunks.py

77lines · modecode

Raw Download

Latest commit unavailable.

unknown

1	`# Copyright (c) Microsoft Corporation and Henry Lucco.`
2	`# Licensed under the MIT License.`
3
4	`import json`
5	`from structs import Episode, Chunk, Turn, Section`
6	`from embedding import Embedding`
7	`from tqdm import tqdm`
8	`from dotenv import load_dotenv`
9	`from concurrent.futures import ThreadPoolExecutor, as_completed`
10	`from llm_util import LLMChat`
11	`from prompts import typeagent_entity_extraction_system_full, generic_chunk_prompt`
12
13	`def generate_chunk_content(content: str) -> str:`
14	`chat = LLMChat()`
15	`# this for experimenting with how an embedding of a json string performs`
16	`# prompt = typeagent_entity_extraction_system_full(content)`
17	`# more state of the art typical prose prompt`
18	`prompt = generic_chunk_prompt(content)`
19	`response_turn = chat.send_message("user", prompt)`
20	`return response_turn.content`
21
22	`def process_turn(`
23	`episode_id: str,`
24	`section: Section,`
25	`turn: Turn,`
26	`use_llm: bool = False`
27	`) -> Chunk:`
28	`cleaned_title = section.title.split("<")[-1].strip()`
29
30	`content = turn.content`
31	`if use_llm:`
32	`content = generate_chunk_content(content)`
33
34	`embedding = Embedding.from_text(content)`
35
36	`# print(f"Generated embedding of size {embedding.dimension} for {turn.id}")`
37
38	`chunk = Chunk(`
39	`id=turn.id,`
40	`episode_id=episode_id,`
41	`section_title=cleaned_title,`
42	`section_id=section.id,`
43	`speaker=turn.speaker,`
44	`content=content,`
45	`speaker_role=turn.speaker_role,`
46	`embedding=embedding`
47	`)`
48	`return chunk`
49
50	`def generate_chunks(in_file: str, out_file: str):`
51	`with open(in_file, "r") as f:`
52	`data = json.load(f)`
53	`print(len(data))`
54	`episodes = [Episode.from_dict(episode) for episode in data]`
55	`chunks = []`
56
57	`with ThreadPoolExecutor() as executor:`
58	`futures = []`
59
60	`for episode in tqdm(episodes):`
61	`for section in episode.sections:`
62	`for turn in section.transcript:`
63	`futures.append(executor.submit(process_turn, episode.id, section, turn))`
64
65
66	`for future in tqdm(as_completed(futures), total=len(futures)):`
67	`chunks.append(future.result())`
68
69	`with open(out_file, "w") as f:`
70	`json.dump([chunk.to_dict() for chunk in chunks], f, indent=4)`
71
72	`if __name__ == "__main__":`
73	`load_dotenv("./env_vars")`
74	`generate_chunks(`
75	`in_file="npr.json",`
76	`out_file="npr_chunks.json"`
77	`)`

microsoft/TypeAgent

Branches

Tags

Clone