microsoft/TypeAgent

Public

mirrored fromhttps://github.com/microsoft/TypeAgentAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
ecec6489db2ad85ccd1be99c0860dc3b057af2b1

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

python/nprData/generate_chunks.py

77lines · modecode

1# Copyright (c) Microsoft Corporation and Henry Lucco.
2# Licensed under the MIT License.
3
4import json
5from structs import Episode, Chunk, Turn, Section
6from embedding import Embedding
7from tqdm import tqdm
8from dotenv import load_dotenv
9from concurrent.futures import ThreadPoolExecutor, as_completed
10from llm_util import LLMChat
11from prompts import typeagent_entity_extraction_system_full, generic_chunk_prompt
12
13def generate_chunk_content(content: str) -> str:
14 chat = LLMChat()
15 # this for experimenting with how an embedding of a json string performs
16 # prompt = typeagent_entity_extraction_system_full(content)
17 # more state of the art typical prose prompt
18 prompt = generic_chunk_prompt(content)
19 response_turn = chat.send_message("user", prompt)
20 return response_turn.content
21
22def process_turn(
23 episode_id: str,
24 section: Section,
25 turn: Turn,
26 use_llm: bool = False
27 ) -> Chunk:
28 cleaned_title = section.title.split("<")[-1].strip()
29
30 content = turn.content
31 if use_llm:
32 content = generate_chunk_content(content)
33
34 embedding = Embedding.from_text(content)
35
36 # print(f"Generated embedding of size {embedding.dimension} for {turn.id}")
37
38 chunk = Chunk(
39 id=turn.id,
40 episode_id=episode_id,
41 section_title=cleaned_title,
42 section_id=section.id,
43 speaker=turn.speaker,
44 content=content,
45 speaker_role=turn.speaker_role,
46 embedding=embedding
47 )
48 return chunk
49
50def generate_chunks(in_file: str, out_file: str):
51 with open(in_file, "r") as f:
52 data = json.load(f)
53 print(len(data))
54 episodes = [Episode.from_dict(episode) for episode in data]
55 chunks = []
56
57 with ThreadPoolExecutor() as executor:
58 futures = []
59
60 for episode in tqdm(episodes):
61 for section in episode.sections:
62 for turn in section.transcript:
63 futures.append(executor.submit(process_turn, episode.id, section, turn))
64
65
66 for future in tqdm(as_completed(futures), total=len(futures)):
67 chunks.append(future.result())
68
69 with open(out_file, "w") as f:
70 json.dump([chunk.to_dict() for chunk in chunks], f, indent=4)
71
72if __name__ == "__main__":
73 load_dotenv("./env_vars")
74 generate_chunks(
75 in_file="npr.json",
76 out_file="npr_chunks.json"
77 )