microsoft/TypeAgent
Publicmirrored fromhttps://github.com/microsoft/TypeAgentAvailable
python/nprData/generate_chunks.py
77lines · modecode
| 1 | # Copyright (c) Microsoft Corporation and Henry Lucco. |
| 2 | # Licensed under the MIT License. |
| 3 | |
| 4 | import json |
| 5 | from structs import Episode, Chunk, Turn, Section |
| 6 | from embedding import Embedding |
| 7 | from tqdm import tqdm |
| 8 | from dotenv import load_dotenv |
| 9 | from concurrent.futures import ThreadPoolExecutor, as_completed |
| 10 | from llm_util import LLMChat |
| 11 | from prompts import typeagent_entity_extraction_system_full, generic_chunk_prompt |
| 12 | |
| 13 | def generate_chunk_content(content: str) -> str: |
| 14 | chat = LLMChat() |
| 15 | # this for experimenting with how an embedding of a json string performs |
| 16 | # prompt = typeagent_entity_extraction_system_full(content) |
| 17 | # more state of the art typical prose prompt |
| 18 | prompt = generic_chunk_prompt(content) |
| 19 | response_turn = chat.send_message("user", prompt) |
| 20 | return response_turn.content |
| 21 | |
| 22 | def process_turn( |
| 23 | episode_id: str, |
| 24 | section: Section, |
| 25 | turn: Turn, |
| 26 | use_llm: bool = False |
| 27 | ) -> Chunk: |
| 28 | cleaned_title = section.title.split("<")[-1].strip() |
| 29 | |
| 30 | content = turn.content |
| 31 | if use_llm: |
| 32 | content = generate_chunk_content(content) |
| 33 | |
| 34 | embedding = Embedding.from_text(content) |
| 35 | |
| 36 | # print(f"Generated embedding of size {embedding.dimension} for {turn.id}") |
| 37 | |
| 38 | chunk = Chunk( |
| 39 | id=turn.id, |
| 40 | episode_id=episode_id, |
| 41 | section_title=cleaned_title, |
| 42 | section_id=section.id, |
| 43 | speaker=turn.speaker, |
| 44 | content=content, |
| 45 | speaker_role=turn.speaker_role, |
| 46 | embedding=embedding |
| 47 | ) |
| 48 | return chunk |
| 49 | |
| 50 | def generate_chunks(in_file: str, out_file: str): |
| 51 | with open(in_file, "r") as f: |
| 52 | data = json.load(f) |
| 53 | print(len(data)) |
| 54 | episodes = [Episode.from_dict(episode) for episode in data] |
| 55 | chunks = [] |
| 56 | |
| 57 | with ThreadPoolExecutor() as executor: |
| 58 | futures = [] |
| 59 | |
| 60 | for episode in tqdm(episodes): |
| 61 | for section in episode.sections: |
| 62 | for turn in section.transcript: |
| 63 | futures.append(executor.submit(process_turn, episode.id, section, turn)) |
| 64 | |
| 65 | |
| 66 | for future in tqdm(as_completed(futures), total=len(futures)): |
| 67 | chunks.append(future.result()) |
| 68 | |
| 69 | with open(out_file, "w") as f: |
| 70 | json.dump([chunk.to_dict() for chunk in chunks], f, indent=4) |
| 71 | |
| 72 | if __name__ == "__main__": |
| 73 | load_dotenv("./env_vars") |
| 74 | generate_chunks( |
| 75 | in_file="npr.json", |
| 76 | out_file="npr_chunks.json" |
| 77 | ) |