microsoft/TypeAgent
Publicmirrored fromhttps://github.com/microsoft/TypeAgentAvailable
python/fineTuning/dataset_convert.py
30lines · modecode
| 1 | # Copyright (c) Microsoft Corporation and Henry Lucco. |
| 2 | # Licensed under the MIT License. |
| 3 | |
| 4 | import json |
| 5 | |
| 6 | from chaparral.models.data import ChapparalDataset |
| 7 | from chaparral.prompts.knowledge import get_knowledge_prompt |
| 8 | |
| 9 | if __name__ == "__main__": |
| 10 | |
| 11 | dataset_file = "./gpt4o_train_200.json" |
| 12 | |
| 13 | with open(dataset_file, "r") as in_file: |
| 14 | data = json.load(in_file) |
| 15 | |
| 16 | dataset = ChapparalDataset.from_list(data) |
| 17 | |
| 18 | items = [] |
| 19 | for pair in dataset.info_pairs: |
| 20 | items.append({ |
| 21 | "instruction" : get_knowledge_prompt(pair.message), |
| 22 | "input": "", |
| 23 | "output" : pair.knowledge.to_str() |
| 24 | }) |
| 25 | |
| 26 | print(len(items[0]["instruction"]), len(items[0]["output"]), len(items[0]["instruction"]) + len(items[0]["output"])) |
| 27 | exit() |
| 28 | |
| 29 | with open("gpt4o_train_200_converted.json", "w") as out_file: |
| 30 | json.dump(items, out_file) |
| 31 | |