microsoft/TypeAgent
Publicmirrored fromhttps://github.com/microsoft/TypeAgentAvailable
python/nprData/generateDataset.py
34lines · modecode
| 1 | # Copyright (c) Microsoft Corporation and Henry Lucco. |
| 2 | # Licensed under the MIT License. |
| 3 | |
| 4 | import json |
| 5 | import random |
| 6 | from structs import Chunk |
| 7 | from typing import List |
| 8 | |
| 9 | filenameBase = 'npr_chunks_no_embedding' |
| 10 | pctTrain = 0.8 |
| 11 | pctVal = 0.1 |
| 12 | pctTest = 0.1 |
| 13 | |
| 14 | # creates a random list of chunks, with length samplesTotal |
| 15 | def createRandomIndexList(chunks: List[Chunk], samplesTotal: int = 5000) -> List[int]: |
| 16 | indexList = random.sample(range(len(chunks)), samplesTotal) |
| 17 | return indexList |
| 18 | |
| 19 | with open(filenameBase + '.json') as f: |
| 20 | chunks = json.load(f) |
| 21 | samplesTotal = 5000 |
| 22 | randomList = createRandomIndexList(chunks, samplesTotal=samplesTotal) |
| 23 | trainList = randomList[:int(pctTrain * samplesTotal)] |
| 24 | valList = randomList[int(pctTrain * samplesTotal):int((pctTrain + pctVal) * samplesTotal)] |
| 25 | testList = randomList[int((pctTrain + pctVal) * samplesTotal):] |
| 26 | # write train, val, and test files with the corresponding chunks |
| 27 | with open(filenameBase + '_train.json', 'w') as ftrain: |
| 28 | json.dump([chunks[i] for i in trainList], ftrain, indent=4) |
| 29 | with open(filenameBase + '_val.json', 'w') as fval: |
| 30 | json.dump([chunks[i] for i in valList], fval, indent=4) |
| 31 | with open(filenameBase + '_test.json', 'w') as ftest: |
| 32 | json.dump([chunks[i] for i in testList], ftest, indent=4) |
| 33 | |
| 34 | print("Train, val, and test files created successfully!") |