microsoft/TypeAgent

Public

mirrored fromhttps://github.com/microsoft/TypeAgentAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
f46fff4e5103217703b51e27ba3f6405ac000e21

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

python/nprData/generateDataset.py

34lines · modecode

1# Copyright (c) Microsoft Corporation and Henry Lucco.
2# Licensed under the MIT License.
3
4import json
5import random
6from structs import Chunk
7from typing import List
8
9filenameBase = 'npr_chunks_no_embedding'
10pctTrain = 0.8
11pctVal = 0.1
12pctTest = 0.1
13
14# creates a random list of chunks, with length samplesTotal
15def createRandomIndexList(chunks: List[Chunk], samplesTotal: int = 5000) -> List[int]:
16 indexList = random.sample(range(len(chunks)), samplesTotal)
17 return indexList
18
19with open(filenameBase + '.json') as f:
20 chunks = json.load(f)
21 samplesTotal = 5000
22 randomList = createRandomIndexList(chunks, samplesTotal=samplesTotal)
23 trainList = randomList[:int(pctTrain * samplesTotal)]
24 valList = randomList[int(pctTrain * samplesTotal):int((pctTrain + pctVal) * samplesTotal)]
25 testList = randomList[int((pctTrain + pctVal) * samplesTotal):]
26 # write train, val, and test files with the corresponding chunks
27 with open(filenameBase + '_train.json', 'w') as ftrain:
28 json.dump([chunks[i] for i in trainList], ftrain, indent=4)
29 with open(filenameBase + '_val.json', 'w') as fval:
30 json.dump([chunks[i] for i in valList], fval, indent=4)
31 with open(filenameBase + '_test.json', 'w') as ftest:
32 json.dump([chunks[i] for i in testList], ftest, indent=4)
33
34 print("Train, val, and test files created successfully!")