forked from floft/codats
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_statistics.py
executable file
·112 lines (80 loc) · 2.88 KB
/
dataset_statistics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python3
"""
Check the datasets statistics for each dataset (from .tfrecord files)
Note: sets CUDA_VISIBLE_DEVICES= so that it doesn't use the GPU.
Run something like the following to save the result:
./dataset_statistics.py | tee dataset_statistics.txt
"""
import os
import numpy as np
from absl import app
from absl import flags
from datasets import datasets
from load_datasets import load_da
from print_dictionary import print_dictionary
FLAGS = flags.FLAGS
def get_data(dataset):
""" Get all the data in a dataset (i.e. must not use tf.data's repeat()) """
xs = []
ys = []
for x, y in dataset:
# Convert each modality/epoch to a numpy before adding
xs.append(x.numpy())
ys.append(y.numpy())
# Stack each modality
x = np.vstack(xs)
y = np.hstack(ys)
return x, y
def calc_individual_stats(x, only_total=False):
original_shape = x.shape
# Reshape from [num_examples, time_steps, num_features] to
# [num_examples*time_steps, num_features]
if len(x.shape) > 1: # i.e. this is x, not y
x = x.reshape(-1, x.shape[-1])
# Then, compute statistics along axis 0, leaving us with an array of length
# num_features for each
results = {}
if not only_total:
results["min"] = np.min(x, axis=0)
results["max"] = np.max(x, axis=0)
results["mean"] = np.mean(x, axis=0)
results["std"] = np.std(x, axis=0)
results["total_min"] = np.min(x)
results["total_max"] = np.max(x)
results["total_mean"] = np.mean(x)
results["total_std"] = np.std(x)
results["shape"] = original_shape
results["dtype"] = x.dtype
return results
def calc_stats(data):
x, y = data
assert len(x) == len(y)
results = {}
results["x"] = calc_individual_stats(x)
results["y"] = calc_individual_stats(y, only_total=True)
return results
def print_stats(dataset_name, dataset, test=False):
if test:
test_name = "test"
else:
test_name = "valid"
print_dictionary(calc_stats(get_data(dataset.train_evaluation)),
dataset_name + "_" + "train")
print_dictionary(calc_stats(get_data(dataset.test_evaluation)),
dataset_name + "_" + test_name)
print()
def main(argv):
# Don't bother using the GPU for this
os.environ["CUDA_VISIBLE_DEVICES"] = ""
for dataset_name in datasets.list_datasets():
for user in datasets.get_dataset_users(dataset_name):
# Note: test=False so we only look at the training samples, where
# train=80% of training set, test=20% of training set, i.e. the
# validation set
test = False
sources, _ = load_da(dataset_name, str(user), "", test=test)
assert len(sources) == 1
dataset = sources[0]
print_stats(dataset_name+"_"+str(user), dataset, test=test)
if __name__ == "__main__":
app.run(main)