make release-tag: Merge branch 'master' into stable

sdv-dev · Aug 20, 2021 · 7e1fe2e · 7e1fe2e
2 parents 4b4413a + af340be
commit 7e1fe2e
Show file tree

Hide file tree

Showing 18 changed files with 248 additions and 126 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,10 @@
-FROM python:3.8-buster
-RUN apt-get update && apt-get install -y build-essential
+FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04
+CMD nvidia-smi
+
+RUN apt-get update && apt-get install -y build-essential && apt-get -y install curl
+RUN apt-get -y install python3.8 python3-distutils && ln -s /usr/bin/python3.8 /usr/bin/python
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+    python get-pip.py && ln -s /usr/bin/pip3 /usr/bin/pip
 
 RUN mkdir /SDGym && \
     mkdir /SDGym/sdgym && \
@@ -13,7 +18,8 @@ COPY /privbayes/ /SDGym/privbayes
 WORKDIR /SDGym
 
 # Install project
-RUN make install-ydata compile
+RUN make install-all compile
+RUN pip install -U numpy==1.20
 ENV PRIVBAYES_BIN /SDGym/privbayes/privBayes.bin
 ENV TF_CPP_MIN_LOG_LEVEL 2
 

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,12 +1,29 @@
 # History
 
+## v0.4.0 - 2021-06-17
+This release fixed a bug where passing a `json` file as configuration for a multi-table synthesizer crashed the model.
+It also adds a number of fixes and enhancements, including: (1) a function and CLI command to list the available synthesizer names,
+(2) a curate set of dependencies and making `Gretel` into an optional dependency, (3) updating `Gretel` to use temp directories,
+(4) using `nvidia-smi` to get the number of gpus and (5) multiple `dockerfile` updates to improve functionality.
+
+### Issues closed
+
+* Bug when using JSON configuration for multiple multi-table evaluation - [Issue #115](https://github.com/sdv-dev/SDGym/issues/115) by @pvk-developer
+* Use nvidia-smi to get number of gpus - [PR #113](https://github.com/sdv-dev/SDGym/issues/113) by @katxiao
+* List synthesizer names - [Issue #82](https://github.com/sdv-dev/SDGym/issues/82) by @fealho
+* Use nvidia base for dockerfile - [PR #108](https://github.com/sdv-dev/SDGym/issues/108) by @katxiao
+* Add Makefile target to install gretel and ydata - [PR #107](https://github.com/sdv-dev/SDGym/issues/107) by @katxiao
+* Curate dependencies and make Gretel optional - [PR #106](https://github.com/sdv-dev/SDGym/issues/106) by @csala
+* Update gretel checkpoints to use temp directory - [PR #105](https://github.com/sdv-dev/SDGym/issues/105) by @katxiao
+* Initialize variable before reference - [PR #104](https://github.com/sdv-dev/SDGym/issues/104) by @katxiao
+
 ## v0.4.0 - 2021-06-17
 
 This release adds new synthesizers for Gretel and ydata, and creates a Docker image for SDGym.
 It also includes enhancements to the accepted SDGym arguments, adds a summary command to aggregate
 metrics, and adds the normalized score to the benchmark results.
 
-## New Features
+### New Features
 
 * Add normalized score to benchmark results - [Issue #102](https://github.com/sdv-dev/SDGym/issues/102) by @katxiao
 * Add max rows and max columns args - [Issue #96](https://github.com/sdv-dev/SDGym/issues/96) by @katxiao

diff --git a/Makefile b/Makefile
@@ -100,6 +100,19 @@ install-ydata-develop: clean-build clean-compile clean-pyc compile ## install th
 	pip install 'ydata-synthetic>=0.3.0,<0.4'
 	pip install -e .[dev]
 
+.PHONY: install-gretel
+install-gretel: clean-build clean-compile clean-pyc compile ## install the package with gretel
+	pip install .[gretel]
+
+.PHONY: install-gretel-develop
+install-gretel-develop: clean-build clean-compile clean-pyc compile ## install the package with gretel and dependencies for development
+	pip install -e .[dev,gretel]
+
+.PHONY: install-all
+install-all: clean-build clean-compile clean-pyc compile ## install the package with gretel and ydata
+	pip install 'ydata-synthetic>=0.3.0,<0.4'
+	pip install .[gretel]
+
 # LINT TARGETS
 
 .PHONY: lint
@@ -126,12 +139,8 @@ test-readme: ## run the readme snippets
 	cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md
 	rm -rf tests/readme_test
 
-# .PHONY: test-tutorials
-# test-tutorials: ## run the tutorial notebooks
-# 	jupyter nbconvert --execute --ExecutePreprocessor.timeout=600 tutorials/*.ipynb --stdout > /dev/null
-
 .PHONY: test
-test: test-unit test-readme # test-tutorials ## test everything that needs test dependencies
+test: test-unit test-readme ## test everything that needs test dependencies
 
 .PHONY: test-devel
 test-devel: lint ## test everything that needs development dependencies
@@ -187,26 +196,31 @@ publish-test: dist publish-confirm ## package and upload a release on TestPyPI
 publish: dist publish-confirm ## package and upload a release
 	twine upload dist/*
 
-.PHONY: bumpversion-release
-bumpversion-release: ## Merge master to stable and bumpversion release
+.PHONY: git-merge-master-stable
+git-merge-master-stable: ## Merge master into stable
 	git checkout stable || git checkout -b stable
 	git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
-	bumpversion release
+
+.PHONY: git-merge-stable-master
+git-merge-stable-master: ## Merge stable into master
+	git checkout master
+	git merge stable
+
+.PHONY: git-push
+git-push: ## Simply push the repository to github
+	git push
+
+.PHONY: git-push-tags-stable
+git-push-tags-stable: ## Push tags and stable to github
 	git push --tags origin stable
 
-.PHONY: bumpversion-release-test
-bumpversion-release-test: ## Merge master to stable and bumpversion release
-	git checkout stable || git checkout -b stable
-	git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
-	bumpversion release --no-tag
-	@echo git push --tags origin stable
+.PHONY: bumpversion-release
+bumpversion-release: ## Bump the version to the next release
+	bumpversion release
 
 .PHONY: bumpversion-patch
-bumpversion-patch: ## Merge stable to master and bumpversion patch
-	git checkout master
-	git merge stable
+bumpversion-patch: ## Bump the version to the next patch
 	bumpversion --no-tag patch
-	git push
 
 .PHONY: bumpversion-candidate
 bumpversion-candidate: ## Bump the version to the next candidate
@@ -222,12 +236,13 @@ bumpversion-major: ## Bump the version the next major skipping the release
 
 .PHONY: bumpversion-revert
 bumpversion-revert: ## Undo a previous bumpversion-release
+	git tag --delete $(shell git tag --points-at HEAD)
 	git checkout master
 	git branch -D stable
 
-CURRENT_VERSION := $(shell grep "^current_version" setup.cfg | grep -o "dev[0-9]*")
 CLEAN_DIR := $(shell git status --short | grep -v ??)
 CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
+CURRENT_VERSION := $(shell grep "^current_version" setup.cfg | grep -o "dev[0-9]*")
 CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l)
 
 .PHONY: check-clean
@@ -236,36 +251,37 @@ ifneq ($(CLEAN_DIR),)
 	$(error There are uncommitted changes)
 endif
 
-.PHONY: check-candidate
-check-candidate: ## Check if a release candidate has been made
-ifeq ($(CURRENT_VERSION),dev0)
-	$(error Please make a release candidate and test it before atempting a release)
-endif
-
 .PHONY: check-master
 check-master: ## Check if we are in master branch
 ifneq ($(CURRENT_BRANCH),master)
 	$(error Please make the release from master branch\n)
 endif
 
+.PHONY: check-candidate
+check-candidate: ## Check if a release candidate has been made
+ifeq ($(CURRENT_VERSION),dev0)
+	$(error Please make a release candidate and test it before atempting a release)
+endif
+
 .PHONY: check-history
 check-history: ## Check if HISTORY.md has been modified
 ifeq ($(CHANGELOG_LINES),0)
 	$(error Please insert the release notes in HISTORY.md before releasing)
 endif
 
 .PHONY: check-release
-check-release: check-candidate check-clean check-master check-history ## Check if the release can be made
+check-release: check-clean check-candidate check-master check-history ## Check if the release can be made
 	@echo "A new release can be made"
 
 .PHONY: release
-release: check-release bumpversion-release publish bumpversion-patch
+release: check-release git-merge-master-stable bumpversion-release git-push-tags-stable \
+	publish git-merge-stable-master bumpversion-patch git-push
 
 .PHONY: release-test
-release-test: check-release bumpversion-release-test publish-test bumpversion-revert
+release-test: check-release git-merge-master-stable bumpversion-release bumpversion-revert
 
 .PHONY: release-candidate
-release-candidate: check-master publish bumpversion-candidate
+release-candidate: check-master publish bumpversion-candidate git-push
 
 .PHONY: release-candidate-test
 release-candidate-test: check-clean check-master publish-test
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = 'sdgym' %}
-{% set version = '0.4.0' %}
+{% set version = '0.4.1.dev3' %}
 
 package:
   name: "{{ name|lower }}"
@@ -17,52 +17,55 @@ build:
 
 requirements:
   host:
-    - ctgan >=0.2.2.dev1,<0.3
-    - gretel-synthetics >=0.15.4,<0.16
-    - humanfriendly >=8.2,<9
-    - numpy >=1.15.4,<2
-    - pandas >=0.23.4,<2
     - pip
-    - pomegranate >=0.13.0,<0.13.5
+    - pytest-runner
+    - graphviz
+    - python >=3.6,<3.9
+    - appdirs >=1.1.4,<2
+    - boto3 >=1.15.0,<2
+    - botocore >=1.20,<2
+    - compress-pickle >=1.2.0,<2
+    - humanfriendly >=8.2,<9
+    - numpy >=1.18.0,<2
+    - pandas >=1.1,<1.1.5
+    - pomegranate >=0.13.4,<0.14.2
     - psutil >=5.7,<6
-    - python
-    - scikit-learn >=0.20,<0.24
-    - scipy >=1.3.0,<2
-    - sdv >=0.4.4.dev0,<0.6
+    - rdt >=0.4.1
+    - sdmetrics >=0.3.0
+    - sdv >=0.9.0
+    - scikit-learn >=0.23,<1
     - tabulate >=0.8.3,<0.9
-    - pytorch >=1.1.0,<2
-    - tensorflow ==2.4.0rc1
-    - torchvision >=0.3.0
-    - tqdm >=4,<5
-    - xlsxwriter >=1.2.8,<1.3
-    - pytest-runner
+    - torch >=1.4,<2
+    - tqdm >=4.14,<5
+    - XlsxWriter >=1.2.8,<1.3
   run:
-    - ctgan >=0.2.2.dev1,<0.3
-    - gretel-synthetics >=0.15.4,<0.16
+    - python >=3.6,<3.9
+    - appdirs >=1.1.4,<2
+    - boto3 >=1.15.0,<2
+    - botocore >=1.20,<2
+    - compress-pickle >=1.2.0,<2
     - humanfriendly >=8.2,<9
-    - numpy >=1.15.4,<2
-    - pandas >=0.23.4,<2
-    - pomegranate >=0.13.0,<0.13.5
+    - numpy >=1.18.0,<2
+    - pandas >=1.1,<1.1.5
+    - pomegranate >=0.13.4,<0.14.2
     - psutil >=5.7,<6
-    - python
-    - scikit-learn >=0.20,<0.24
-    - scipy >=1.3.0,<2
-    - sdv >=0.4.4.dev0,<0.6
+    - rdt >=0.4.1
+    - sdmetrics >=0.3.0
+    - sdv >=0.9.0
+    - scikit-learn >=0.23,<1
     - tabulate >=0.8.3,<0.9
-    - pytorch >=1.1.0,<2
-    - tensorflow ==2.4.0rc1
-    - torchvision >=0.3.0
-    - tqdm >=4,<5
-    - xlsxwriter >=1.2.8,<1.3
+    - torch >=1.4,<2
+    - tqdm >=4.14,<5
+    - XlsxWriter >=1.2.8,<1.3
 
 about:
   home: "https://github.com/sdv-dev/SDGym"
   license: MIT
   license_family: MIT
-  license_file: 
+  license_file:
   summary: "A framework to benchmark the performance of synthetic data generators for non-temporal tabular data"
-  doc_url: 
-  dev_url: 
+  doc_url:
+  dev_url:
 
 extra:
   recipe-maintainers:

diff --git a/sdgym/__init__.py b/sdgym/__init__.py
@@ -8,7 +8,7 @@
 __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
 __email__ = '[email protected]'
 __license__ = 'MIT'
-__version__ = '0.4.0'
+__version__ = '0.4.1.dev3'
 
 from sdgym import benchmark, synthesizers
 from sdgym.benchmark import run

diff --git a/sdgym/__main__.py b/sdgym/__main__.py
@@ -13,6 +13,8 @@
 import tqdm
 
 import sdgym
+from sdgym.synthesizers.base import Baseline
+from sdgym.utils import get_synthesizers
 
 
 def _env_setup(logfile, verbosity):
@@ -134,6 +136,11 @@ def _list_available(args):
     _print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size})
 
 
+def _list_synthesizers(args):
+    synthesizers = Baseline.get_baselines()
+    _print_table(pd.DataFrame(get_synthesizers(list(synthesizers))))
+
+
 def _collect(args):
     sdgym.collect.collect_results(args.input_path, args.output_file, args.aws_key, args.aws_secret)
 
@@ -241,6 +248,11 @@ def _get_parser():
     list_available.add_argument('-as', '--aws-secret', type=str, required=False,
                                 help='Aws secret access key to use when reading datasets.')
 
+    # list-synthesizers
+    list_available = action.add_parser('list-synthesizers',
+                                       help='List synthesizers available for use.')
+    list_available.set_defaults(action=_list_synthesizers)
+
     # collect
     collect = action.add_parser('collect', help='Collect sdgym results.')
     collect.set_defaults(action=_collect)

diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py
@@ -11,7 +11,6 @@
 import compress_pickle
 import numpy as np
 import pandas as pd
-import torch
 import tqdm
 
 from sdgym.datasets import get_dataset_paths, load_dataset, load_tables
@@ -20,6 +19,7 @@
 from sdgym.progress import TqdmLogger, progress
 from sdgym.s3 import is_s3_path, write_csv, write_file
 from sdgym.synthesizers.base import Baseline
+from sdgym.synthesizers.utils import get_num_gpus
 from sdgym.utils import (
     build_synthesizer, format_exception, get_synthesizers, import_object, used_memory)
 
@@ -72,6 +72,7 @@ def _compute_scores(metrics, real_data, synthetic_data, metadata, output):
 
         error = None
         score = None
+        normalized_score = None
         start = datetime.utcnow()
         try:
             LOGGER.info('Computing %s on dataset %s', metric_name, metadata._metadata['name'])
@@ -309,8 +310,9 @@ def run(synthesizers=None, datasets=None, datasets_path=None, modalities=None, b
     run_id = os.getenv('RUN_ID') or str(uuid.uuid4())[:10]
 
     if workers == -1:
-        if torch.cuda.is_available():
-            workers = torch.cuda.device_count()
+        num_gpus = get_num_gpus()
+        if num_gpus > 0:
+            workers = num_gpus
         else:
             workers = multiprocessing.cpu_count()
 

diff --git a/sdgym/synthesizers/__init__.py b/sdgym/synthesizers/__init__.py
@@ -23,7 +23,6 @@
     'CTGAN',
     'Uniform',
     'VEEGAN',
-    'CTGAN',
     'CopulaGAN',
     'GaussianCopulaCategorical',
     'GaussianCopulaCategoricalFuzzy',