diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 00000000..e4db0356
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,2 @@
+# Global rule:
+* @sdv-dev/core-contributors
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000..22f701b4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,33 @@
+---
+name: Bug report
+about: Report an error that you found when using SDGym
+title: ''
+labels: bug, pending review
+assignees: ''
+
+---
+
+### Environment Details
+
+Please indicate the following details about the environment in which you found the bug:
+
+* SDGym version:
+* Python version:
+* Operating System:
+
+### Error Description
+
+
+
+### Steps to reproduce
+
+
+
+```
+Paste the command(s) you ran and the output.
+If there was a crash, please include the traceback here.
+```
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 00000000..75f16bca
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,24 @@
+---
+name: Feature request
+about: Request a new feature that you would like to see implemented in SDGym
+title: ''
+labels: new feature, pending review
+assignees: ''
+
+---
+
+### Problem Description
+
+
+
+### Expected behavior
+
+
+
+### Additional context
+
+
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
new file mode 100644
index 00000000..222d3f44
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -0,0 +1,33 @@
+---
+name: Question
+about: Doubts about SDV usage
+title: ''
+labels: question, pending review
+assignees: ''
+
+---
+
+### Environment details
+
+If you are already running SDGym, please indicate the following details about the environment in
+which you are running it:
+
+* SDGym version:
+* Python version:
+* Operating System:
+
+### Problem description
+
+
+
+### What I already tried
+
+
+
+```
+Paste the command(s) you ran and the output.
+If there was a crash, please include the traceback here.
+```
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
new file mode 100644
index 00000000..c3c2e8ac
--- /dev/null
+++ b/.github/workflows/integration.yml
@@ -0,0 +1,31 @@
+name: Integration Tests
+
+on:
+ - push
+ - pull_request
+
+jobs:
+ unit:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: [3.6, 3.7, 3.8, 3.9]
+ os: [ubuntu-latest, macos-10.15, windows-latest]
+ steps:
+ - uses: actions/checkout@v1
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python-version }}
+ - if: matrix.os == 'windows-latest'
+ name: Install dependencies - Windows
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install 'torch==1.8.0' -f https://download.pytorch.org/whl/cpu/torch/
+ python -m pip install 'torchvision==0.9.0' -f https://download.pytorch.org/whl/cpu/torchvision/
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install invoke .[test]
+ - name: Run integration tests
+ run: invoke integration
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 00000000..0630c219
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,21 @@
+name: Style Checks
+
+on:
+ - push
+ - pull_request
+
+jobs:
+ lint:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v1
+ - name: Set up Python 3.8
+ uses: actions/setup-python@v1
+ with:
+ python-version: 3.8
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install invoke .[dev]
+ - name: Run lint checks
+ run: invoke lint
diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml
new file mode 100644
index 00000000..cb2f3af5
--- /dev/null
+++ b/.github/workflows/minimum.yml
@@ -0,0 +1,31 @@
+name: Unit Tests Minimum Versions
+
+on:
+ - push
+ - pull_request
+
+jobs:
+ minimum:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: [3.6, 3.7, 3.8, 3.9]
+ os: [ubuntu-latest, macos-10.15, windows-latest]
+ steps:
+ - uses: actions/checkout@v1
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python-version }}
+ - if: matrix.os == 'windows-latest'
+ name: Install dependencies - Windows
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install 'torch==1.8.0' -f https://download.pytorch.org/whl/cpu/torch/
+ python -m pip install 'torchvision==0.9.0' -f https://download.pytorch.org/whl/cpu/torchvision/
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install invoke .[test]
+ - name: Test with minimum versions
+ run: invoke minimum
diff --git a/.github/workflows/readme.yml b/.github/workflows/readme.yml
new file mode 100644
index 00000000..2fe4b64c
--- /dev/null
+++ b/.github/workflows/readme.yml
@@ -0,0 +1,25 @@
+name: Test README
+
+on:
+ - push
+ - pull_request
+
+jobs:
+ readme:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: [3.6, 3.7, 3.8, 3.9]
+ os: [ubuntu-latest, macos-10.15] # skip windows bc rundoc fails
+ steps:
+ - uses: actions/checkout@v1
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install invoke rundoc .
+ - name: Run the README.md
+ run: invoke readme
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
deleted file mode 100644
index 6a0e7c08..00000000
--- a/.github/workflows/tests.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: Run Tests
-
-on:
- push:
- branches: [ '*' ]
- pull_request:
- branches: [ master ]
-
-jobs:
- build:
- runs-on: ${{ matrix.os }}
- strategy:
- matrix:
- python-version: [3.6, 3.7, 3.8]
- os: [ubuntu-latest, macos-latest]
-
- steps:
- - uses: actions/checkout@v1
- - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v1
- with:
- python-version: ${{ matrix.python-version }}
-
- - name: Install dependencies
- run: |
- python -m pip install --upgrade pip
- pip install tox tox-gh-actions
-
- - name: Test with tox
- run: tox
diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml
new file mode 100644
index 00000000..18558731
--- /dev/null
+++ b/.github/workflows/unit.yml
@@ -0,0 +1,34 @@
+name: Unit Tests
+
+on:
+ - push
+ - pull_request
+
+jobs:
+ unit:
+ runs-on: ${{ matrix.os }}
+ strategy:
+ matrix:
+ python-version: [3.6, 3.7, 3.8, 3.9]
+ os: [ubuntu-latest, macos-10.15, windows-latest]
+ steps:
+ - uses: actions/checkout@v1
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v1
+ with:
+ python-version: ${{ matrix.python-version }}
+ - if: matrix.os == 'windows-latest'
+ name: Install dependencies - Windows
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install 'torch==1.8.0' -f https://download.pytorch.org/whl/cpu/torch/
+ python -m pip install 'torchvision==0.9.0' -f https://download.pytorch.org/whl/cpu/torchvision/
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install invoke .[test]
+ - name: Run unit tests
+ run: invoke unit
+ - if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.8
+ name: Upload codecov report
+ uses: codecov/codecov-action@v2
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 09c9e3e1..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-# Config file for automatic testing at travis-ci.org
-dist: bionic
-language: python
-python:
- - 3.8
- - 3.7
- - 3.6
-
-# Command to install dependencies
-install:
- - pip install -U tox-travis codecov
-
-after_success: codecov
-
-# Command to run tests
-script: tox
diff --git a/Dockerfile b/Dockerfile
index f4685b19..b62959fe 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,10 @@
-FROM python:3.8-buster
-RUN apt-get update && apt-get install -y build-essential
+FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04
+CMD nvidia-smi
+
+RUN apt-get update && apt-get install -y build-essential && apt-get -y install curl
+RUN apt-get -y install python3.8 python3-distutils && ln -s /usr/bin/python3.8 /usr/bin/python
+RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \
+ python get-pip.py && ln -s /usr/bin/pip3 /usr/bin/pip
RUN mkdir /SDGym && \
mkdir /SDGym/sdgym && \
@@ -13,7 +18,8 @@ COPY /privbayes/ /SDGym/privbayes
WORKDIR /SDGym
# Install project
-RUN make install-ydata compile
+RUN make install-all compile
+RUN pip install -U numpy==1.20
ENV PRIVBAYES_BIN /SDGym/privbayes/privBayes.bin
ENV TF_CPP_MIN_LOG_LEVEL 2
diff --git a/HISTORY.md b/HISTORY.md
index 2fe72943..fd6af46b 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,12 +1,39 @@
# History
+## v0.5.0 - 2021-12-13
+This release adds support for Python 3.9, and updates dependencies to accept the latest versions when possible.
+
+### Issues closed
+
+* Add support for Python 3.9 - [Issue #127](https://github.com/sdv-dev/SDGym/issues/127) by @katxiao
+* Add pip check worflow - [Issue #124](https://github.com/sdv-dev/SDGym/issues/124) by @pvk-developer
+* Fix meta.yaml dependencies - [PR #119](https://github.com/sdv-dev/SDGym/pull/119) by @fealho
+* Upgrade dependency ranges - [Issue #118](https://github.com/sdv-dev/SDGym/issues/118) by @katxiao
+
+## v0.4.1 - 2021-08-20
+This release fixed a bug where passing a `json` file as configuration for a multi-table synthesizer crashed the model.
+It also adds a number of fixes and enhancements, including: (1) a function and CLI command to list the available synthesizer names,
+(2) a curate set of dependencies and making `Gretel` into an optional dependency, (3) updating `Gretel` to use temp directories,
+(4) using `nvidia-smi` to get the number of gpus and (5) multiple `dockerfile` updates to improve functionality.
+
+### Issues closed
+
+* Bug when using JSON configuration for multiple multi-table evaluation - [Issue #115](https://github.com/sdv-dev/SDGym/issues/115) by @pvk-developer
+* Use nvidia-smi to get number of gpus - [PR #113](https://github.com/sdv-dev/SDGym/issues/113) by @katxiao
+* List synthesizer names - [Issue #82](https://github.com/sdv-dev/SDGym/issues/82) by @fealho
+* Use nvidia base for dockerfile - [PR #108](https://github.com/sdv-dev/SDGym/issues/108) by @katxiao
+* Add Makefile target to install gretel and ydata - [PR #107](https://github.com/sdv-dev/SDGym/issues/107) by @katxiao
+* Curate dependencies and make Gretel optional - [PR #106](https://github.com/sdv-dev/SDGym/issues/106) by @csala
+* Update gretel checkpoints to use temp directory - [PR #105](https://github.com/sdv-dev/SDGym/issues/105) by @katxiao
+* Initialize variable before reference - [PR #104](https://github.com/sdv-dev/SDGym/issues/104) by @katxiao
+
## v0.4.0 - 2021-06-17
This release adds new synthesizers for Gretel and ydata, and creates a Docker image for SDGym.
It also includes enhancements to the accepted SDGym arguments, adds a summary command to aggregate
metrics, and adds the normalized score to the benchmark results.
-## New Features
+### New Features
* Add normalized score to benchmark results - [Issue #102](https://github.com/sdv-dev/SDGym/issues/102) by @katxiao
* Add max rows and max columns args - [Issue #96](https://github.com/sdv-dev/SDGym/issues/96) by @katxiao
diff --git a/INSTALL.md b/INSTALL.md
index 0c228a0a..497304d8 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -2,7 +2,7 @@
## Requirements
-**SDGym** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/)
+**SDGym** has been developed and tested on [Python 3.6, 3.7, 3.8 and 3.9](https://www.python.org/downloads/)
Also, although it is not strictly required, the usage of a [virtualenv](
https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid
@@ -94,4 +94,4 @@ export PRIVBAYES_BIN=$(pwd)/privBayes.bin
## Run using Docker
-We support using Docker to run **SDGym**. For more information on how to do so, check the `DOCKER.md` file.
\ No newline at end of file
+We support using Docker to run **SDGym**. For more information on how to do so, check the `DOCKER.md` file.
diff --git a/Makefile b/Makefile
index cd9888bc..d3f74d57 100644
--- a/Makefile
+++ b/Makefile
@@ -47,12 +47,6 @@ clean-pyc: ## remove Python file artifacts
find . -name '*~' -exec rm -f {} +
find . -name '__pycache__' -exec rm -fr {} +
-.PHONY: clean-docs
-clean-docs: ## remove previously built docs
- rm -rf docs/_build
- rm -f docs/api/*.rst
- -$(MAKE) -C docs clean 2>/dev/null # this fails if sphinx is not yet installed
-
.PHONY: clean-coverage
clean-coverage: ## remove coverage artifacts
rm -f .coverage
@@ -69,7 +63,7 @@ clean-compile: ## remove compile artifacts
rm -fr __privbn_tmp/
.PHONY: clean
-clean: clean-build clean-compile clean-pyc clean-test clean-coverage clean-docs ## remove all build, test, coverage, docs and Python artifacts
+clean: clean-build clean-compile clean-pyc clean-test clean-coverage ## remove all build, test, coverage and Python artifacts
# INSTALL TARGETS
@@ -100,6 +94,19 @@ install-ydata-develop: clean-build clean-compile clean-pyc compile ## install th
pip install 'ydata-synthetic>=0.3.0,<0.4'
pip install -e .[dev]
+.PHONY: install-gretel
+install-gretel: clean-build clean-compile clean-pyc compile ## install the package with gretel
+ pip install .[gretel]
+
+.PHONY: install-gretel-develop
+install-gretel-develop: clean-build clean-compile clean-pyc compile ## install the package with gretel and dependencies for development
+ pip install -e .[dev,gretel]
+
+.PHONY: install-all
+install-all: clean-build clean-compile clean-pyc compile ## install the package with gretel and ydata
+ pip install 'ydata-synthetic>=0.3.0,<0.4'
+ pip install .[gretel]
+
# LINT TARGETS
.PHONY: lint
@@ -126,12 +133,8 @@ test-readme: ## run the readme snippets
cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md
rm -rf tests/readme_test
-# .PHONY: test-tutorials
-# test-tutorials: ## run the tutorial notebooks
-# jupyter nbconvert --execute --ExecutePreprocessor.timeout=600 tutorials/*.ipynb --stdout > /dev/null
-
.PHONY: test
-test: test-unit test-readme # test-tutorials ## test everything that needs test dependencies
+test: test-unit test-readme ## test everything that needs test dependencies
.PHONY: test-devel
test-devel: lint ## test everything that needs development dependencies
@@ -148,22 +151,6 @@ coverage: ## check code coverage quickly with the default Python
$(BROWSER) htmlcov/index.html
-# DOCS TARGETS
-
-.PHONY: docs
-docs: clean-docs ## generate Sphinx HTML documentation, including API docs
- sphinx-apidoc --separate -T -o docs/api/ sdgym
- $(MAKE) -C docs html
-
-.PHONY: view-docs
-view-docs: docs ## view docs in browser
- $(BROWSER) docs/_build/html/index.html
-
-.PHONY: serve-docs
-serve-docs: view-docs ## compile the docs watching for changes
- watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' .
-
-
# RELEASE TARGETS
.PHONY: dist
@@ -187,26 +174,31 @@ publish-test: dist publish-confirm ## package and upload a release on TestPyPI
publish: dist publish-confirm ## package and upload a release
twine upload dist/*
-.PHONY: bumpversion-release
-bumpversion-release: ## Merge master to stable and bumpversion release
+.PHONY: git-merge-master-stable
+git-merge-master-stable: ## Merge master into stable
git checkout stable || git checkout -b stable
git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
- bumpversion release
+
+.PHONY: git-merge-stable-master
+git-merge-stable-master: ## Merge stable into master
+ git checkout master
+ git merge stable
+
+.PHONY: git-push
+git-push: ## Simply push the repository to github
+ git push
+
+.PHONY: git-push-tags-stable
+git-push-tags-stable: ## Push tags and stable to github
git push --tags origin stable
-.PHONY: bumpversion-release-test
-bumpversion-release-test: ## Merge master to stable and bumpversion release
- git checkout stable || git checkout -b stable
- git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable"
- bumpversion release --no-tag
- @echo git push --tags origin stable
+.PHONY: bumpversion-release
+bumpversion-release: ## Bump the version to the next release
+ bumpversion release
.PHONY: bumpversion-patch
-bumpversion-patch: ## Merge stable to master and bumpversion patch
- git checkout master
- git merge stable
+bumpversion-patch: ## Bump the version to the next patch
bumpversion --no-tag patch
- git push
.PHONY: bumpversion-candidate
bumpversion-candidate: ## Bump the version to the next candidate
@@ -222,12 +214,13 @@ bumpversion-major: ## Bump the version the next major skipping the release
.PHONY: bumpversion-revert
bumpversion-revert: ## Undo a previous bumpversion-release
+ git tag --delete $(shell git tag --points-at HEAD)
git checkout master
git branch -D stable
-CURRENT_VERSION := $(shell grep "^current_version" setup.cfg | grep -o "dev[0-9]*")
CLEAN_DIR := $(shell git status --short | grep -v ??)
CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null)
+CURRENT_VERSION := $(shell grep "^current_version" setup.cfg | grep -o "dev[0-9]*")
CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l)
.PHONY: check-clean
@@ -236,18 +229,18 @@ ifneq ($(CLEAN_DIR),)
$(error There are uncommitted changes)
endif
-.PHONY: check-candidate
-check-candidate: ## Check if a release candidate has been made
-ifeq ($(CURRENT_VERSION),dev0)
- $(error Please make a release candidate and test it before atempting a release)
-endif
-
.PHONY: check-master
check-master: ## Check if we are in master branch
ifneq ($(CURRENT_BRANCH),master)
$(error Please make the release from master branch\n)
endif
+.PHONY: check-candidate
+check-candidate: ## Check if a release candidate has been made
+ifeq ($(CURRENT_VERSION),dev0)
+ $(error Please make a release candidate and test it before atempting a release)
+endif
+
.PHONY: check-history
check-history: ## Check if HISTORY.md has been modified
ifeq ($(CHANGELOG_LINES),0)
@@ -255,17 +248,18 @@ ifeq ($(CHANGELOG_LINES),0)
endif
.PHONY: check-release
-check-release: check-candidate check-clean check-master check-history ## Check if the release can be made
+check-release: check-clean check-candidate check-master check-history ## Check if the release can be made
@echo "A new release can be made"
.PHONY: release
-release: check-release bumpversion-release publish bumpversion-patch
+release: check-release git-merge-master-stable bumpversion-release git-push-tags-stable \
+ publish git-merge-stable-master bumpversion-patch git-push
.PHONY: release-test
-release-test: check-release bumpversion-release-test publish-test bumpversion-revert
+release-test: check-release git-merge-master-stable bumpversion-release bumpversion-revert
.PHONY: release-candidate
-release-candidate: check-master publish bumpversion-candidate
+release-candidate: check-master publish bumpversion-candidate git-push
.PHONY: release-candidate-test
release-candidate-test: check-clean check-master publish-test
diff --git a/README.md b/README.md
index 91322824..1653a5a0 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@
[![PyPi Shield](https://img.shields.io/pypi/v/sdgym.svg)](https://pypi.python.org/pypi/sdgym)
[![Downloads](https://pepy.tech/badge/sdgym)](https://pepy.tech/project/sdgym)
-
+
Benchmarking framework for Synthetic Data Generators
diff --git a/conda/meta.yaml b/conda/meta.yaml
index c60d039a..9d43a4d1 100644
--- a/conda/meta.yaml
+++ b/conda/meta.yaml
@@ -1,5 +1,5 @@
{% set name = 'sdgym' %}
-{% set version = '0.4.0' %}
+{% set version = '0.5.0.dev1' %}
package:
name: "{{ name|lower }}"
@@ -17,52 +17,53 @@ build:
requirements:
host:
- - ctgan >=0.2.2.dev1,<0.3
- - gretel-synthetics >=0.15.4,<0.16
- - humanfriendly >=8.2,<9
- - numpy >=1.15.4,<2
- - pandas >=0.23.4,<2
- pip
- - pomegranate >=0.13.0,<0.13.5
+ - pytest-runner
+ - graphviz
+ - python >=3.6,<3.9
+ - appdirs >=1.3,<2
+ - boto3 >=1.15.0,<2
+ - botocore >=1.18,<2
+ - humanfriendly >=8.2,<11
+ - numpy >=1.18.0,<2
+ - pandas >=1.1.3,<2
+ - pomegranate >=0.14.1,<15
- psutil >=5.7,<6
- - python
- - scikit-learn >=0.20,<0.24
- - scipy >=1.3.0,<2
- - sdv >=0.4.4.dev0,<0.6
+ - scikit-learn >=0.24,<2
- tabulate >=0.8.3,<0.9
- - pytorch >=1.1.0,<2
- - tensorflow ==2.4.0rc1
- - torchvision >=0.3.0
- - tqdm >=4,<5
- - xlsxwriter >=1.2.8,<1.3
- - pytest-runner
+ - pytorch >=1.8.0,<2
+ - tqdm >=4.14,<5
+ - XlsxWriter >=1.2.8,<4
+ - rdt >=0.4.1,<0.6
+ - sdmetrics >=0.4.1,<0.5
+ - sdv >=0.9.0
run:
- - ctgan >=0.2.2.dev1,<0.3
- - gretel-synthetics >=0.15.4,<0.16
- - humanfriendly >=8.2,<9
- - numpy >=1.15.4,<2
- - pandas >=0.23.4,<2
- - pomegranate >=0.13.0,<0.13.5
+ - python >=3.6,<3.9
+ - appdirs >=1.3,<2
+ - boto3 >=1.15.0,<2
+ - botocore >=1.18,<2
+ - humanfriendly >=8.2,<11
+ - numpy >=1.18.0,<2
+ - pandas >=1.1.3,<2
+ - pomegranate >=0.14.1,<15
- psutil >=5.7,<6
- - python
- - scikit-learn >=0.20,<0.24
- - scipy >=1.3.0,<2
- - sdv >=0.4.4.dev0,<0.6
+ - scikit-learn >=0.24,<2
- tabulate >=0.8.3,<0.9
- - pytorch >=1.1.0,<2
- - tensorflow ==2.4.0rc1
- - torchvision >=0.3.0
- - tqdm >=4,<5
- - xlsxwriter >=1.2.8,<1.3
+ - pytorch >=1.8.0,<2
+ - tqdm >=4.14,<5
+ - XlsxWriter >=1.2.8,<4
+ - rdt >=0.4.1,<0.6
+ - sdmetrics >=0.4.1,<0.5
+ - sdv >=0.9.0
about:
home: "https://github.com/sdv-dev/SDGym"
license: MIT
license_family: MIT
- license_file:
+ license_file:
summary: "A framework to benchmark the performance of synthetic data generators for non-temporal tabular data"
- doc_url:
- dev_url:
+ doc_url:
+ dev_url:
extra:
recipe-maintainers:
diff --git a/docs/.nojekyll b/docs/.nojekyll
deleted file mode 100644
index e69de29b..00000000
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index 5e737062..00000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS =
-SPHINXBUILD = python -msphinx
-SPHINXPROJ = sdgym
-SOURCEDIR = .
-BUILDDIR = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index da9825e6..00000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# -*- coding: utf-8 -*-
-#
-# sdgym documentation build configuration file, created by
-# sphinx-quickstart on Fri Jan 6 13:06:48 2017.
-#
-# This file is execfile()d with the current directory set to its
-# containing dir.
-#
-# Note that not all possible configuration values are present in this
-# autogenerated file.
-#
-# All configuration values have a default; values that are commented out
-# serve to show the default.
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-
-import sphinx_rtd_theme # For read the docs theme
-
-import sdgym
-
-# -- General configuration ---------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
-extensions = [
- 'm2r',
- 'sphinx.ext.autodoc',
- 'sphinx.ext.githubpages',
- 'sphinx.ext.viewcode',
- 'sphinx.ext.napoleon',
- 'autodocsumm',
-]
-
-autodoc_default_options = {
- 'autosummary': True,
-}
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-source_suffix = ['.rst', '.md']
-
-# The master toctree document.
-master_doc = 'index'
-
-# General information about the project.
-project = 'SDGym'
-slug = 'sdgym'
-title = project + ' Documentation'
-copyright = '2019, MIT Data to AI Lab'
-author = 'MIT Data To AI Lab'
-description = (
- 'Synthetic Data Gym: A framework to benchmark the performance of synthetic data generators '
- 'for non-temporal tabular data.'
-)
-user = 'sdv-dev'
-
-# The version info for the project you're documenting, acts as replacement
-# for |version| and |release|, also used in various other places throughout
-# the built documents.
-#
-# The short X.Y version.
-version = sdgym.__version__
-# The full version, including alpha/beta/rc tags.
-release = sdgym.__version__
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ['.py', '_build', 'Thumbs.db', '.DS_Store',
- '**.ipynb_checkpoints', '**sdgym.synthesizers*.rst']
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = False
-
-
-# -- Options for HTML output -------------------------------------------
-
-# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
-#
-html_theme = 'sphinx_rtd_theme'
-html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
-
-# Readthedocs additions
-html_context = {
- 'display_github': True,
- 'github_user': user,
- 'github_repo': project,
- 'github_version': 'master',
- 'conf_py_path': '/docs/',
-}
-
-# Theme options are theme-specific and customize the look and feel of a
-# theme further. For a list of options available for each theme, see the
-# documentation.
-html_theme_options = {
- 'collapse_navigation': False,
- 'display_version': False,
- 'logo_only': True
-}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-
-# The name of an image file (relative to this directory) to use as a favicon of
-# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
-# pixels large.
-html_favicon = 'images/dai-logo-white.ico'
-
-html_logo = '../resources/header_light.png'
-
-# -- Options for HTMLHelp output ---------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = slug + 'doc'
-
-
-# -- Options for LaTeX output ------------------------------------------
-
-latex_elements = {
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, documentclass
-# [howto, manual, or own class]).
-latex_documents = [(
- master_doc,
- slug + '.tex',
- title,
- author,
- 'manual'
-)]
-
-
-# -- Options for manual page output ------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [(
- master_doc,
- slug,
- title,
- [author],
- 1
-)]
-
-
-# -- Options for Texinfo output ----------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-# dir menu entry, description, category)
-texinfo_documents = [(
- master_doc,
- slug,
- title,
- author,
- slug,
- description,
- 'Miscellaneous'
-)]
diff --git a/docs/contributing.rst b/docs/contributing.rst
deleted file mode 100644
index e582053e..00000000
--- a/docs/contributing.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. include:: ../CONTRIBUTING.rst
diff --git a/docs/history.rst b/docs/history.rst
deleted file mode 100644
index d26e5be8..00000000
--- a/docs/history.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. mdinclude:: ../HISTORY.md
diff --git a/docs/images/dai-logo-white-200.png b/docs/images/dai-logo-white-200.png
deleted file mode 100644
index 58bb9fde..00000000
Binary files a/docs/images/dai-logo-white-200.png and /dev/null differ
diff --git a/docs/images/dai-logo-white.ico b/docs/images/dai-logo-white.ico
deleted file mode 100644
index aab7ae8c..00000000
Binary files a/docs/images/dai-logo-white.ico and /dev/null differ
diff --git a/docs/images/misc/alarm.png b/docs/images/misc/alarm.png
deleted file mode 100644
index c2b92b7b..00000000
Binary files a/docs/images/misc/alarm.png and /dev/null differ
diff --git a/docs/images/misc/asia.png b/docs/images/misc/asia.png
deleted file mode 100644
index add7426e..00000000
Binary files a/docs/images/misc/asia.png and /dev/null differ
diff --git a/docs/images/misc/child.png b/docs/images/misc/child.png
deleted file mode 100644
index b602290e..00000000
Binary files a/docs/images/misc/child.png and /dev/null differ
diff --git a/docs/images/misc/grid.jpg b/docs/images/misc/grid.jpg
deleted file mode 100644
index 42f4daa5..00000000
Binary files a/docs/images/misc/grid.jpg and /dev/null differ
diff --git a/docs/images/misc/gridr.jpg b/docs/images/misc/gridr.jpg
deleted file mode 100644
index b4a844d1..00000000
Binary files a/docs/images/misc/gridr.jpg and /dev/null differ
diff --git a/docs/images/misc/insurance.png b/docs/images/misc/insurance.png
deleted file mode 100644
index 56c89936..00000000
Binary files a/docs/images/misc/insurance.png and /dev/null differ
diff --git a/docs/images/misc/res/1.jpg b/docs/images/misc/res/1.jpg
deleted file mode 100644
index 74e82740..00000000
Binary files a/docs/images/misc/res/1.jpg and /dev/null differ
diff --git a/docs/images/misc/res/10.jpg b/docs/images/misc/res/10.jpg
deleted file mode 100644
index 90922631..00000000
Binary files a/docs/images/misc/res/10.jpg and /dev/null differ
diff --git a/docs/images/misc/res/11.jpg b/docs/images/misc/res/11.jpg
deleted file mode 100644
index f0602e9d..00000000
Binary files a/docs/images/misc/res/11.jpg and /dev/null differ
diff --git a/docs/images/misc/res/12.jpg b/docs/images/misc/res/12.jpg
deleted file mode 100644
index e2931323..00000000
Binary files a/docs/images/misc/res/12.jpg and /dev/null differ
diff --git a/docs/images/misc/res/13.jpg b/docs/images/misc/res/13.jpg
deleted file mode 100644
index 25561032..00000000
Binary files a/docs/images/misc/res/13.jpg and /dev/null differ
diff --git a/docs/images/misc/res/14.jpg b/docs/images/misc/res/14.jpg
deleted file mode 100644
index 1e154588..00000000
Binary files a/docs/images/misc/res/14.jpg and /dev/null differ
diff --git a/docs/images/misc/res/15.jpg b/docs/images/misc/res/15.jpg
deleted file mode 100644
index 52548d8a..00000000
Binary files a/docs/images/misc/res/15.jpg and /dev/null differ
diff --git a/docs/images/misc/res/2.jpg b/docs/images/misc/res/2.jpg
deleted file mode 100644
index 0b29b2ea..00000000
Binary files a/docs/images/misc/res/2.jpg and /dev/null differ
diff --git a/docs/images/misc/res/3.jpg b/docs/images/misc/res/3.jpg
deleted file mode 100644
index d22f0a2f..00000000
Binary files a/docs/images/misc/res/3.jpg and /dev/null differ
diff --git a/docs/images/misc/res/4.jpg b/docs/images/misc/res/4.jpg
deleted file mode 100644
index 365f08a4..00000000
Binary files a/docs/images/misc/res/4.jpg and /dev/null differ
diff --git a/docs/images/misc/res/5.jpg b/docs/images/misc/res/5.jpg
deleted file mode 100644
index 7c050701..00000000
Binary files a/docs/images/misc/res/5.jpg and /dev/null differ
diff --git a/docs/images/misc/res/6.jpg b/docs/images/misc/res/6.jpg
deleted file mode 100644
index 7e89a74d..00000000
Binary files a/docs/images/misc/res/6.jpg and /dev/null differ
diff --git a/docs/images/misc/res/7.jpg b/docs/images/misc/res/7.jpg
deleted file mode 100644
index 7c6fa1d3..00000000
Binary files a/docs/images/misc/res/7.jpg and /dev/null differ
diff --git a/docs/images/misc/res/8.jpg b/docs/images/misc/res/8.jpg
deleted file mode 100644
index 414296b2..00000000
Binary files a/docs/images/misc/res/8.jpg and /dev/null differ
diff --git a/docs/images/misc/res/9.jpg b/docs/images/misc/res/9.jpg
deleted file mode 100644
index aa89bcbe..00000000
Binary files a/docs/images/misc/res/9.jpg and /dev/null differ
diff --git a/docs/images/misc/res/c.jpg b/docs/images/misc/res/c.jpg
deleted file mode 100644
index 6eddb2eb..00000000
Binary files a/docs/images/misc/res/c.jpg and /dev/null differ
diff --git a/docs/images/misc/ring.jpg b/docs/images/misc/ring.jpg
deleted file mode 100644
index e660bb47..00000000
Binary files a/docs/images/misc/ring.jpg and /dev/null differ
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index a17c7634..00000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-.. include:: readme.rst
-
-.. toctree::
- :caption: Getting Started
- :maxdepth: 2
-
- Overview
-
-.. toctree::
- :caption: Resources
-
- API Reference
- contributing
- history
-
-Indices and tables
-==================
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs/readme.rst b/docs/readme.rst
deleted file mode 100644
index 97d49585..00000000
--- a/docs/readme.rst
+++ /dev/null
@@ -1 +0,0 @@
-.. mdinclude:: ../README.md
diff --git a/docs/resources/header.png b/docs/resources/header.png
deleted file mode 100644
index 1b900e32..00000000
Binary files a/docs/resources/header.png and /dev/null differ
diff --git a/docs/resources/header_light.png b/docs/resources/header_light.png
deleted file mode 100644
index f92d95e5..00000000
Binary files a/docs/resources/header_light.png and /dev/null differ
diff --git a/sdgym/__init__.py b/sdgym/__init__.py
index c07605a0..559dd9c2 100644
--- a/sdgym/__init__.py
+++ b/sdgym/__init__.py
@@ -8,7 +8,7 @@
__copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab'
__email__ = 'dailabmit@gmail.com'
__license__ = 'MIT'
-__version__ = '0.4.0'
+__version__ = '0.5.0.dev1'
from sdgym import benchmark, synthesizers
from sdgym.benchmark import run
diff --git a/sdgym/__main__.py b/sdgym/__main__.py
index 98e02658..aa31ee3d 100644
--- a/sdgym/__main__.py
+++ b/sdgym/__main__.py
@@ -13,6 +13,8 @@
import tqdm
import sdgym
+from sdgym.synthesizers.base import Baseline
+from sdgym.utils import get_synthesizers
def _env_setup(logfile, verbosity):
@@ -134,6 +136,11 @@ def _list_available(args):
_print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size})
+def _list_synthesizers(args):
+ synthesizers = Baseline.get_baselines()
+ _print_table(pd.DataFrame(get_synthesizers(list(synthesizers))))
+
+
def _collect(args):
sdgym.collect.collect_results(args.input_path, args.output_file, args.aws_key, args.aws_secret)
@@ -241,6 +248,11 @@ def _get_parser():
list_available.add_argument('-as', '--aws-secret', type=str, required=False,
help='Aws secret access key to use when reading datasets.')
+ # list-synthesizers
+ list_available = action.add_parser('list-synthesizers',
+ help='List synthesizers available for use.')
+ list_available.set_defaults(action=_list_synthesizers)
+
# collect
collect = action.add_parser('collect', help='Collect sdgym results.')
collect.set_defaults(action=_collect)
diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py
index 42a85f6c..35e82016 100644
--- a/sdgym/benchmark.py
+++ b/sdgym/benchmark.py
@@ -11,7 +11,6 @@
import compress_pickle
import numpy as np
import pandas as pd
-import torch
import tqdm
from sdgym.datasets import get_dataset_paths, load_dataset, load_tables
@@ -20,6 +19,7 @@
from sdgym.progress import TqdmLogger, progress
from sdgym.s3 import is_s3_path, write_csv, write_file
from sdgym.synthesizers.base import Baseline
+from sdgym.synthesizers.utils import get_num_gpus
from sdgym.utils import (
build_synthesizer, format_exception, get_synthesizers, import_object, used_memory)
@@ -72,6 +72,7 @@ def _compute_scores(metrics, real_data, synthetic_data, metadata, output):
error = None
score = None
+ normalized_score = None
start = datetime.utcnow()
try:
LOGGER.info('Computing %s on dataset %s', metric_name, metadata._metadata['name'])
@@ -309,8 +310,9 @@ def run(synthesizers=None, datasets=None, datasets_path=None, modalities=None, b
run_id = os.getenv('RUN_ID') or str(uuid.uuid4())[:10]
if workers == -1:
- if torch.cuda.is_available():
- workers = torch.cuda.device_count()
+ num_gpus = get_num_gpus()
+ if num_gpus > 0:
+ workers = num_gpus
else:
workers = multiprocessing.cpu_count()
diff --git a/sdgym/synthesizers/__init__.py b/sdgym/synthesizers/__init__.py
index 858a8957..8336ce23 100644
--- a/sdgym/synthesizers/__init__.py
+++ b/sdgym/synthesizers/__init__.py
@@ -23,7 +23,6 @@
'CTGAN',
'Uniform',
'VEEGAN',
- 'CTGAN',
'CopulaGAN',
'GaussianCopulaCategorical',
'GaussianCopulaCategoricalFuzzy',
diff --git a/sdgym/synthesizers/base.py b/sdgym/synthesizers/base.py
index 4cbf94f8..adb70772 100644
--- a/sdgym/synthesizers/base.py
+++ b/sdgym/synthesizers/base.py
@@ -1,3 +1,4 @@
+import abc
import logging
import pandas as pd
@@ -8,7 +9,7 @@
LOGGER = logging.getLogger(__name__)
-class Baseline:
+class Baseline(abc.ABC):
"""Base class for all the ``SDGym`` baselines."""
MODALITIES = ()
@@ -31,11 +32,21 @@ def get_subclasses(cls, include_parents=False):
return subclasses
+ @classmethod
+ def get_baselines(cls):
+ subclasses = cls.get_subclasses(include_parents=True)
+ synthesizers = []
+ for _, subclass in subclasses.items():
+ if abc.ABC not in subclass.__bases__:
+ synthesizers.append(subclass)
+
+ return synthesizers
+
def fit_sample(self, real_data, metadata):
pass
-class SingleTableBaseline(Baseline):
+class SingleTableBaseline(Baseline, abc.ABC):
"""Base class for all the SingleTable Baselines.
Subclasses can choose to implement ``_fit_sample``, which will
@@ -77,7 +88,7 @@ def fit_sample(self, real_data, metadata):
return _fit_sample(real_data, metadata)
-class MultiSingleTableBaseline(Baseline):
+class MultiSingleTableBaseline(Baseline, abc.ABC):
"""Base class for SingleTableBaselines that are used on multi table scenarios.
These classes model and sample each table independently and then just
@@ -111,7 +122,7 @@ def fit_sample(self, real_data, metadata):
return self._fit_sample(real_data, metadata)
-class LegacySingleTableBaseline(SingleTableBaseline):
+class LegacySingleTableBaseline(SingleTableBaseline, abc.ABC):
"""Single table baseline which passes ordinals and categoricals down.
This class exists here to support the legacy baselines which do not operate
@@ -144,8 +155,8 @@ def _fit_sample(self, real_data, table_metadata):
columns, categoricals = self._get_columns(real_data, table_metadata)
real_data = real_data[columns]
- ht = rdt.HyperTransformer(dtype_transformers={
- 'O': 'label_encoding',
+ ht = rdt.HyperTransformer(default_data_type_transformers={
+ 'categorical': 'LabelEncodingTransformer',
})
ht.fit(real_data.iloc[:, categoricals])
model_data = ht.transform(real_data)
diff --git a/sdgym/synthesizers/gretel.py b/sdgym/synthesizers/gretel.py
index 2cdfdfd0..ec3b3dfb 100644
--- a/sdgym/synthesizers/gretel.py
+++ b/sdgym/synthesizers/gretel.py
@@ -1,19 +1,24 @@
-import os
+import tempfile
import numpy as np
-from gretel_synthetics.batch import DataFrameBatch
from sdgym.synthesizers.base import SingleTableBaseline
+try:
+ from gretel_synthetics.batch import DataFrameBatch
+except ImportError:
+ DataFrameBatch = None
+
class Gretel(SingleTableBaseline):
"""Class to represent Gretel's neural network model."""
- DEFAULT_CHECKPOINT_DIR = os.path.join(os.getcwd(), 'checkpoints')
-
def __init__(self, max_lines=0, max_line_len=2048, epochs=None, vocab_size=20000,
gen_lines=None, dp=False, field_delimiter=",", overwrite=True,
- checkpoint_dir=DEFAULT_CHECKPOINT_DIR):
+ checkpoint_dir=None):
+ if DataFrameBatch is None:
+ raise ImportError('Please install gretel-synthetics using `pip install sdgym[gretel]`')
+
self.max_lines = max_lines
self.max_line_len = max_line_len
self.epochs = epochs
@@ -22,7 +27,7 @@ def __init__(self, max_lines=0, max_line_len=2048, epochs=None, vocab_size=20000
self.dp = dp
self.field_delimiter = field_delimiter
self.overwrite = overwrite
- self.checkpoint_dir = checkpoint_dir
+ self.checkpoint_dir = checkpoint_dir or tempfile.TemporaryDirectory().name
def _fit_sample(self, data, metadata):
config = {
diff --git a/sdgym/synthesizers/sdv.py b/sdgym/synthesizers/sdv.py
index af615666..8244b18b 100644
--- a/sdgym/synthesizers/sdv.py
+++ b/sdgym/synthesizers/sdv.py
@@ -1,3 +1,4 @@
+import abc
import logging
import sdv
@@ -9,7 +10,7 @@
LOGGER = logging.getLogger(__name__)
-class SDV(Baseline):
+class SDV(Baseline, abc.ABC):
MODALITIES = ('single-table', 'multi-table')
@@ -22,7 +23,7 @@ def fit_sample(self, data, metadata):
return model.sample_all()
-class SDVTabular(SingleTableBaseline):
+class SDVTabular(SingleTableBaseline, abc.ABC):
MODALITIES = ('single-table', )
_MODEL = None
@@ -58,11 +59,11 @@ class GaussianCopulaOneHot(SDVTabular):
_MODEL = sdv.tabular.GaussianCopula
_MODEL_KWARGS = {
- 'categorical_transformer': 'one_hot_encoding'
+ 'categorical_transformer': 'OneHotEncodingTransformer'
}
-class CUDATabular(SDVTabular):
+class CUDATabular(SDVTabular, abc.ABC):
def _fit_sample(self, data, metadata):
LOGGER.info('Fitting %s', self.__class__.__name__)
@@ -90,7 +91,7 @@ class CopulaGAN(CUDATabular):
_MODEL = sdv.tabular.CopulaGAN
-class SDVRelational(Baseline):
+class SDVRelational(Baseline, abc.ABC):
MODALITIES = ('single-table', 'multi-table')
_MODEL = None
@@ -111,7 +112,7 @@ class HMA1(SDVRelational):
_MODEL = sdv.relational.HMA1
-class SDVTimeseries(SingleTableBaseline):
+class SDVTimeseries(SingleTableBaseline, abc.ABC):
MODALITIES = ('timeseries', )
_MODEL = None
diff --git a/sdgym/synthesizers/utils.py b/sdgym/synthesizers/utils.py
index 9f908b9e..192efd79 100644
--- a/sdgym/synthesizers/utils.py
+++ b/sdgym/synthesizers/utils.py
@@ -2,7 +2,6 @@
import numpy as np
import pandas as pd
-import torch
from sklearn.mixture import BayesianGaussianMixture, GaussianMixture
from sklearn.preprocessing import KBinsDiscretizer
@@ -438,16 +437,21 @@ def inverse_transform(self, data):
return data_t
-def select_device():
- if not torch.cuda.is_available():
- return 'cpu'
+def get_num_gpus():
+ try:
+ command = ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits']
+ output = subprocess.run(command, stdout=subprocess.PIPE)
+ return len(output.stdout.decode().split())
+ except Exception:
+ return 0
+
+def select_device():
try:
command = ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits']
output = subprocess.run(command, stdout=subprocess.PIPE)
loads = np.array(output.stdout.decode().split()).astype(float)
device = loads.argmin()
+ return f'cuda:{device}'
except Exception:
- device = np.random.randint(torch.cuda.device_count())
-
- return f'cuda:{device}'
+ return 'cpu'
diff --git a/sdgym/synthesizers/ydata.py b/sdgym/synthesizers/ydata.py
index 7aaeefe8..af8ace7f 100644
--- a/sdgym/synthesizers/ydata.py
+++ b/sdgym/synthesizers/ydata.py
@@ -1,12 +1,14 @@
+import abc
+
+from sdgym.synthesizers.base import SingleTableBaseline
+
try:
import ydata_synthetic.synthesizers.regular as ydata
except ImportError:
ydata = None
-from sdgym.synthesizers.base import SingleTableBaseline
-
-class YData(SingleTableBaseline):
+class YData(SingleTableBaseline, abc.ABC):
def _fit_sample(self, real_data, table_metadata):
if ydata is None:
diff --git a/sdgym/utils.py b/sdgym/utils.py
index 86ba9df8..12fcee12 100644
--- a/sdgym/utils.py
+++ b/sdgym/utils.py
@@ -1,5 +1,6 @@
"""Random utils used by SDGym."""
+import copy
import importlib
import json
import logging
@@ -81,7 +82,7 @@ def _get_synthesizer(synthesizer, name=None):
with open(synthesizer, 'r') as json_file:
return json.load(json_file)
- baselines = Baseline.get_subclasses()
+ baselines = Baseline.get_subclasses(include_parents=True)
if synthesizer in baselines:
LOGGER.info('Trying to import synthesizer by name.')
synthesizer = baselines[synthesizer]
@@ -187,14 +188,17 @@ def build_synthesizer(synthesizer, synthesizer_dict):
callable:
The synthesizer function
"""
+
+ _synthesizer_dict = copy.deepcopy(synthesizer_dict)
+
def _synthesizer_function(real_data, metadata):
- metadata_keyword = synthesizer_dict.get('metadata', '$metadata')
- real_data_keyword = synthesizer_dict.get('real_data', '$real_data')
- device_keyword = synthesizer_dict.get('device', '$device')
- device_attribute = synthesizer_dict.get('device_attribute')
+ metadata_keyword = _synthesizer_dict.get('metadata', '$metadata')
+ real_data_keyword = _synthesizer_dict.get('real_data', '$real_data')
+ device_keyword = _synthesizer_dict.get('device', '$device')
+ device_attribute = _synthesizer_dict.get('device_attribute')
device = select_device()
- multi_table = 'multi-table' in synthesizer_dict['modalities']
+ multi_table = 'multi-table' in _synthesizer_dict['modalities']
if not multi_table:
table = metadata.get_tables()[0]
metadata = metadata.get_table_meta(table)
@@ -206,8 +210,8 @@ def _synthesizer_function(real_data, metadata):
(device_keyword, device),
]
- init_kwargs = _get_kwargs(synthesizer_dict, 'init', replace)
- fit_kwargs = _get_kwargs(synthesizer_dict, 'fit', replace)
+ init_kwargs = _get_kwargs(_synthesizer_dict, 'init', replace)
+ fit_kwargs = _get_kwargs(_synthesizer_dict, 'fit', replace)
instance = synthesizer(**init_kwargs)
if device_attribute:
diff --git a/setup.cfg b/setup.cfg
index 576868f3..b67aba27 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.4.0
+current_version = 0.5.0.dev1
commit = True
tag = True
parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))?
diff --git a/setup.py b/setup.py
index c5a67950..21489831 100644
--- a/setup.py
+++ b/setup.py
@@ -12,32 +12,46 @@
history = history_file.read()
install_requires = [
- 'appdirs>1.1.4,<2',
+ 'appdirs>=1.3,<2',
'boto3>=1.15.0,<2',
- 'compress-pickle>=1.2.0,<2',
- 'gretel-synthetics>=0.15.4,<0.16',
- 'humanfriendly>=8.2,<9',
- 'numpy>=1.15.4,<1.20',
- 'pandas<1.1.5,>=1.1',
- 'pomegranate>=0.13.0,<0.13.5',
+ 'botocore>=1.18,<2',
+ 'compress-pickle>=1.2.0,<3',
+ 'humanfriendly>=8.2,<11',
+ "numpy>=1.18.0,<1.20.0;python_version<'3.7'",
+ "numpy>=1.20.0,<2;python_version>='3.7'",
+ 'pandas>=1.1.3,<2',
+ "pomegranate>=0.13.4,<0.14.2;python_version<'3.7'",
+ "pomegranate>=0.14.1,<0.15;python_version>='3.7'",
'psutil>=5.7,<6',
- 'scikit-learn>=0.20,<1',
+ 'scikit-learn>=0.24,<2',
+ 'scipy>=1.5.4,<2',
'tabulate>=0.8.3,<0.9',
- 'torch>=1.1.0,<2',
- 'tqdm>=4,<5',
- 'XlsxWriter>=1.2.8,<1.3',
- 'rdt>=0.4.1',
- 'sdmetrics>=0.3.0',
- 'sdv>=0.9.0',
- 'tensorflow==2.4.0rc1',
- 'wheel~=0.35',
+ 'torch>=1.8.0,<2',
+ 'tqdm>=4.15,<5',
+ 'XlsxWriter>=1.2.8,<4',
+ 'rdt>=0.6.1,<0.7',
+ 'sdmetrics>=0.4.1,<0.5',
+ 'sdv>=0.13.0',
]
+
+dask_requires = [
+ 'dask',
+ 'distributed',
+]
+
+
ydata_requires = [
# preferably install using make install-ydata
'ydata-synthetic>=0.3.0,<0.4',
]
+gretel_requires = [
+ 'gretel-synthetics>=0.15.4,<0.16',
+ 'tensorflow==2.4.0rc1',
+ 'wheel~=0.35',
+]
+
setup_requires = [
'pytest-runner>=2.11.1',
]
@@ -55,12 +69,6 @@
'pip>=9.0.1',
'watchdog>=0.8.3,<0.11',
- # docs
- 'm2r>=0.2.0,<0.3',
- 'Sphinx>=1.7.1,<3',
- 'sphinx_rtd_theme>=0.2.4,<0.5',
- 'autodocsumm>=0.1.10,<0.2',
-
# style check
'flake8>=3.7.7,<4',
'isort>=4.3.4,<5',
@@ -77,6 +85,9 @@
'coverage>=4.5.1,<6',
'tox>=2.9.1,<4',
'importlib-metadata>=3.6',
+
+ # Invoke
+ 'invoke',
]
setup(
@@ -91,6 +102,7 @@
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
],
description=(
@@ -103,8 +115,11 @@
],
},
extras_require={
- 'dev': development_requires + tests_require,
+ 'all': development_requires + tests_require + dask_requires + gretel_requires,
+ 'dev': development_requires + tests_require + dask_requires,
'test': tests_require,
+ 'gretel': gretel_requires,
+ 'dask': dask_requires,
},
include_package_data=True,
install_requires=install_requires,
@@ -114,11 +129,11 @@
keywords='machine learning synthetic data generation benchmark generative models',
name='sdgym',
packages=find_packages(include=['sdgym', 'sdgym.*']),
- python_requires='>=3.6,<3.9',
+ python_requires='>=3.6,<3.10',
setup_requires=setup_requires,
test_suite='tests',
tests_require=tests_require,
url='https://github.com/sdv-dev/SDGym',
- version='0.4.0',
+ version='0.5.0.dev1',
zip_safe=False,
)
diff --git a/tasks.py b/tasks.py
new file mode 100644
index 00000000..75847ab0
--- /dev/null
+++ b/tasks.py
@@ -0,0 +1,121 @@
+import glob
+import operator
+import os
+import re
+import platform
+import shutil
+import stat
+from pathlib import Path
+
+from invoke import task
+
+COMPARISONS = {
+ '>=': operator.ge,
+ '>': operator.gt,
+ '<': operator.lt,
+ '<=': operator.le
+}
+
+
+@task
+def check_dependencies(c):
+ c.run('python -m pip check')
+
+
+@task
+def unit(c):
+ c.run('python -m pytest ./tests/unit --cov=sdgym --cov-report=xml')
+
+
+@task
+def integration(c):
+ c.run('python -m pytest ./tests/integration')
+
+
+@task
+def readme(c):
+ test_path = Path('tests/readme_test')
+ if test_path.exists() and test_path.is_dir():
+ shutil.rmtree(test_path)
+
+ cwd = os.getcwd()
+ os.makedirs(test_path, exist_ok=True)
+ shutil.copy('README.md', test_path / 'README.md')
+ os.chdir(test_path)
+ c.run('rundoc run --single-session python3 -t python3 README.md')
+ os.chdir(cwd)
+ shutil.rmtree(test_path)
+
+
+def _validate_python_version(line):
+ python_version_match = re.search(r"python_version(<=?|>=?)\'(\d\.?)+\'", line)
+ if python_version_match:
+ python_version = python_version_match.group(0)
+ comparison = re.search(r'(>=?|<=?)', python_version).group(0)
+ version_number = python_version.split(comparison)[-1].replace("'", "")
+ comparison_function = COMPARISONS[comparison]
+ return comparison_function(platform.python_version(), version_number)
+
+ return True
+
+
+@task
+def install_minimum(c):
+ with open('setup.py', 'r') as setup_py:
+ lines = setup_py.read().splitlines()
+
+ versions = []
+ started = False
+ for line in lines:
+ if started:
+ if line == ']':
+ started = False
+ continue
+
+ line = line.strip()
+ if _validate_python_version(line):
+ requirement = re.match(r'[^>]*', line).group(0)
+ requirement = re.sub(r"""['",]""", '', requirement)
+ version = re.search(r'>=?[^(,|#)]*', line).group(0)
+ if version:
+ version = re.sub(r'>=?', '==', version)
+ version = re.sub(r"""['",]""", '', version)
+ requirement += version
+
+ versions.append(requirement)
+
+ elif (line.startswith('install_requires = [') or
+ line.startswith('pomegranate_requires = [')):
+ started = True
+
+ c.run(f'python -m pip install {" ".join(versions)}')
+
+
+@task
+def minimum(c):
+ install_minimum(c)
+ check_dependencies(c)
+ unit(c)
+ integration(c)
+
+
+@task
+def lint(c):
+ check_dependencies(c)
+ c.run('flake8 sdgym')
+ c.run('flake8 tests --ignore=D,SFS2')
+ c.run('isort -c --recursive sdgym tests')
+
+
+def remove_readonly(func, path, _):
+ "Clear the readonly bit and reattempt the removal"
+ os.chmod(path, stat.S_IWRITE)
+ func(path)
+
+
+@task
+def rmdir(c, path):
+ try:
+ shutil.rmtree(path, onerror=remove_readonly)
+ except PermissionError:
+ pass
diff --git a/tests/integration/test_benchmark.py b/tests/integration/test_benchmark.py
index c2984724..1b256c69 100644
--- a/tests/integration/test_benchmark.py
+++ b/tests/integration/test_benchmark.py
@@ -39,11 +39,11 @@ def test_identity_jobs():
def test_json_synthesizer():
synthesizer = {
- "name": "synthesizer_name",
- "synthesizer": "sdgym.synthesizers.ydata.PreprocessedVanillaGAN",
- "modalities": ["single-table"],
- "init_kwargs": {"categorical_transformer": "label_encoding"},
- "fit_kwargs": {"data": "$real_data"}
+ 'name': 'synthesizer_name',
+ 'synthesizer': 'sdgym.synthesizers.ydata.PreprocessedVanillaGAN',
+ 'modalities': ['single-table'],
+ 'init_kwargs': {'categorical_transformer': 'label_encoding'},
+ 'fit_kwargs': {'data': '$real_data'}
}
output = sdgym.run(
@@ -52,4 +52,31 @@ def test_json_synthesizer():
iterations=1,
)
- assert set(output['synthesizer']) == {"synthesizer_name"}
+ assert set(output['synthesizer']) == {'synthesizer_name'}
+
+
+def test_json_synthesizer_multi_table():
+ synthesizer = {
+ 'name': 'HMA1',
+ 'synthesizer': 'sdv.relational.HMA1',
+ 'modalities': [
+ 'multi-table'
+ ],
+ 'init_kwargs': {
+ 'metadata': '$metadata'
+ },
+ 'fit_kwargs': {
+ 'tables': '$real_data'
+ }
+ }
+
+ output = sdgym.run(
+ synthesizers=[json.dumps(synthesizer)],
+ datasets=['university_v1', 'trains_v1'],
+ iterations=1,
+ )
+
+ # CSTest for `university_v1` is not valid because there are no categorical columns.
+ valid_out = output.loc[~((output.dataset == 'university_v1') & (output.metric == 'CSTest'))]
+
+ assert not valid_out.error.any()
diff --git a/tests/unit/test_datasets.py b/tests/unit/test_datasets.py
index 89d25703..9c55f193 100644
--- a/tests/unit/test_datasets.py
+++ b/tests/unit/test_datasets.py
@@ -9,6 +9,7 @@
class AnyConfigWith:
"""AnyConfigWith matches any s3 config with the specified signature version."""
+
def __init__(self, signature_version):
self.signature_version = signature_version
diff --git a/tests/unit/test_s3.py b/tests/unit/test_s3.py
index ae396c1f..a1357146 100644
--- a/tests/unit/test_s3.py
+++ b/tests/unit/test_s3.py
@@ -204,9 +204,10 @@ def test_write_csv(write_file_mock):
write_csv(data, path, None, None)
# asserts
- expected_content = 'col1,col2\n1,3\n2,4\n'
+ input_data = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]})
+ expected_content = input_data.to_csv(index=False).encode('utf-8')
write_file_mock.assert_called_once_with(
- expected_content.encode('utf-8'),
+ expected_content,
path,
None,
None
diff --git a/tox.ini b/tox.ini
index a63bfd39..7b793aae 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,27 +1,20 @@
[tox]
-envlist = py3{6,7,8}, test-devel
-
-[travis]
-python =
- 3.8: py38, test-devel
- 3.7: py37
- 3.6: py36
-
-[gh-actions]
-python =
- 3.8: py38, test-devel
- 3.7: py37
- 3.6: py36
+envlist = py38-lint, py3{6,7,8,9}-{integration,unit,minimum,readme}
[testenv]
-passenv = CI TRAVIS TRAVIS_*
skipsdist = false
skip_install = false
-extras = test
-commands =
- /usr/bin/env make test
-
-[testenv:test-devel]
-extras = dev
+deps =
+ invoke
+extras =
+ lint: dev
+ unit: test
+ integration: test
+ minimum: test
commands =
- /usr/bin/env make test-devel
+ lint: invoke lint
+ readme: invoke readme
+ unit: invoke unit
+ integration: invoke integration
+ minimum: invoke minimum
+ invoke rmdir --path {envdir}