diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 00000000..e4db0356 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,2 @@ +# Global rule: +* @sdv-dev/core-contributors diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..22f701b4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,33 @@ +--- +name: Bug report +about: Report an error that you found when using SDGym +title: '' +labels: bug, pending review +assignees: '' + +--- + +### Environment Details + +Please indicate the following details about the environment in which you found the bug: + +* SDGym version: +* Python version: +* Operating System: + +### Error Description + + + +### Steps to reproduce + + + +``` +Paste the command(s) you ran and the output. +If there was a crash, please include the traceback here. +``` diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..75f16bca --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,24 @@ +--- +name: Feature request +about: Request a new feature that you would like to see implemented in SDGym +title: '' +labels: new feature, pending review +assignees: '' + +--- + +### Problem Description + + + +### Expected behavior + + + +### Additional context + + diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 00000000..222d3f44 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,33 @@ +--- +name: Question +about: Doubts about SDV usage +title: '' +labels: question, pending review +assignees: '' + +--- + +### Environment details + +If you are already running SDGym, please indicate the following details about the environment in +which you are running it: + +* SDGym version: +* Python version: +* Operating System: + +### Problem description + + + +### What I already tried + + + +``` +Paste the command(s) you ran and the output. +If there was a crash, please include the traceback here. +``` diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml new file mode 100644 index 00000000..c3c2e8ac --- /dev/null +++ b/.github/workflows/integration.yml @@ -0,0 +1,31 @@ +name: Integration Tests + +on: + - push + - pull_request + +jobs: + unit: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + os: [ubuntu-latest, macos-10.15, windows-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - if: matrix.os == 'windows-latest' + name: Install dependencies - Windows + run: | + python -m pip install --upgrade pip + python -m pip install 'torch==1.8.0' -f https://download.pytorch.org/whl/cpu/torch/ + python -m pip install 'torchvision==0.9.0' -f https://download.pytorch.org/whl/cpu/torchvision/ + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install invoke .[test] + - name: Run integration tests + run: invoke integration diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..0630c219 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,21 @@ +name: Style Checks + +on: + - push + - pull_request + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + - name: Set up Python 3.8 + uses: actions/setup-python@v1 + with: + python-version: 3.8 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install invoke .[dev] + - name: Run lint checks + run: invoke lint diff --git a/.github/workflows/minimum.yml b/.github/workflows/minimum.yml new file mode 100644 index 00000000..cb2f3af5 --- /dev/null +++ b/.github/workflows/minimum.yml @@ -0,0 +1,31 @@ +name: Unit Tests Minimum Versions + +on: + - push + - pull_request + +jobs: + minimum: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + os: [ubuntu-latest, macos-10.15, windows-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - if: matrix.os == 'windows-latest' + name: Install dependencies - Windows + run: | + python -m pip install --upgrade pip + python -m pip install 'torch==1.8.0' -f https://download.pytorch.org/whl/cpu/torch/ + python -m pip install 'torchvision==0.9.0' -f https://download.pytorch.org/whl/cpu/torchvision/ + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install invoke .[test] + - name: Test with minimum versions + run: invoke minimum diff --git a/.github/workflows/readme.yml b/.github/workflows/readme.yml new file mode 100644 index 00000000..2fe4b64c --- /dev/null +++ b/.github/workflows/readme.yml @@ -0,0 +1,25 @@ +name: Test README + +on: + - push + - pull_request + +jobs: + readme: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + os: [ubuntu-latest, macos-10.15] # skip windows bc rundoc fails + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install invoke rundoc . + - name: Run the README.md + run: invoke readme diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml deleted file mode 100644 index 6a0e7c08..00000000 --- a/.github/workflows/tests.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: Run Tests - -on: - push: - branches: [ '*' ] - pull_request: - branches: [ master ] - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - matrix: - python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest, macos-latest] - - steps: - - uses: actions/checkout@v1 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install tox tox-gh-actions - - - name: Test with tox - run: tox diff --git a/.github/workflows/unit.yml b/.github/workflows/unit.yml new file mode 100644 index 00000000..18558731 --- /dev/null +++ b/.github/workflows/unit.yml @@ -0,0 +1,34 @@ +name: Unit Tests + +on: + - push + - pull_request + +jobs: + unit: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8, 3.9] + os: [ubuntu-latest, macos-10.15, windows-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - if: matrix.os == 'windows-latest' + name: Install dependencies - Windows + run: | + python -m pip install --upgrade pip + python -m pip install 'torch==1.8.0' -f https://download.pytorch.org/whl/cpu/torch/ + python -m pip install 'torchvision==0.9.0' -f https://download.pytorch.org/whl/cpu/torchvision/ + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install invoke .[test] + - name: Run unit tests + run: invoke unit + - if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.8 + name: Upload codecov report + uses: codecov/codecov-action@v2 diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 09c9e3e1..00000000 --- a/.travis.yml +++ /dev/null @@ -1,16 +0,0 @@ -# Config file for automatic testing at travis-ci.org -dist: bionic -language: python -python: - - 3.8 - - 3.7 - - 3.6 - -# Command to install dependencies -install: - - pip install -U tox-travis codecov - -after_success: codecov - -# Command to run tests -script: tox diff --git a/Dockerfile b/Dockerfile index f4685b19..b62959fe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,10 @@ -FROM python:3.8-buster -RUN apt-get update && apt-get install -y build-essential +FROM nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04 +CMD nvidia-smi + +RUN apt-get update && apt-get install -y build-essential && apt-get -y install curl +RUN apt-get -y install python3.8 python3-distutils && ln -s /usr/bin/python3.8 /usr/bin/python +RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && \ + python get-pip.py && ln -s /usr/bin/pip3 /usr/bin/pip RUN mkdir /SDGym && \ mkdir /SDGym/sdgym && \ @@ -13,7 +18,8 @@ COPY /privbayes/ /SDGym/privbayes WORKDIR /SDGym # Install project -RUN make install-ydata compile +RUN make install-all compile +RUN pip install -U numpy==1.20 ENV PRIVBAYES_BIN /SDGym/privbayes/privBayes.bin ENV TF_CPP_MIN_LOG_LEVEL 2 diff --git a/HISTORY.md b/HISTORY.md index 2fe72943..fd6af46b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,12 +1,39 @@ # History +## v0.5.0 - 2021-12-13 +This release adds support for Python 3.9, and updates dependencies to accept the latest versions when possible. + +### Issues closed + +* Add support for Python 3.9 - [Issue #127](https://github.com/sdv-dev/SDGym/issues/127) by @katxiao +* Add pip check worflow - [Issue #124](https://github.com/sdv-dev/SDGym/issues/124) by @pvk-developer +* Fix meta.yaml dependencies - [PR #119](https://github.com/sdv-dev/SDGym/pull/119) by @fealho +* Upgrade dependency ranges - [Issue #118](https://github.com/sdv-dev/SDGym/issues/118) by @katxiao + +## v0.4.1 - 2021-08-20 +This release fixed a bug where passing a `json` file as configuration for a multi-table synthesizer crashed the model. +It also adds a number of fixes and enhancements, including: (1) a function and CLI command to list the available synthesizer names, +(2) a curate set of dependencies and making `Gretel` into an optional dependency, (3) updating `Gretel` to use temp directories, +(4) using `nvidia-smi` to get the number of gpus and (5) multiple `dockerfile` updates to improve functionality. + +### Issues closed + +* Bug when using JSON configuration for multiple multi-table evaluation - [Issue #115](https://github.com/sdv-dev/SDGym/issues/115) by @pvk-developer +* Use nvidia-smi to get number of gpus - [PR #113](https://github.com/sdv-dev/SDGym/issues/113) by @katxiao +* List synthesizer names - [Issue #82](https://github.com/sdv-dev/SDGym/issues/82) by @fealho +* Use nvidia base for dockerfile - [PR #108](https://github.com/sdv-dev/SDGym/issues/108) by @katxiao +* Add Makefile target to install gretel and ydata - [PR #107](https://github.com/sdv-dev/SDGym/issues/107) by @katxiao +* Curate dependencies and make Gretel optional - [PR #106](https://github.com/sdv-dev/SDGym/issues/106) by @csala +* Update gretel checkpoints to use temp directory - [PR #105](https://github.com/sdv-dev/SDGym/issues/105) by @katxiao +* Initialize variable before reference - [PR #104](https://github.com/sdv-dev/SDGym/issues/104) by @katxiao + ## v0.4.0 - 2021-06-17 This release adds new synthesizers for Gretel and ydata, and creates a Docker image for SDGym. It also includes enhancements to the accepted SDGym arguments, adds a summary command to aggregate metrics, and adds the normalized score to the benchmark results. -## New Features +### New Features * Add normalized score to benchmark results - [Issue #102](https://github.com/sdv-dev/SDGym/issues/102) by @katxiao * Add max rows and max columns args - [Issue #96](https://github.com/sdv-dev/SDGym/issues/96) by @katxiao diff --git a/INSTALL.md b/INSTALL.md index 0c228a0a..497304d8 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -2,7 +2,7 @@ ## Requirements -**SDGym** has been developed and tested on [Python 3.6, 3.7 and 3.8](https://www.python.org/downloads/) +**SDGym** has been developed and tested on [Python 3.6, 3.7, 3.8 and 3.9](https://www.python.org/downloads/) Also, although it is not strictly required, the usage of a [virtualenv]( https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid @@ -94,4 +94,4 @@ export PRIVBAYES_BIN=$(pwd)/privBayes.bin ## Run using Docker -We support using Docker to run **SDGym**. For more information on how to do so, check the `DOCKER.md` file. \ No newline at end of file +We support using Docker to run **SDGym**. For more information on how to do so, check the `DOCKER.md` file. diff --git a/Makefile b/Makefile index cd9888bc..d3f74d57 100644 --- a/Makefile +++ b/Makefile @@ -47,12 +47,6 @@ clean-pyc: ## remove Python file artifacts find . -name '*~' -exec rm -f {} + find . -name '__pycache__' -exec rm -fr {} + -.PHONY: clean-docs -clean-docs: ## remove previously built docs - rm -rf docs/_build - rm -f docs/api/*.rst - -$(MAKE) -C docs clean 2>/dev/null # this fails if sphinx is not yet installed - .PHONY: clean-coverage clean-coverage: ## remove coverage artifacts rm -f .coverage @@ -69,7 +63,7 @@ clean-compile: ## remove compile artifacts rm -fr __privbn_tmp/ .PHONY: clean -clean: clean-build clean-compile clean-pyc clean-test clean-coverage clean-docs ## remove all build, test, coverage, docs and Python artifacts +clean: clean-build clean-compile clean-pyc clean-test clean-coverage ## remove all build, test, coverage and Python artifacts # INSTALL TARGETS @@ -100,6 +94,19 @@ install-ydata-develop: clean-build clean-compile clean-pyc compile ## install th pip install 'ydata-synthetic>=0.3.0,<0.4' pip install -e .[dev] +.PHONY: install-gretel +install-gretel: clean-build clean-compile clean-pyc compile ## install the package with gretel + pip install .[gretel] + +.PHONY: install-gretel-develop +install-gretel-develop: clean-build clean-compile clean-pyc compile ## install the package with gretel and dependencies for development + pip install -e .[dev,gretel] + +.PHONY: install-all +install-all: clean-build clean-compile clean-pyc compile ## install the package with gretel and ydata + pip install 'ydata-synthetic>=0.3.0,<0.4' + pip install .[gretel] + # LINT TARGETS .PHONY: lint @@ -126,12 +133,8 @@ test-readme: ## run the readme snippets cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md rm -rf tests/readme_test -# .PHONY: test-tutorials -# test-tutorials: ## run the tutorial notebooks -# jupyter nbconvert --execute --ExecutePreprocessor.timeout=600 tutorials/*.ipynb --stdout > /dev/null - .PHONY: test -test: test-unit test-readme # test-tutorials ## test everything that needs test dependencies +test: test-unit test-readme ## test everything that needs test dependencies .PHONY: test-devel test-devel: lint ## test everything that needs development dependencies @@ -148,22 +151,6 @@ coverage: ## check code coverage quickly with the default Python $(BROWSER) htmlcov/index.html -# DOCS TARGETS - -.PHONY: docs -docs: clean-docs ## generate Sphinx HTML documentation, including API docs - sphinx-apidoc --separate -T -o docs/api/ sdgym - $(MAKE) -C docs html - -.PHONY: view-docs -view-docs: docs ## view docs in browser - $(BROWSER) docs/_build/html/index.html - -.PHONY: serve-docs -serve-docs: view-docs ## compile the docs watching for changes - watchmedo shell-command -W -R -D -p '*.rst;*.md' -c '$(MAKE) -C docs html' . - - # RELEASE TARGETS .PHONY: dist @@ -187,26 +174,31 @@ publish-test: dist publish-confirm ## package and upload a release on TestPyPI publish: dist publish-confirm ## package and upload a release twine upload dist/* -.PHONY: bumpversion-release -bumpversion-release: ## Merge master to stable and bumpversion release +.PHONY: git-merge-master-stable +git-merge-master-stable: ## Merge master into stable git checkout stable || git checkout -b stable git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable" - bumpversion release + +.PHONY: git-merge-stable-master +git-merge-stable-master: ## Merge stable into master + git checkout master + git merge stable + +.PHONY: git-push +git-push: ## Simply push the repository to github + git push + +.PHONY: git-push-tags-stable +git-push-tags-stable: ## Push tags and stable to github git push --tags origin stable -.PHONY: bumpversion-release-test -bumpversion-release-test: ## Merge master to stable and bumpversion release - git checkout stable || git checkout -b stable - git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable" - bumpversion release --no-tag - @echo git push --tags origin stable +.PHONY: bumpversion-release +bumpversion-release: ## Bump the version to the next release + bumpversion release .PHONY: bumpversion-patch -bumpversion-patch: ## Merge stable to master and bumpversion patch - git checkout master - git merge stable +bumpversion-patch: ## Bump the version to the next patch bumpversion --no-tag patch - git push .PHONY: bumpversion-candidate bumpversion-candidate: ## Bump the version to the next candidate @@ -222,12 +214,13 @@ bumpversion-major: ## Bump the version the next major skipping the release .PHONY: bumpversion-revert bumpversion-revert: ## Undo a previous bumpversion-release + git tag --delete $(shell git tag --points-at HEAD) git checkout master git branch -D stable -CURRENT_VERSION := $(shell grep "^current_version" setup.cfg | grep -o "dev[0-9]*") CLEAN_DIR := $(shell git status --short | grep -v ??) CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) +CURRENT_VERSION := $(shell grep "^current_version" setup.cfg | grep -o "dev[0-9]*") CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l) .PHONY: check-clean @@ -236,18 +229,18 @@ ifneq ($(CLEAN_DIR),) $(error There are uncommitted changes) endif -.PHONY: check-candidate -check-candidate: ## Check if a release candidate has been made -ifeq ($(CURRENT_VERSION),dev0) - $(error Please make a release candidate and test it before atempting a release) -endif - .PHONY: check-master check-master: ## Check if we are in master branch ifneq ($(CURRENT_BRANCH),master) $(error Please make the release from master branch\n) endif +.PHONY: check-candidate +check-candidate: ## Check if a release candidate has been made +ifeq ($(CURRENT_VERSION),dev0) + $(error Please make a release candidate and test it before atempting a release) +endif + .PHONY: check-history check-history: ## Check if HISTORY.md has been modified ifeq ($(CHANGELOG_LINES),0) @@ -255,17 +248,18 @@ ifeq ($(CHANGELOG_LINES),0) endif .PHONY: check-release -check-release: check-candidate check-clean check-master check-history ## Check if the release can be made +check-release: check-clean check-candidate check-master check-history ## Check if the release can be made @echo "A new release can be made" .PHONY: release -release: check-release bumpversion-release publish bumpversion-patch +release: check-release git-merge-master-stable bumpversion-release git-push-tags-stable \ + publish git-merge-stable-master bumpversion-patch git-push .PHONY: release-test -release-test: check-release bumpversion-release-test publish-test bumpversion-revert +release-test: check-release git-merge-master-stable bumpversion-release bumpversion-revert .PHONY: release-candidate -release-candidate: check-master publish bumpversion-candidate +release-candidate: check-master publish bumpversion-candidate git-push .PHONY: release-candidate-test release-candidate-test: check-clean check-master publish-test diff --git a/README.md b/README.md index 91322824..1653a5a0 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ [![PyPi Shield](https://img.shields.io/pypi/v/sdgym.svg)](https://pypi.python.org/pypi/sdgym) [![Downloads](https://pepy.tech/badge/sdgym)](https://pepy.tech/project/sdgym) - + Benchmarking framework for Synthetic Data Generators diff --git a/conda/meta.yaml b/conda/meta.yaml index c60d039a..9d43a4d1 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -1,5 +1,5 @@ {% set name = 'sdgym' %} -{% set version = '0.4.0' %} +{% set version = '0.5.0.dev1' %} package: name: "{{ name|lower }}" @@ -17,52 +17,53 @@ build: requirements: host: - - ctgan >=0.2.2.dev1,<0.3 - - gretel-synthetics >=0.15.4,<0.16 - - humanfriendly >=8.2,<9 - - numpy >=1.15.4,<2 - - pandas >=0.23.4,<2 - pip - - pomegranate >=0.13.0,<0.13.5 + - pytest-runner + - graphviz + - python >=3.6,<3.9 + - appdirs >=1.3,<2 + - boto3 >=1.15.0,<2 + - botocore >=1.18,<2 + - humanfriendly >=8.2,<11 + - numpy >=1.18.0,<2 + - pandas >=1.1.3,<2 + - pomegranate >=0.14.1,<15 - psutil >=5.7,<6 - - python - - scikit-learn >=0.20,<0.24 - - scipy >=1.3.0,<2 - - sdv >=0.4.4.dev0,<0.6 + - scikit-learn >=0.24,<2 - tabulate >=0.8.3,<0.9 - - pytorch >=1.1.0,<2 - - tensorflow ==2.4.0rc1 - - torchvision >=0.3.0 - - tqdm >=4,<5 - - xlsxwriter >=1.2.8,<1.3 - - pytest-runner + - pytorch >=1.8.0,<2 + - tqdm >=4.14,<5 + - XlsxWriter >=1.2.8,<4 + - rdt >=0.4.1,<0.6 + - sdmetrics >=0.4.1,<0.5 + - sdv >=0.9.0 run: - - ctgan >=0.2.2.dev1,<0.3 - - gretel-synthetics >=0.15.4,<0.16 - - humanfriendly >=8.2,<9 - - numpy >=1.15.4,<2 - - pandas >=0.23.4,<2 - - pomegranate >=0.13.0,<0.13.5 + - python >=3.6,<3.9 + - appdirs >=1.3,<2 + - boto3 >=1.15.0,<2 + - botocore >=1.18,<2 + - humanfriendly >=8.2,<11 + - numpy >=1.18.0,<2 + - pandas >=1.1.3,<2 + - pomegranate >=0.14.1,<15 - psutil >=5.7,<6 - - python - - scikit-learn >=0.20,<0.24 - - scipy >=1.3.0,<2 - - sdv >=0.4.4.dev0,<0.6 + - scikit-learn >=0.24,<2 - tabulate >=0.8.3,<0.9 - - pytorch >=1.1.0,<2 - - tensorflow ==2.4.0rc1 - - torchvision >=0.3.0 - - tqdm >=4,<5 - - xlsxwriter >=1.2.8,<1.3 + - pytorch >=1.8.0,<2 + - tqdm >=4.14,<5 + - XlsxWriter >=1.2.8,<4 + - rdt >=0.4.1,<0.6 + - sdmetrics >=0.4.1,<0.5 + - sdv >=0.9.0 about: home: "https://github.com/sdv-dev/SDGym" license: MIT license_family: MIT - license_file: + license_file: summary: "A framework to benchmark the performance of synthetic data generators for non-temporal tabular data" - doc_url: - dev_url: + doc_url: + dev_url: extra: recipe-maintainers: diff --git a/docs/.nojekyll b/docs/.nojekyll deleted file mode 100644 index e69de29b..00000000 diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 5e737062..00000000 --- a/docs/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = python -msphinx -SPHINXPROJ = sdgym -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index da9825e6..00000000 --- a/docs/conf.py +++ /dev/null @@ -1,181 +0,0 @@ -# -*- coding: utf-8 -*- -# -# sdgym documentation build configuration file, created by -# sphinx-quickstart on Fri Jan 6 13:06:48 2017. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import sphinx_rtd_theme # For read the docs theme - -import sdgym - -# -- General configuration --------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = [ - 'm2r', - 'sphinx.ext.autodoc', - 'sphinx.ext.githubpages', - 'sphinx.ext.viewcode', - 'sphinx.ext.napoleon', - 'autodocsumm', -] - -autodoc_default_options = { - 'autosummary': True, -} - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -source_suffix = ['.rst', '.md'] - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = 'SDGym' -slug = 'sdgym' -title = project + ' Documentation' -copyright = '2019, MIT Data to AI Lab' -author = 'MIT Data To AI Lab' -description = ( - 'Synthetic Data Gym: A framework to benchmark the performance of synthetic data generators ' - 'for non-temporal tabular data.' -) -user = 'sdv-dev' - -# The version info for the project you're documenting, acts as replacement -# for |version| and |release|, also used in various other places throughout -# the built documents. -# -# The short X.Y version. -version = sdgym.__version__ -# The full version, including alpha/beta/rc tags. -release = sdgym.__version__ - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = ['.py', '_build', 'Thumbs.db', '.DS_Store', - '**.ipynb_checkpoints', '**sdgym.synthesizers*.rst'] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - - -# -- Options for HTML output ------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'sphinx_rtd_theme' -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] - -# Readthedocs additions -html_context = { - 'display_github': True, - 'github_user': user, - 'github_repo': project, - 'github_version': 'master', - 'conf_py_path': '/docs/', -} - -# Theme options are theme-specific and customize the look and feel of a -# theme further. For a list of options available for each theme, see the -# documentation. -html_theme_options = { - 'collapse_navigation': False, - 'display_version': False, - 'logo_only': True -} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". - -# The name of an image file (relative to this directory) to use as a favicon of -# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -html_favicon = 'images/dai-logo-white.ico' - -html_logo = '../resources/header_light.png' - -# -- Options for HTMLHelp output --------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = slug + 'doc' - - -# -- Options for LaTeX output ------------------------------------------ - -latex_elements = { -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, documentclass -# [howto, manual, or own class]). -latex_documents = [( - master_doc, - slug + '.tex', - title, - author, - 'manual' -)] - - -# -- Options for manual page output ------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [( - master_doc, - slug, - title, - [author], - 1 -)] - - -# -- Options for Texinfo output ---------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [( - master_doc, - slug, - title, - author, - slug, - description, - 'Miscellaneous' -)] diff --git a/docs/contributing.rst b/docs/contributing.rst deleted file mode 100644 index e582053e..00000000 --- a/docs/contributing.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../CONTRIBUTING.rst diff --git a/docs/history.rst b/docs/history.rst deleted file mode 100644 index d26e5be8..00000000 --- a/docs/history.rst +++ /dev/null @@ -1 +0,0 @@ -.. mdinclude:: ../HISTORY.md diff --git a/docs/images/dai-logo-white-200.png b/docs/images/dai-logo-white-200.png deleted file mode 100644 index 58bb9fde..00000000 Binary files a/docs/images/dai-logo-white-200.png and /dev/null differ diff --git a/docs/images/dai-logo-white.ico b/docs/images/dai-logo-white.ico deleted file mode 100644 index aab7ae8c..00000000 Binary files a/docs/images/dai-logo-white.ico and /dev/null differ diff --git a/docs/images/misc/alarm.png b/docs/images/misc/alarm.png deleted file mode 100644 index c2b92b7b..00000000 Binary files a/docs/images/misc/alarm.png and /dev/null differ diff --git a/docs/images/misc/asia.png b/docs/images/misc/asia.png deleted file mode 100644 index add7426e..00000000 Binary files a/docs/images/misc/asia.png and /dev/null differ diff --git a/docs/images/misc/child.png b/docs/images/misc/child.png deleted file mode 100644 index b602290e..00000000 Binary files a/docs/images/misc/child.png and /dev/null differ diff --git a/docs/images/misc/grid.jpg b/docs/images/misc/grid.jpg deleted file mode 100644 index 42f4daa5..00000000 Binary files a/docs/images/misc/grid.jpg and /dev/null differ diff --git a/docs/images/misc/gridr.jpg b/docs/images/misc/gridr.jpg deleted file mode 100644 index b4a844d1..00000000 Binary files a/docs/images/misc/gridr.jpg and /dev/null differ diff --git a/docs/images/misc/insurance.png b/docs/images/misc/insurance.png deleted file mode 100644 index 56c89936..00000000 Binary files a/docs/images/misc/insurance.png and /dev/null differ diff --git a/docs/images/misc/res/1.jpg b/docs/images/misc/res/1.jpg deleted file mode 100644 index 74e82740..00000000 Binary files a/docs/images/misc/res/1.jpg and /dev/null differ diff --git a/docs/images/misc/res/10.jpg b/docs/images/misc/res/10.jpg deleted file mode 100644 index 90922631..00000000 Binary files a/docs/images/misc/res/10.jpg and /dev/null differ diff --git a/docs/images/misc/res/11.jpg b/docs/images/misc/res/11.jpg deleted file mode 100644 index f0602e9d..00000000 Binary files a/docs/images/misc/res/11.jpg and /dev/null differ diff --git a/docs/images/misc/res/12.jpg b/docs/images/misc/res/12.jpg deleted file mode 100644 index e2931323..00000000 Binary files a/docs/images/misc/res/12.jpg and /dev/null differ diff --git a/docs/images/misc/res/13.jpg b/docs/images/misc/res/13.jpg deleted file mode 100644 index 25561032..00000000 Binary files a/docs/images/misc/res/13.jpg and /dev/null differ diff --git a/docs/images/misc/res/14.jpg b/docs/images/misc/res/14.jpg deleted file mode 100644 index 1e154588..00000000 Binary files a/docs/images/misc/res/14.jpg and /dev/null differ diff --git a/docs/images/misc/res/15.jpg b/docs/images/misc/res/15.jpg deleted file mode 100644 index 52548d8a..00000000 Binary files a/docs/images/misc/res/15.jpg and /dev/null differ diff --git a/docs/images/misc/res/2.jpg b/docs/images/misc/res/2.jpg deleted file mode 100644 index 0b29b2ea..00000000 Binary files a/docs/images/misc/res/2.jpg and /dev/null differ diff --git a/docs/images/misc/res/3.jpg b/docs/images/misc/res/3.jpg deleted file mode 100644 index d22f0a2f..00000000 Binary files a/docs/images/misc/res/3.jpg and /dev/null differ diff --git a/docs/images/misc/res/4.jpg b/docs/images/misc/res/4.jpg deleted file mode 100644 index 365f08a4..00000000 Binary files a/docs/images/misc/res/4.jpg and /dev/null differ diff --git a/docs/images/misc/res/5.jpg b/docs/images/misc/res/5.jpg deleted file mode 100644 index 7c050701..00000000 Binary files a/docs/images/misc/res/5.jpg and /dev/null differ diff --git a/docs/images/misc/res/6.jpg b/docs/images/misc/res/6.jpg deleted file mode 100644 index 7e89a74d..00000000 Binary files a/docs/images/misc/res/6.jpg and /dev/null differ diff --git a/docs/images/misc/res/7.jpg b/docs/images/misc/res/7.jpg deleted file mode 100644 index 7c6fa1d3..00000000 Binary files a/docs/images/misc/res/7.jpg and /dev/null differ diff --git a/docs/images/misc/res/8.jpg b/docs/images/misc/res/8.jpg deleted file mode 100644 index 414296b2..00000000 Binary files a/docs/images/misc/res/8.jpg and /dev/null differ diff --git a/docs/images/misc/res/9.jpg b/docs/images/misc/res/9.jpg deleted file mode 100644 index aa89bcbe..00000000 Binary files a/docs/images/misc/res/9.jpg and /dev/null differ diff --git a/docs/images/misc/res/c.jpg b/docs/images/misc/res/c.jpg deleted file mode 100644 index 6eddb2eb..00000000 Binary files a/docs/images/misc/res/c.jpg and /dev/null differ diff --git a/docs/images/misc/ring.jpg b/docs/images/misc/ring.jpg deleted file mode 100644 index e660bb47..00000000 Binary files a/docs/images/misc/ring.jpg and /dev/null differ diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index a17c7634..00000000 --- a/docs/index.rst +++ /dev/null @@ -1,20 +0,0 @@ -.. include:: readme.rst - -.. toctree:: - :caption: Getting Started - :maxdepth: 2 - - Overview - -.. toctree:: - :caption: Resources - - API Reference - contributing - history - -Indices and tables -================== -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/readme.rst b/docs/readme.rst deleted file mode 100644 index 97d49585..00000000 --- a/docs/readme.rst +++ /dev/null @@ -1 +0,0 @@ -.. mdinclude:: ../README.md diff --git a/docs/resources/header.png b/docs/resources/header.png deleted file mode 100644 index 1b900e32..00000000 Binary files a/docs/resources/header.png and /dev/null differ diff --git a/docs/resources/header_light.png b/docs/resources/header_light.png deleted file mode 100644 index f92d95e5..00000000 Binary files a/docs/resources/header_light.png and /dev/null differ diff --git a/sdgym/__init__.py b/sdgym/__init__.py index c07605a0..559dd9c2 100644 --- a/sdgym/__init__.py +++ b/sdgym/__init__.py @@ -8,7 +8,7 @@ __copyright__ = 'Copyright (c) 2018, MIT Data To AI Lab' __email__ = 'dailabmit@gmail.com' __license__ = 'MIT' -__version__ = '0.4.0' +__version__ = '0.5.0.dev1' from sdgym import benchmark, synthesizers from sdgym.benchmark import run diff --git a/sdgym/__main__.py b/sdgym/__main__.py index 98e02658..aa31ee3d 100644 --- a/sdgym/__main__.py +++ b/sdgym/__main__.py @@ -13,6 +13,8 @@ import tqdm import sdgym +from sdgym.synthesizers.base import Baseline +from sdgym.utils import get_synthesizers def _env_setup(logfile, verbosity): @@ -134,6 +136,11 @@ def _list_available(args): _print_table(datasets, args.sort, args.reverse, {'size': humanfriendly.format_size}) +def _list_synthesizers(args): + synthesizers = Baseline.get_baselines() + _print_table(pd.DataFrame(get_synthesizers(list(synthesizers)))) + + def _collect(args): sdgym.collect.collect_results(args.input_path, args.output_file, args.aws_key, args.aws_secret) @@ -241,6 +248,11 @@ def _get_parser(): list_available.add_argument('-as', '--aws-secret', type=str, required=False, help='Aws secret access key to use when reading datasets.') + # list-synthesizers + list_available = action.add_parser('list-synthesizers', + help='List synthesizers available for use.') + list_available.set_defaults(action=_list_synthesizers) + # collect collect = action.add_parser('collect', help='Collect sdgym results.') collect.set_defaults(action=_collect) diff --git a/sdgym/benchmark.py b/sdgym/benchmark.py index 42a85f6c..35e82016 100644 --- a/sdgym/benchmark.py +++ b/sdgym/benchmark.py @@ -11,7 +11,6 @@ import compress_pickle import numpy as np import pandas as pd -import torch import tqdm from sdgym.datasets import get_dataset_paths, load_dataset, load_tables @@ -20,6 +19,7 @@ from sdgym.progress import TqdmLogger, progress from sdgym.s3 import is_s3_path, write_csv, write_file from sdgym.synthesizers.base import Baseline +from sdgym.synthesizers.utils import get_num_gpus from sdgym.utils import ( build_synthesizer, format_exception, get_synthesizers, import_object, used_memory) @@ -72,6 +72,7 @@ def _compute_scores(metrics, real_data, synthetic_data, metadata, output): error = None score = None + normalized_score = None start = datetime.utcnow() try: LOGGER.info('Computing %s on dataset %s', metric_name, metadata._metadata['name']) @@ -309,8 +310,9 @@ def run(synthesizers=None, datasets=None, datasets_path=None, modalities=None, b run_id = os.getenv('RUN_ID') or str(uuid.uuid4())[:10] if workers == -1: - if torch.cuda.is_available(): - workers = torch.cuda.device_count() + num_gpus = get_num_gpus() + if num_gpus > 0: + workers = num_gpus else: workers = multiprocessing.cpu_count() diff --git a/sdgym/synthesizers/__init__.py b/sdgym/synthesizers/__init__.py index 858a8957..8336ce23 100644 --- a/sdgym/synthesizers/__init__.py +++ b/sdgym/synthesizers/__init__.py @@ -23,7 +23,6 @@ 'CTGAN', 'Uniform', 'VEEGAN', - 'CTGAN', 'CopulaGAN', 'GaussianCopulaCategorical', 'GaussianCopulaCategoricalFuzzy', diff --git a/sdgym/synthesizers/base.py b/sdgym/synthesizers/base.py index 4cbf94f8..adb70772 100644 --- a/sdgym/synthesizers/base.py +++ b/sdgym/synthesizers/base.py @@ -1,3 +1,4 @@ +import abc import logging import pandas as pd @@ -8,7 +9,7 @@ LOGGER = logging.getLogger(__name__) -class Baseline: +class Baseline(abc.ABC): """Base class for all the ``SDGym`` baselines.""" MODALITIES = () @@ -31,11 +32,21 @@ def get_subclasses(cls, include_parents=False): return subclasses + @classmethod + def get_baselines(cls): + subclasses = cls.get_subclasses(include_parents=True) + synthesizers = [] + for _, subclass in subclasses.items(): + if abc.ABC not in subclass.__bases__: + synthesizers.append(subclass) + + return synthesizers + def fit_sample(self, real_data, metadata): pass -class SingleTableBaseline(Baseline): +class SingleTableBaseline(Baseline, abc.ABC): """Base class for all the SingleTable Baselines. Subclasses can choose to implement ``_fit_sample``, which will @@ -77,7 +88,7 @@ def fit_sample(self, real_data, metadata): return _fit_sample(real_data, metadata) -class MultiSingleTableBaseline(Baseline): +class MultiSingleTableBaseline(Baseline, abc.ABC): """Base class for SingleTableBaselines that are used on multi table scenarios. These classes model and sample each table independently and then just @@ -111,7 +122,7 @@ def fit_sample(self, real_data, metadata): return self._fit_sample(real_data, metadata) -class LegacySingleTableBaseline(SingleTableBaseline): +class LegacySingleTableBaseline(SingleTableBaseline, abc.ABC): """Single table baseline which passes ordinals and categoricals down. This class exists here to support the legacy baselines which do not operate @@ -144,8 +155,8 @@ def _fit_sample(self, real_data, table_metadata): columns, categoricals = self._get_columns(real_data, table_metadata) real_data = real_data[columns] - ht = rdt.HyperTransformer(dtype_transformers={ - 'O': 'label_encoding', + ht = rdt.HyperTransformer(default_data_type_transformers={ + 'categorical': 'LabelEncodingTransformer', }) ht.fit(real_data.iloc[:, categoricals]) model_data = ht.transform(real_data) diff --git a/sdgym/synthesizers/gretel.py b/sdgym/synthesizers/gretel.py index 2cdfdfd0..ec3b3dfb 100644 --- a/sdgym/synthesizers/gretel.py +++ b/sdgym/synthesizers/gretel.py @@ -1,19 +1,24 @@ -import os +import tempfile import numpy as np -from gretel_synthetics.batch import DataFrameBatch from sdgym.synthesizers.base import SingleTableBaseline +try: + from gretel_synthetics.batch import DataFrameBatch +except ImportError: + DataFrameBatch = None + class Gretel(SingleTableBaseline): """Class to represent Gretel's neural network model.""" - DEFAULT_CHECKPOINT_DIR = os.path.join(os.getcwd(), 'checkpoints') - def __init__(self, max_lines=0, max_line_len=2048, epochs=None, vocab_size=20000, gen_lines=None, dp=False, field_delimiter=",", overwrite=True, - checkpoint_dir=DEFAULT_CHECKPOINT_DIR): + checkpoint_dir=None): + if DataFrameBatch is None: + raise ImportError('Please install gretel-synthetics using `pip install sdgym[gretel]`') + self.max_lines = max_lines self.max_line_len = max_line_len self.epochs = epochs @@ -22,7 +27,7 @@ def __init__(self, max_lines=0, max_line_len=2048, epochs=None, vocab_size=20000 self.dp = dp self.field_delimiter = field_delimiter self.overwrite = overwrite - self.checkpoint_dir = checkpoint_dir + self.checkpoint_dir = checkpoint_dir or tempfile.TemporaryDirectory().name def _fit_sample(self, data, metadata): config = { diff --git a/sdgym/synthesizers/sdv.py b/sdgym/synthesizers/sdv.py index af615666..8244b18b 100644 --- a/sdgym/synthesizers/sdv.py +++ b/sdgym/synthesizers/sdv.py @@ -1,3 +1,4 @@ +import abc import logging import sdv @@ -9,7 +10,7 @@ LOGGER = logging.getLogger(__name__) -class SDV(Baseline): +class SDV(Baseline, abc.ABC): MODALITIES = ('single-table', 'multi-table') @@ -22,7 +23,7 @@ def fit_sample(self, data, metadata): return model.sample_all() -class SDVTabular(SingleTableBaseline): +class SDVTabular(SingleTableBaseline, abc.ABC): MODALITIES = ('single-table', ) _MODEL = None @@ -58,11 +59,11 @@ class GaussianCopulaOneHot(SDVTabular): _MODEL = sdv.tabular.GaussianCopula _MODEL_KWARGS = { - 'categorical_transformer': 'one_hot_encoding' + 'categorical_transformer': 'OneHotEncodingTransformer' } -class CUDATabular(SDVTabular): +class CUDATabular(SDVTabular, abc.ABC): def _fit_sample(self, data, metadata): LOGGER.info('Fitting %s', self.__class__.__name__) @@ -90,7 +91,7 @@ class CopulaGAN(CUDATabular): _MODEL = sdv.tabular.CopulaGAN -class SDVRelational(Baseline): +class SDVRelational(Baseline, abc.ABC): MODALITIES = ('single-table', 'multi-table') _MODEL = None @@ -111,7 +112,7 @@ class HMA1(SDVRelational): _MODEL = sdv.relational.HMA1 -class SDVTimeseries(SingleTableBaseline): +class SDVTimeseries(SingleTableBaseline, abc.ABC): MODALITIES = ('timeseries', ) _MODEL = None diff --git a/sdgym/synthesizers/utils.py b/sdgym/synthesizers/utils.py index 9f908b9e..192efd79 100644 --- a/sdgym/synthesizers/utils.py +++ b/sdgym/synthesizers/utils.py @@ -2,7 +2,6 @@ import numpy as np import pandas as pd -import torch from sklearn.mixture import BayesianGaussianMixture, GaussianMixture from sklearn.preprocessing import KBinsDiscretizer @@ -438,16 +437,21 @@ def inverse_transform(self, data): return data_t -def select_device(): - if not torch.cuda.is_available(): - return 'cpu' +def get_num_gpus(): + try: + command = ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'] + output = subprocess.run(command, stdout=subprocess.PIPE) + return len(output.stdout.decode().split()) + except Exception: + return 0 + +def select_device(): try: command = ['nvidia-smi', '--query-gpu=utilization.gpu', '--format=csv,noheader,nounits'] output = subprocess.run(command, stdout=subprocess.PIPE) loads = np.array(output.stdout.decode().split()).astype(float) device = loads.argmin() + return f'cuda:{device}' except Exception: - device = np.random.randint(torch.cuda.device_count()) - - return f'cuda:{device}' + return 'cpu' diff --git a/sdgym/synthesizers/ydata.py b/sdgym/synthesizers/ydata.py index 7aaeefe8..af8ace7f 100644 --- a/sdgym/synthesizers/ydata.py +++ b/sdgym/synthesizers/ydata.py @@ -1,12 +1,14 @@ +import abc + +from sdgym.synthesizers.base import SingleTableBaseline + try: import ydata_synthetic.synthesizers.regular as ydata except ImportError: ydata = None -from sdgym.synthesizers.base import SingleTableBaseline - -class YData(SingleTableBaseline): +class YData(SingleTableBaseline, abc.ABC): def _fit_sample(self, real_data, table_metadata): if ydata is None: diff --git a/sdgym/utils.py b/sdgym/utils.py index 86ba9df8..12fcee12 100644 --- a/sdgym/utils.py +++ b/sdgym/utils.py @@ -1,5 +1,6 @@ """Random utils used by SDGym.""" +import copy import importlib import json import logging @@ -81,7 +82,7 @@ def _get_synthesizer(synthesizer, name=None): with open(synthesizer, 'r') as json_file: return json.load(json_file) - baselines = Baseline.get_subclasses() + baselines = Baseline.get_subclasses(include_parents=True) if synthesizer in baselines: LOGGER.info('Trying to import synthesizer by name.') synthesizer = baselines[synthesizer] @@ -187,14 +188,17 @@ def build_synthesizer(synthesizer, synthesizer_dict): callable: The synthesizer function """ + + _synthesizer_dict = copy.deepcopy(synthesizer_dict) + def _synthesizer_function(real_data, metadata): - metadata_keyword = synthesizer_dict.get('metadata', '$metadata') - real_data_keyword = synthesizer_dict.get('real_data', '$real_data') - device_keyword = synthesizer_dict.get('device', '$device') - device_attribute = synthesizer_dict.get('device_attribute') + metadata_keyword = _synthesizer_dict.get('metadata', '$metadata') + real_data_keyword = _synthesizer_dict.get('real_data', '$real_data') + device_keyword = _synthesizer_dict.get('device', '$device') + device_attribute = _synthesizer_dict.get('device_attribute') device = select_device() - multi_table = 'multi-table' in synthesizer_dict['modalities'] + multi_table = 'multi-table' in _synthesizer_dict['modalities'] if not multi_table: table = metadata.get_tables()[0] metadata = metadata.get_table_meta(table) @@ -206,8 +210,8 @@ def _synthesizer_function(real_data, metadata): (device_keyword, device), ] - init_kwargs = _get_kwargs(synthesizer_dict, 'init', replace) - fit_kwargs = _get_kwargs(synthesizer_dict, 'fit', replace) + init_kwargs = _get_kwargs(_synthesizer_dict, 'init', replace) + fit_kwargs = _get_kwargs(_synthesizer_dict, 'fit', replace) instance = synthesizer(**init_kwargs) if device_attribute: diff --git a/setup.cfg b/setup.cfg index 576868f3..b67aba27 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.0 +current_version = 0.5.0.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index c5a67950..21489831 100644 --- a/setup.py +++ b/setup.py @@ -12,32 +12,46 @@ history = history_file.read() install_requires = [ - 'appdirs>1.1.4,<2', + 'appdirs>=1.3,<2', 'boto3>=1.15.0,<2', - 'compress-pickle>=1.2.0,<2', - 'gretel-synthetics>=0.15.4,<0.16', - 'humanfriendly>=8.2,<9', - 'numpy>=1.15.4,<1.20', - 'pandas<1.1.5,>=1.1', - 'pomegranate>=0.13.0,<0.13.5', + 'botocore>=1.18,<2', + 'compress-pickle>=1.2.0,<3', + 'humanfriendly>=8.2,<11', + "numpy>=1.18.0,<1.20.0;python_version<'3.7'", + "numpy>=1.20.0,<2;python_version>='3.7'", + 'pandas>=1.1.3,<2', + "pomegranate>=0.13.4,<0.14.2;python_version<'3.7'", + "pomegranate>=0.14.1,<0.15;python_version>='3.7'", 'psutil>=5.7,<6', - 'scikit-learn>=0.20,<1', + 'scikit-learn>=0.24,<2', + 'scipy>=1.5.4,<2', 'tabulate>=0.8.3,<0.9', - 'torch>=1.1.0,<2', - 'tqdm>=4,<5', - 'XlsxWriter>=1.2.8,<1.3', - 'rdt>=0.4.1', - 'sdmetrics>=0.3.0', - 'sdv>=0.9.0', - 'tensorflow==2.4.0rc1', - 'wheel~=0.35', + 'torch>=1.8.0,<2', + 'tqdm>=4.15,<5', + 'XlsxWriter>=1.2.8,<4', + 'rdt>=0.6.1,<0.7', + 'sdmetrics>=0.4.1,<0.5', + 'sdv>=0.13.0', ] + +dask_requires = [ + 'dask', + 'distributed', +] + + ydata_requires = [ # preferably install using make install-ydata 'ydata-synthetic>=0.3.0,<0.4', ] +gretel_requires = [ + 'gretel-synthetics>=0.15.4,<0.16', + 'tensorflow==2.4.0rc1', + 'wheel~=0.35', +] + setup_requires = [ 'pytest-runner>=2.11.1', ] @@ -55,12 +69,6 @@ 'pip>=9.0.1', 'watchdog>=0.8.3,<0.11', - # docs - 'm2r>=0.2.0,<0.3', - 'Sphinx>=1.7.1,<3', - 'sphinx_rtd_theme>=0.2.4,<0.5', - 'autodocsumm>=0.1.10,<0.2', - # style check 'flake8>=3.7.7,<4', 'isort>=4.3.4,<5', @@ -77,6 +85,9 @@ 'coverage>=4.5.1,<6', 'tox>=2.9.1,<4', 'importlib-metadata>=3.6', + + # Invoke + 'invoke', ] setup( @@ -91,6 +102,7 @@ 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'Topic :: Scientific/Engineering :: Artificial Intelligence', ], description=( @@ -103,8 +115,11 @@ ], }, extras_require={ - 'dev': development_requires + tests_require, + 'all': development_requires + tests_require + dask_requires + gretel_requires, + 'dev': development_requires + tests_require + dask_requires, 'test': tests_require, + 'gretel': gretel_requires, + 'dask': dask_requires, }, include_package_data=True, install_requires=install_requires, @@ -114,11 +129,11 @@ keywords='machine learning synthetic data generation benchmark generative models', name='sdgym', packages=find_packages(include=['sdgym', 'sdgym.*']), - python_requires='>=3.6,<3.9', + python_requires='>=3.6,<3.10', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, url='https://github.com/sdv-dev/SDGym', - version='0.4.0', + version='0.5.0.dev1', zip_safe=False, ) diff --git a/tasks.py b/tasks.py new file mode 100644 index 00000000..75847ab0 --- /dev/null +++ b/tasks.py @@ -0,0 +1,121 @@ +import glob +import operator +import os +import re +import platform +import shutil +import stat +from pathlib import Path + +from invoke import task + +COMPARISONS = { + '>=': operator.ge, + '>': operator.gt, + '<': operator.lt, + '<=': operator.le +} + + +@task +def check_dependencies(c): + c.run('python -m pip check') + + +@task +def unit(c): + c.run('python -m pytest ./tests/unit --cov=sdgym --cov-report=xml') + + +@task +def integration(c): + c.run('python -m pytest ./tests/integration') + + +@task +def readme(c): + test_path = Path('tests/readme_test') + if test_path.exists() and test_path.is_dir(): + shutil.rmtree(test_path) + + cwd = os.getcwd() + os.makedirs(test_path, exist_ok=True) + shutil.copy('README.md', test_path / 'README.md') + os.chdir(test_path) + c.run('rundoc run --single-session python3 -t python3 README.md') + os.chdir(cwd) + shutil.rmtree(test_path) + + +def _validate_python_version(line): + python_version_match = re.search(r"python_version(<=?|>=?)\'(\d\.?)+\'", line) + if python_version_match: + python_version = python_version_match.group(0) + comparison = re.search(r'(>=?|<=?)', python_version).group(0) + version_number = python_version.split(comparison)[-1].replace("'", "") + comparison_function = COMPARISONS[comparison] + return comparison_function(platform.python_version(), version_number) + + return True + + +@task +def install_minimum(c): + with open('setup.py', 'r') as setup_py: + lines = setup_py.read().splitlines() + + versions = [] + started = False + for line in lines: + if started: + if line == ']': + started = False + continue + + line = line.strip() + if _validate_python_version(line): + requirement = re.match(r'[^>]*', line).group(0) + requirement = re.sub(r"""['",]""", '', requirement) + version = re.search(r'>=?[^(,|#)]*', line).group(0) + if version: + version = re.sub(r'>=?', '==', version) + version = re.sub(r"""['",]""", '', version) + requirement += version + + versions.append(requirement) + + elif (line.startswith('install_requires = [') or + line.startswith('pomegranate_requires = [')): + started = True + + c.run(f'python -m pip install {" ".join(versions)}') + + +@task +def minimum(c): + install_minimum(c) + check_dependencies(c) + unit(c) + integration(c) + + +@task +def lint(c): + check_dependencies(c) + c.run('flake8 sdgym') + c.run('flake8 tests --ignore=D,SFS2') + c.run('isort -c --recursive sdgym tests') + + +def remove_readonly(func, path, _): + "Clear the readonly bit and reattempt the removal" + os.chmod(path, stat.S_IWRITE) + func(path) + + +@task +def rmdir(c, path): + try: + shutil.rmtree(path, onerror=remove_readonly) + except PermissionError: + pass diff --git a/tests/integration/test_benchmark.py b/tests/integration/test_benchmark.py index c2984724..1b256c69 100644 --- a/tests/integration/test_benchmark.py +++ b/tests/integration/test_benchmark.py @@ -39,11 +39,11 @@ def test_identity_jobs(): def test_json_synthesizer(): synthesizer = { - "name": "synthesizer_name", - "synthesizer": "sdgym.synthesizers.ydata.PreprocessedVanillaGAN", - "modalities": ["single-table"], - "init_kwargs": {"categorical_transformer": "label_encoding"}, - "fit_kwargs": {"data": "$real_data"} + 'name': 'synthesizer_name', + 'synthesizer': 'sdgym.synthesizers.ydata.PreprocessedVanillaGAN', + 'modalities': ['single-table'], + 'init_kwargs': {'categorical_transformer': 'label_encoding'}, + 'fit_kwargs': {'data': '$real_data'} } output = sdgym.run( @@ -52,4 +52,31 @@ def test_json_synthesizer(): iterations=1, ) - assert set(output['synthesizer']) == {"synthesizer_name"} + assert set(output['synthesizer']) == {'synthesizer_name'} + + +def test_json_synthesizer_multi_table(): + synthesizer = { + 'name': 'HMA1', + 'synthesizer': 'sdv.relational.HMA1', + 'modalities': [ + 'multi-table' + ], + 'init_kwargs': { + 'metadata': '$metadata' + }, + 'fit_kwargs': { + 'tables': '$real_data' + } + } + + output = sdgym.run( + synthesizers=[json.dumps(synthesizer)], + datasets=['university_v1', 'trains_v1'], + iterations=1, + ) + + # CSTest for `university_v1` is not valid because there are no categorical columns. + valid_out = output.loc[~((output.dataset == 'university_v1') & (output.metric == 'CSTest'))] + + assert not valid_out.error.any() diff --git a/tests/unit/test_datasets.py b/tests/unit/test_datasets.py index 89d25703..9c55f193 100644 --- a/tests/unit/test_datasets.py +++ b/tests/unit/test_datasets.py @@ -9,6 +9,7 @@ class AnyConfigWith: """AnyConfigWith matches any s3 config with the specified signature version.""" + def __init__(self, signature_version): self.signature_version = signature_version diff --git a/tests/unit/test_s3.py b/tests/unit/test_s3.py index ae396c1f..a1357146 100644 --- a/tests/unit/test_s3.py +++ b/tests/unit/test_s3.py @@ -204,9 +204,10 @@ def test_write_csv(write_file_mock): write_csv(data, path, None, None) # asserts - expected_content = 'col1,col2\n1,3\n2,4\n' + input_data = pd.DataFrame({"col1": [1, 2], "col2": [3, 4]}) + expected_content = input_data.to_csv(index=False).encode('utf-8') write_file_mock.assert_called_once_with( - expected_content.encode('utf-8'), + expected_content, path, None, None diff --git a/tox.ini b/tox.ini index a63bfd39..7b793aae 100644 --- a/tox.ini +++ b/tox.ini @@ -1,27 +1,20 @@ [tox] -envlist = py3{6,7,8}, test-devel - -[travis] -python = - 3.8: py38, test-devel - 3.7: py37 - 3.6: py36 - -[gh-actions] -python = - 3.8: py38, test-devel - 3.7: py37 - 3.6: py36 +envlist = py38-lint, py3{6,7,8,9}-{integration,unit,minimum,readme} [testenv] -passenv = CI TRAVIS TRAVIS_* skipsdist = false skip_install = false -extras = test -commands = - /usr/bin/env make test - -[testenv:test-devel] -extras = dev +deps = + invoke +extras = + lint: dev + unit: test + integration: test + minimum: test commands = - /usr/bin/env make test-devel + lint: invoke lint + readme: invoke readme + unit: invoke unit + integration: invoke integration + minimum: invoke minimum + invoke rmdir --path {envdir}