From 275b5d7894d1e67adedee482b6f0db26d678b93a Mon Sep 17 00:00:00 2001 From: Samuel Date: Mon, 12 Feb 2024 12:28:33 +0100 Subject: [PATCH] Improved docstrings at a class level (#234) * updates docstrings in main metasyn package * add base and categorical class docstrings * Change configuration from ini to toml * Enable privacy to be set * Add data-free option to generate data. * Update according to discussion * Mostly working implementation * Remove old code * Fix the tests * Fix ruff * Rework VarConfig * Fix pylint issues * Fix mistake in error handling * Update tutorial * Fix pydocstyle issues * Fix mypy issues * Fix cyclic import * Remove mypy errors * Improve tests for the CLI * Add tests for configurations * Quickly fix some linting issues in testing * Update documentation according to changes * There was a reason for the comments * Uncommitted code * Update docs/source/usage/cli.rst Co-authored-by: Samuel * Update metasyn/config.py Co-authored-by: Samuel * Update metasyn/config.py Co-authored-by: Samuel * Update metasyn/util.py Co-authored-by: Samuel * minor changes * fix test errors * fix sphinx * Revert "fix sphinx" This reverts commit da5e7e85372b9c6ff3e0d76c6dbf14206910a62c. * Fix sphinx (without irrelevant files) * comment based updates * more comment based changes * Update docs/source/api/metasyn.distribution.rst Co-authored-by: qubixes <44498096+qubixes@users.noreply.github.com> --------- Co-authored-by: Raoul Schram Co-authored-by: qubixes <44498096+qubixes@users.noreply.github.com> --- docs/source/api/metasyn.demo.rst | 7 ++- docs/source/api/metasyn.distribution.rst | 73 ++++++++++++++++-------- docs/source/api/metasyn.schema.rst | 4 +- metasyn/distribution/__init__.py | 3 + metasyn/distribution/base.py | 40 +++++++++---- metasyn/distribution/categorical.py | 16 ++++-- metasyn/distribution/na.py | 2 +- metasyn/metaframe.py | 26 +++++---- metasyn/privacy.py | 14 +++-- metasyn/provider.py | 38 ++++++++---- metasyn/var.py | 4 +- 11 files changed, 154 insertions(+), 73 deletions(-) diff --git a/docs/source/api/metasyn.demo.rst b/docs/source/api/metasyn.demo.rst index 8b04de8f..f19240cf 100644 --- a/docs/source/api/metasyn.demo.rst +++ b/docs/source/api/metasyn.demo.rst @@ -1,7 +1,10 @@ -metasyn.demo package -==================== +Demo package +============ .. automodule:: metasyn.demo :members: :undoc-members: + :imported-members: + :inherited-members: + :private-members: :show-inheritance: diff --git a/docs/source/api/metasyn.distribution.rst b/docs/source/api/metasyn.distribution.rst index 3da07dd3..036e2a64 100644 --- a/docs/source/api/metasyn.distribution.rst +++ b/docs/source/api/metasyn.distribution.rst @@ -1,70 +1,97 @@ -metasyn.distribution package -============================== +Distribution package +==================== -.. automodule:: metasyn.distribution +This package consists of several distribution modules. +It also includes :mod:`metasyn.distribution.base` module, which forms the basis of all distributions. + +Base module +^^^^^^^^^^^ + +.. automodule:: metasyn.distribution.base :members: :undoc-members: + :inherited-members: + :private-members: :show-inheritance: -Submodules ----------- +Categorical module +^^^^^^^^^^^^^^^^^^ -metasyn.distribution.base module -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. automodule:: metasyn.distribution.base +.. automodule:: metasyn.distribution.categorical :members: :undoc-members: + :inherited-members: + :private-members: :show-inheritance: -metasyn.distribution.categorical module -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Constant module +^^^^^^^^^^^^^^^ -.. automodule:: metasyn.distribution.categorical +.. automodule:: metasyn.distribution.constant :members: :undoc-members: + :inherited-members: + :private-members: :show-inheritance: -metasyn.distribution.continuous module -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Continuous module +^^^^^^^^^^^^^^^^^ .. automodule:: metasyn.distribution.continuous :members: :undoc-members: + :inherited-members: + :private-members: :show-inheritance: -metasyn.distribution.datetime module -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Datetime module +^^^^^^^^^^^^^^^ .. automodule:: metasyn.distribution.datetime :members: :undoc-members: + :inherited-members: + :private-members: :show-inheritance: -metasyn.distribution.discrete module -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Discrete module +^^^^^^^^^^^^^^^ .. automodule:: metasyn.distribution.discrete :members: :undoc-members: + :inherited-members: + :private-members: :show-inheritance: -metasyn.distribution.faker module -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Faker module +^^^^^^^^^^^^ .. automodule:: metasyn.distribution.faker :members: :undoc-members: + :inherited-members: + :private-members: :show-inheritance: -metasyn.distribution.regex module -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +NA module +^^^^^^^^^ -.. automodule:: metasyn.distribution.regex +.. automodule:: metasyn.distribution.na :members: :undoc-members: + :inherited-members: + :private-members: :show-inheritance: +Regex module +^^^^^^^^^^^^ +.. automodule:: metasyn.distribution.regex + :members: + :undoc-members: + :inherited-members: + :private-members: + :show-inheritance: diff --git a/docs/source/api/metasyn.schema.rst b/docs/source/api/metasyn.schema.rst index 3deae866..49a3ea3a 100644 --- a/docs/source/api/metasyn.schema.rst +++ b/docs/source/api/metasyn.schema.rst @@ -1,5 +1,5 @@ -metasyn.schema package -======================== +Schema package +============== .. automodule:: metasyn.schema :members: diff --git a/metasyn/distribution/__init__.py b/metasyn/distribution/__init__.py index b39e01e3..21209e8d 100644 --- a/metasyn/distribution/__init__.py +++ b/metasyn/distribution/__init__.py @@ -1,5 +1,8 @@ """Package providing different distributions used in metasyn. +The package consists of several distribution modules, it also includes +``base`` module which forms the basis of all distributions. + Each distribution class provides methods for fitting the distribution to a a series of values, and for generating synthetic data based on the fitted distribution. Each distribution class also provides a way to calculate the diff --git a/metasyn/distribution/base.py b/metasyn/distribution/base.py index 8374286c..fd606c93 100644 --- a/metasyn/distribution/base.py +++ b/metasyn/distribution/base.py @@ -1,17 +1,18 @@ """ Module serving as the basis for all metasyn distributions. -The base module contains the BaseDistribution class, which is the base class -for all distributions. It also contains the ScipyDistribution class, +The base module contains the ``BaseDistribution`` class, +which is the base class for all distributions. +It also contains the ``ScipyDistribution`` class, which is a specialized base class for distributions that are built on top of SciPy's statistical distributions. -Additionally it contains the UniqueDistributionMixin class, +Additionally it contains the ``UniqueDistributionMixin`` class, which is a mixin class that can be used to make a distribution unique (i.e., one that does not contain duplicate values). -Finally it contains the metadist() decorator, which is used to set the -class attributes of a distribution. +Finally it contains the ``metadist()`` decorator, +which is used to set the class attributes of a distribution. """ from __future__ import annotations @@ -28,8 +29,14 @@ class attributes of a distribution. class BaseDistribution(ABC): """Abstract base class to define a distribution. - All distributions should be derived from this class, and should implement the following methods: - _fit, draw, _param_dict, _param_schema, default_distribution and __init__. + All distributions should be derived from this class, and should implement + the following methods: + :meth:`~_fit`, + :meth:`~draw`, + :meth:`~_param_dict`, + :meth:`~_param_schema`, + :meth:`~default_distribution` + and ``__init__``. """ implements: str = "unknown" @@ -229,12 +236,11 @@ def _wrap(cls): class ScipyDistribution(BaseDistribution): - """Base class for numerical Scipy distributions. + """Base class for numerical distributions using Scipy. This base class makes it easy to implement new numerical - distributions. One could also use this base class for non-scipy - distributions, in which case the distribution class should implement - logpdf, rvs and fit methods. + distributions. It can also be used for non-Scipy distributions, + provided the distribution implements `logpdf`, `rvs` and `fit` methods. """ @property @@ -286,7 +292,17 @@ def _information_criterion(self, values): @metadist(unique=True) class UniqueDistributionMixin(BaseDistribution): - """Mixin class to make unique version of base distribution.""" + """Mixin class to make unique version of base distributions. + + This mixin class can be used to extend base distribution classes, adding + functionality that ensures generated values are unique. It overrides + the `draw` method of the base class, adding a check to prevent duplicate + values from being drawn. If a duplicate value is drawn, it retries up to + 1e5 times before raising a ValueError. + + The `UniqueDistributionMixin` is used in various unique metasyn distribution + variations, such as `UniqueFakerDistribution` and `UniqueRegexDistribution`. + """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/metasyn/distribution/categorical.py b/metasyn/distribution/categorical.py index 95d95dd6..bf4c1573 100644 --- a/metasyn/distribution/categorical.py +++ b/metasyn/distribution/categorical.py @@ -16,13 +16,19 @@ class MultinoulliDistribution(BaseDistribution): """Categorical distribution that stores category labels and probabilities. + This class represents a multinoulli (categorical) distribution. + It is used in cases where there are multiple potential outcomes, + each with a specified probability. The class stores the labels for each + category and their corresponding probabilities. + Parameters ---------- - labels: list of str - List containing the label belonging to each category. - probs: list of int - List containing the probability of each category. - Probabilities will be normalized, so frequencies are valid too. + labels : list of str + The labels for each category in the distribution, representing + the possible outcomes. + probs : list of int + The probabilities or frequencies of each category. These will be + normalized internally. """ def __init__(self, labels: Union[npt.NDArray[np.str_], list[str]], diff --git a/metasyn/distribution/na.py b/metasyn/distribution/na.py index 59d856d1..a0417566 100644 --- a/metasyn/distribution/na.py +++ b/metasyn/distribution/na.py @@ -11,7 +11,7 @@ @metadist(implements="core.na", var_type="string") class NADistribution(BaseDistribution): - """Distribution that will only ever give back NA.""" + """Distribution that always returns NA values (None).""" @classmethod def _fit(cls, values: pl.Series) -> BaseDistribution: diff --git a/metasyn/metaframe.py b/metasyn/metaframe.py index f7f136cd..f43b36bb 100644 --- a/metasyn/metaframe.py +++ b/metasyn/metaframe.py @@ -1,4 +1,4 @@ -"""Module defining the MetaFrame class, used for the conversion of DataFrames to MetaFrames.""" +"""Module defining MetaFrames, used for creating MetaFrames from DataFrames.""" # pylint: disable=invalid-name from __future__ import annotations @@ -19,22 +19,28 @@ class MetaFrame(): - """Metasyn metaframe consisting of variables. + """Container for statistical metadata describing a dataset. - A MetaFrame, short for metadata frame, is a structure that holds statistical metadata - about a dataset. The data contained in a MetaFrame is in line with the - Generative Metadata Format (GMF). It is essentially, a collection of MetaVar objects, - each representing a column in a dataset. + This class is used to fit a MetaFrame to a Polars DataFrame, serialize and + export the MetaFrame to a file, read a MetaFrame from a file, and create + a synthetic Polars DataFrame. - The metaframe is most easily created from a polars dataset with the from_dataframe() - class method. + A MetaFrame represents a metadata frame, which is a structure that holds + statistical metadata about a dataset. The data contained in a MetaFrame + follows the Generative Metadata Format (GMF). + The metadata is contained in a collection of MetaVar objects, + with each MetaVar representing a column (variable). + + A MetaFrame can easily be created using the + ``fit_dataframe`` method, which takes a Polars DataFrame and fits a + MetaFrame to it. Parameters ---------- meta_vars: - List of variables representing columns in a dataframe. + List of variables representing columns in a DataFrame. n_rows: - Number of rows in the original dataframe. + Number of rows in the original DataFrame. privacy_package: Package that supplies the distributions. """ diff --git a/metasyn/privacy.py b/metasyn/privacy.py index ba355cbc..29b3730e 100644 --- a/metasyn/privacy.py +++ b/metasyn/privacy.py @@ -12,10 +12,12 @@ class BasePrivacy(ABC): - """Base class for privacy level. + """Abstract base class for privacy levels. - Derived classes should at least set the class variable - name and implement the to_dict method. + This class serves as a blueprint for privacy classes. Derived classes + should at least set the class variable `name` and implement the `to_dict` + method, which should return a dictionary that gives the privacy type and + its parameters. """ name = "unknown_privacy" @@ -51,7 +53,11 @@ def fit_kwargs(self): class BasicPrivacy(BasePrivacy): - """No privacy class, which uses statistically optimal distributions.""" + """Class representing no privacy level. + + This class uses statistically optimal distributions. It inherits from the + `BasePrivacy` class and sets the `name` attribute to "none". + """ name = "none" diff --git a/metasyn/provider.py b/metasyn/provider.py index cce92757..bdb4af3c 100644 --- a/metasyn/provider.py +++ b/metasyn/provider.py @@ -61,9 +61,14 @@ from metasyn.config import VarConfig, VarConfigAccess class BaseDistributionProvider(ABC): - """Class that encapsulates a set of distributions. - - It has a property {var_type}_distributions for every var_type. + """Base class for all distribution providers. + + A distribution provider is a class that provides a set of distributions + that can be used by metasyn to generate synthetic data. + This class acts as a base class for creating specific distribution + providers. It also contains a list of the available distributions and + legacy distributions. A list of distributions for a specific type can be + accessed with ``get_dist_list``. """ name = "" @@ -124,7 +129,11 @@ def all_var_types(self) -> List[str]: class BuiltinDistributionProvider(BaseDistributionProvider): - """Distribution tree that includes the builtin distributions.""" + """Distribution tree that includes the builtin distributions. + + This class inherits from BaseDistributionProvider and provides + the built-in metasyn distributions. + """ name = "builtin" version = "1.2" @@ -152,15 +161,20 @@ class BuiltinDistributionProvider(BaseDistributionProvider): class DistributionProviderList(): """List of DistributionProviders with functionality to fit distributions. - Arguments - --------- + This class is responsible for managing and providing access to + different distribution providers. It allows for fitting distributions, + as well as retrieving distributions based on certain constraints + such as privacy level, variable type, and uniqueness. + + Parameters + ---------- dist_providers: - One or more distribution providers, that are denoted either with a string ("builtin") - , DistributionProvider (BuiltinDistributionProvider()) or DistributionProvider type - (BuiltinDistributionProvider). - The order in which distribution providers are included matters. If a provider implements - the same distribution at the same privacy level, then only the first will be taken into - account. + One or more distribution providers, that are denoted either with a string ("builtin"), + DistributionProvider (BuiltinDistributionProvider()) + or DistributionProvider type (BuiltinDistributionProvider). + The order in which distribution providers are included matters. + If a provider implements the same distribution at the same privacy level, + then only the first will be taken into account. """ def __init__( diff --git a/metasyn/var.py b/metasyn/var.py index f4a57c48..00dbabc0 100644 --- a/metasyn/var.py +++ b/metasyn/var.py @@ -14,7 +14,7 @@ class MetaVar(): - """Metadata variable. + """Metadata variable describing a column in a MetaFrame. MetaVar is a structure that holds all metadata needed to generate a synthetic column for it. This is the variable level building block for the @@ -42,7 +42,7 @@ class MetaVar(): Proportion of the series that are missing/NA. dtype: Type of the original values, e.g. int64, float, etc. Used for type-casting - back. + back. The default value is "unknown". description: User-provided description of the variable. """