Merge branch 'feature-postfix-fields'

elcorto · Apr 24, 2024 · 1b76805 · 1b76805
2 parents 6762b5e + c6de150
commit 1b76805
Show file tree

Hide file tree

Showing 21 changed files with 269 additions and 69 deletions.
diff --git a/doc/source/written/manual.md b/doc/source/written/manual.md
@@ -43,7 +43,7 @@ calculate and store the result of a calculation for each parameter combination.
 
 
 >>> def func(pset):
-...    return {"result": random.random() * pset["a"] * pset["b"]}
+...    return {"result_": random.random() * pset["a"] * pset["b"]}
 
 >>> a = ps.plist("a", [1,2,3])
 >>> b = ps.plist("b", [88,99])
@@ -95,7 +95,7 @@ pickled file `calc/database.pk` by default:
 4      calc 2023-01-20 19:54:09.546977043          4         0    deskbot
 5      calc 2023-01-20 19:54:09.548082113          5         0    deskbot
 
-       result  _pset_runtime
+      result_  _pset_runtime
 0    3.629665       0.000004
 1   59.093600       0.000002
 2   84.056801       0.000002
@@ -104,7 +104,7 @@ pickled file `calc/database.pk` by default:
 5   37.220296       0.000002
 ```
 
-You see the columns `a` and `b`, the column `result` (returned by
+You see the columns `a` and `b`, the column `result_` (returned by
 `func`) and a number of reserved fields for book-keeping such as
 
 ```
@@ -157,13 +157,13 @@ and runs the workload for that pset. `func` must return a
 dict, for example:
 
 ```py
-{'result': 1.234}
+{'result_': 1.234}
 ```
 
 or an updated 'pset':
 
 ```py
-{'a': 1, 'b': 88, 'result': 1.234}
+{'a': 1, 'b': 88, 'result_': 1.234}
 ```
 
 We always merge (`dict.update()`) the result of `func` with the pset, which gives
@@ -180,6 +180,19 @@ columns such as `_run_id` (once per `ps.run()` call) or `_pset_id` (once
 per pset). Using `ps.run(... poolsize=...)` runs `func` in parallel on
 `params` using `multiprocessing.Pool`.
 
+## Naming of database fields
+
+`ps.run()` will add book-keeping fields starting with an underscore prefix
+(e.g. `_pset_id`). By doing that, they can be distinguished from `pset` fields
+`a` and `b`. We *recommend* but not require you to name all fields (dict keys)
+generated in `func()` such as `result_` with a trailing or *postfix*
+underscore. That way you can in the database clearly distinguish between
+book-keeping (`_foo`), pset (`a`, `b`) and result-type fields (`bar_`). But
+again, this is only a suggestion, you can name the fields in a `pset` and the
+ones created in `func()` any way you like. See [this section for more
+details](s:more-on-db-field-names).
+
+
 ## Building parameter grids
 
 This package offers some very simple helper functions which assist in creating
@@ -438,7 +451,7 @@ have as above two unique `_run_id`s, unique `_pset_id`s, but *two sets of the
 same* `_pset_hash`.
 
 ```
-                             _run_id                              _pset_id  _run_seq  _pset_seq                                _pset_hash  a    result
+                             _run_id                              _pset_id  _run_seq  _pset_seq                                _pset_hash  a   result_
 8543fdad-4426-41cb-ab42-8a80b1bebbe2  08cb5f7c-8ce8-451f-846d-db5ac3bcc746         0          0  e4ad4daad53a2eec0313386ada88211e50d693bd  1  0.381589
 8543fdad-4426-41cb-ab42-8a80b1bebbe2  18da3840-d00e-4bdd-b29c-68be2adb164e         0          1  7b7ee754248759adcee9e62a4c1477ed1a8bb1ab  2  1.935220
 8543fdad-4426-41cb-ab42-8a80b1bebbe2  bcc47205-0919-4084-9f07-072eb56ed5fd         0          2  9e0e6d8a99c72daf40337183358cbef91bba7311  3  2.187107
@@ -471,6 +484,36 @@ This will skip all `pset`s already in the database based on their hash and
 only add calculations for new `pset`s.
 
 
+(s:more-on-db-field-names)=
+### More details on naming database fields
+
+We implement the convention to ignore fields starting and ending in an
+underscore at the moment only internally in `ps.pset_hash()` to ensure that the
+hash includes only `pset` variables. However, when `ps.run()` is called, the
+hash is calculated *before* book-keeping fields like `_pset_id` are added and
+`func()` is called to, for instance, return `{'result_': 1.234}` and update the
+`pset`. Therefore, this convention is in fact not needed. It only takes effect
+should you ever want to re-calculate the hash, as in
+
+```py
+>>> for idx, row in df.iterrows():
+...    df.at[idx, "_pset_hash_new"] = ps.pset_hash(row.to_dict())
+
+>>> df
+   a                                _pset_hash                              _pset_id  ...   result_                            _pset_hash_new
+0  1  64846e128be5c974d6194f77557d0511542835a8  61f899a8-314b-4a19-a359-3502e3e2d009  ...  0.880328  64846e128be5c974d6194f77557d0511542835a8
+1  2  e746f91e51f09064bd7f1e516701ba7d0d908653  cd1dc05b-0fab-4e09-9798-9de94a5b3cd3  ...  0.815945  e746f91e51f09064bd7f1e516701ba7d0d908653
+2  3  96da150761c9d66b31975f11ef44bfb75c2fdc11  6612eab6-5d5a-4fbf-ae18-fdb4846fd459  ...  0.096946  96da150761c9d66b31975f11ef44bfb75c2fdc11
+3  4  79ba178b3895a603bf9d84dea82e034791e8fe30  bf5bf881-3273-4932-a3f3-9c117bca921b  ...  2.606486  79ba178b3895a603bf9d84dea82e034791e8fe30
+```
+
+Here the hash goes only over the `a` field, so `_pset_hash` and `_pset_hash_new`
+must be the same.
+
+We may provide tooling for that in the future. See also
+https://github.com/elcorto/psweep/issues/15 .
+
+
 ## Best practices
 
 The following workflows and practices come from experience. They are, if
@@ -816,7 +859,7 @@ This will add a bool field `_failed` to the database, as well as a text field
 `_exc_txt` which stores the exception's traceback message.
 
 We don't implement this as a feature and only provide examples, which keeps
-things fexible. Maybe you want `_failed` to be called `_crashed` instead, or you want
+things flexible. Maybe you want `_failed` to be called `_crashed` instead, or you want
 to log more data.
 
 For post-processing, you would then do something like:
@@ -1484,6 +1527,7 @@ $ cat _pics/foo.png
 
 will bring the content back to the working dir.
 
+
 ## Scope and related projects
 
 This project aims to be agnostic to the field of study. We target problems that

diff --git a/examples/batch_templates/20eval.py b/examples/batch_templates/20eval.py
@@ -10,9 +10,9 @@
         [np.load(f"calc/{pset_id}/out.npy") for pset_id in df._pset_id.values]
     )
 
-    df["mean"] = arr.mean(axis=1)
+    df["mean_"] = arr.mean(axis=1)
 
-    cols = ["param_a", "param_b", "mean"]
+    cols = ["param_a", "param_b", "mean_"]
     ps.df_print(df[cols])
 
     ps.df_write("calc/database_eval.pk", df)
diff --git a/examples/batch_templates/run_example.sh b/examples/batch_templates/run_example.sh
@@ -22,4 +22,4 @@ sh run_local.sh
 cd ..
 ./20eval.py
 
-psweep-db2table calc/database_eval.pk param_a param_b mean _run_seq _pset_seq _run_id _pset_id
+psweep-db2table calc/database_eval.pk param_a param_b mean_ _run_seq _pset_seq _run_id _pset_id
diff --git a/examples/batch_templates_git/20eval.py b/examples/batch_templates_git/20eval.py
@@ -10,9 +10,9 @@
         [np.load(f"calc/{pset_id}/out.npy") for pset_id in df._pset_id.values]
     )
 
-    df["mean"] = arr.mean(axis=1)
+    df["mean_"] = arr.mean(axis=1)
 
-    cols = ["param_a", "param_b", "mean"]
+    cols = ["param_a", "param_b", "mean_"]
     ps.df_print(df[cols])
 
     ps.df_write("calc/database_eval.pk", df)
diff --git a/examples/batch_templates_git/run_example.sh b/examples/batch_templates_git/run_example.sh
@@ -49,4 +49,4 @@ sh run_local.sh
 cd ..
 ./20eval.py
 
-psweep-db2table calc/database_eval.pk param_a param_b mean _run_seq _pset_seq _run_id _pset_id
+psweep-db2table calc/database_eval.pk param_a param_b mean_ _run_seq _pset_seq _run_id _pset_id
diff --git a/examples/benchmark_shell_command.py b/examples/benchmark_shell_command.py
@@ -29,7 +29,7 @@ def func(pset):
     timing = min(
         timeit.repeat(lambda: run(cmd, shell=True), repeat=3, number=1)
     )
-    return {"timing": timing}
+    return {"timing_": timing}
 
 
 if __name__ == "__main__":

diff --git a/examples/capture_logs.py b/examples/capture_logs.py
@@ -37,7 +37,7 @@ def func_no_exc(pset):
 
     # stderr
     print("text on stderr", file=sys.stderr)
-    return {"result": random.random() * pset["a"]}
+    return {"result_": random.random() * pset["a"]}
 
 
 def func_with_exc(pset):
@@ -53,7 +53,7 @@ def func_with_exc(pset):
             raise_error=True,
         )
     )
-    return {"result": random.random() * pset["a"]}
+    return {"result_": random.random() * pset["a"]}
 
 
 def safe_func(pset):

diff --git a/examples/multiple_local_1d_scans/10run.py b/examples/multiple_local_1d_scans/10run.py
@@ -12,7 +12,7 @@
 
 
 def func(pset):
-    return {"result": random.random() * pset["a"] * pset["b"]}
+    return {"result_": random.random() * pset["a"] * pset["b"]}
 
 
 if __name__ == "__main__":
@@ -43,4 +43,4 @@ def func(pset):
 
     disp_cols += ["_run_id", "study"]
     df = ps.run(func, params, verbose=disp_cols)
-    print(df[disp_cols + ["result"]])
+    print(df[disp_cols + ["result_"]])
diff --git a/examples/multiple_local_1d_scans/20run.py b/examples/multiple_local_1d_scans/20run.py
@@ -5,7 +5,7 @@
 
 
 def func(pset):
-    return {"result": random.random() * pset["a"] * pset["b"]}
+    return {"result_": random.random() * pset["a"] * pset["b"]}
 
 
 if __name__ == "__main__":
@@ -27,4 +27,4 @@ def func(pset):
 
     disp_cols += ["_run_id", "study"]
     df = ps.run(func, params, verbose=disp_cols, backup=True)
-    print(df[disp_cols + ["result"]])
+    print(df[disp_cols + ["result_"]])
diff --git a/examples/repeat_failed.py b/examples/repeat_failed.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+import random
+import traceback
+from functools import partial
+
+import numpy as np
+
+import psweep as ps
+
+
+def safe_func(pset, *, func):
+    ret = dict()
+    try:
+        ret.update(func(pset))
+        ret.update(_failed=False, _exc_txt=None)
+    except:
+        txt = traceback.format_exc()
+        print(f"failed, traceback:\n{txt}")
+        ret.update(_failed=True, _exc_txt=txt)
+    finally:
+        # some cleanup here if needed
+        pass
+    return ret
+
+
+def func_with_fail(pset):
+    a = pset["a"]
+    # Some fake reason to fail
+    if a % 2 == 0:
+        raise ValueError("a is even, fail here")
+    return {"result_": random.random() * a}
+
+
+def func_fixed(pset):
+    a = pset["a"]
+    return {"result_": random.random() * a}
+
+
+def pset_col_filter(c: str):
+    """Filter field names that belong to a pset, so everything *not* starting
+    or ending with a "_".
+
+    Example
+    -------
+    >>> df.columns
+    Index(['a', 'b', '_pset_hash', '_pset_id', '_run_seq', '_pset_seq', '_run_id',
+           '_calc_dir', '_time_utc', '_exec_host', 'result_', '_pset_runtime'],
+           dtype='object')
+    >>> list(filter(pset_col_filter, df.columns))
+    ['a', 'b']
+    """
+    return not (c.startswith("_") or c.endswith("_"))
+
+
+if __name__ == "__main__":
+    # Pass a list of ints (i.e. type(1) == int). If we use np.arange(10) then
+    # the type of each entry is np.int64 and that leads tp different _pset_hash
+    # values, since our hashes are (lukily, sadly?) type-specific due to the
+    # usage of joblib.hash().
+    params = ps.plist("a", list(range(10)))
+    assert isinstance(params[0]["a"], int)
+    assert not isinstance(params[0]["a"], np.int64)
+
+    # First run. Don't write df to disk. Pass on here to second run since this
+    # is one script. But just using default save=False and letting the second
+    # run read it from disk also works of course.
+
+    df = ps.run(
+        partial(safe_func, func=func_with_fail),
+        params,
+        capture_logs="db",
+        save=False,
+    )
+    ps.df_print(df, cols=["a", "result_", "_failed", "_pset_hash", "_run_id"])
+
+    n_failed = len(df[df._failed])
+    print(f"{n_failed=}")
+    run_id_0 = df._run_id.unique()[0]
+
+    # Repeat failed
+
+    pset_cols = list(filter(pset_col_filter, df.columns))
+    print(f"{pset_cols=}")
+    params_repeat = [
+        row.to_dict() for _, row in df[df._failed][pset_cols].iterrows()
+    ]
+    print(f"{params_repeat=}")
+    df = ps.run(
+        partial(safe_func, func=func_fixed),
+        params_repeat,
+        capture_logs="db",
+        df=df,
+        save=False,
+    )
+    ps.df_print(df, cols=["a", "result_", "_failed", "_pset_hash", "_run_id"])
+    run_id_1 = df._run_id.unique()[-1]
+
+    assert (
+        df[df._run_id == run_id_1].a.values == np.array([0, 2, 4, 6, 8])
+    ).all()
+
+    assert (
+        df[df._run_id == run_id_1]._pset_hash.values
+        == df[df._failed & (df._run_id == run_id_0)]._pset_hash.values
+    ).all()
diff --git a/examples/save_data_on_disk/10run.py b/examples/save_data_on_disk/10run.py
@@ -12,7 +12,7 @@ def func(pset):
         f"echo {pset['a']} {pset['a']*2} {pset['a']*4} > {fn}"
     )
     subprocess.run(cmd, shell=True)
-    return {"cmd": cmd}
+    return {"_cmd": cmd}
 
 
 if __name__ == "__main__":

diff --git a/examples/save_data_on_disk/20eval.py b/examples/save_data_on_disk/20eval.py
@@ -17,9 +17,9 @@
         ]
     )
 
-    df["mean"] = arr.mean(axis=1)
+    df["mean_"] = arr.mean(axis=1)
 
-    cols = ["a", "mean", "_pset_id"]
+    cols = ["a", "mean_", "_pset_id"]
     ps.df_print(df[cols])
 
     ps.df_write("calc/database_eval.pk", df)
diff --git a/examples/vary_1_param.py b/examples/vary_1_param.py
@@ -5,7 +5,7 @@
 
 
 def func(pset):
-    return {"result": random.random() * pset["a"]}
+    return {"result_": random.random() * pset["a"]}
 
 
 if __name__ == "__main__":

diff --git a/examples/vary_1_param_parallel.py b/examples/vary_1_param_parallel.py
@@ -7,7 +7,7 @@
 
 def func(pset):
     print(mp.current_process().name)
-    return {"result": random.random() * pset["a"]}
+    return {"result_": random.random() * pset["a"]}
 
 
 if __name__ == "__main__":

diff --git a/examples/vary_1_param_repeat_same.py b/examples/vary_1_param_repeat_same.py
@@ -5,7 +5,7 @@
 
 
 def func(pset):
-    return {"result": random.random() * pset["a"]}
+    return {"result_": random.random() * pset["a"]}
 
 
 if __name__ == "__main__":
@@ -24,6 +24,6 @@ def func(pset):
         "_pset_seq",
         "_pset_hash",
         "a",
-        "result",
+        "result_",
     ]
     ps.df_print(df[cols])
diff --git a/examples/vary_1_param_simulate.py b/examples/vary_1_param_simulate.py
@@ -5,11 +5,11 @@
 
 
 def func(pset):
-    return {"result": random.random() * pset["a"] * pset["b"]}
+    return {"result_": random.random() * pset["a"] * pset["b"]}
 
 
 if __name__ == "__main__":
-    sel = ["_calc_dir", "a", "b", "result"]
+    sel = ["_calc_dir", "a", "b", "result_"]
 
     # 1st real run: produce some data, vary a, b constant
     params = ps.pgrid(ps.plist("a", [1, 2, 3, 4]), ps.plist("b", [100]))