diff --git a/0.1/.buildinfo b/0.1/.buildinfo index 42e6e1fd..c06196d0 100644 --- a/0.1/.buildinfo +++ b/0.1/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 55bb307912a4f2d9d48195c7e170ecb8 +config: 8f283ab8d9420186222c16786ac862f9 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/0.1/_images/sphx_glr_01_encodings_001.png b/0.1/_images/sphx_glr_01_encodings_001.png index 49ac3f42..41af42fb 100644 Binary files a/0.1/_images/sphx_glr_01_encodings_001.png and b/0.1/_images/sphx_glr_01_encodings_001.png differ diff --git a/0.1/_images/sphx_glr_01_encodings_thumb.png b/0.1/_images/sphx_glr_01_encodings_thumb.png index f290b54c..2f2503be 100644 Binary files a/0.1/_images/sphx_glr_01_encodings_thumb.png and b/0.1/_images/sphx_glr_01_encodings_thumb.png differ diff --git a/0.1/_images/sphx_glr_08_join_aggregation_003.png b/0.1/_images/sphx_glr_08_join_aggregation_003.png index 22f957b0..a6275541 100644 Binary files a/0.1/_images/sphx_glr_08_join_aggregation_003.png and b/0.1/_images/sphx_glr_08_join_aggregation_003.png differ diff --git a/0.1/_images/sphx_glr_09_interpolation_join_001.png b/0.1/_images/sphx_glr_09_interpolation_join_001.png index eac5513a..6f06c9e0 100644 Binary files a/0.1/_images/sphx_glr_09_interpolation_join_001.png and b/0.1/_images/sphx_glr_09_interpolation_join_001.png differ diff --git a/0.1/_images/sphx_glr_09_interpolation_join_002.png b/0.1/_images/sphx_glr_09_interpolation_join_002.png index ddf09d4d..079b2cd0 100644 Binary files a/0.1/_images/sphx_glr_09_interpolation_join_002.png and b/0.1/_images/sphx_glr_09_interpolation_join_002.png differ diff --git a/0.1/_images/sphx_glr_09_interpolation_join_003.png b/0.1/_images/sphx_glr_09_interpolation_join_003.png index a8dc7a84..c7346677 100644 Binary files a/0.1/_images/sphx_glr_09_interpolation_join_003.png and b/0.1/_images/sphx_glr_09_interpolation_join_003.png differ diff --git a/0.1/_images/sphx_glr_09_interpolation_join_thumb.png b/0.1/_images/sphx_glr_09_interpolation_join_thumb.png index 5c3ec005..b9e28afc 100644 Binary files a/0.1/_images/sphx_glr_09_interpolation_join_thumb.png and b/0.1/_images/sphx_glr_09_interpolation_join_thumb.png differ diff --git a/0.1/_sources/auto_examples/01_encodings.rst.txt b/0.1/_sources/auto_examples/01_encodings.rst.txt index 6a0c38f2..fd68e6f8 100644 --- a/0.1/_sources/auto_examples/01_encodings.rst.txt +++ b/0.1/_sources/auto_examples/01_encodings.rst.txt @@ -440,47 +440,42 @@ corresponding columns: .. code-block:: none - array(['accountability, accounts, community', - 'assessment, protective, treatment', - 'communications, communication, telecommunications', - 'station, state, estate', 'security, mccf, unit', - 'gaithersburg, clarksburg, the', 'traffic, safety, alcohol', - 'investigative, investigations, criminal', - 'training, recruit, director', 'inspections, inspection, special', - 'district, squad, 3rd', 'services, highway, service', - 'behavioral, health, school', 'silver, spring, ride', - 'nicholson, transit, transport', 'mangement, engineering, parking', - 'welfare, childhood, children', 'collection, recycling, solid', - 'technology, systems, telephone', - 'equipment, management, automotive', - 'facilities, maintenance, supports', - 'rockville, downtown, library', 'warehouse, liquor, stock', - 'development, delivery, montgomery', - 'eligibility, assistance, emergency', - 'programs, program, officers', 'planning, construction, building', - 'patrol, 5th, 4th', 'family, crimes, sexual', - 'toddlers, custody, members', 'coordinator, services, service', - 'operator, bus, operations', 'specialist, special, quality', - 'master, registered, water', 'manager, projects, project', - 'officer, office, police', 'firefighter, rescuer, rescue', - 'supervisor, supervisory, supply', 'assistance, income, client', - 'correctional, correction, corporal', - 'legislative, principal, executive', 'school, room, behavioral', - 'community, security, nurse', - 'communications, telecommunications, safety', - 'lieutenant, captain, chief', - 'information, technology, technologist', + array(['compliance, building, violence', 'gaithersburg, clarksburg, the', + 'station, state, estate', 'development, planning, accounting', + 'patrol, 4th, 5th', 'traffic, safety, alcohol', + 'management, equipment, budget', 'toddlers, custody, members', + 'services, highway, service', 'behavioral, health, school', + 'collection, inspections, operations', 'family, crimes, outreach', + 'welfare, childhood, child', 'security, mccf, unit', + 'supports, support, network', 'emergency, centers, center', + 'district, squad, urban', 'maintenance, facilities, recruit', + 'administration, battalion, admin', 'nicholson, transit, taxicab', + 'warehouse, delivery, cloverly', + 'communications, communication, education', 'spring, silver, king', + 'assessment, protective, projects', + 'technology, telephone, systems', 'rockville, twinbrook, downtown', + 'director, officers, officer', 'assignment, assistance, medical', + 'animal, virtual, regional', + 'investigative, investigations, explosive', + 'firefighter, rescuer, recruit', 'operator, bus, operations', + 'officer, office, security', 'government, employee, budget', + 'liquor, clerk, store', 'information, technology, renovation', + 'manager, engineer, iii', 'income, assistance, client', 'administrative, administration, administrator', - 'enforcement, inspector, abandoned', - 'crossing, purchasing, engineer', - 'warehouse, craftsworker, welfare', 'sergeant, cadet, emergency', - 'liquor, clerk, store', 'assistant, library, librarian', - 'recreation, renovation, paralegal', - 'permitting, planning, resources', - 'equipment, auditor, investment', 'sheriff, deputy, urban', - 'environmental, therapist, enviromental', - 'program, programs, property', 'technician, mechanic, hvac'], - dtype=object) + 'coordinator, coordinating, transit', + 'technician, mechanic, supply', 'accountant, attendant, attorney', + 'corporal, pfc, dietary', 'community, health, nurse', + 'school, room, behavioral', 'services, supervisor, service', + 'enforcement, permitting, inspector', 'lieutenant, captain, chief', + 'assistant, library, librarian', + 'communications, telecommunications, safety', + 'warehouse, welfare, caseworker', 'specialist, special, therapist', + 'crossing, purchasing, planning', 'candidate, sheriff, deputy', + 'legislative, principal, executive', + 'equipment, investment, investigator', + 'program, programs, property', + 'correctional, correction, regional', 'sergeant, police, cadet', + 'master, registered, meter'], dtype=object) @@ -562,7 +557,7 @@ Let's look at the cross-validated R2 score of our model: .. code-block:: none - R2 score: mean: 0.921; std: 0.015 + R2 score: mean: 0.923; std: 0.014 @@ -698,7 +693,7 @@ to plot the feature importances. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (1 minutes 20.731 seconds) + **Total running time of the script:** (1 minutes 27.261 seconds) .. _sphx_glr_download_auto_examples_01_encodings.py: diff --git a/0.1/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt b/0.1/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt index f01d21a6..c64735ae 100644 --- a/0.1/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt +++ b/0.1/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt @@ -504,7 +504,7 @@ as a set of latent topics. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 2.168 seconds) + **Total running time of the script:** (0 minutes 2.163 seconds) .. _sphx_glr_download_auto_examples_02_feature_interpretation_with_gapencoder.py: diff --git a/0.1/_sources/auto_examples/03_datetime_encoder.rst.txt b/0.1/_sources/auto_examples/03_datetime_encoder.rst.txt index 4e339727..057f4c78 100644 --- a/0.1/_sources/auto_examples/03_datetime_encoder.rst.txt +++ b/0.1/_sources/auto_examples/03_datetime_encoder.rst.txt @@ -610,7 +610,7 @@ and transforms datetime columns by default. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.849 seconds) + **Total running time of the script:** (0 minutes 5.097 seconds) .. _sphx_glr_download_auto_examples_03_datetime_encoder.py: diff --git a/0.1/_sources/auto_examples/04_fuzzy_joining.rst.txt b/0.1/_sources/auto_examples/04_fuzzy_joining.rst.txt index c9b19e24..e8d5fde5 100644 --- a/0.1/_sources/auto_examples/04_fuzzy_joining.rst.txt +++ b/0.1/_sources/auto_examples/04_fuzzy_joining.rst.txt @@ -1711,7 +1711,7 @@ introduced into a grid search: .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 22.159 seconds) + **Total running time of the script:** (0 minutes 22.931 seconds) .. _sphx_glr_download_auto_examples_04_fuzzy_joining.py: diff --git a/0.1/_sources/auto_examples/05_deduplication.rst.txt b/0.1/_sources/auto_examples/05_deduplication.rst.txt index d0e94dec..7236f965 100644 --- a/0.1/_sources/auto_examples/05_deduplication.rst.txt +++ b/0.1/_sources/auto_examples/05_deduplication.rst.txt @@ -335,7 +335,7 @@ or |MinHash|. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 5.151 seconds) + **Total running time of the script:** (0 minutes 6.062 seconds) .. _sphx_glr_download_auto_examples_05_deduplication.py: diff --git a/0.1/_sources/auto_examples/06_ken_embeddings.rst.txt b/0.1/_sources/auto_examples/06_ken_embeddings.rst.txt index b6413ed9..3d0cc9ae 100644 --- a/0.1/_sources/auto_examples/06_ken_embeddings.rst.txt +++ b/0.1/_sources/auto_examples/06_ken_embeddings.rst.txt @@ -305,7 +305,7 @@ We will start by checking out the available tables with .. code-block:: none - {'companies', 'games', 'movies', 'all_entities', 'schools', 'albums'} + {'companies', 'games', 'all_entities', 'schools', 'movies', 'albums'} @@ -840,7 +840,7 @@ It helped significantly improve the prediction score. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 21.562 seconds) + **Total running time of the script:** (13 minutes 4.578 seconds) .. _sphx_glr_download_auto_examples_06_ken_embeddings.py: diff --git a/0.1/_sources/auto_examples/07_multiple_key_join.rst.txt b/0.1/_sources/auto_examples/07_multiple_key_join.rst.txt index 12e257f5..835492de 100644 --- a/0.1/_sources/auto_examples/07_multiple_key_join.rst.txt +++ b/0.1/_sources/auto_examples/07_multiple_key_join.rst.txt @@ -1226,7 +1226,7 @@ The results: /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:228: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros warnings.warn( - 0.5913999999999999 + 0.58445 @@ -1244,7 +1244,7 @@ Our final cross-validated accuracy score is 0.58. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 23.106 seconds) + **Total running time of the script:** (13 minutes 12.928 seconds) .. _sphx_glr_download_auto_examples_07_multiple_key_join.py: diff --git a/0.1/_sources/auto_examples/08_join_aggregation.rst.txt b/0.1/_sources/auto_examples/08_join_aggregation.rst.txt index b8c01447..c9d19a99 100644 --- a/0.1/_sources/auto_examples/08_join_aggregation.rst.txt +++ b/0.1/_sources/auto_examples/08_join_aggregation.rst.txt @@ -844,75 +844,75 @@ operation maximizing our validation score. split1_test_score - 0.047968 - 0.034920 - 0.059455 - 0.070215 - 0.078351 + 0.033002 + 0.042840 + 0.080098 + 0.061375 + 0.066677 split2_test_score - 0.068097 - 0.083264 - 0.088838 - 0.087972 - 0.095634 + 0.060051 + 0.078515 + 0.093596 + 0.099191 + 0.098471 split3_test_score - 0.035797 - 0.053966 - 0.056003 - 0.061367 - 0.062436 + 0.038842 + 0.064836 + 0.056812 + 0.073599 + 0.077640 split4_test_score - 0.133327 - 0.121540 - 0.158436 - 0.154274 - 0.150200 + 0.142220 + 0.128170 + 0.145539 + 0.142744 + 0.146731 split5_test_score - 0.105519 - 0.111000 - 0.109453 - 0.112274 - 0.118223 + 0.103772 + 0.114905 + 0.109094 + 0.109256 + 0.115901 split6_test_score - 0.079701 - 0.105084 - 0.104736 - 0.111211 - 0.104890 + 0.079807 + 0.095965 + 0.106291 + 0.107266 + 0.107440 split7_test_score - 0.065811 - 0.060532 - 0.068918 - 0.061500 - 0.081844 + 0.073432 + 0.072592 + 0.048308 + 0.067931 + 0.085349 split8_test_score - 0.102426 - 0.115744 - 0.124244 - 0.131354 - 0.125673 + 0.109640 + 0.112492 + 0.117007 + 0.124763 + 0.129425 split9_test_score - 0.115310 - 0.156327 - 0.155995 - 0.164067 - 0.176232 + 0.115726 + 0.158776 + 0.169859 + 0.162961 + 0.181152 @@ -1003,7 +1003,7 @@ exhaustive histogram over all the possible values of ratings .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 18.256 seconds) + **Total running time of the script:** (0 minutes 23.726 seconds) .. _sphx_glr_download_auto_examples_08_join_aggregation.py: diff --git a/0.1/_sources/auto_examples/09_interpolation_join.rst.txt b/0.1/_sources/auto_examples/09_interpolation_join.rst.txt index b5d00f18..8f9426cd 100644 --- a/0.1/_sources/auto_examples/09_interpolation_join.rst.txt +++ b/0.1/_sources/auto_examples/09_interpolation_join.rst.txt @@ -350,9 +350,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 42.6 0.0 NaN - 33.651221 - 56.428523 - -0.166755 + 35.637157 + 82.189197 + -0.039541 1 @@ -362,9 +362,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 26.9 NaN NaN - 27.286608 - 40.865248 - 0.129940 + 26.834254 + 137.171376 + 0.237755 2 @@ -374,9 +374,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 41.6 NaN NaN - 31.019178 - 28.513377 - -0.164530 + 31.871600 + 27.685462 + -0.039896 3 @@ -386,9 +386,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 46.6 0.0 NaN - 33.530323 - 32.307955 - -0.164530 + 34.787195 + 49.989701 + -0.039896 4 @@ -398,9 +398,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 36.1 0.0 NaN - 28.511537 - 17.663136 - -0.128419 + 29.502211 + 33.688246 + -0.039896 @@ -580,7 +580,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 0.926337 + 0.929497 1 @@ -592,7 +592,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 16.187665 + 16.652344 2 @@ -604,7 +604,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 6.140298 + 6.490080 3 @@ -616,7 +616,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 7.192594 + 7.248217 4 @@ -628,7 +628,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 7.371359 + 7.106750 @@ -678,11 +678,11 @@ States with the lowest average predicted temperatures: Alaska, Montana, North Da state - AK -3.590085 - MT 0.062396 - WA 0.147388 - ND 1.152629 - MN 1.409494 + AK -2.597808 + WA 0.834756 + ND 0.899300 + MT 0.927627 + MN 1.461241 Name: TMAX, dtype: float64 @@ -707,11 +707,11 @@ States with the highest predicted temperatures: Puerto Rico, Virgin Islands, Haw state - LA 21.655593 - FL 24.662459 - HI 27.341897 - VI 29.278621 - PR 29.733622 + LA 21.441632 + FL 24.618361 + HI 28.353713 + VI 30.155955 + PR 30.581503 Name: TMAX, dtype: float64 @@ -787,7 +787,7 @@ It is a generalization of the :func:`~skrub.fuzzy_join`, as :func:`~skrub.fuzzy_ .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 5.497 seconds) + **Total running time of the script:** (0 minutes 7.251 seconds) .. _sphx_glr_download_auto_examples_09_interpolation_join.py: diff --git a/0.1/_sources/auto_examples/sg_execution_times.rst.txt b/0.1/_sources/auto_examples/sg_execution_times.rst.txt index aaec461e..9e70a350 100644 --- a/0.1/_sources/auto_examples/sg_execution_times.rst.txt +++ b/0.1/_sources/auto_examples/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**25:03.479** total execution time for 9 files **from auto_examples**: +**28:51.997** total execution time for 9 files **from auto_examples**: .. container:: @@ -33,29 +33,29 @@ Computation times - Time - Mem (MB) * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``07_multiple_key_join.py``) - - 11:23.106 + - 13:12.928 - 0.0 * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``06_ken_embeddings.py``) - - 11:21.562 + - 13:04.578 - 0.0 * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``01_encodings.py``) - - 01:20.731 - - 0.0 - * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``04_fuzzy_joining.py``) - - 00:22.159 + - 01:27.261 - 0.0 * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``08_join_aggregation.py``) - - 00:18.256 + - 00:23.726 + - 0.0 + * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``04_fuzzy_joining.py``) + - 00:22.931 - 0.0 * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``09_interpolation_join.py``) - - 00:05.497 + - 00:07.251 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``05_deduplication.py``) - - 00:05.151 + - 00:06.062 - 0.0 * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``03_datetime_encoder.py``) - - 00:04.849 + - 00:05.097 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``02_feature_interpretation_with_gapencoder.py``) - - 00:02.168 + - 00:02.163 - 0.0 diff --git a/0.1/_sources/install.rst.txt b/0.1/_sources/install.rst.txt index cf572c4f..bfa3114f 100644 --- a/0.1/_sources/install.rst.txt +++ b/0.1/_sources/install.rst.txt @@ -34,7 +34,6 @@ Installing

- This will not work yet! The package is still waiting to be published on conda-forge. You can follow progress here. .. code:: console @@ -45,7 +44,6 @@ Installing

- This will not work yet! The package is still waiting to be published on conda-forge. You can follow progress here. .. code:: console diff --git a/0.1/_sources/sg_execution_times.rst.txt b/0.1/_sources/sg_execution_times.rst.txt index b731eb3f..1b89f500 100644 --- a/0.1/_sources/sg_execution_times.rst.txt +++ b/0.1/_sources/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**25:03.479** total execution time for 9 files **from all galleries**: +**28:51.997** total execution time for 9 files **from all galleries**: .. container:: @@ -33,29 +33,29 @@ Computation times - Time - Mem (MB) * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``../examples/07_multiple_key_join.py``) - - 11:23.106 + - 13:12.928 - 0.0 * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``../examples/06_ken_embeddings.py``) - - 11:21.562 + - 13:04.578 - 0.0 * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``../examples/01_encodings.py``) - - 01:20.731 - - 0.0 - * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``../examples/04_fuzzy_joining.py``) - - 00:22.159 + - 01:27.261 - 0.0 * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``../examples/08_join_aggregation.py``) - - 00:18.256 + - 00:23.726 + - 0.0 + * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``../examples/04_fuzzy_joining.py``) + - 00:22.931 - 0.0 * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``../examples/09_interpolation_join.py``) - - 00:05.497 + - 00:07.251 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``../examples/05_deduplication.py``) - - 00:05.151 + - 00:06.062 - 0.0 * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``../examples/03_datetime_encoder.py``) - - 00:04.849 + - 00:05.097 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``../examples/02_feature_interpretation_with_gapencoder.py``) - - 00:02.168 + - 00:02.163 - 0.0 diff --git a/0.1/auto_examples/01_encodings.html b/0.1/auto_examples/01_encodings.html index a93a804a..a441a861 100644 --- a/0.1/auto_examples/01_encodings.html +++ b/0.1/auto_examples/01_encodings.html @@ -797,47 +797,42 @@

A simple prediction pipeline
tv.named_transformers_["high_cardinality"].get_feature_names_out()
 

-
array(['accountability, accounts, community',
-       'assessment, protective, treatment',
-       'communications, communication, telecommunications',
-       'station, state, estate', 'security, mccf, unit',
-       'gaithersburg, clarksburg, the', 'traffic, safety, alcohol',
-       'investigative, investigations, criminal',
-       'training, recruit, director', 'inspections, inspection, special',
-       'district, squad, 3rd', 'services, highway, service',
-       'behavioral, health, school', 'silver, spring, ride',
-       'nicholson, transit, transport', 'mangement, engineering, parking',
-       'welfare, childhood, children', 'collection, recycling, solid',
-       'technology, systems, telephone',
-       'equipment, management, automotive',
-       'facilities, maintenance, supports',
-       'rockville, downtown, library', 'warehouse, liquor, stock',
-       'development, delivery, montgomery',
-       'eligibility, assistance, emergency',
-       'programs, program, officers', 'planning, construction, building',
-       'patrol, 5th, 4th', 'family, crimes, sexual',
-       'toddlers, custody, members', 'coordinator, services, service',
-       'operator, bus, operations', 'specialist, special, quality',
-       'master, registered, water', 'manager, projects, project',
-       'officer, office, police', 'firefighter, rescuer, rescue',
-       'supervisor, supervisory, supply', 'assistance, income, client',
-       'correctional, correction, corporal',
-       'legislative, principal, executive', 'school, room, behavioral',
-       'community, security, nurse',
-       'communications, telecommunications, safety',
-       'lieutenant, captain, chief',
-       'information, technology, technologist',
+
array(['compliance, building, violence', 'gaithersburg, clarksburg, the',
+       'station, state, estate', 'development, planning, accounting',
+       'patrol, 4th, 5th', 'traffic, safety, alcohol',
+       'management, equipment, budget', 'toddlers, custody, members',
+       'services, highway, service', 'behavioral, health, school',
+       'collection, inspections, operations', 'family, crimes, outreach',
+       'welfare, childhood, child', 'security, mccf, unit',
+       'supports, support, network', 'emergency, centers, center',
+       'district, squad, urban', 'maintenance, facilities, recruit',
+       'administration, battalion, admin', 'nicholson, transit, taxicab',
+       'warehouse, delivery, cloverly',
+       'communications, communication, education', 'spring, silver, king',
+       'assessment, protective, projects',
+       'technology, telephone, systems', 'rockville, twinbrook, downtown',
+       'director, officers, officer', 'assignment, assistance, medical',
+       'animal, virtual, regional',
+       'investigative, investigations, explosive',
+       'firefighter, rescuer, recruit', 'operator, bus, operations',
+       'officer, office, security', 'government, employee, budget',
+       'liquor, clerk, store', 'information, technology, renovation',
+       'manager, engineer, iii', 'income, assistance, client',
        'administrative, administration, administrator',
-       'enforcement, inspector, abandoned',
-       'crossing, purchasing, engineer',
-       'warehouse, craftsworker, welfare', 'sergeant, cadet, emergency',
-       'liquor, clerk, store', 'assistant, library, librarian',
-       'recreation, renovation, paralegal',
-       'permitting, planning, resources',
-       'equipment, auditor, investment', 'sheriff, deputy, urban',
-       'environmental, therapist, enviromental',
-       'program, programs, property', 'technician, mechanic, hvac'],
-      dtype=object)
+       'coordinator, coordinating, transit',
+       'technician, mechanic, supply', 'accountant, attendant, attorney',
+       'corporal, pfc, dietary', 'community, health, nurse',
+       'school, room, behavioral', 'services, supervisor, service',
+       'enforcement, permitting, inspector', 'lieutenant, captain, chief',
+       'assistant, library, librarian',
+       'communications, telecommunications, safety',
+       'warehouse, welfare, caseworker', 'specialist, special, therapist',
+       'crossing, purchasing, planning', 'candidate, sheriff, deputy',
+       'legislative, principal, executive',
+       'equipment, investment, investigator',
+       'program, programs, property',
+       'correctional, correction, regional', 'sergeant, police, cadet',
+       'master, registered, meter'], dtype=object)
 
-
R2 score:  mean: 0.921; std: 0.015
+
R2 score:  mean: 0.923; std: 0.014
 

The simple pipeline applied on this complex dataset gave us very good results.

@@ -939,7 +934,7 @@

ConclusionTotal running time of the script: (1 minutes 20.731 seconds)

+

Total running time of the script: (1 minutes 27.261 seconds)

-
{'companies', 'games', 'movies', 'all_entities', 'schools', 'albums'}
+
{'companies', 'games', 'all_entities', 'schools', 'movies', 'albums'}
 

The games table is the most relevant to our case. @@ -968,7 +968,7 @@

Plotting the results

It helped significantly improve the prediction score.

-

Total running time of the script: (11 minutes 21.562 seconds)

+

Total running time of the script: (13 minutes 4.578 seconds)