diff --git a/0.1/.buildinfo b/0.1/.buildinfo index 44587ffe..567ac8bc 100644 --- a/0.1/.buildinfo +++ b/0.1/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 4e757390fbfd429264474b4c216ae132 +config: e25750dadb21b97663017b304cf4de14 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/0.1/_images/sphx_glr_01_encodings_001.png b/0.1/_images/sphx_glr_01_encodings_001.png index 267dfe0b..22ba0cf9 100644 Binary files a/0.1/_images/sphx_glr_01_encodings_001.png and b/0.1/_images/sphx_glr_01_encodings_001.png differ diff --git a/0.1/_images/sphx_glr_01_encodings_thumb.png b/0.1/_images/sphx_glr_01_encodings_thumb.png index 8dc3dd58..f0fff56d 100644 Binary files a/0.1/_images/sphx_glr_01_encodings_thumb.png and b/0.1/_images/sphx_glr_01_encodings_thumb.png differ diff --git a/0.1/_images/sphx_glr_08_join_aggregation_003.png b/0.1/_images/sphx_glr_08_join_aggregation_003.png index e01da23c..fc80fd46 100644 Binary files a/0.1/_images/sphx_glr_08_join_aggregation_003.png and b/0.1/_images/sphx_glr_08_join_aggregation_003.png differ diff --git a/0.1/_images/sphx_glr_09_interpolation_join_001.png b/0.1/_images/sphx_glr_09_interpolation_join_001.png index ce2ea4e5..403ab7ad 100644 Binary files a/0.1/_images/sphx_glr_09_interpolation_join_001.png and b/0.1/_images/sphx_glr_09_interpolation_join_001.png differ diff --git a/0.1/_images/sphx_glr_09_interpolation_join_002.png b/0.1/_images/sphx_glr_09_interpolation_join_002.png index b54ceb67..797dccd1 100644 Binary files a/0.1/_images/sphx_glr_09_interpolation_join_002.png and b/0.1/_images/sphx_glr_09_interpolation_join_002.png differ diff --git a/0.1/_images/sphx_glr_09_interpolation_join_003.png b/0.1/_images/sphx_glr_09_interpolation_join_003.png index ee35c411..cd124a19 100644 Binary files a/0.1/_images/sphx_glr_09_interpolation_join_003.png and b/0.1/_images/sphx_glr_09_interpolation_join_003.png differ diff --git a/0.1/_images/sphx_glr_09_interpolation_join_thumb.png b/0.1/_images/sphx_glr_09_interpolation_join_thumb.png index 65994100..c6c7be91 100644 Binary files a/0.1/_images/sphx_glr_09_interpolation_join_thumb.png and b/0.1/_images/sphx_glr_09_interpolation_join_thumb.png differ diff --git a/0.1/_sources/auto_examples/01_encodings.rst.txt b/0.1/_sources/auto_examples/01_encodings.rst.txt index 4305c556..f80b336f 100644 --- a/0.1/_sources/auto_examples/01_encodings.rst.txt +++ b/0.1/_sources/auto_examples/01_encodings.rst.txt @@ -440,41 +440,45 @@ corresponding columns: .. code-block:: none - array(['maintenance, facilities, finance', 'station, state, estate', - 'behavioral, health, school', 'gaithersburg, the, clarksburg', - 'procurement, protective, adult', 'warehouse, delivery, liquor', - 'patrol, 5th, 4th', 'supports, support, network', - 'traffic, safety, alcohol', 'spring, silver, monitoring', - 'family, pedophile, crimes', 'rockville, twinbrook, downtown', - 'administration, battalion, registration', - 'highway, welfare, child', 'technology, inmate, systems', - 'management, equipment, budget', - 'communications, communication, division', - 'nicholson, transit, taxicab', - 'investigative, investigations, criminal', 'security, mc311, mccf', - 'custody, mcdc, quality', 'inspections, inspection, collection', - 'eligibility, assistance, disability', 'services, service, animal', - 'programs, projects, program', 'accounts, receivable, members', - 'planning, training, recruit', 'district, squad, 3rd', - 'emergency, commuter, duplicating', 'director, kingsview, officer', - 'firefighter, recruit, rescuer', - 'paralegal, psychiatrist, employee', - 'income, assistance, compliance', 'librarian, associate, library', - 'candidate, police, sergeant', 'manager, projects, project', + array(['station, state, estate', 'district, patrol, 3rd', + 'silver, spring, urban', 'welfare, children, childhood', + 'services, highway, service', 'traffic, safety, alcohol', + 'gaithersburg, clarksburg, the', 'supports, downtown, support', + 'twinbrook, warehouse, rockville', + 'investigative, criminal, investigations', + 'equipment, automotive, fleet', 'assessment, ombudsman, adult', + 'programs, program, commercial', 'development, delivery, cloverly', + 'nicholson, transit, taxicab', 'abandoned, division, employee', + 'behavioral, health, school', 'maintenance, facilities, finance', + 'technology, inmate, systems', + 'administrative, administration, battalion', + 'toddlers, custody, members', 'building, structural, construction', + 'eligibility, assistance, medical', + 'communications, communication, immunization', + 'training, director, recruit', 'mangement, management, engagement', + 'collection, operations, special', + 'regulatory, environmental, centers', 'assignment, squad, team', + 'security, mc311, mccf', 'representative, legislative, customer', + 'manager, projects, project', 'accountant, assistant, library', + 'candidate, officer, office', 'operator, bus, operations', + 'school, health, room', 'lieutenant, captain, chief', + 'firefighter, rescuer, recruit', 'iii, police, of', + 'program, programs, graphic', + 'information, technology, technologist', + 'sergeant, attendant, police', + 'correctional, correction, corporal', + 'crossing, purchasing, engineer', 'community, nurse, unit', + 'coordinator, services, service', 'assistance, income, client', 'enforcement, permitting, inspector', - 'program, programs, resource', 'operator, bus, operations', - 'administrative, principal, executive', 'captain, rescue, chief', - 'technician, mechanic, supply', 'school, room, behavioral', + 'technician, mechanic, supply', + 'administrative, principal, executive', + 'warehouse, welfare, caseworker', 'transit, public, telephone', + 'sheriff, deputy, therapist', 'specialist, recreation, special', + 'supervisor, supervisory, records', 'master, meter, registered', 'communications, telecommunications, safety', - 'community, health, nurse', 'correctional, correction, corporal', - 'liquor, clerk, store', 'services, president, resident', - 'specialist, special, quality', 'coordinator, coordinating, depot', - 'officer, office, iii', 'master, registered, meter', - 'craftsworker, supervisor, advisor', 'sheriff, deputy, autobody', - 'information, recreation, technology', - 'warehouse, welfare, caseworker', 'crossing, purchasing, engineer', - 'lieutenant, shift, records', 'accountant, assistant, county', - 'equipment, investigator, apprentice'], dtype=object) + 'equipment, investment, investigator', + 'environmental, budget, analyst', 'liquor, clerk, store'], + dtype=object) @@ -556,7 +560,7 @@ Let's look at the cross-validated R2 score of our model: .. code-block:: none - R2 score: mean: 0.922; std: 0.010 + R2 score: mean: 0.921; std: 0.015 @@ -692,7 +696,7 @@ to plot the feature importances. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (1 minutes 12.266 seconds) + **Total running time of the script:** (1 minutes 15.855 seconds) .. _sphx_glr_download_auto_examples_01_encodings.py: diff --git a/0.1/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt b/0.1/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt index 4a1f733c..80f9d55e 100644 --- a/0.1/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt +++ b/0.1/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt @@ -504,7 +504,7 @@ as a set of latent topics. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 1.752 seconds) + **Total running time of the script:** (0 minutes 1.841 seconds) .. _sphx_glr_download_auto_examples_02_feature_interpretation_with_gapencoder.py: diff --git a/0.1/_sources/auto_examples/03_datetime_encoder.rst.txt b/0.1/_sources/auto_examples/03_datetime_encoder.rst.txt index 7f8acb21..9955839f 100644 --- a/0.1/_sources/auto_examples/03_datetime_encoder.rst.txt +++ b/0.1/_sources/auto_examples/03_datetime_encoder.rst.txt @@ -610,7 +610,7 @@ and transforms datetime columns by default. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.439 seconds) + **Total running time of the script:** (0 minutes 4.448 seconds) .. _sphx_glr_download_auto_examples_03_datetime_encoder.py: diff --git a/0.1/_sources/auto_examples/04_fuzzy_joining.rst.txt b/0.1/_sources/auto_examples/04_fuzzy_joining.rst.txt index f36167e1..0282b031 100644 --- a/0.1/_sources/auto_examples/04_fuzzy_joining.rst.txt +++ b/0.1/_sources/auto_examples/04_fuzzy_joining.rst.txt @@ -1711,7 +1711,7 @@ introduced into a grid search: .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 22.247 seconds) + **Total running time of the script:** (0 minutes 20.264 seconds) .. _sphx_glr_download_auto_examples_04_fuzzy_joining.py: diff --git a/0.1/_sources/auto_examples/05_deduplication.rst.txt b/0.1/_sources/auto_examples/05_deduplication.rst.txt index d18f9b90..ff1a28c0 100644 --- a/0.1/_sources/auto_examples/05_deduplication.rst.txt +++ b/0.1/_sources/auto_examples/05_deduplication.rst.txt @@ -335,7 +335,7 @@ or |MinHash|. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.893 seconds) + **Total running time of the script:** (0 minutes 5.130 seconds) .. _sphx_glr_download_auto_examples_05_deduplication.py: diff --git a/0.1/_sources/auto_examples/06_ken_embeddings.rst.txt b/0.1/_sources/auto_examples/06_ken_embeddings.rst.txt index 168425da..4d7e6424 100644 --- a/0.1/_sources/auto_examples/06_ken_embeddings.rst.txt +++ b/0.1/_sources/auto_examples/06_ken_embeddings.rst.txt @@ -305,7 +305,7 @@ We will start by checking out the available tables with .. code-block:: none - {'all_entities', 'albums', 'games', 'companies', 'schools', 'movies'} + {'albums', 'companies', 'games', 'all_entities', 'movies', 'schools'} @@ -840,7 +840,7 @@ It helped significantly improve the prediction score. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 7.052 seconds) + **Total running time of the script:** (10 minutes 42.099 seconds) .. _sphx_glr_download_auto_examples_06_ken_embeddings.py: diff --git a/0.1/_sources/auto_examples/07_multiple_key_join.rst.txt b/0.1/_sources/auto_examples/07_multiple_key_join.rst.txt index 0699c4e3..31a6fd57 100644 --- a/0.1/_sources/auto_examples/07_multiple_key_join.rst.txt +++ b/0.1/_sources/auto_examples/07_multiple_key_join.rst.txt @@ -1226,7 +1226,7 @@ The results: /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:228: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros warnings.warn( - 0.5841000000000001 + 0.5906499999999999 @@ -1244,7 +1244,7 @@ Our final cross-validated accuracy score is 0.58. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (11 minutes 58.676 seconds) + **Total running time of the script:** (10 minutes 58.884 seconds) .. _sphx_glr_download_auto_examples_07_multiple_key_join.py: diff --git a/0.1/_sources/auto_examples/08_join_aggregation.rst.txt b/0.1/_sources/auto_examples/08_join_aggregation.rst.txt index 5fb8d124..b30175dc 100644 --- a/0.1/_sources/auto_examples/08_join_aggregation.rst.txt +++ b/0.1/_sources/auto_examples/08_join_aggregation.rst.txt @@ -844,75 +844,75 @@ operation maximizing our validation score. split1_test_score - 0.047202 - 0.043610 - 0.077519 - 0.085678 - 0.057439 + 0.066934 + 0.047807 + 0.081054 + 0.066640 + 0.073865 split2_test_score - 0.078155 - 0.074853 - 0.093541 - 0.099196 - 0.088418 + 0.087146 + 0.076471 + 0.082633 + 0.089661 + 0.082575 split3_test_score - 0.035982 - 0.061126 - 0.062310 - 0.058060 - 0.076577 + 0.046050 + 0.069368 + 0.064893 + 0.062906 + 0.066695 split4_test_score - 0.134301 - 0.135657 - 0.148712 - 0.154143 - 0.143427 + 0.142628 + 0.122168 + 0.141547 + 0.148026 + 0.141334 split5_test_score - 0.106490 - 0.112103 - 0.101398 - 0.106325 - 0.117235 + 0.112646 + 0.112636 + 0.106074 + 0.110451 + 0.105759 split6_test_score - 0.083866 - 0.108533 - 0.105748 - 0.106346 - 0.109916 + 0.088999 + 0.097707 + 0.102453 + 0.100955 + 0.104426 split7_test_score - 0.068434 - 0.069732 - 0.057040 - 0.070862 - 0.072920 + 0.071672 + 0.074117 + 0.064240 + 0.067340 + 0.072866 split8_test_score - 0.108563 - 0.109040 - 0.116688 - 0.122268 - 0.133081 + 0.109308 + 0.106898 + 0.122343 + 0.120844 + 0.134203 split9_test_score - 0.123125 - 0.148885 - 0.161331 - 0.166520 - 0.173197 + 0.118252 + 0.151829 + 0.169206 + 0.163430 + 0.185400 @@ -1003,7 +1003,7 @@ exhaustive histogram over all the possible values of ratings .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 23.746 seconds) + **Total running time of the script:** (0 minutes 17.626 seconds) .. _sphx_glr_download_auto_examples_08_join_aggregation.py: diff --git a/0.1/_sources/auto_examples/09_interpolation_join.rst.txt b/0.1/_sources/auto_examples/09_interpolation_join.rst.txt index a29f5ff3..bd562c34 100644 --- a/0.1/_sources/auto_examples/09_interpolation_join.rst.txt +++ b/0.1/_sources/auto_examples/09_interpolation_join.rst.txt @@ -350,9 +350,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 42.6 0.0 NaN - 34.114658 - 104.980507 - -0.034032 + 34.781478 + 37.248739 + -0.067377 1 @@ -362,9 +362,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 26.9 NaN NaN - 26.633029 - 90.401068 - -0.163749 + 27.159409 + 73.742964 + 0.177901 2 @@ -374,9 +374,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 41.6 NaN NaN - 31.644945 - 30.102591 - -0.099210 + 31.592443 + 20.690208 + -0.067377 3 @@ -386,9 +386,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 46.6 0.0 NaN - 33.565024 - 95.206328 - -0.099210 + 34.316934 + 59.562043 + -0.067377 4 @@ -398,9 +398,9 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 36.1 0.0 NaN - 30.914017 - 24.199779 - -0.028122 + 29.505032 + 26.683668 + -0.225919 @@ -580,7 +580,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 1.241457 + 2.546299 1 @@ -592,7 +592,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 16.061691 + 16.140838 2 @@ -604,7 +604,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 6.377676 + 6.370342 3 @@ -616,7 +616,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 7.014022 + 7.643229 4 @@ -628,7 +628,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 7.080677 + 8.781918 @@ -678,11 +678,11 @@ States with the lowest average predicted temperatures: Alaska, Montana, North Da state - AK -3.195786 - MT -0.569793 - WA 0.008274 - ND 1.144235 - MN 1.436454 + AK -2.373219 + MT -0.047876 + WA 0.498786 + MN 1.236130 + ND 1.247732 Name: TMAX, dtype: float64 @@ -707,11 +707,11 @@ States with the highest predicted temperatures: Puerto Rico, Virgin Islands, Haw state - LA 21.634984 - FL 24.810720 - HI 26.988138 - VI 30.824549 - PR 31.345255 + LA 21.805534 + FL 24.660732 + HI 26.866774 + VI 31.384267 + PR 31.769181 Name: TMAX, dtype: float64 @@ -787,7 +787,7 @@ It is a generalization of the :func:`~skrub.fuzzy_join`, as :func:`~skrub.fuzzy_ .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 8.311 seconds) + **Total running time of the script:** (0 minutes 5.717 seconds) .. _sphx_glr_download_auto_examples_09_interpolation_join.py: diff --git a/0.1/_sources/auto_examples/sg_execution_times.rst.txt b/0.1/_sources/auto_examples/sg_execution_times.rst.txt index 1f75addf..42545b5c 100644 --- a/0.1/_sources/auto_examples/sg_execution_times.rst.txt +++ b/0.1/_sources/auto_examples/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**25:23.383** total execution time for 9 files **from auto_examples**: +**23:51.862** total execution time for 9 files **from auto_examples**: .. container:: @@ -33,29 +33,29 @@ Computation times - Time - Mem (MB) * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``07_multiple_key_join.py``) - - 11:58.676 + - 10:58.884 - 0.0 * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``06_ken_embeddings.py``) - - 11:07.052 + - 10:42.099 - 0.0 * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``01_encodings.py``) - - 01:12.266 - - 0.0 - * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``08_join_aggregation.py``) - - 00:23.746 + - 01:15.855 - 0.0 * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``04_fuzzy_joining.py``) - - 00:22.247 + - 00:20.264 + - 0.0 + * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``08_join_aggregation.py``) + - 00:17.626 - 0.0 * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``09_interpolation_join.py``) - - 00:08.311 + - 00:05.717 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``05_deduplication.py``) - - 00:04.893 + - 00:05.130 - 0.0 * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``03_datetime_encoder.py``) - - 00:04.439 + - 00:04.448 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``02_feature_interpretation_with_gapencoder.py``) - - 00:01.752 + - 00:01.841 - 0.0 diff --git a/0.1/_sources/sg_execution_times.rst.txt b/0.1/_sources/sg_execution_times.rst.txt index f24911ae..b5e892ec 100644 --- a/0.1/_sources/sg_execution_times.rst.txt +++ b/0.1/_sources/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**25:23.383** total execution time for 9 files **from all galleries**: +**23:51.862** total execution time for 9 files **from all galleries**: .. container:: @@ -33,29 +33,29 @@ Computation times - Time - Mem (MB) * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``../examples/07_multiple_key_join.py``) - - 11:58.676 + - 10:58.884 - 0.0 * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``../examples/06_ken_embeddings.py``) - - 11:07.052 + - 10:42.099 - 0.0 * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``../examples/01_encodings.py``) - - 01:12.266 - - 0.0 - * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``../examples/08_join_aggregation.py``) - - 00:23.746 + - 01:15.855 - 0.0 * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``../examples/04_fuzzy_joining.py``) - - 00:22.247 + - 00:20.264 + - 0.0 + * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``../examples/08_join_aggregation.py``) + - 00:17.626 - 0.0 * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``../examples/09_interpolation_join.py``) - - 00:08.311 + - 00:05.717 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``../examples/05_deduplication.py``) - - 00:04.893 + - 00:05.130 - 0.0 * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``../examples/03_datetime_encoder.py``) - - 00:04.439 + - 00:04.448 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``../examples/02_feature_interpretation_with_gapencoder.py``) - - 00:01.752 + - 00:01.841 - 0.0 diff --git a/0.1/auto_examples/01_encodings.html b/0.1/auto_examples/01_encodings.html index 4d9f859f..1a88d230 100644 --- a/0.1/auto_examples/01_encodings.html +++ b/0.1/auto_examples/01_encodings.html @@ -797,41 +797,45 @@

A simple prediction pipeline
tv.named_transformers_["high_cardinality"].get_feature_names_out()
 
-
array(['maintenance, facilities, finance', 'station, state, estate',
-       'behavioral, health, school', 'gaithersburg, the, clarksburg',
-       'procurement, protective, adult', 'warehouse, delivery, liquor',
-       'patrol, 5th, 4th', 'supports, support, network',
-       'traffic, safety, alcohol', 'spring, silver, monitoring',
-       'family, pedophile, crimes', 'rockville, twinbrook, downtown',
-       'administration, battalion, registration',
-       'highway, welfare, child', 'technology, inmate, systems',
-       'management, equipment, budget',
-       'communications, communication, division',
-       'nicholson, transit, taxicab',
-       'investigative, investigations, criminal', 'security, mc311, mccf',
-       'custody, mcdc, quality', 'inspections, inspection, collection',
-       'eligibility, assistance, disability', 'services, service, animal',
-       'programs, projects, program', 'accounts, receivable, members',
-       'planning, training, recruit', 'district, squad, 3rd',
-       'emergency, commuter, duplicating', 'director, kingsview, officer',
-       'firefighter, recruit, rescuer',
-       'paralegal, psychiatrist, employee',
-       'income, assistance, compliance', 'librarian, associate, library',
-       'candidate, police, sergeant', 'manager, projects, project',
+
array(['station, state, estate', 'district, patrol, 3rd',
+       'silver, spring, urban', 'welfare, children, childhood',
+       'services, highway, service', 'traffic, safety, alcohol',
+       'gaithersburg, clarksburg, the', 'supports, downtown, support',
+       'twinbrook, warehouse, rockville',
+       'investigative, criminal, investigations',
+       'equipment, automotive, fleet', 'assessment, ombudsman, adult',
+       'programs, program, commercial', 'development, delivery, cloverly',
+       'nicholson, transit, taxicab', 'abandoned, division, employee',
+       'behavioral, health, school', 'maintenance, facilities, finance',
+       'technology, inmate, systems',
+       'administrative, administration, battalion',
+       'toddlers, custody, members', 'building, structural, construction',
+       'eligibility, assistance, medical',
+       'communications, communication, immunization',
+       'training, director, recruit', 'mangement, management, engagement',
+       'collection, operations, special',
+       'regulatory, environmental, centers', 'assignment, squad, team',
+       'security, mc311, mccf', 'representative, legislative, customer',
+       'manager, projects, project', 'accountant, assistant, library',
+       'candidate, officer, office', 'operator, bus, operations',
+       'school, health, room', 'lieutenant, captain, chief',
+       'firefighter, rescuer, recruit', 'iii, police, of',
+       'program, programs, graphic',
+       'information, technology, technologist',
+       'sergeant, attendant, police',
+       'correctional, correction, corporal',
+       'crossing, purchasing, engineer', 'community, nurse, unit',
+       'coordinator, services, service', 'assistance, income, client',
        'enforcement, permitting, inspector',
-       'program, programs, resource', 'operator, bus, operations',
-       'administrative, principal, executive', 'captain, rescue, chief',
-       'technician, mechanic, supply', 'school, room, behavioral',
+       'technician, mechanic, supply',
+       'administrative, principal, executive',
+       'warehouse, welfare, caseworker', 'transit, public, telephone',
+       'sheriff, deputy, therapist', 'specialist, recreation, special',
+       'supervisor, supervisory, records', 'master, meter, registered',
        'communications, telecommunications, safety',
-       'community, health, nurse', 'correctional, correction, corporal',
-       'liquor, clerk, store', 'services, president, resident',
-       'specialist, special, quality', 'coordinator, coordinating, depot',
-       'officer, office, iii', 'master, registered, meter',
-       'craftsworker, supervisor, advisor', 'sheriff, deputy, autobody',
-       'information, recreation, technology',
-       'warehouse, welfare, caseworker', 'crossing, purchasing, engineer',
-       'lieutenant, shift, records', 'accountant, assistant, county',
-       'equipment, investigator, apprentice'], dtype=object)
+       'equipment, investment, investigator',
+       'environmental, budget, analyst', 'liquor, clerk, store'],
+      dtype=object)
 
-
R2 score:  mean: 0.922; std: 0.010
+
R2 score:  mean: 0.921; std: 0.015
 

The simple pipeline applied on this complex dataset gave us very good results.

@@ -933,7 +937,7 @@

ConclusionTotal running time of the script: (1 minutes 12.266 seconds)

+

Total running time of the script: (1 minutes 15.855 seconds)

-
{'all_entities', 'albums', 'games', 'companies', 'schools', 'movies'}
+
{'albums', 'companies', 'games', 'all_entities', 'movies', 'schools'}
 

The games table is the most relevant to our case. @@ -968,7 +968,7 @@

Plotting the results

It helped significantly improve the prediction score.

-

Total running time of the script: (11 minutes 7.052 seconds)

+

Total running time of the script: (10 minutes 42.099 seconds)