diff --git a/dev/.buildinfo b/dev/.buildinfo index 36775569..84f09b4b 100644 --- a/dev/.buildinfo +++ b/dev/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 171e190e40f18d210c495edd89992fa6 +config: 494786c4ccb31c03eae44be6d784ab41 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/dev/_images/sphx_glr_01_encodings_001.png b/dev/_images/sphx_glr_01_encodings_001.png index 1b773801..e0f1d166 100644 Binary files a/dev/_images/sphx_glr_01_encodings_001.png and b/dev/_images/sphx_glr_01_encodings_001.png differ diff --git a/dev/_images/sphx_glr_01_encodings_thumb.png b/dev/_images/sphx_glr_01_encodings_thumb.png index 8b11c774..0c0e19eb 100644 Binary files a/dev/_images/sphx_glr_01_encodings_thumb.png and b/dev/_images/sphx_glr_01_encodings_thumb.png differ diff --git a/dev/_images/sphx_glr_08_join_aggregation_003.png b/dev/_images/sphx_glr_08_join_aggregation_003.png index 873cc9dc..ca53a11b 100644 Binary files a/dev/_images/sphx_glr_08_join_aggregation_003.png and b/dev/_images/sphx_glr_08_join_aggregation_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_001.png b/dev/_images/sphx_glr_09_interpolation_join_001.png index b1a9b3df..efb90c69 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_001.png and b/dev/_images/sphx_glr_09_interpolation_join_001.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_002.png b/dev/_images/sphx_glr_09_interpolation_join_002.png index 3a1f3fab..456367c9 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_002.png and b/dev/_images/sphx_glr_09_interpolation_join_002.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_003.png b/dev/_images/sphx_glr_09_interpolation_join_003.png index 54898afa..82f6ab0e 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_003.png and b/dev/_images/sphx_glr_09_interpolation_join_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_thumb.png b/dev/_images/sphx_glr_09_interpolation_join_thumb.png index b0f2b704..2a5f8f3c 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_thumb.png and b/dev/_images/sphx_glr_09_interpolation_join_thumb.png differ diff --git a/dev/_sources/auto_examples/01_encodings.rst.txt b/dev/_sources/auto_examples/01_encodings.rst.txt index 8ff5560e..c1d97835 100644 --- a/dev/_sources/auto_examples/01_encodings.rst.txt +++ b/dev/_sources/auto_examples/01_encodings.rst.txt @@ -847,46 +847,45 @@ corresponding columns: .. code-block:: none - array(['district, patrol, 3rd', 'behavioral, health, school', - 'station, state, estate', 'custody, toddlers, members', - 'gaithersburg, clarksburg, the', - 'resources, resource, neighborhood', 'welfare, children, child', - 'services, highway, service', 'safety, collision, traffic', - 'technology, systems, telephone', 'supports, support, sports', - 'planning, construction, building', + array(['behavioral, centers, seniors', 'silver, spring, ride', + 'station, state, estate', 'equipment, automotive, mangement', + 'patrol, 4th, 6th', 'traffic, safety, alcohol', + 'management, fleet, protective', 'delivery, warehouse, operations', + 'custody, mcdc, customer', 'division, family, animal', + 'technology, systems, telephone', 'rockville, twinbrook, downtown', + 'services, highway, service', 'investigative, investigations, criminal', - 'rockville, downtown, twinbrook', 'nicholson, transit, taxicab', - 'silver, spring, urban', 'security, mccf, unit', - 'management, budget, fleet', 'operations, delivery, warehouse', - 'development, stormwater, residential', - 'office, enforcement, officer', - 'eligibility, assistance, assisted', - 'automotive, protective, equipment', - 'communications, communication, immunization', - 'family, crimes, crime', - 'administration, battalion, administrative', - 'maintenance, facilities, council', 'training, recruit, director', - 'division, animal, fiscal', 'programs, personnel, background', - 'legislative, principal, executive', - 'firefighter, rescuer, master', 'officer, office, traffic', - 'operator, bus, operations', 'school, room, behavioral', - 'manager, projects, project', 'income, assistance, client', - 'coordinator, coordinating, depot', - 'information, technology, recreation', 'sergeant, police, cadet', + 'gaithersburg, nicholson, transit', 'security, mc311, mccf', + 'special, assignment, medical', 'welfare, child, childhood', + 'communications, communication, commuter', 'school, health, based', + 'supports, support, network', 'enforcement, emergency, crossing', + 'maintenance, facilities, facility', + 'administrative, administration, battalion', + 'development, planning, monitoring', 'training, director, recruit', + 'environmental, regulatory, adolescent', + 'accounts, programs, program', 'transport, design, building', + 'district, payroll, squad', 'officer, office, security', + 'firefighter, rescuer, recruit', 'operator, equipment, bus', + 'therapist, plumber, member', + 'administrative, administration, administrator', + 'technician, mechanic, supply', 'worker, social, works', + 'specialist, special, quality', + 'information, renovation, technology', + 'program, programs, projects', 'liquor, clerk, store', 'enforcement, permitting, inspector', - 'planning, specialist, special', - 'correctional, correction, records', - 'assistant, library, librarian', 'community, health, nurse', - 'communications, telecommunications, safety', - 'captain, chief, rescue', 'technician, mechanic, supply', - 'crossing, parking, guard', 'liquor, clerk, store', - 'warehouse, craftsworker, worker', 'program, programs, graphic', - 'lieutenant, shift, maintenance', - 'equipment, investigator, investment', - 'supervisor, supervisory, welfare', 'services, service, urban', - 'candidate, sheriff, deputy', 'corporal, behavioral, erp', - 'environmental, budget, ombudsman', - 'administration, administrator, administrative'], dtype=object) + 'manager, investigator, management', 'candidate, police, master', + 'school, health, room', 'correctional, correction, corporal', + 'coordinator, services, service', 'community, nurse, unit', + 'captain, rescue, chief', 'income, assistance, client', + 'crossing, engineer, parking', + 'telecommunications, communications, safety', + 'sheriff, deputy, autobody', 'lieutenant, maintenance, crew', + 'librarian, psychiatric, accountant', + 'warehouse, welfare, representative', + 'supervisor, sergeant, supervisory', + 'assistant, library, attorney', + 'legislative, principal, executive', 'planning, budget, senior'], + dtype=object) @@ -968,7 +967,7 @@ Let's look at the cross-validated R2 score of our model: .. code-block:: none - R2 score: mean: 0.923; std: 0.012 + R2 score: mean: 0.919; std: 0.016 @@ -1508,7 +1507,7 @@ to plot the feature importances. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (1 minutes 14.213 seconds) + **Total running time of the script:** (1 minutes 11.426 seconds) .. _sphx_glr_download_auto_examples_01_encodings.py: diff --git a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt index b66e46f6..c9db9182 100644 --- a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt +++ b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt @@ -504,7 +504,7 @@ as a set of latent topics. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 1.653 seconds) + **Total running time of the script:** (0 minutes 1.570 seconds) .. _sphx_glr_download_auto_examples_02_feature_interpretation_with_gapencoder.py: diff --git a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt index 440c2b2d..59b9320e 100644 --- a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt +++ b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt @@ -610,7 +610,7 @@ and transforms datetime columns by default. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.266 seconds) + **Total running time of the script:** (0 minutes 4.131 seconds) .. _sphx_glr_download_auto_examples_03_datetime_encoder.py: diff --git a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt index ee47690b..c0e29e89 100644 --- a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt +++ b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt @@ -239,13 +239,6 @@ We extract the table containing GDP per capita by country: -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:678: UserWarning: Could not find the dataset 'NY.GDP.PCAP.CD' locally. Downloading it from the World Bank; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/world_bank/NY.GDP.PCAP.CD.csv. - info = _fetch_world_bank_data(dataset_id, data_directory) - .. raw:: html @@ -310,13 +303,6 @@ Then another table, with life expectancy by country: -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:678: UserWarning: Could not find the dataset 'SP.DYN.LE00.IN' locally. Downloading it from the World Bank; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/world_bank/SP.DYN.LE00.IN.csv. - info = _fetch_world_bank_data(dataset_id, data_directory) - .. raw:: html @@ -381,13 +367,6 @@ And a table with legal rights strength by country: -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:678: UserWarning: Could not find the dataset 'IC.LGL.CRED.XQ' locally. Downloading it from the World Bank; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/world_bank/IC.LGL.CRED.XQ.csv. - info = _fetch_world_bank_data(dataset_id, data_directory) - .. raw:: html @@ -1732,7 +1711,7 @@ introduced into a grid search: .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 17.162 seconds) + **Total running time of the script:** (0 minutes 14.152 seconds) .. _sphx_glr_download_auto_examples_04_fuzzy_joining.py: diff --git a/dev/_sources/auto_examples/05_deduplication.rst.txt b/dev/_sources/auto_examples/05_deduplication.rst.txt index dce47615..db3aebc4 100644 --- a/dev/_sources/auto_examples/05_deduplication.rst.txt +++ b/dev/_sources/auto_examples/05_deduplication.rst.txt @@ -335,7 +335,7 @@ or |MinHash|. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 5.206 seconds) + **Total running time of the script:** (0 minutes 5.023 seconds) .. _sphx_glr_download_auto_examples_05_deduplication.py: diff --git a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt index e717f108..7ceb69b9 100644 --- a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt +++ b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt @@ -305,7 +305,7 @@ We will start by checking out the available tables with .. code-block:: none - {'games', 'movies', 'schools', 'companies', 'albums', 'all_entities'} + {'movies', 'all_entities', 'companies', 'games', 'schools', 'albums'} @@ -327,13 +327,6 @@ Let's see what kind of types we can find in it with the function -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset 40019788 locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_40019788.parquet. - info = _fetch_figshare(dataset_id, data_directory) - .. raw:: html @@ -460,15 +453,6 @@ and exclude those with type name "companies" or "developer". -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset 39266678 locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_39266678.parquet. - info = _fetch_figshare(dataset_id, data_directory) - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset 39254360 locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_39254360.parquet. - info = _fetch_figshare(dataset_id, data_directory) - @@ -856,7 +840,7 @@ It helped significantly improve the prediction score. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (1 minutes 12.585 seconds) + **Total running time of the script:** (1 minutes 4.578 seconds) .. _sphx_glr_download_auto_examples_06_ken_embeddings.py: diff --git a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt index 88e1a205..8b9d03af 100644 --- a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt +++ b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt @@ -80,13 +80,6 @@ The main table: flights dataset -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset '41771418' locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_41771418.parquet. - info = _fetch_figshare(dataset_id, data_directory) - .. raw:: html @@ -259,13 +252,6 @@ Airport data: an auxiliary table from the same database -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset '41710257' locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_41710257.parquet. - info = _fetch_figshare(dataset_id, data_directory) - .. raw:: html @@ -377,13 +363,6 @@ Weather data: auxiliary tables from external sources -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset '41771457' locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_41771457.parquet. - info = _fetch_figshare(dataset_id, data_directory) - .. raw:: html @@ -416,9 +395,9 @@ Weather data: auxiliary tables from external sources 0 - GME00127822 - 2008-08-29 - 206.0 + ASN00041037 + 2008-06-18 + NaN 0.0 NaN @@ -432,25 +411,25 @@ Weather data: auxiliary tables from external sources 2 - MXN00015282 - 2008-10-30 - 211.0 - 0.0 + US1ILSP0008 + 2008-08-19 NaN + 0.0 + 0.0 3 - EN000026038 - 2008-12-17 - -19.0 - 3.0 + USC00164931 + 2008-10-05 NaN + 0.0 + 0.0 4 - ASN00086351 - 2008-10-29 - 229.0 + NOE00111309 + 2008-08-18 + NaN 0.0 NaN @@ -478,13 +457,6 @@ Weather data: auxiliary tables from external sources -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset '41710524' locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_41710524.parquet. - info = _fetch_figshare(dataset_id, data_directory) - .. raw:: html @@ -644,33 +616,33 @@ First we join the stations with weather on the ID (exact join): 0 - AEM00041194 - 25.255 - 55.364 - 10.4 - DUBAI INTL - None + AE000041196 + 25.333 + 55.517 + 34.0 + SHARJAH INTER. AIRP None - 41194.0 + GSN + 41196.0 NaN - 2008-07-24 - 453.0 + 2008-08-15 + 422.0 0.0 NaN 1 - AEM00041194 - 25.255 - 55.364 - 10.4 - DUBAI INTL - None + AE000041196 + 25.333 + 55.517 + 34.0 + SHARJAH INTER. AIRP None - 41194.0 + GSN + 41196.0 NaN - 2008-01-21 - 221.0 + 2008-05-08 + 374.0 0.0 NaN @@ -685,41 +657,41 @@ First we join the stations with weather on the ID (exact join): None 41194.0 NaN - 2008-06-21 - 393.0 + 2008-04-04 + 288.0 0.0 NaN 3 - AEM00041217 - 24.433 - 54.651 - 26.8 - ABU DHABI INTL + AEM00041194 + 25.255 + 55.364 + 10.4 + DUBAI INTL None None - 41217.0 - NaN - 2008-03-03 - 305.0 + 41194.0 NaN + 2008-05-21 + 365.0 + 0.0 NaN 4 - AEM00041217 - 24.433 - 54.651 - 26.8 - ABU DHABI INTL + AEM00041194 + 25.255 + 55.364 + 10.4 + DUBAI INTL None None - 41217.0 - NaN - 2008-07-18 - 419.0 + 41194.0 NaN + 2008-01-21 + 221.0 + 0.0 NaN @@ -801,17 +773,17 @@ tables into one. 0 - AEM00041194 - 25.255 - 55.364 - 10.4 - DUBAI INTL - None + AE000041196 + 25.333 + 55.517 + 34.0 + SHARJAH INTER. AIRP None - 41194.0 + GSN + 41196.0 NaN - 2008-07-24 - 453.0 + 2008-08-15 + 422.0 0.0 NaN ROP @@ -821,23 +793,23 @@ tables into one. Thailand 14.078333 101.378334 - 2.418780 - 3.314851 + 2.418437 + 3.314381 True 1 - AEM00041194 - 25.255 - 55.364 - 10.4 - DUBAI INTL - None + AE000041196 + 25.333 + 55.517 + 34.0 + SHARJAH INTER. AIRP None - 41194.0 + GSN + 41196.0 NaN - 2008-01-21 - 221.0 + 2008-05-08 + 374.0 0.0 NaN ROP @@ -847,8 +819,8 @@ tables into one. Thailand 14.078333 101.378334 - 2.418780 - 3.314851 + 2.418437 + 3.314381 True @@ -862,8 +834,8 @@ tables into one. None 41194.0 NaN - 2008-06-21 - 393.0 + 2008-04-04 + 288.0 0.0 NaN ROP @@ -879,18 +851,18 @@ tables into one. 3 - AEM00041217 - 24.433 - 54.651 - 26.8 - ABU DHABI INTL + AEM00041194 + 25.255 + 55.364 + 10.4 + DUBAI INTL None None - 41217.0 - NaN - 2008-03-03 - 305.0 + 41194.0 NaN + 2008-05-21 + 365.0 + 0.0 NaN ROP Prachinburi @@ -899,24 +871,24 @@ tables into one. Thailand 14.078333 101.378334 - 2.392028 - 3.278188 + 2.418780 + 3.314851 True 4 - AEM00041217 - 24.433 - 54.651 - 26.8 - ABU DHABI INTL + AEM00041194 + 25.255 + 55.364 + 10.4 + DUBAI INTL None None - 41217.0 - NaN - 2008-07-18 - 419.0 + 41194.0 NaN + 2008-01-21 + 221.0 + 0.0 NaN ROP Prachinburi @@ -925,8 +897,8 @@ tables into one. Thailand 14.078333 101.378334 - 2.392028 - 3.278188 + 2.418780 + 3.314851 True @@ -1253,10 +1225,8 @@ The results: warnings.warn( /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:241: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros warnings.warn( - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/sklearn/preprocessing/_encoders.py:241: UserWarning: Found unknown categories in columns [1] during transform. These unknown categories will be encoded as all zeros - warnings.warn( - 0.5890000000000001 + 0.584 @@ -1274,7 +1244,7 @@ Our final cross-validated accuracy score is 0.58. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (6 minutes 36.403 seconds) + **Total running time of the script:** (6 minutes 15.839 seconds) .. _sphx_glr_download_auto_examples_07_multiple_key_join.py: diff --git a/dev/_sources/auto_examples/08_join_aggregation.rst.txt b/dev/_sources/auto_examples/08_join_aggregation.rst.txt index 9ead9aa5..62294d76 100644 --- a/dev/_sources/auto_examples/08_join_aggregation.rst.txt +++ b/dev/_sources/auto_examples/08_join_aggregation.rst.txt @@ -106,8 +106,6 @@ Note that we use the light version (100k rows). .. code-block:: none - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:682: UserWarning: Could not find the dataset 'ratings' locally. Downloading it from MovieLens; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data. - info = _fetch_movielens(dataset_id, data_directory) ((100836, 3), (100836,)) @@ -1250,75 +1248,75 @@ operation maximizing our validation score. split1_test_score - 0.052844 - 0.068819 - 0.091119 - 0.073400 - 0.077463 + 0.036788 + 0.042861 + 0.047756 + 0.092186 + 0.074721 split2_test_score - 0.067998 - 0.073335 - 0.099403 - 0.093315 - 0.083894 + 0.082080 + 0.076600 + 0.088298 + 0.098839 + 0.092330 split3_test_score - 0.038866 - 0.059366 - 0.066177 - 0.063048 - 0.077090 + 0.040455 + 0.060279 + 0.059554 + 0.069920 + 0.076664 split4_test_score - 0.127047 - 0.137670 - 0.134789 - 0.146312 - 0.150864 + 0.144695 + 0.126360 + 0.139479 + 0.147547 + 0.150157 split5_test_score - 0.104822 - 0.111108 - 0.107033 - 0.107686 - 0.112994 + 0.109618 + 0.109127 + 0.108129 + 0.110375 + 0.115823 split6_test_score - 0.082263 - 0.100981 - 0.105808 - 0.112339 - 0.108530 + 0.082864 + 0.097725 + 0.107884 + 0.107061 + 0.108110 split7_test_score - 0.061744 - 0.070645 - 0.059122 - 0.070789 - 0.078973 + 0.065931 + 0.065383 + 0.066993 + 0.068702 + 0.077875 split8_test_score - 0.106238 - 0.116092 - 0.115282 - 0.123013 - 0.125354 + 0.105591 + 0.107371 + 0.127966 + 0.118582 + 0.126476 split9_test_score - 0.121766 - 0.148285 - 0.162742 - 0.164243 - 0.182996 + 0.129446 + 0.155033 + 0.160510 + 0.169458 + 0.177739 @@ -1409,7 +1407,7 @@ exhaustive histogram over all the possible values of ratings .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 18.990 seconds) + **Total running time of the script:** (0 minutes 17.345 seconds) .. _sphx_glr_download_auto_examples_08_join_aggregation.py: diff --git a/dev/_sources/auto_examples/09_interpolation_join.rst.txt b/dev/_sources/auto_examples/09_interpolation_join.rst.txt index a11f1133..7276c164 100644 --- a/dev/_sources/auto_examples/09_interpolation_join.rst.txt +++ b/dev/_sources/auto_examples/09_interpolation_join.rst.txt @@ -139,8 +139,8 @@ Thus, the values from the right side table of the join are inferred, whereas the 0 25.333 55.517 - 2008-09-16 - 40.1 + 2008-07-26 + 42.6 0.0 NaN @@ -148,36 +148,36 @@ Thus, the values from the right side table of the join are inferred, whereas the 1 25.333 55.517 - 2008-04-27 - 39.6 - 0.0 + 2008-03-07 + 26.9 + NaN NaN 2 25.333 55.517 - 2008-02-07 - 20.5 + 2008-09-13 + 41.6 NaN NaN 3 - 25.333 - 55.517 - 2008-05-24 - 39.4 + 25.255 + 55.364 + 2008-07-14 + 46.6 0.0 NaN 4 - 25.333 - 55.517 - 2008-03-28 - 37.3 - NaN + 25.255 + 55.364 + 2008-10-10 + 36.1 + 0.0 NaN @@ -232,45 +232,45 @@ Thus, the values from the right side table of the join are inferred, whereas the 50000 - 39.6429 - 104.9972 - 2008-11-08 + 39.3745 + 104.8145 + 2008-08-19 NaN - 0.0 + 3.0 0.0 50001 - 39.6429 - 104.9972 - 2008-10-09 + 39.3745 + 104.8145 + 2008-12-12 NaN 0.0 0.0 50002 - 39.6429 - 104.9972 - 2008-01-16 + 39.4850 + 104.9089 + 2008-10-08 NaN 0.0 0.0 50003 - 39.6429 - 104.9972 - 2008-08-16 + 39.4850 + 104.9089 + 2008-07-06 NaN - 173.0 + 0.0 0.0 50004 - 39.6429 - 104.9972 - 2008-09-02 + 39.4850 + 104.9089 + 2008-08-06 NaN 0.0 0.0 @@ -346,61 +346,61 @@ To avoid clashes in the column names, we use the ``suffix`` parameter to append 0 25.333 55.517 - 2008-09-16 - 40.1 + 2008-07-26 + 42.6 0.0 NaN - 28.793065 - 36.405488 - 0.028923 + 33.574041 + 34.947159 + -0.044366 1 25.333 55.517 - 2008-04-27 - 39.6 - 0.0 + 2008-03-07 + 26.9 NaN - 29.934053 - 23.814175 - 0.056715 + NaN + 27.628380 + 82.634901 + 0.283787 2 25.333 55.517 - 2008-02-07 - 20.5 + 2008-09-13 + 41.6 NaN NaN - 25.691471 - 61.294862 - 0.036142 + 31.799569 + 15.854745 + -0.044366 3 - 25.333 - 55.517 - 2008-05-24 - 39.4 + 25.255 + 55.364 + 2008-07-14 + 46.6 0.0 NaN - 31.083335 - 18.782720 - -0.056641 + 33.605682 + 44.011747 + -0.044366 4 - 25.333 - 55.517 - 2008-03-28 - 37.3 - NaN + 25.255 + 55.364 + 2008-10-10 + 36.1 + 0.0 NaN - 27.021425 - 15.758833 - -0.048054 + 29.028138 + 14.139990 + -0.044366 @@ -580,7 +580,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co MI 42.212059 -83.348836 - 2.086271 + 0.376282 1 @@ -592,7 +592,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co TX 32.895951 -97.037200 - 19.039737 + 21.193443 2 @@ -604,7 +604,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co NC 36.097747 -79.937297 - 17.402124 + 17.469606 3 @@ -616,7 +616,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co IL 41.979595 -87.904464 - 15.957930 + 15.554276 4 @@ -628,7 +628,7 @@ Then, we use it to transform the flights table -- it adds a ``'TMAX'`` column co TX 32.895951 -97.037200 - 13.279960 + 12.075490 @@ -678,11 +678,11 @@ States with the lowest average predicted temperatures: Alaska, Montana, North Da state - AK -5.434837 - MT 0.747835 - WA 1.276608 - MN 1.695421 - ND 2.126488 + AK -2.341141 + MT 0.159879 + WA 0.429907 + ND 0.763726 + MN 1.216084 Name: TMAX, dtype: float64 @@ -707,11 +707,11 @@ States with the highest predicted temperatures: Puerto Rico, Virgin Islands, Haw state - LA 21.582258 - FL 24.666959 - VI 24.802627 - HI 25.498273 - PR 25.657998 + LA 21.610379 + FL 24.679313 + HI 27.325298 + VI 30.081688 + PR 30.683504 Name: TMAX, dtype: float64 @@ -787,7 +787,7 @@ It is a generalization of the :func:`~skrub.fuzzy_join`, as :func:`~skrub.fuzzy_ .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 5.573 seconds) + **Total running time of the script:** (0 minutes 5.732 seconds) .. _sphx_glr_download_auto_examples_09_interpolation_join.py: diff --git a/dev/_sources/auto_examples/sg_execution_times.rst.txt b/dev/_sources/auto_examples/sg_execution_times.rst.txt index c73884ec..e1b114db 100644 --- a/dev/_sources/auto_examples/sg_execution_times.rst.txt +++ b/dev/_sources/auto_examples/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**09:56.051** total execution time for 9 files **from auto_examples**: +**09:19.795** total execution time for 9 files **from auto_examples**: .. container:: @@ -33,29 +33,29 @@ Computation times - Time - Mem (MB) * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``07_multiple_key_join.py``) - - 06:36.403 + - 06:15.839 - 0.0 * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``01_encodings.py``) - - 01:14.213 + - 01:11.426 - 0.0 * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``06_ken_embeddings.py``) - - 01:12.585 + - 01:04.578 - 0.0 * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``08_join_aggregation.py``) - - 00:18.990 + - 00:17.345 - 0.0 * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``04_fuzzy_joining.py``) - - 00:17.162 + - 00:14.152 - 0.0 * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``09_interpolation_join.py``) - - 00:05.573 + - 00:05.732 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``05_deduplication.py``) - - 00:05.206 + - 00:05.023 - 0.0 * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``03_datetime_encoder.py``) - - 00:04.266 + - 00:04.131 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``02_feature_interpretation_with_gapencoder.py``) - - 00:01.653 + - 00:01.570 - 0.0 diff --git a/dev/_sources/sg_execution_times.rst.txt b/dev/_sources/sg_execution_times.rst.txt index 15e8504e..e74e98bf 100644 --- a/dev/_sources/sg_execution_times.rst.txt +++ b/dev/_sources/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**09:56.051** total execution time for 9 files **from all galleries**: +**09:19.795** total execution time for 9 files **from all galleries**: .. container:: @@ -33,29 +33,29 @@ Computation times - Time - Mem (MB) * - :ref:`sphx_glr_auto_examples_07_multiple_key_join.py` (``../examples/07_multiple_key_join.py``) - - 06:36.403 + - 06:15.839 - 0.0 * - :ref:`sphx_glr_auto_examples_01_encodings.py` (``../examples/01_encodings.py``) - - 01:14.213 + - 01:11.426 - 0.0 * - :ref:`sphx_glr_auto_examples_06_ken_embeddings.py` (``../examples/06_ken_embeddings.py``) - - 01:12.585 + - 01:04.578 - 0.0 * - :ref:`sphx_glr_auto_examples_08_join_aggregation.py` (``../examples/08_join_aggregation.py``) - - 00:18.990 + - 00:17.345 - 0.0 * - :ref:`sphx_glr_auto_examples_04_fuzzy_joining.py` (``../examples/04_fuzzy_joining.py``) - - 00:17.162 + - 00:14.152 - 0.0 * - :ref:`sphx_glr_auto_examples_09_interpolation_join.py` (``../examples/09_interpolation_join.py``) - - 00:05.573 + - 00:05.732 - 0.0 * - :ref:`sphx_glr_auto_examples_05_deduplication.py` (``../examples/05_deduplication.py``) - - 00:05.206 + - 00:05.023 - 0.0 * - :ref:`sphx_glr_auto_examples_03_datetime_encoder.py` (``../examples/03_datetime_encoder.py``) - - 00:04.266 + - 00:04.131 - 0.0 * - :ref:`sphx_glr_auto_examples_02_feature_interpretation_with_gapencoder.py` (``../examples/02_feature_interpretation_with_gapencoder.py``) - - 00:01.653 + - 00:01.570 - 0.0 diff --git a/dev/auto_examples/01_encodings.html b/dev/auto_examples/01_encodings.html index efb378c5..bdd09ca5 100644 --- a/dev/auto_examples/01_encodings.html +++ b/dev/auto_examples/01_encodings.html @@ -1194,46 +1194,45 @@

A simple prediction pipeline
tv.named_transformers_["high_cardinality"].get_feature_names_out()
 
-
array(['district, patrol, 3rd', 'behavioral, health, school',
-       'station, state, estate', 'custody, toddlers, members',
-       'gaithersburg, clarksburg, the',
-       'resources, resource, neighborhood', 'welfare, children, child',
-       'services, highway, service', 'safety, collision, traffic',
-       'technology, systems, telephone', 'supports, support, sports',
-       'planning, construction, building',
+
array(['behavioral, centers, seniors', 'silver, spring, ride',
+       'station, state, estate', 'equipment, automotive, mangement',
+       'patrol, 4th, 6th', 'traffic, safety, alcohol',
+       'management, fleet, protective', 'delivery, warehouse, operations',
+       'custody, mcdc, customer', 'division, family, animal',
+       'technology, systems, telephone', 'rockville, twinbrook, downtown',
+       'services, highway, service',
        'investigative, investigations, criminal',
-       'rockville, downtown, twinbrook', 'nicholson, transit, taxicab',
-       'silver, spring, urban', 'security, mccf, unit',
-       'management, budget, fleet', 'operations, delivery, warehouse',
-       'development, stormwater, residential',
-       'office, enforcement, officer',
-       'eligibility, assistance, assisted',
-       'automotive, protective, equipment',
-       'communications, communication, immunization',
-       'family, crimes, crime',
-       'administration, battalion, administrative',
-       'maintenance, facilities, council', 'training, recruit, director',
-       'division, animal, fiscal', 'programs, personnel, background',
-       'legislative, principal, executive',
-       'firefighter, rescuer, master', 'officer, office, traffic',
-       'operator, bus, operations', 'school, room, behavioral',
-       'manager, projects, project', 'income, assistance, client',
-       'coordinator, coordinating, depot',
-       'information, technology, recreation', 'sergeant, police, cadet',
+       'gaithersburg, nicholson, transit', 'security, mc311, mccf',
+       'special, assignment, medical', 'welfare, child, childhood',
+       'communications, communication, commuter', 'school, health, based',
+       'supports, support, network', 'enforcement, emergency, crossing',
+       'maintenance, facilities, facility',
+       'administrative, administration, battalion',
+       'development, planning, monitoring', 'training, director, recruit',
+       'environmental, regulatory, adolescent',
+       'accounts, programs, program', 'transport, design, building',
+       'district, payroll, squad', 'officer, office, security',
+       'firefighter, rescuer, recruit', 'operator, equipment, bus',
+       'therapist, plumber, member',
+       'administrative, administration, administrator',
+       'technician, mechanic, supply', 'worker, social, works',
+       'specialist, special, quality',
+       'information, renovation, technology',
+       'program, programs, projects', 'liquor, clerk, store',
        'enforcement, permitting, inspector',
-       'planning, specialist, special',
-       'correctional, correction, records',
-       'assistant, library, librarian', 'community, health, nurse',
-       'communications, telecommunications, safety',
-       'captain, chief, rescue', 'technician, mechanic, supply',
-       'crossing, parking, guard', 'liquor, clerk, store',
-       'warehouse, craftsworker, worker', 'program, programs, graphic',
-       'lieutenant, shift, maintenance',
-       'equipment, investigator, investment',
-       'supervisor, supervisory, welfare', 'services, service, urban',
-       'candidate, sheriff, deputy', 'corporal, behavioral, erp',
-       'environmental, budget, ombudsman',
-       'administration, administrator, administrative'], dtype=object)
+       'manager, investigator, management', 'candidate, police, master',
+       'school, health, room', 'correctional, correction, corporal',
+       'coordinator, services, service', 'community, nurse, unit',
+       'captain, rescue, chief', 'income, assistance, client',
+       'crossing, engineer, parking',
+       'telecommunications, communications, safety',
+       'sheriff, deputy, autobody', 'lieutenant, maintenance, crew',
+       'librarian, psychiatric, accountant',
+       'warehouse, welfare, representative',
+       'supervisor, sergeant, supervisory',
+       'assistant, library, attorney',
+       'legislative, principal, executive', 'planning, budget, senior'],
+      dtype=object)
 
-
R2 score:  mean: 0.923; std: 0.012
+
R2 score:  mean: 0.919; std: 0.016
 

The simple pipeline applied on this complex dataset gave us very good results.

@@ -1739,7 +1738,7 @@

ConclusionTotal running time of the script: (1 minutes 14.213 seconds)

+

Total running time of the script: (1 minutes 11.426 seconds)

-
/home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:678: UserWarning: Could not find the dataset 'NY.GDP.PCAP.CD' locally. Downloading it from the World Bank; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/world_bank/NY.GDP.PCAP.CD.csv.
-  info = _fetch_world_bank_data(dataset_id, data_directory)
-
-