diff --git a/dev/.buildinfo b/dev/.buildinfo index 36775569..84f09b4b 100644 --- a/dev/.buildinfo +++ b/dev/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 171e190e40f18d210c495edd89992fa6 +config: 494786c4ccb31c03eae44be6d784ab41 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/dev/_images/sphx_glr_01_encodings_001.png b/dev/_images/sphx_glr_01_encodings_001.png index 1b773801..e0f1d166 100644 Binary files a/dev/_images/sphx_glr_01_encodings_001.png and b/dev/_images/sphx_glr_01_encodings_001.png differ diff --git a/dev/_images/sphx_glr_01_encodings_thumb.png b/dev/_images/sphx_glr_01_encodings_thumb.png index 8b11c774..0c0e19eb 100644 Binary files a/dev/_images/sphx_glr_01_encodings_thumb.png and b/dev/_images/sphx_glr_01_encodings_thumb.png differ diff --git a/dev/_images/sphx_glr_08_join_aggregation_003.png b/dev/_images/sphx_glr_08_join_aggregation_003.png index 873cc9dc..ca53a11b 100644 Binary files a/dev/_images/sphx_glr_08_join_aggregation_003.png and b/dev/_images/sphx_glr_08_join_aggregation_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_001.png b/dev/_images/sphx_glr_09_interpolation_join_001.png index b1a9b3df..efb90c69 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_001.png and b/dev/_images/sphx_glr_09_interpolation_join_001.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_002.png b/dev/_images/sphx_glr_09_interpolation_join_002.png index 3a1f3fab..456367c9 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_002.png and b/dev/_images/sphx_glr_09_interpolation_join_002.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_003.png b/dev/_images/sphx_glr_09_interpolation_join_003.png index 54898afa..82f6ab0e 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_003.png and b/dev/_images/sphx_glr_09_interpolation_join_003.png differ diff --git a/dev/_images/sphx_glr_09_interpolation_join_thumb.png b/dev/_images/sphx_glr_09_interpolation_join_thumb.png index b0f2b704..2a5f8f3c 100644 Binary files a/dev/_images/sphx_glr_09_interpolation_join_thumb.png and b/dev/_images/sphx_glr_09_interpolation_join_thumb.png differ diff --git a/dev/_sources/auto_examples/01_encodings.rst.txt b/dev/_sources/auto_examples/01_encodings.rst.txt index 8ff5560e..c1d97835 100644 --- a/dev/_sources/auto_examples/01_encodings.rst.txt +++ b/dev/_sources/auto_examples/01_encodings.rst.txt @@ -847,46 +847,45 @@ corresponding columns: .. code-block:: none - array(['district, patrol, 3rd', 'behavioral, health, school', - 'station, state, estate', 'custody, toddlers, members', - 'gaithersburg, clarksburg, the', - 'resources, resource, neighborhood', 'welfare, children, child', - 'services, highway, service', 'safety, collision, traffic', - 'technology, systems, telephone', 'supports, support, sports', - 'planning, construction, building', + array(['behavioral, centers, seniors', 'silver, spring, ride', + 'station, state, estate', 'equipment, automotive, mangement', + 'patrol, 4th, 6th', 'traffic, safety, alcohol', + 'management, fleet, protective', 'delivery, warehouse, operations', + 'custody, mcdc, customer', 'division, family, animal', + 'technology, systems, telephone', 'rockville, twinbrook, downtown', + 'services, highway, service', 'investigative, investigations, criminal', - 'rockville, downtown, twinbrook', 'nicholson, transit, taxicab', - 'silver, spring, urban', 'security, mccf, unit', - 'management, budget, fleet', 'operations, delivery, warehouse', - 'development, stormwater, residential', - 'office, enforcement, officer', - 'eligibility, assistance, assisted', - 'automotive, protective, equipment', - 'communications, communication, immunization', - 'family, crimes, crime', - 'administration, battalion, administrative', - 'maintenance, facilities, council', 'training, recruit, director', - 'division, animal, fiscal', 'programs, personnel, background', - 'legislative, principal, executive', - 'firefighter, rescuer, master', 'officer, office, traffic', - 'operator, bus, operations', 'school, room, behavioral', - 'manager, projects, project', 'income, assistance, client', - 'coordinator, coordinating, depot', - 'information, technology, recreation', 'sergeant, police, cadet', + 'gaithersburg, nicholson, transit', 'security, mc311, mccf', + 'special, assignment, medical', 'welfare, child, childhood', + 'communications, communication, commuter', 'school, health, based', + 'supports, support, network', 'enforcement, emergency, crossing', + 'maintenance, facilities, facility', + 'administrative, administration, battalion', + 'development, planning, monitoring', 'training, director, recruit', + 'environmental, regulatory, adolescent', + 'accounts, programs, program', 'transport, design, building', + 'district, payroll, squad', 'officer, office, security', + 'firefighter, rescuer, recruit', 'operator, equipment, bus', + 'therapist, plumber, member', + 'administrative, administration, administrator', + 'technician, mechanic, supply', 'worker, social, works', + 'specialist, special, quality', + 'information, renovation, technology', + 'program, programs, projects', 'liquor, clerk, store', 'enforcement, permitting, inspector', - 'planning, specialist, special', - 'correctional, correction, records', - 'assistant, library, librarian', 'community, health, nurse', - 'communications, telecommunications, safety', - 'captain, chief, rescue', 'technician, mechanic, supply', - 'crossing, parking, guard', 'liquor, clerk, store', - 'warehouse, craftsworker, worker', 'program, programs, graphic', - 'lieutenant, shift, maintenance', - 'equipment, investigator, investment', - 'supervisor, supervisory, welfare', 'services, service, urban', - 'candidate, sheriff, deputy', 'corporal, behavioral, erp', - 'environmental, budget, ombudsman', - 'administration, administrator, administrative'], dtype=object) + 'manager, investigator, management', 'candidate, police, master', + 'school, health, room', 'correctional, correction, corporal', + 'coordinator, services, service', 'community, nurse, unit', + 'captain, rescue, chief', 'income, assistance, client', + 'crossing, engineer, parking', + 'telecommunications, communications, safety', + 'sheriff, deputy, autobody', 'lieutenant, maintenance, crew', + 'librarian, psychiatric, accountant', + 'warehouse, welfare, representative', + 'supervisor, sergeant, supervisory', + 'assistant, library, attorney', + 'legislative, principal, executive', 'planning, budget, senior'], + dtype=object) @@ -968,7 +967,7 @@ Let's look at the cross-validated R2 score of our model: .. code-block:: none - R2 score: mean: 0.923; std: 0.012 + R2 score: mean: 0.919; std: 0.016 @@ -1508,7 +1507,7 @@ to plot the feature importances. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (1 minutes 14.213 seconds) + **Total running time of the script:** (1 minutes 11.426 seconds) .. _sphx_glr_download_auto_examples_01_encodings.py: diff --git a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt index b66e46f6..c9db9182 100644 --- a/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt +++ b/dev/_sources/auto_examples/02_feature_interpretation_with_gapencoder.rst.txt @@ -504,7 +504,7 @@ as a set of latent topics. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 1.653 seconds) + **Total running time of the script:** (0 minutes 1.570 seconds) .. _sphx_glr_download_auto_examples_02_feature_interpretation_with_gapencoder.py: diff --git a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt index 440c2b2d..59b9320e 100644 --- a/dev/_sources/auto_examples/03_datetime_encoder.rst.txt +++ b/dev/_sources/auto_examples/03_datetime_encoder.rst.txt @@ -610,7 +610,7 @@ and transforms datetime columns by default. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 4.266 seconds) + **Total running time of the script:** (0 minutes 4.131 seconds) .. _sphx_glr_download_auto_examples_03_datetime_encoder.py: diff --git a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt index ee47690b..c0e29e89 100644 --- a/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt +++ b/dev/_sources/auto_examples/04_fuzzy_joining.rst.txt @@ -239,13 +239,6 @@ We extract the table containing GDP per capita by country: -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:678: UserWarning: Could not find the dataset 'NY.GDP.PCAP.CD' locally. Downloading it from the World Bank; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/world_bank/NY.GDP.PCAP.CD.csv. - info = _fetch_world_bank_data(dataset_id, data_directory) - .. raw:: html @@ -310,13 +303,6 @@ Then another table, with life expectancy by country: -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:678: UserWarning: Could not find the dataset 'SP.DYN.LE00.IN' locally. Downloading it from the World Bank; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/world_bank/SP.DYN.LE00.IN.csv. - info = _fetch_world_bank_data(dataset_id, data_directory) - .. raw:: html @@ -381,13 +367,6 @@ And a table with legal rights strength by country: -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:678: UserWarning: Could not find the dataset 'IC.LGL.CRED.XQ' locally. Downloading it from the World Bank; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/world_bank/IC.LGL.CRED.XQ.csv. - info = _fetch_world_bank_data(dataset_id, data_directory) - .. raw:: html @@ -1732,7 +1711,7 @@ introduced into a grid search: .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 17.162 seconds) + **Total running time of the script:** (0 minutes 14.152 seconds) .. _sphx_glr_download_auto_examples_04_fuzzy_joining.py: diff --git a/dev/_sources/auto_examples/05_deduplication.rst.txt b/dev/_sources/auto_examples/05_deduplication.rst.txt index dce47615..db3aebc4 100644 --- a/dev/_sources/auto_examples/05_deduplication.rst.txt +++ b/dev/_sources/auto_examples/05_deduplication.rst.txt @@ -335,7 +335,7 @@ or |MinHash|. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 5.206 seconds) + **Total running time of the script:** (0 minutes 5.023 seconds) .. _sphx_glr_download_auto_examples_05_deduplication.py: diff --git a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt index e717f108..7ceb69b9 100644 --- a/dev/_sources/auto_examples/06_ken_embeddings.rst.txt +++ b/dev/_sources/auto_examples/06_ken_embeddings.rst.txt @@ -305,7 +305,7 @@ We will start by checking out the available tables with .. code-block:: none - {'games', 'movies', 'schools', 'companies', 'albums', 'all_entities'} + {'movies', 'all_entities', 'companies', 'games', 'schools', 'albums'} @@ -327,13 +327,6 @@ Let's see what kind of types we can find in it with the function -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset 40019788 locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_40019788.parquet. - info = _fetch_figshare(dataset_id, data_directory) - .. raw:: html @@ -460,15 +453,6 @@ and exclude those with type name "companies" or "developer". -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset 39266678 locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_39266678.parquet. - info = _fetch_figshare(dataset_id, data_directory) - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset 39254360 locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_39254360.parquet. - info = _fetch_figshare(dataset_id, data_directory) - @@ -856,7 +840,7 @@ It helped significantly improve the prediction score. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (1 minutes 12.585 seconds) + **Total running time of the script:** (1 minutes 4.578 seconds) .. _sphx_glr_download_auto_examples_06_ken_embeddings.py: diff --git a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt index 88e1a205..8b9d03af 100644 --- a/dev/_sources/auto_examples/07_multiple_key_join.rst.txt +++ b/dev/_sources/auto_examples/07_multiple_key_join.rst.txt @@ -80,13 +80,6 @@ The main table: flights dataset -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset '41771418' locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_41771418.parquet. - info = _fetch_figshare(dataset_id, data_directory) - .. raw:: html @@ -259,13 +252,6 @@ Airport data: an auxiliary table from the same database -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset '41710257' locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_41710257.parquet. - info = _fetch_figshare(dataset_id, data_directory) - .. raw:: html @@ -377,13 +363,6 @@ Weather data: auxiliary tables from external sources -.. rst-class:: sphx-glr-script-out - - .. code-block:: none - - /home/circleci/project/miniconda/envs/testenv/lib/python3.10/site-packages/skrub/datasets/_fetching.py:680: UserWarning: Could not find the dataset '41771457' locally. Downloading it from figshare; this might take a while... If it is interrupted, some files might be invalid/incomplete: if on the following run, the fetching raises errors, you can try fixing this issue by deleting the directory /home/circleci/skrub_data/figshare/figshare_41771457.parquet. - info = _fetch_figshare(dataset_id, data_directory) - .. raw:: html @@ -416,9 +395,9 @@ Weather data: auxiliary tables from external sources
tv.named_transformers_["high_cardinality"].get_feature_names_out()
array(['district, patrol, 3rd', 'behavioral, health, school',
- 'station, state, estate', 'custody, toddlers, members',
- 'gaithersburg, clarksburg, the',
- 'resources, resource, neighborhood', 'welfare, children, child',
- 'services, highway, service', 'safety, collision, traffic',
- 'technology, systems, telephone', 'supports, support, sports',
- 'planning, construction, building',
+array(['behavioral, centers, seniors', 'silver, spring, ride',
+ 'station, state, estate', 'equipment, automotive, mangement',
+ 'patrol, 4th, 6th', 'traffic, safety, alcohol',
+ 'management, fleet, protective', 'delivery, warehouse, operations',
+ 'custody, mcdc, customer', 'division, family, animal',
+ 'technology, systems, telephone', 'rockville, twinbrook, downtown',
+ 'services, highway, service',
'investigative, investigations, criminal',
- 'rockville, downtown, twinbrook', 'nicholson, transit, taxicab',
- 'silver, spring, urban', 'security, mccf, unit',
- 'management, budget, fleet', 'operations, delivery, warehouse',
- 'development, stormwater, residential',
- 'office, enforcement, officer',
- 'eligibility, assistance, assisted',
- 'automotive, protective, equipment',
- 'communications, communication, immunization',
- 'family, crimes, crime',
- 'administration, battalion, administrative',
- 'maintenance, facilities, council', 'training, recruit, director',
- 'division, animal, fiscal', 'programs, personnel, background',
- 'legislative, principal, executive',
- 'firefighter, rescuer, master', 'officer, office, traffic',
- 'operator, bus, operations', 'school, room, behavioral',
- 'manager, projects, project', 'income, assistance, client',
- 'coordinator, coordinating, depot',
- 'information, technology, recreation', 'sergeant, police, cadet',
+ 'gaithersburg, nicholson, transit', 'security, mc311, mccf',
+ 'special, assignment, medical', 'welfare, child, childhood',
+ 'communications, communication, commuter', 'school, health, based',
+ 'supports, support, network', 'enforcement, emergency, crossing',
+ 'maintenance, facilities, facility',
+ 'administrative, administration, battalion',
+ 'development, planning, monitoring', 'training, director, recruit',
+ 'environmental, regulatory, adolescent',
+ 'accounts, programs, program', 'transport, design, building',
+ 'district, payroll, squad', 'officer, office, security',
+ 'firefighter, rescuer, recruit', 'operator, equipment, bus',
+ 'therapist, plumber, member',
+ 'administrative, administration, administrator',
+ 'technician, mechanic, supply', 'worker, social, works',
+ 'specialist, special, quality',
+ 'information, renovation, technology',
+ 'program, programs, projects', 'liquor, clerk, store',
'enforcement, permitting, inspector',
- 'planning, specialist, special',
- 'correctional, correction, records',
- 'assistant, library, librarian', 'community, health, nurse',
- 'communications, telecommunications, safety',
- 'captain, chief, rescue', 'technician, mechanic, supply',
- 'crossing, parking, guard', 'liquor, clerk, store',
- 'warehouse, craftsworker, worker', 'program, programs, graphic',
- 'lieutenant, shift, maintenance',
- 'equipment, investigator, investment',
- 'supervisor, supervisory, welfare', 'services, service, urban',
- 'candidate, sheriff, deputy', 'corporal, behavioral, erp',
- 'environmental, budget, ombudsman',
- 'administration, administrator, administrative'], dtype=object)
+ 'manager, investigator, management', 'candidate, police, master',
+ 'school, health, room', 'correctional, correction, corporal',
+ 'coordinator, services, service', 'community, nurse, unit',
+ 'captain, rescue, chief', 'income, assistance, client',
+ 'crossing, engineer, parking',
+ 'telecommunications, communications, safety',
+ 'sheriff, deputy, autobody', 'lieutenant, maintenance, crew',
+ 'librarian, psychiatric, accountant',
+ 'warehouse, welfare, representative',
+ 'supervisor, sergeant, supervisory',
+ 'assistant, library, attorney',
+ 'legislative, principal, executive', 'planning, budget, senior'],
+ dtype=object)
R2 score: mean: 0.923; std: 0.012
+R2 score: mean: 0.919; std: 0.016
The simple pipeline applied on this complex dataset gave us very good results.
@@ -1739,7 +1738,7 @@ ConclusionTotal running time of the script: (1 minutes 14.213 seconds)
+Total running time of the script: (1 minutes 11.426 seconds)