From 47e33295f1b5f7574ad0e07ef96dd6425b6bf4e8 Mon Sep 17 00:00:00 2001 From: ljeub-pometry <97447091+ljeub-pometry@users.noreply.github.com> Date: Tue, 3 Sep 2024 19:10:42 +0200 Subject: [PATCH] Parallel loading from parquet and Pandas (#1732) * start implementing parallel df loading (most of the infrastructure is there but still need to update all the loaders) * start implementing the parallel loaders * implement parallel loading from DfView * PropCols needs to return empty rows if not specified so the zipping doesn't terminate early * remove the len method from PropCol as it is not used anymore * fix merge issue * need to sort test output as order is no longer guaranteed * add missing feature tags * GID node state should implement Ord * make it possible to compare NodeState with dict * add sort_by_id for NodeState * clean up error handling and make missing values an error again * fix all the tests so they do not rely on insertion order which is no longer preserved * one more order-dependent test * resolve all nodes first * try to drop the pair lock earlier * try chunking by min of src/dst to reduce contention * pull the edge initialisation out of the node locks * num_shards exposed * try to improve the contention * expose number of shards to python * try to fix the decontention-sort * add jemallocator to fix some weirdness * snmalloc for slightly better performance and hopefully better compatibility * hopefully fix the python import error * fix python take 2 * last try * just on macos for now until we figure out what is going on * Revert the pre-sorting of the updates as it doesn't seem to help * clean up the handling of num_shards * remove unused method and bump the chunk size up in the pandas loader for a bit more speed * fix merge error and clean up allocator dependency management * fix dead code warnings * fix random breakage in async_graphql * no more debug symbols in the CI to hopefully save some disk space * fix the nextest invocation --- .../test_rust_disk_storage_workflow.yml | 2 +- .github/workflows/test_rust_workflow.yml | 2 +- Cargo.lock | 267 +++--- Cargo.toml | 7 +- .../test_loaders/test_load_from_pandas.py | 461 +++++------ .../test_loaders/test_load_from_parquet.py | 260 +++--- .../src/model/graph/mutable_graph.rs | 15 +- raphtory/Cargo.toml | 5 +- .../entities/graph/logical_to_physical.rs | 57 +- raphtory/src/core/entities/graph/tgraph.rs | 48 +- .../src/core/entities/graph/tgraph_storage.rs | 18 +- raphtory/src/core/entities/nodes/node_ref.rs | 2 +- raphtory/src/core/storage/mod.rs | 54 +- raphtory/src/core/storage/raw_edges.rs | 94 ++- raphtory/src/core/utils/errors.rs | 45 +- .../internal/internal_addition_ops.rs | 13 + raphtory/src/db/api/state/ops.rs | 25 +- .../storage/graph/storage_ops/additions.rs | 22 + raphtory/src/db/api/storage/storage.rs | 10 + raphtory/src/io/arrow/dataframe.rs | 80 +- raphtory/src/io/arrow/df_loaders.rs | 757 ++++++------------ raphtory/src/io/arrow/layer_col.rs | 143 ++++ raphtory/src/io/arrow/mod.rs | 12 +- raphtory/src/io/arrow/node_col.rs | 146 ++++ raphtory/src/io/arrow/prop_handler.rs | 449 ++++++----- raphtory/src/lib.rs | 9 + raphtory/src/python/graph/graph.rs | 8 +- .../src/python/graph/io/pandas_loaders.rs | 22 +- .../types/macros/trait_impl/node_state.rs | 16 +- raphtory/src/search/mod.rs | 11 + raphtory/src/serialise/serialise.rs | 6 +- 31 files changed, 1657 insertions(+), 1409 deletions(-) create mode 100644 raphtory/src/io/arrow/layer_col.rs create mode 100644 raphtory/src/io/arrow/node_col.rs diff --git a/.github/workflows/test_rust_disk_storage_workflow.yml b/.github/workflows/test_rust_disk_storage_workflow.yml index c27ffb2340..4e9c1cac7b 100644 --- a/.github/workflows/test_rust_disk_storage_workflow.yml +++ b/.github/workflows/test_rust_disk_storage_workflow.yml @@ -67,7 +67,7 @@ jobs: RUSTFLAGS: -Awarnings ${{ matrix.flags }} TEMPDIR: ${{ runner.temp }} run: | - cargo nextest run --all --no-default-features --features "storage" + cargo nextest run --all --no-default-features --features "storage" --cargo-profile test-ci - name: Check all features env: RUSTFLAGS: -Awarnings diff --git a/.github/workflows/test_rust_workflow.yml b/.github/workflows/test_rust_workflow.yml index e0547924bc..126e31a1cc 100644 --- a/.github/workflows/test_rust_workflow.yml +++ b/.github/workflows/test_rust_workflow.yml @@ -55,7 +55,7 @@ jobs: RUSTFLAGS: -Awarnings TEMPDIR: ${{ runner.temp }} run: | - cargo nextest run --all --no-default-features + cargo nextest run --all --no-default-features --cargo-profile test-ci doc-test: if: ${{ !inputs.skip_tests }} name: "Doc tests" diff --git a/Cargo.lock b/Cargo.lock index a6e0226d6d..d2d9838f63 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -513,26 +513,26 @@ dependencies = [ [[package]] name = "async-graphql-derive" -version = "7.0.7" +version = "7.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72e2e26a6b44bc61df3ca8546402cf9204c28e30c06084cc8e75cd5e34d4f150" +checksum = "f1141703c11c6ad4fa9b3b0e1e476dea01dbd18a44db00f949b804afaab2f344" dependencies = [ "Inflector", "async-graphql-parser", "darling", - "proc-macro-crate 3.1.0", + "proc-macro-crate 3.2.0", "proc-macro2", "quote", "strum", - "syn 2.0.75", + "syn 2.0.77", "thiserror", ] [[package]] name = "async-graphql-parser" -version = "7.0.7" +version = "7.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f801451484b4977d6fe67b29030f81353cabdcbb754e5a064f39493582dac0cf" +checksum = "2f66edcce4c38c18f7eb181fdf561c3d3aa2d644ce7358fc7a928c00a4ffef17" dependencies = [ "async-graphql-value", "pest", @@ -559,9 +559,9 @@ dependencies = [ [[package]] name = "async-graphql-value" -version = "7.0.7" +version = "7.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69117c43c01d81a69890a9f5dd6235f2f027ca8d1ec62d6d3c5e01ca0edb4f2b" +checksum = "3b0206011cad065420c27988f17dd7fe201a0e056b20c262209b7bffcd6fa176" dependencies = [ "bytes", "indexmap", @@ -614,18 +614,18 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] name = "async-trait" -version = "0.1.81" +version = "0.1.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" +checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -840,9 +840,9 @@ checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "bytemuck" -version = "1.17.0" +version = "1.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fd4c6dcc3b0aea2f5c0b4b82c2b15fe39ddbc76041a310848f4706edf76bb31" +checksum = "773d90827bc3feecfb67fab12e24de0749aad83c74b9504ecde46237b5cd24e2" dependencies = [ "bytemuck_derive", ] @@ -855,7 +855,7 @@ checksum = "0cc8b54b395f2fcfbb3d90c47b01c7f444d94d05bdeb775811dec868ac3bbc26" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -902,9 +902,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.1.13" +version = "1.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48" +checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6" dependencies = [ "jobserver", "libc", @@ -1056,7 +1056,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -1065,6 +1065,15 @@ version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" +[[package]] +name = "cmake" +version = "0.1.51" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a" +dependencies = [ + "cc", +] + [[package]] name = "colorchoice" version = "1.0.2" @@ -1134,9 +1143,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "convert_case" @@ -1352,7 +1361,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -1363,7 +1372,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -1759,38 +1768,38 @@ checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] name = "derive_builder" -version = "0.20.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0350b5cb0331628a5916d6c5c0b72e97393b8b6b03b47a9284f4e7f5a405ffd7" +checksum = "cd33f37ee6a119146a1781d3356a7c26028f83d779b2e04ecd45fdc75c76877b" dependencies = [ "derive_builder_macro", ] [[package]] name = "derive_builder_core" -version = "0.20.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d48cda787f839151732d396ac69e3473923d54312c070ee21e9effcaa8ca0b1d" +checksum = "7431fa049613920234f22c47fdc33e6cf3ee83067091ea4277a3f8c4587aae38" dependencies = [ "darling", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] name = "derive_builder_macro" -version = "0.20.0" +version = "0.20.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "206868b8242f27cecce124c19fd88157fbd0dd334df2587f36417bafbc85097b" +checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc" dependencies = [ "derive_builder_core", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -1824,7 +1833,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -1882,7 +1891,7 @@ dependencies = [ "proc-macro-crate 1.3.1", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", "thiserror", ] @@ -1910,7 +1919,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -1996,9 +2005,9 @@ checksum = "59668941c55e5c186b8b58c391629af56774ec768f73c08bbcd56f09348eb00b" [[package]] name = "fastrand" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "fixedbitset" @@ -2018,9 +2027,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.32" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c0596c1eac1f9e04ed902702e9878208b336edc9d6fddc8a48387349bab3666" +checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" dependencies = [ "crc32fast", "miniz_oxide 0.8.0", @@ -2113,7 +2122,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -2510,7 +2519,7 @@ dependencies = [ "tokio", "tokio-rustls 0.26.0", "tower-service", - "webpki-roots 0.26.3", + "webpki-roots 0.26.5", ] [[package]] @@ -2574,9 +2583,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown 0.14.5", @@ -3158,7 +3167,7 @@ dependencies = [ "tokio", "tokio-rustls 0.26.0", "url", - "webpki-roots 0.26.3", + "webpki-roots 0.26.5", ] [[package]] @@ -3168,7 +3177,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53a0d57c55d2d1dc62a2b1d16a0a1079eb78d67c36bdf468d582ab4482ec7002" dependencies = [ "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3315,9 +3324,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.3" +version = "0.36.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" +checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" dependencies = [ "memchr", ] @@ -3481,7 +3490,7 @@ dependencies = [ "proc-macro2", "proc-macro2-diagnostics", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3642,7 +3651,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3721,7 +3730,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3823,10 +3832,10 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a62fea1692d80a000126f9b28d865012a160b80000abb53ccf152b428222c155" dependencies = [ - "proc-macro-crate 3.1.0", + "proc-macro-crate 3.2.0", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3977,12 +3986,12 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.20" +version = "0.2.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f12335488a2f3b0a83b14edad48dca9879ce89b2edd10e80237e4e852dd645e" +checksum = "479cf940fbbb3426c32c5d5176f62ad57549a0bb84773423ba8be9d089f5faba" dependencies = [ "proc-macro2", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3997,11 +4006,11 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "3.1.0" +version = "3.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +checksum = "8ecf48c7ca261d60b74ab1a7b20da18bede46776b2e55535cb958eb595c5fa7b" dependencies = [ - "toml_edit 0.21.1", + "toml_edit 0.22.20", ] [[package]] @@ -4021,7 +4030,7 @@ checksum = "af066a9c399a26e020ada66a034357a868728e72cd426f3adcd35f80d88d88c8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", "version_check", "yansi 1.0.1", ] @@ -4048,9 +4057,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13db3d3fde688c61e2446b4d843bc27a7e8af269a69440c0308021dc92333cc" +checksum = "3b2ecbe40f08db5c006b5764a2645f7f3f141ce756412ac9e1dd6087e6d32995" dependencies = [ "bytes", "prost-derive", @@ -4058,9 +4067,9 @@ dependencies = [ [[package]] name = "prost-build" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb182580f71dd070f88d01ce3de9f4da5021db7115d2e1c3605a754153b77c1" +checksum = "f8650aabb6c35b860610e9cff5dc1af886c9e25073b7b1712a68972af4281302" dependencies = [ "bytes", "heck 0.5.0", @@ -4073,37 +4082,37 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.75", + "syn 2.0.77", "tempfile", ] [[package]] name = "prost-derive" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18bec9b0adc4eba778b33684b7ba3e7137789434769ee3ce3930463ef904cfca" +checksum = "acf0c195eebb4af52c752bec4f52f645da98b6e92077a04110c7f349477ae5ac" dependencies = [ "anyhow", "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] name = "prost-types" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cee5168b05f49d4b0ca581206eb14a7b22fafd963efe729ac48eb03266e25cc2" +checksum = "60caa6738c7369b940c3d49246a8d1749323674c65cb13010134f5c9bad5b519" dependencies = [ "prost", ] [[package]] name = "psm" -version = "0.1.21" +version = "0.1.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5787f7cda34e3033a72192c018bc5883100330f362ef279a8cbccfce8bb4e874" +checksum = "3b1f9bf148c15500d44581654fb9260bc9d82970f3ef777a79a40534f6aa784f" dependencies = [ "cc", ] @@ -4157,7 +4166,7 @@ dependencies = [ "proc-macro2", "pyo3-macros-backend", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -4170,7 +4179,7 @@ dependencies = [ "proc-macro2", "pyo3-build-config", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -4224,9 +4233,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.3" +version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" +checksum = "a2d2fb862b7ba45e615c1429def928f2e15f815bdf933b27a2d3824e224c1f46" dependencies = [ "bytes", "pin-project-lite", @@ -4242,9 +4251,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.6" +version = "0.11.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba92fb39ec7ad06ca2582c0ca834dfeadcaf06ddfc8e635c80aa7e1c05315fdd" +checksum = "ea0a9b3a42929fad8a7c3de7f86ce0814cfa893328157672680e9fb1145549c5" dependencies = [ "bytes", "rand", @@ -4272,9 +4281,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -4379,6 +4388,7 @@ dependencies = [ "rustc-hash 2.0.0", "serde", "serde_json", + "snmalloc-rs", "sorted_vector_map", "streaming-stats", "tantivy", @@ -4683,7 +4693,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 0.26.3", + "webpki-roots 0.26.5", "windows-registry", ] @@ -4785,18 +4795,18 @@ checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" [[package]] name = "rustc_version" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ "semver", ] [[package]] name = "rustix" -version = "0.38.34" +version = "0.38.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "a85d50532239da68e9addb745ba38ff4612a242c1c7ceea689c4bc7c2f43c36f" dependencies = [ "bitflags 2.6.0", "errno", @@ -4826,16 +4836,16 @@ dependencies = [ "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.102.6", + "rustls-webpki 0.102.7", "subtle", "zeroize", ] [[package]] name = "rustls-native-certs" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04182dffc9091a404e0fc069ea5cd60e5b866c3adf881eff99a32d048242dffa" +checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" dependencies = [ "openssl-probe", "rustls-pemfile 2.1.3", @@ -4881,9 +4891,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.102.6" +version = "0.102.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" +checksum = "84678086bd54edf2b415183ed7a94d0efb049f1b646a33e22a36f3794be6ae56" dependencies = [ "ring", "rustls-pki-types", @@ -5001,9 +5011,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.208" +version = "1.0.209" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" +checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" dependencies = [ "serde_derive", ] @@ -5021,20 +5031,20 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.208" +version = "1.0.209" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" +checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] name = "serde_json" -version = "1.0.125" +version = "1.0.127" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" +checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad" dependencies = [ "itoa", "memchr", @@ -5212,6 +5222,24 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" +[[package]] +name = "snmalloc-rs" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2504c9edd7ca7a1cfe637296dc0d263ce1e9975c4ec43f3652616ebce9d1df1c" +dependencies = [ + "snmalloc-sys", +] + +[[package]] +name = "snmalloc-sys" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d448599db5c3263b35d67ab26a2399e74ca0265211f5f5dd4cb9f4c3ccada6a" +dependencies = [ + "cmake", +] + [[package]] name = "socket2" version = "0.5.7" @@ -5256,7 +5284,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -5267,15 +5295,15 @@ checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" [[package]] name = "stacker" -version = "0.1.15" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c886bd4480155fd3ef527d45e9ac8dd7118a898a46530b7b94c3e21866259fce" +checksum = "799c883d55abdb5e98af1a7b3f23b9b6de8ecada0ecac058672d7635eb48ca7b" dependencies = [ "cc", "cfg-if", "libc", "psm", - "winapi", + "windows-sys 0.59.0", ] [[package]] @@ -5345,7 +5373,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -5367,9 +5395,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.75" +version = "2.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" dependencies = [ "proc-macro2", "quote", @@ -5611,7 +5639,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -5713,9 +5741,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.3" +version = "1.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" +checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" dependencies = [ "backtrace", "bytes", @@ -5737,7 +5765,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -5830,17 +5858,6 @@ dependencies = [ "winnow 0.5.40", ] -[[package]] -name = "toml_edit" -version = "0.21.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" -dependencies = [ - "indexmap", - "toml_datetime", - "winnow 0.5.40", -] - [[package]] name = "toml_edit" version = "0.22.20" @@ -5900,7 +5917,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -6200,7 +6217,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", "wasm-bindgen-shared", ] @@ -6234,7 +6251,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6268,7 +6285,7 @@ checksum = "4b8220be1fa9e4c889b30fd207d4906657e7e90b12e0e6b0c8b8d8709f5de021" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -6312,9 +6329,9 @@ checksum = "5f20c57d8d7db6d3b86154206ae5d8fba62dd39573114de97c2cb0578251f8e1" [[package]] name = "webpki-roots" -version = "0.26.3" +version = "0.26.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd7c23921eeb1713a4e851530e9b9756e4fb0e89978582942612524cf09f01cd" +checksum = "0bd24728e5af82c6c4ec1b66ac4844bdf8156257fccda846ec58b42cd0cdbe6a" dependencies = [ "rustls-pki-types", ] @@ -6619,7 +6636,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -6639,7 +6656,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index ec4ff40a96..ab769e1364 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,13 +32,18 @@ split-debuginfo = "unpacked" inherits = "release" debug = true +[profile.test-ci] +inherits = "test" +debug = 0 + [workspace.dependencies] #[public-storage] pometry-storage = { version = ">=0.8.1", path = "pometry-storage" } #[private-storage] # pometry-storage = { path = "pometry-storage-private", package = "pometry-storage-private" } -async-graphql = { version = "7.0.5", features = ["dynamic-schema"] } +async-graphql = { version = ">=7.0.5, <7.0.8", features = ["dynamic-schema"] } # 7.0.8+ is borked, see https://github.com/async-graphql/async-graphql/issues/1586 bincode = "1.3.3" +snmalloc-rs = { version = "0.3.6" } async-graphql-poem = "7.0.5" dynamic-graphql = "0.9.0" reqwest = { version = "0.12.7", default-features = false, features = ["rustls-tls", "multipart", "json"] } diff --git a/python/tests/test_loaders/test_load_from_pandas.py b/python/tests/test_loaders/test_load_from_pandas.py index 6df04d40e9..55d9da81e4 100644 --- a/python/tests/test_loaders/test_load_from_pandas.py +++ b/python/tests/test_loaders/test_load_from_pandas.py @@ -43,7 +43,9 @@ def assertions(g): marbles = e["marbles"] edges.append((e.src.id, e.dst.id, weight, marbles)) - assert g.nodes.id.collect() == expected_nodes + edges.sort() + + assert g.nodes.id.sorted() == expected_nodes assert edges == expected_edges g = Graph() @@ -68,11 +70,10 @@ def test_load_from_pandas_with_invalid_data(): ) def assertions(exc_info): - assert "Failed to load graph" in str(exc_info.value) assert "ArrowInvalid" in str(exc_info.value) assert ( - "Could not convert '3.0 KG' with type str: tried to convert to double" - in str(exc_info.value) + "Could not convert '3.0 KG' with type str: tried to convert to double" + in str(exc_info.value) ) # Use pytest.raises to expect an exception @@ -87,11 +88,10 @@ def assertions(exc_info): assertions(exc_info) # Optionally, you can check the exception message or type - assert "Failed to load graph" in str(exc_info.value) assert "ArrowInvalid" in str(exc_info.value) assert ( - "Could not convert '3.0 KG' with type str: tried to convert to double" - in str(exc_info.value) + "Could not convert '3.0 KG' with type str: tried to convert to double" + in str(exc_info.value) ) @@ -138,12 +138,15 @@ def assertions(g): weight = e["weight"] marbles = e["marbles"] edges.append((e.src.id, e.dst.id, weight, marbles)) + edges.sort() + nodes = [] for v in g.nodes: name = v["name"] nodes.append((v.id, name)) + nodes.sort() - assert g.nodes.id.collect() == expected_node_ids + assert g.nodes.id.sorted() == expected_node_ids assert edges == expected_edges assert nodes == expected_nodes @@ -204,14 +207,16 @@ def assertions(g): weight = e["weight"] marbles = e["marbles"] edges.append((e.src.id, e.dst.id, weight, marbles)) + edges.sort() nodes = [] for v in g.nodes: name = v["name"] nodes.append((v.id, name)) + nodes.sort() assert nodes == expected_nodes - assert g.nodes.id.collect() == expected_node_ids + assert g.nodes.id.sorted() == expected_node_ids assert edges == expected_edges g = Graph() @@ -333,14 +338,14 @@ def assertions1(g): assertions1(g) def assertions2(g): - assert g.nodes.properties.constant.get("type").collect() == [ - "Person 1", - "Person 2", - "Person 3", - "Person 4", - "Person 5", - "Person 6", - ] + assert dict(zip(g.nodes.id, g.nodes.properties.constant.get("type"))) == { + 1: "Person 1", + 2: "Person 2", + 3: "Person 3", + 4: "Person 4", + 5: "Person 5", + 6: "Person 6", + } g = Graph() g.load_nodes_from_pandas( @@ -366,7 +371,7 @@ def assertions2(g): def assertions3(g): assert g.unique_layers == ["_default", "test_layer"] - assert g.layers(["test_layer"]).edges.src.id.collect() == [1, 2, 3, 4, 5] + assert set(g.layers(["test_layer"]).edges.src.id) == {1, 2, 3, 4, 5} assert g.edges.properties.constant.get("type").collect() == [ {"test_layer": "Edge"}, {"test_layer": "Edge"}, @@ -381,13 +386,13 @@ def assertions3(g): {"test_layer": "test_tag"}, {"test_layer": "test_tag"}, ] - assert g.edges.properties.constant.get("marbles_const").collect() == [ - {"test_layer": "red"}, - {"test_layer": "blue"}, - {"test_layer": "green"}, - {"test_layer": "yellow"}, - {"test_layer": "purple"}, - ] + assert dict(zip(g.edges.id, g.edges.properties.constant.get("marbles_const"))) == { + (1, 2): {"test_layer": "red"}, + (2, 3): {"test_layer": "blue"}, + (3, 4): {"test_layer": "green"}, + (4, 5): {"test_layer": "yellow"}, + (5, 6): {"test_layer": "purple"}, + } g = Graph() g.load_edges_from_pandas( @@ -416,18 +421,18 @@ def assertions3(g): assertions3(g) def assertions4(g): - assert g.layers(["layer 1"]).edges.src.id.collect() == [1] - assert g.layers(["layer 1", "layer 2"]).edges.src.id.collect() == [1, 2] - assert g.layers(["layer 1", "layer 2", "layer 3"]).edges.src.id.collect() == [ - 1, - 2, - 3, - ] - assert g.layers(["layer 1", "layer 4", "layer 5"]).edges.src.id.collect() == [ - 1, - 4, - 5, - ] + assert g.layers(["layer 1"]).edges.id.collect() == [(1, 2)] + assert set(g.layers(["layer 1", "layer 2"]).edges.id) == {(1, 2), (2, 3)} + assert set(g.layers(["layer 1", "layer 2", "layer 3"]).edges.id) == { + (1, 2), + (2, 3), + (3, 4), + } + assert set(g.layers(["layer 1", "layer 4", "layer 5"]).edges.id) == { + (1, 2), + (4, 5), + (5, 6), + } g = Graph() g.load_edges_from_pandas( @@ -450,7 +455,7 @@ def assertions5(g): "Person", "Person", ] - assert g.layers(["test_layer"]).edges.src.id.collect() == [1, 2, 3, 4, 5] + assert set(g.layers(["test_layer"]).edges.id) == {(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)} g = Graph() g.load_edges_from_pandas( @@ -487,26 +492,26 @@ def assertions5(g): assertions5(g) def assertions6(g): - assert g.nodes.properties.constant.get("type").collect() == [ - "Person 1", - "Person 2", - "Person 3", - "Person 4", - "Person 5", - "Person 6", - ] - assert g.layers(["layer 1"]).edges.src.id.collect() == [1] - assert g.layers(["layer 1", "layer 2"]).edges.src.id.collect() == [1, 2] - assert g.layers(["layer 1", "layer 2", "layer 3"]).edges.src.id.collect() == [ - 1, - 2, - 3, - ] - assert g.layers(["layer 1", "layer 4", "layer 5"]).edges.src.id.collect() == [ - 1, - 4, - 5, - ] + assert dict(zip(g.nodes.id, g.nodes.properties.constant.get("type"))) == { + 1: "Person 1", + 2: "Person 2", + 3: "Person 3", + 4: "Person 4", + 5: "Person 5", + 6: "Person 6", + } + assert g.layers(["layer 1"]).edges.id.collect() == [(1, 2)] + assert set(g.layers(["layer 1", "layer 2"]).edges.id) == {(1, 2), (2, 3)} + assert set(g.layers(["layer 1", "layer 2", "layer 3"]).edges.id) == { + (1, 2), + (2, 3), + (3, 4), + } + assert set(g.layers(["layer 1", "layer 4", "layer 5"]).edges.id) == { + (1, 2), + (4, 5), + (5, 6), + } g = Graph() g.load_edges_from_pandas(edges_df, "time", "src", "dst", layer_col="layers") @@ -531,14 +536,14 @@ def assertions6(g): assertions6(g) def assertions7(g): - assert g.nodes.properties.constant.get("type").collect() == [ - "Person 1", - "Person 2", - "Person 3", - "Person 4", - "Person 5", - "Person 6", - ] + assert dict(zip(g.nodes.id, g.nodes.properties.constant.get("type"))) == { + 1: "Person 1", + 2: "Person 2", + 3: "Person 3", + 4: "Person 4", + 5: "Person 5", + 6: "Person 6", + } assert g.nodes.properties.constant.get("tag").collect() == [ "test_tag", "test_tag", @@ -567,20 +572,22 @@ def assertions7(g): assertions7(g) def assertions8(g): - assert g.layers( + assert dict(zip(g.layers( ["layer 1", "layer 2", "layer 3"] - ).edges.properties.constant.get("marbles_const").collect() == [ - {"layer 1": "red"}, - {"layer 2": "blue"}, - {"layer 3": "green"}, - ] - assert g.edges.properties.constant.get("tag").collect() == [ - {"layer 1": "test_tag"}, - {"layer 2": "test_tag"}, - {"layer 3": "test_tag"}, - {"layer 4": "test_tag"}, - {"layer 5": "test_tag"}, - ] + ).edges.id, g.layers( + ["layer 1", "layer 2", "layer 3"] + ).edges.properties.constant.get("marbles_const"))) == { + (1, 2): {"layer 1": "red"}, + (2, 3): {"layer 2": "blue"}, + (3, 4): {"layer 3": "green"}, + } + assert dict(zip(g.edges.id, g.edges.properties.constant.get("tag"))) == { + (1, 2): {"layer 1": "test_tag"}, + (2, 3): {"layer 2": "test_tag"}, + (3, 4): {"layer 3": "test_tag"}, + (4, 5): {"layer 4": "test_tag"}, + (5, 6): {"layer 5": "test_tag"}, + } g.load_edge_props_from_pandas( edges_df, @@ -622,21 +629,21 @@ def assertions8(g): assertions8(g) def assertions_layers_in_df(g): - assert g.unique_layers == [ + assert set(g.unique_layers) == { "_default", "layer 1", "layer 2", "layer 3", "layer 4", "layer 5", - ] + } assert g.layers(["layer 1"]).edges.src.id.collect() == [1] assert g.layers(["layer 3"]).edges.src.id.collect() == [3] with pytest.raises( - Exception, - match=re.escape( - "Invalid layer: test_layer. Valid layers: _default, layer 1, layer 2, layer 3, layer 4, layer 5" - ), + Exception, + match=re.escape( + "Invalid layer: test_layer" + ), ): g.layers(["test_layer"]) @@ -687,10 +694,10 @@ def test_missing_columns(): ) with pytest.raises( - Exception, - match=re.escape( - "columns are not present within the dataframe: not_src, not_dst, not_time" - ), + Exception, + match=re.escape( + "columns are not present within the dataframe: not_src, not_dst, not_time" + ), ): g = Graph() g.load_edges_from_pandas( @@ -701,10 +708,10 @@ def test_missing_columns(): ) with pytest.raises( - Exception, - match=re.escape( - "columns are not present within the dataframe: not_src, not_dst, not_time" - ), + Exception, + match=re.escape( + "columns are not present within the dataframe: not_src, not_dst, not_time" + ), ): g = PersistentGraph() g.load_edges_from_pandas( @@ -715,10 +722,10 @@ def test_missing_columns(): ) with pytest.raises( - Exception, - match=re.escape( - "columns are not present within the dataframe: not_weight, bleep_bloop" - ), + Exception, + match=re.escape( + "columns are not present within the dataframe: not_weight, bleep_bloop" + ), ): g = Graph() g.load_edges_from_pandas( @@ -732,10 +739,10 @@ def test_missing_columns(): g.load_nodes_from_pandas(df=nodes_df, time="time", id="id", properties=["name"]) with pytest.raises( - Exception, - match=re.escape( - "columns are not present within the dataframe: not_weight, bleep_bloop" - ), + Exception, + match=re.escape( + "columns are not present within the dataframe: not_weight, bleep_bloop" + ), ): g = PersistentGraph() g.load_edges_from_pandas( @@ -749,10 +756,10 @@ def test_missing_columns(): g.load_nodes_from_pandas(df=nodes_df, time="time", id="id", properties=["name"]) with pytest.raises( - Exception, - match=re.escape( - "columns are not present within the dataframe: not_id, not_time, not_name" - ), + Exception, + match=re.escape( + "columns are not present within the dataframe: not_id, not_time, not_name" + ), ): g = Graph() g.load_edges_from_pandas( @@ -767,10 +774,10 @@ def test_missing_columns(): ) with pytest.raises( - Exception, - match=re.escape( - "columns are not present within the dataframe: not_id, not_time, not_name" - ), + Exception, + match=re.escape( + "columns are not present within the dataframe: not_id, not_time, not_name" + ), ): g = PersistentGraph() g.load_edges_from_pandas( @@ -785,10 +792,10 @@ def test_missing_columns(): ) with pytest.raises( - Exception, - match=re.escape( - "columns are not present within the dataframe: sauce, dist, wait, marples" - ), + Exception, + match=re.escape( + "columns are not present within the dataframe: sauce, dist, wait, marples" + ), ): g = Graph() g.load_edge_props_from_pandas( @@ -799,10 +806,10 @@ def test_missing_columns(): ) with pytest.raises( - Exception, - match=re.escape( - "columns are not present within the dataframe: sauce, dist, wait, marples" - ), + Exception, + match=re.escape( + "columns are not present within the dataframe: sauce, dist, wait, marples" + ), ): g = PersistentGraph() g.load_edge_props_from_pandas( @@ -813,10 +820,10 @@ def test_missing_columns(): ) with pytest.raises( - Exception, - match=re.escape( - "columns are not present within the dataframe: sauce, wait, marples" - ), + Exception, + match=re.escape( + "columns are not present within the dataframe: sauce, wait, marples" + ), ): g = Graph() g.load_node_props_from_pandas( @@ -826,10 +833,10 @@ def test_missing_columns(): ) with pytest.raises( - Exception, - match=re.escape( - "columns are not present within the dataframe: sauce, wait, marples" - ), + Exception, + match=re.escape( + "columns are not present within the dataframe: sauce, wait, marples" + ), ): g = PersistentGraph() g.load_node_props_from_pandas( @@ -844,13 +851,13 @@ def test_none_columns_edges(): {"src": [1, None, 3, 4, 5], "dst": [2, 3, 4, 5, 6], "time": [1, 2, 3, 4, 5]} ) with pytest.raises( - Exception, match=re.escape("Ensure these contain no NaN, Null or None values.") + Exception, match=re.escape("Float64 not supported as node id type") ): g = Graph() g.load_edges_from_pandas(edges_df, "time", "src", "dst") with pytest.raises( - Exception, match=re.escape("Ensure these contain no NaN, Null or None values.") + Exception, match=re.escape("Float64 not supported as node id type") ): PersistentGraph().load_edges_from_pandas(edges_df, "time", "src", "dst") @@ -858,11 +865,11 @@ def test_none_columns_edges(): {"src": [1, 2, 3, 4, 5], "dst": [2, 3, 4, None, 6], "time": [1, 2, 3, 4, 5]} ) with pytest.raises( - Exception, match=re.escape("Ensure these contain no NaN, Null or None values.") + Exception, match=re.escape("Float64 not supported as node id type") ): Graph().load_edges_from_pandas(edges_df, "time", "src", "dst") with pytest.raises( - Exception, match=re.escape("Ensure these contain no NaN, Null or None values.") + Exception, match=re.escape("Float64 not supported as node id type") ): PersistentGraph().load_edges_from_pandas(edges_df, "time", "src", "dst") @@ -870,11 +877,11 @@ def test_none_columns_edges(): {"src": [1, 2, 3, 4, 5], "dst": [2, 3, 4, 5, 6], "time": [1, 2, None, 4, 5]} ) with pytest.raises( - Exception, match=re.escape("Ensure these contain no NaN, Null or None values.") + Exception, match=re.escape("Float64 not supported for time column") ): Graph().load_edges_from_pandas(edges_df, "time", "src", "dst") with pytest.raises( - Exception, match=re.escape("Ensure these contain no NaN, Null or None values.") + Exception, match=re.escape("Float64 not supported for time column") ): PersistentGraph().load_edges_from_pandas(edges_df, "time", "src", "dst") @@ -932,10 +939,10 @@ def test_unparsable_props(): ) with pytest.raises( - Exception, - match=re.escape( - """"Could not convert '2.0' with type str: tried to convert to double", 'Conversion failed for column weight with type object'""" - ), + Exception, + match=re.escape( + """"Could not convert '2.0' with type str: tried to convert to double", 'Conversion failed for column weight with type object'""" + ), ): Graph().load_edges_from_pandas( edges_df, @@ -945,10 +952,10 @@ def test_unparsable_props(): properties=["weight"], ) with pytest.raises( - Exception, - match=re.escape( - """"Could not convert '2.0' with type str: tried to convert to double", 'Conversion failed for column weight with type object'""" - ), + Exception, + match=re.escape( + """"Could not convert '2.0' with type str: tried to convert to double", 'Conversion failed for column weight with type object'""" + ), ): PersistentGraph().load_edges_from_pandas( edges_df, @@ -1058,9 +1065,9 @@ def test_load_edge_deletions_from_pandas(): g = PersistentGraph() g.load_edges_from_pandas(edges_df, "time", "src", "dst") - assert g.window(10, 12).edges.src.id.collect() == [1, 2, 3, 4, 5] + assert set(g.window(10, 12).edges.id) == {(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)} g.load_edge_deletions_from_pandas(edge_dels_df, "time", "src", "dst") - assert g.window(10, 12).edges.src.id.collect() == [1, 2, 5] + assert set(g.window(10, 12).edges.src.id) == {1, 2, 5} def test_edge_both_option_failures_pandas(): @@ -1076,16 +1083,16 @@ def test_edge_both_option_failures_pandas(): # CHECK ALL EDGE FUNCTIONS ON GRAPH FAIL WITH BOTH LAYER AND LAYER_COL g = Graph() with pytest.raises( - Exception, - match=r"GraphLoadException\('WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)'\)", + Exception, + match=r"You cannot set ‘layer_name’ and ‘layer_col’ at the same time. Please pick one or the other.", ): g.load_edges_from_pandas( edges_df, "time", "src", "dst", layer="blah", layer_col="marbles" ) with pytest.raises( - Exception, - match=r"GraphLoadException\('WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)'\)", + Exception, + match=r"You cannot set ‘layer_name’ and ‘layer_col’ at the same time. Please pick one or the other.", ): g.load_edge_props_from_pandas( edges_df, "src", "dst", layer="blah", layer_col="marbles" @@ -1116,67 +1123,67 @@ def test_edge_both_option_failures_pandas(): ["blah"], ] assert g.unique_layers == ["_default", "blah"] - assert g.layer("blah").edges.properties.get("marbles") == [ - "red", - "blue", - "green", - "yellow", - "purple", - ] + assert dict(zip(g.layer("blah").edges.id, g.layer("blah").edges.properties.get("marbles"))) == { + (1, 2): "red", + (2, 3): "blue", + (3, 4): "green", + (4, 5): "yellow", + (5, 6): "purple", + } # CHECK IF JUST LAYER_COL WORKS g = Graph() g.load_edges_from_pandas(edges_df, "time", "src", "dst", layer_col="marbles") - assert g.edges.layer_names.collect() == [ - ["red"], - ["blue"], - ["green"], - ["yellow"], - ["purple"], - ] - assert g.unique_layers == ["_default", "red", "blue", "green", "yellow", "purple"] + assert dict(zip(g.edges.id, g.edges.layer_names)) == { + (1, 2): ["red"], + (2, 3): ["blue"], + (3, 4): ["green"], + (4, 5): ["yellow"], + (5, 6): ["purple"], + } + assert set(g.unique_layers) == {"_default", "red", "blue", "green", "yellow", "purple"} g = Graph() g.load_edges_from_pandas(edges_df, "time", "src", "dst", layer_col="marbles") g.load_edge_props_from_pandas( edges_df, "src", "dst", layer_col="marbles", constant_properties=["marbles"] ) - assert g.edges.layer_names.collect() == [ - ["red"], - ["blue"], - ["green"], - ["yellow"], - ["purple"], - ] - assert g.unique_layers == ["_default", "red", "blue", "green", "yellow", "purple"] - assert g.edges.properties.get("marbles").collect() == [ - {"red": "red"}, - {"blue": "blue"}, - {"green": "green"}, - {"yellow": "yellow"}, - {"purple": "purple"}, - ] + assert dict(zip(g.edges.id, g.edges.layer_names)) == { + (1, 2): ["red"], + (2, 3): ["blue"], + (3, 4): ["green"], + (4, 5): ["yellow"], + (5, 6): ["purple"], + } + assert set(g.unique_layers) == {"_default", "red", "blue", "green", "yellow", "purple"} + assert dict(zip(g.edges.id, g.edges.properties.get("marbles"))) == { + (1, 2): {"red": "red"}, + (2, 3): {"blue": "blue"}, + (3, 4): {"green": "green"}, + (4, 5): {"yellow": "yellow"}, + (5, 6): {"purple": "purple"}, + } g = PersistentGraph() with pytest.raises( - Exception, - match=r"GraphLoadException\('WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)'\)", + Exception, + match=r"You cannot set ‘layer_name’ and ‘layer_col’ at the same time. Please pick one or the other.", ): g.load_edges_from_pandas( edges_df, "time", "src", "dst", layer="blah", layer_col="marbles" ) with pytest.raises( - Exception, - match=r"GraphLoadException\('WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)'\)", + Exception, + match=r"You cannot set ‘layer_name’ and ‘layer_col’ at the same time. Please pick one or the other.", ): g.load_edge_props_from_pandas( edges_df, "src", "dst", layer="blah", layer_col="marbles" ) with pytest.raises( - Exception, - match=r"GraphLoadException\('WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)'\)", + Exception, + match=r"You cannot set ‘layer_name’ and ‘layer_col’ at the same time. Please pick one or the other.", ): g.load_edge_deletions_from_pandas( edges_df, "time", "src", "dst", layer="blah", layer_col="marbles" @@ -1207,13 +1214,13 @@ def test_edge_both_option_failures_pandas(): ["blah"], ] assert g.unique_layers == ["_default", "blah"] - assert g.layer("blah").edges.properties.get("marbles") == [ - "red", - "blue", - "green", - "yellow", - "purple", - ] + assert dict(zip(g.layer("blah").edges.id, g.layer("blah").edges.properties.get("marbles"))) == { + (1, 2): "red", + (2, 3): "blue", + (3, 4): "green", + (4, 5): "yellow", + (5, 6): "purple", + } g = PersistentGraph() g.load_edge_deletions_from_pandas(edges_df, "time", "src", "dst", layer="blah") @@ -1229,48 +1236,48 @@ def test_edge_both_option_failures_pandas(): # CHECK IF JUST LAYER_COL WORKS g = PersistentGraph() g.load_edges_from_pandas(edges_df, "time", "src", "dst", layer_col="marbles") - assert g.edges.layer_names.collect() == [ - ["red"], - ["blue"], - ["green"], - ["yellow"], - ["purple"], - ] - assert g.unique_layers == ["_default", "red", "blue", "green", "yellow", "purple"] + assert dict(zip(g.edges.id, g.edges.layer_names)) == { + (1, 2): ["red"], + (2, 3): ["blue"], + (3, 4): ["green"], + (4, 5): ["yellow"], + (5, 6): ["purple"], + } + assert set(g.unique_layers) == {"_default", "red", "blue", "green", "yellow", "purple"} g = PersistentGraph() g.load_edges_from_pandas(edges_df, "time", "src", "dst", layer_col="marbles") g.load_edge_props_from_pandas( edges_df, "src", "dst", layer_col="marbles", constant_properties=["marbles"] ) - assert g.edges.layer_names.collect() == [ - ["red"], - ["blue"], - ["green"], - ["yellow"], - ["purple"], - ] - assert g.unique_layers == ["_default", "red", "blue", "green", "yellow", "purple"] - assert g.edges.properties.get("marbles").collect() == [ - {"red": "red"}, - {"blue": "blue"}, - {"green": "green"}, - {"yellow": "yellow"}, - {"purple": "purple"}, - ] + assert dict(zip(g.edges.id, g.edges.layer_names)) == { + (1, 2): ["red"], + (2, 3): ["blue"], + (3, 4): ["green"], + (4, 5): ["yellow"], + (5, 6): ["purple"], + } + assert set(g.unique_layers) == {"_default", "red", "blue", "green", "yellow", "purple"} + assert dict(zip(g.edges.id, g.edges.properties.get("marbles"))) == { + (1, 2): {"red": "red"}, + (2, 3): {"blue": "blue"}, + (3, 4): {"green": "green"}, + (4, 5): {"yellow": "yellow"}, + (5, 6): {"purple": "purple"}, + } g = PersistentGraph() g.load_edge_deletions_from_pandas( edges_df, "time", "src", "dst", layer_col="marbles" ) - assert g.edges.layer_names.collect() == [ - ["red"], - ["blue"], - ["green"], - ["yellow"], - ["purple"], - ] - assert g.unique_layers == ["_default", "red", "blue", "green", "yellow", "purple"] + assert dict(zip(g.edges.id, g.edges.layer_names)) == { + (1, 2): ["red"], + (2, 3): ["blue"], + (3, 4): ["green"], + (4, 5): ["yellow"], + (5, 6): ["purple"], + } + assert set(g.unique_layers) == {"_default", "red", "blue", "green", "yellow", "purple"} def test_node_both_option_failures_pandas(): @@ -1284,8 +1291,8 @@ def test_node_both_option_failures_pandas(): ) # CHECK ALL NODE FUNCTIONS ON GRAPH FAIL WITH BOTH NODE_TYPE AND NODE_TYPE_COL with pytest.raises( - Exception, - match=r"GraphLoadException\('WrongNumOfArgs\(\"node_type\", \"node_type_col\"\)'\)", + Exception, + match=r"You cannot set ‘node_type_name’ and ‘node_type_col’ at the same time. Please pick one or the other.", ): g = Graph() g.load_nodes_from_pandas( @@ -1293,8 +1300,8 @@ def test_node_both_option_failures_pandas(): ) with pytest.raises( - Exception, - match=r"GraphLoadException\('WrongNumOfArgs\(\"node_type\", \"node_type_col\"\)'\)", + Exception, + match=r"You cannot set ‘node_type_name’ and ‘node_type_col’ at the same time. Please pick one or the other.", ): g = Graph() g.load_node_props_from_pandas( @@ -1304,7 +1311,7 @@ def test_node_both_option_failures_pandas(): # CHECK IF JUST NODE_TYPE WORKS g = Graph() g.load_nodes_from_pandas(nodes_df, "time", "id", node_type="node_type") - assert g.nodes.node_type.collect() == [ + assert g.nodes.node_type == [ "node_type", "node_type", "node_type", @@ -1315,7 +1322,7 @@ def test_node_both_option_failures_pandas(): g = Graph() g.load_nodes_from_pandas(nodes_df, "time", "id") g.load_node_props_from_pandas(nodes_df, "id", node_type="node_type") - assert g.nodes.node_type.collect() == [ + assert g.nodes.node_type == [ "node_type", "node_type", "node_type", @@ -1327,8 +1334,8 @@ def test_node_both_option_failures_pandas(): # CHECK IF JUST NODE_TYPE_COL WORKS g = Graph() g.load_nodes_from_pandas(nodes_df, "time", "id", node_type_col="node_type") - assert g.nodes.node_type.collect() == ["P1", "P2", "P3", "P4", "P5", "P6"] + assert g.nodes.node_type == {1: "P1", 2: "P2", 3: "P3", 4: "P4", 5: "P5", 6: "P6"} g = Graph() g.load_nodes_from_pandas(nodes_df, "time", "id") g.load_node_props_from_pandas(nodes_df, "id", node_type_col="node_type") - assert g.nodes.node_type.collect() == ["P1", "P2", "P3", "P4", "P5", "P6"] + assert g.nodes.node_type == {1: "P1", 2: "P2", 3: "P3", 4: "P4", 5: "P5", 6: "P6"} diff --git a/python/tests/test_loaders/test_load_from_parquet.py b/python/tests/test_loaders/test_load_from_parquet.py index 5b8b62e48a..e419153d42 100644 --- a/python/tests/test_loaders/test_load_from_parquet.py +++ b/python/tests/test_loaders/test_load_from_parquet.py @@ -87,7 +87,9 @@ def assert_expected_nodes(g): for v in g.nodes: name = v["name"] nodes.append((v.id, name)) - assert g.nodes.id.collect() == expected_node_ids + nodes.sort() + + assert g.nodes.id.sorted() == expected_node_ids assert nodes == expected_nodes @@ -104,18 +106,19 @@ def assert_expected_edges(g): weight = e["weight"] marbles = e["marbles"] edges.append((e.src.id, e.dst.id, weight, marbles)) + edges.sort() assert edges == expected_edges def assert_expected_node_types(g): - assert g.nodes.node_type == [ - "p1", - "p2", - "p3", - "p4", - "p5", - "p6", - ] + assert g.nodes.node_type == { + 1: "p1", + 2: "p2", + 3: "p3", + 4: "p4", + 5: "p5", + 6: "p6", + } def assert_expected_node_property_tag(g): @@ -130,14 +133,14 @@ def assert_expected_node_property_tag(g): def assert_expected_node_property_type(g): - assert g.nodes.properties.constant.get("type").collect() == [ - "Person 1", - "Person 2", - "Person 3", - "Person 4", - "Person 5", - "Person 6", - ] + assert dict(zip(g.nodes.id, g.nodes.properties.constant.get("type"))) == { + 1: "Person 1", + 2: "Person 2", + 3: "Person 3", + 4: "Person 4", + 5: "Person 5", + 6: "Person 6", + } def assert_expected_node_property_dept(g): @@ -152,20 +155,21 @@ def assert_expected_node_property_dept(g): def assert_expected_edge_properties(g): - assert g.layers(["layer 1", "layer 2", "layer 3"]).edges.properties.constant.get( - "marbles_const" - ).collect() == [ - {"layer 1": "red"}, - {"layer 2": "blue"}, - {"layer 3": "green"}, - ] - assert g.edges.properties.constant.get("tag").collect() == [ - {"layer 1": "test_tag"}, - {"layer 2": "test_tag"}, - {"layer 3": "test_tag"}, - {"layer 4": "test_tag"}, - {"layer 5": "test_tag"}, - ] + assert dict(zip(g.layers(["layer 1", "layer 2", "layer 3"]).edges.id, + g.layers(["layer 1", "layer 2", "layer 3"]).edges.properties.constant.get( + "marbles_const" + ))) == { + (1, 2): {"layer 1": "red"}, + (2, 3): {"layer 2": "blue"}, + (3, 4): {"layer 3": "green"}, + } + assert dict(zip(g.edges.id, g.edges.properties.constant.get("tag"))) == { + (1, 2): {"layer 1": "test_tag"}, + (2, 3): {"layer 2": "test_tag"}, + (3, 4): {"layer 3": "test_tag"}, + (4, 5): {"layer 4": "test_tag"}, + (5, 6): {"layer 5": "test_tag"}, + } def assert_expected_edge_properties_test_layer(g): @@ -193,38 +197,38 @@ def assert_expected_edge_properties_test_layer(g): def assert_expected_layers(g): - assert g.unique_layers == [ + assert set(g.unique_layers) == { "_default", "layer 1", "layer 2", "layer 3", "layer 4", "layer 5", - ] + } assert g.layers(["layer 1"]).edges.src.id.collect() == [1] - assert g.layers(["layer 1", "layer 2"]).edges.src.id.collect() == [1, 2] - assert g.layers(["layer 1", "layer 2", "layer 3"]).edges.src.id.collect() == [ + assert sorted(g.layers(["layer 1", "layer 2"]).edges.src.id) == [1, 2] + assert sorted(g.layers(["layer 1", "layer 2", "layer 3"]).edges.src.id) == [ 1, 2, 3, ] - assert g.layers(["layer 1", "layer 4", "layer 5"]).edges.src.id.collect() == [ + assert sorted(g.layers(["layer 1", "layer 4", "layer 5"]).edges.src.id) == [ 1, 4, 5, ] with pytest.raises( - Exception, - match=re.escape( - "Invalid layer: test_layer. Valid layers: _default, layer 1, layer 2, layer 3, layer 4, layer 5" - ), + Exception, + match=re.escape( + "Invalid layer: test_layer" + ), ): g.layers(["test_layer"]) def assert_expected_test_layer(g): assert g.unique_layers == ["_default", "test_layer"] - assert g.layers(["test_layer"]).edges.src.id.collect() == [1, 2, 3, 4, 5] + assert sorted(g.layers(["test_layer"]).edges.src.id) == [1, 2, 3, 4, 5] def test_load_from_parquet_graphs(parquet_files): @@ -489,14 +493,14 @@ def test_load_from_parquet_persistent_graphs(parquet_files): src="src", dst="dst", ) - assert g.window(10, 12).edges.src.id.collect() == [1, 2, 3, 4, 5] + assert set(g.window(10, 12).edges.id) == {(1, 2), (2, 3), (3, 4), (4, 5), (5, 6)} g.load_edge_deletions_from_parquet( parquet_path=edges_deletions_parquet_file_path, time="time", src="src", dst="dst", ) - assert g.window(10, 12).edges.src.id.collect() == [1, 2, 5] + assert set(g.window(10, 12).edges.id) == {(1, 2), (2, 3), (5, 6)} def test_edge_both_option_failures_parquet(parquet_files): @@ -508,8 +512,8 @@ def test_edge_both_option_failures_parquet(parquet_files): # CHECK ALL EDGE FUNCTIONS ON GRAPH FAIL WITH BOTH LAYER AND LAYER_COL g = Graph() with pytest.raises( - Exception, - match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)", + Exception, + match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)", ): g.load_edges_from_parquet( edges_parquet_file_path, @@ -521,8 +525,8 @@ def test_edge_both_option_failures_parquet(parquet_files): ) with pytest.raises( - Exception, - match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)", + Exception, + match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)", ): g.load_edge_props_from_parquet( edges_parquet_file_path, "src", "dst", layer="blah", layer_col="marbles" @@ -561,27 +565,27 @@ def test_edge_both_option_failures_parquet(parquet_files): ["blah"], ] assert g.unique_layers == ["_default", "blah"] - assert g.layer("blah").edges.properties.get("marbles") == [ - "red", - "blue", - "green", - "yellow", - "purple", - ] + assert dict(zip(g.layer("blah").edges.id, g.layer("blah").edges.properties.get("marbles"))) == { + (1, 2): "red", + (2, 3): "blue", + (3, 4): "green", + (4, 5): "yellow", + (5, 6): "purple", + } # CHECK IF JUST LAYER_COL WORKS g = Graph() g.load_edges_from_parquet( edges_parquet_file_path, "time", "src", "dst", layer_col="marbles" ) - assert g.edges.layer_names.collect() == [ - ["red"], - ["blue"], - ["green"], - ["yellow"], - ["purple"], - ] - assert g.unique_layers == ["_default", "red", "blue", "green", "yellow", "purple"] + assert dict(zip(g.edges.id, g.edges.layer_names)) == { + (1, 2): ["red"], + (2, 3): ["blue"], + (3, 4): ["green"], + (4, 5): ["yellow"], + (5, 6): ["purple"], + } + assert set(g.unique_layers) == {"_default", "red", "blue", "green", "yellow", "purple"} g = Graph() g.load_edges_from_parquet( @@ -594,26 +598,26 @@ def test_edge_both_option_failures_parquet(parquet_files): layer_col="marbles", constant_properties=["marbles"], ) - assert g.edges.layer_names.collect() == [ - ["red"], - ["blue"], - ["green"], - ["yellow"], - ["purple"], - ] - assert g.unique_layers == ["_default", "red", "blue", "green", "yellow", "purple"] - assert g.edges.properties.get("marbles").collect() == [ - {"red": "red"}, - {"blue": "blue"}, - {"green": "green"}, - {"yellow": "yellow"}, - {"purple": "purple"}, - ] + assert dict(zip(g.edges.id, g.edges.layer_names)) == { + (1, 2): ["red"], + (2, 3): ["blue"], + (3, 4): ["green"], + (4, 5): ["yellow"], + (5, 6): ["purple"], + } + assert set(g.unique_layers) == {"_default", "red", "blue", "green", "yellow", "purple"} + assert dict(zip(g.edges.id, g.edges.properties.get("marbles"))) == { + (1, 2): {"red": "red"}, + (2, 3): {"blue": "blue"}, + (3, 4): {"green": "green"}, + (4, 5): {"yellow": "yellow"}, + (5, 6): {"purple": "purple"}, + } g = PersistentGraph() with pytest.raises( - Exception, - match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)", + Exception, + match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)", ): g.load_edges_from_parquet( edges_parquet_file_path, @@ -625,16 +629,16 @@ def test_edge_both_option_failures_parquet(parquet_files): ) with pytest.raises( - Exception, - match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)", + Exception, + match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)", ): g.load_edge_props_from_parquet( edges_parquet_file_path, "src", "dst", layer="blah", layer_col="marbles" ) with pytest.raises( - Exception, - match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)", + Exception, + match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"layer_name\", \"layer_col\"\)", ): g.load_edge_deletions_from_parquet( edges_parquet_file_path, @@ -678,13 +682,13 @@ def test_edge_both_option_failures_parquet(parquet_files): ["blah"], ] assert g.unique_layers == ["_default", "blah"] - assert g.layer("blah").edges.properties.get("marbles") == [ - "red", - "blue", - "green", - "yellow", - "purple", - ] + assert dict(zip(g.layer("blah").edges.id, g.layer("blah").edges.properties.get("marbles"))) == { + (1, 2): "red", + (2, 3): "blue", + (3, 4): "green", + (4, 5): "yellow", + (5, 6): "purple", + } g = PersistentGraph() g.load_edge_deletions_from_parquet( @@ -704,14 +708,14 @@ def test_edge_both_option_failures_parquet(parquet_files): g.load_edges_from_parquet( edges_parquet_file_path, "time", "src", "dst", layer_col="marbles" ) - assert g.edges.layer_names.collect() == [ - ["red"], - ["blue"], - ["green"], - ["yellow"], - ["purple"], - ] - assert g.unique_layers == ["_default", "red", "blue", "green", "yellow", "purple"] + assert dict(zip(g.edges.id, g.edges.layer_names)) == { + (1, 2): ["red"], + (2, 3): ["blue"], + (3, 4): ["green"], + (4, 5): ["yellow"], + (5, 6): ["purple"], + } + assert set(g.unique_layers) == {"_default", "red", "blue", "green", "yellow", "purple"} g = PersistentGraph() g.load_edges_from_parquet( @@ -724,34 +728,34 @@ def test_edge_both_option_failures_parquet(parquet_files): layer_col="marbles", constant_properties=["marbles"], ) - assert g.edges.layer_names.collect() == [ - ["red"], - ["blue"], - ["green"], - ["yellow"], - ["purple"], - ] - assert g.unique_layers == ["_default", "red", "blue", "green", "yellow", "purple"] - assert g.edges.properties.get("marbles").collect() == [ - {"red": "red"}, - {"blue": "blue"}, - {"green": "green"}, - {"yellow": "yellow"}, - {"purple": "purple"}, - ] + assert dict(zip(g.edges.id, g.edges.layer_names)) == { + (1, 2): ["red"], + (2, 3): ["blue"], + (3, 4): ["green"], + (4, 5): ["yellow"], + (5, 6): ["purple"], + } + assert set(g.unique_layers) == {"_default", "red", "blue", "green", "yellow", "purple"} + assert dict(zip(g.edges.id, g.edges.properties.get("marbles"))) == { + (1, 2): {"red": "red"}, + (2, 3): {"blue": "blue"}, + (3, 4): {"green": "green"}, + (4, 5): {"yellow": "yellow"}, + (5, 6): {"purple": "purple"}, + } g = PersistentGraph() g.load_edge_deletions_from_parquet( edges_parquet_file_path, "time", "src", "dst", layer_col="marbles" ) - assert g.edges.layer_names.collect() == [ - ["red"], - ["blue"], - ["green"], - ["yellow"], - ["purple"], - ] - assert g.unique_layers == ["_default", "red", "blue", "green", "yellow", "purple"] + assert dict(zip(g.edges.id, g.edges.layer_names)) == { + (1, 2): ["red"], + (2, 3): ["blue"], + (3, 4): ["green"], + (4, 5): ["yellow"], + (5, 6): ["purple"], + } + assert set(g.unique_layers) == {"_default", "red", "blue", "green", "yellow", "purple"} def test_node_both_option_failures_parquet(parquet_files): @@ -763,8 +767,9 @@ def test_node_both_option_failures_parquet(parquet_files): # CHECK ALL NODE FUNCTIONS ON GRAPH FAIL WITH BOTH NODE_TYPE AND NODE_TYPE_COL with pytest.raises( - Exception, - match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"node_type\", \"node_type_col\"\)", + Exception, + match=re.escape( + r'Failed to load graph: Failed to load graph WrongNumOfArgs("node_type_name", "node_type_col")'), ): g = Graph() g.load_nodes_from_parquet( @@ -776,8 +781,9 @@ def test_node_both_option_failures_parquet(parquet_files): ) with pytest.raises( - Exception, - match=r"Failed to load graph: Failed to load graph WrongNumOfArgs\(\"node_type\", \"node_type_col\"\)", + Exception, + match=re.escape( + r'Failed to load graph: Failed to load graph WrongNumOfArgs("node_type_name", "node_type_col")'), ): g = Graph() g.load_node_props_from_parquet( @@ -817,10 +823,10 @@ def test_node_both_option_failures_parquet(parquet_files): g.load_nodes_from_parquet( nodes_parquet_file_path, "time", "id", node_type_col="node_type" ) - assert g.nodes.node_type.collect() == ["p1", "p2", "p3", "p4", "p5", "p6"] + assert g.nodes.node_type.sorted_by_id() == ["p1", "p2", "p3", "p4", "p5", "p6"] g = Graph() g.load_nodes_from_parquet(nodes_parquet_file_path, "time", "id") g.load_node_props_from_parquet( nodes_parquet_file_path, "id", node_type_col="node_type" ) - assert g.nodes.node_type.collect() == ["p1", "p2", "p3", "p4", "p5", "p6"] + assert g.nodes.node_type.sorted_by_id() == ["p1", "p2", "p3", "p4", "p5", "p6"] diff --git a/raphtory-graphql/src/model/graph/mutable_graph.rs b/raphtory-graphql/src/model/graph/mutable_graph.rs index 35cb1c0bcc..2cab726e57 100644 --- a/raphtory-graphql/src/model/graph/mutable_graph.rs +++ b/raphtory-graphql/src/model/graph/mutable_graph.rs @@ -107,7 +107,7 @@ impl GqlMutableGraph { let node_view = self .graph .node(name) - .ok_or(GraphError::NodeNameError(node.name.clone()))?; + .ok_or_else(|| GraphError::NodeMissingError(GID::Str(node.name.clone())))?; node_view.set_node_type(node_type)?; } let constant_props = node.constant_properties.unwrap_or(vec![]); @@ -115,7 +115,7 @@ impl GqlMutableGraph { let node_view = self .graph .node(name) - .ok_or(GraphError::NodeNameError(node.name))?; + .ok_or(GraphError::NodeMissingError(GID::Str(node.name)))?; node_view.add_constant_properties(as_properties(constant_props))?; } } @@ -165,10 +165,13 @@ impl GqlMutableGraph { } let constant_props = edge.constant_properties.unwrap_or(vec![]); if !constant_props.is_empty() { - let edge_view = self.graph.edge(src, dst).ok_or(GraphError::EdgeNameError { - src: edge.src, - dst: edge.dst, - })?; + let edge_view = self + .graph + .edge(src, dst) + .ok_or(GraphError::EdgeMissingError { + src: GID::Str(edge.src), + dst: GID::Str(edge.dst), + })?; edge_view.add_constant_properties(as_properties(constant_props), layer)?; } } diff --git a/raphtory/Cargo.toml b/raphtory/Cargo.toml index c27facfd0d..45b74c11b8 100644 --- a/raphtory/Cargo.toml +++ b/raphtory/Cargo.toml @@ -73,6 +73,9 @@ pometry-storage = { workspace = true, optional = true } prost = { workspace = true, optional = true } prost-types = { workspace = true, optional = true } +[target.'cfg(target_os = "macos")'.dependencies] +snmalloc-rs = { workspace = true } + [dev-dependencies] csv = { workspace = true } pretty_assertions = { workspace = true } @@ -118,7 +121,7 @@ python = [ "dep:display-error-chain", "polars-arrow?/compute", "raphtory-api/python", - "kdam/notebook" + "kdam/notebook", ] # storage diff --git a/raphtory/src/core/entities/graph/logical_to_physical.rs b/raphtory/src/core/entities/graph/logical_to_physical.rs index 559baa3e1d..16b7c4a867 100644 --- a/raphtory/src/core/entities/graph/logical_to_physical.rs +++ b/raphtory/src/core/entities/graph/logical_to_physical.rs @@ -1,5 +1,10 @@ -use crate::core::utils::errors::{GraphError, MutateGraphError}; +use crate::core::{ + entities::nodes::node_store::NodeStore, + storage::UninitialisedEntry, + utils::errors::{GraphError, MutateGraphError}, +}; use dashmap::mapref::entry::Entry; +use either::Either; use once_cell::sync::OnceCell; use raphtory_api::core::{ entities::{GidRef, VID}, @@ -48,10 +53,26 @@ impl Mapping { } } - pub fn get_or_init( + pub fn set(&self, gid: GidRef, vid: VID) -> Result<(), GraphError> { + let map = self.map.get_or_init(|| match gid { + GidRef::U64(_) => Map::U64(FxDashMap::default()), + GidRef::Str(_) => Map::Str(FxDashMap::default()), + }); + match gid { + GidRef::U64(id) => map.as_u64().map(|map| { + map.insert(id, vid); + }), + GidRef::Str(id) => map.as_str().map(|map| { + map.insert(id.to_owned(), vid); + }), + } + .ok_or_else(|| MutateGraphError::InvalidNodeId(gid.into()).into()) + } + + pub fn get_or_init<'a>( &self, gid: GidRef, - f_init: impl FnOnce() -> VID, + f_init: impl FnOnce() -> UninitialisedEntry<'a, NodeStore>, ) -> Result, GraphError> { let map = self.map.get_or_init(|| match &gid { GidRef::U64(_) => Map::U64(FxDashMap::default()), @@ -61,7 +82,7 @@ impl Mapping { GidRef::U64(id) => map.as_u64().map(|m| get_or_new(m, id, f_init)), GidRef::Str(id) => map.as_str().map(|m| optim_get_or_insert(m, id, f_init)), }; - vid.ok_or(GraphError::FailedToMutateGraph { + vid.ok_or_else(|| GraphError::FailedToMutateGraph { source: MutateGraphError::InvalidNodeId(gid.into()), }) } @@ -78,10 +99,10 @@ impl Mapping { } #[inline] -fn optim_get_or_insert( +fn optim_get_or_insert<'a>( m: &FxDashMap, id: &str, - f_init: impl FnOnce() -> VID, + f_init: impl FnOnce() -> UninitialisedEntry<'a, NodeStore>, ) -> MaybeNew { m.get(id) .map(|vid| MaybeNew::Existing(*vid)) @@ -89,17 +110,27 @@ fn optim_get_or_insert( } #[inline] -fn get_or_new( +fn get_or_new<'a, K: Eq + Hash>( m: &FxDashMap, id: K, - f_init: impl FnOnce() -> VID, + f_init: impl FnOnce() -> UninitialisedEntry<'a, NodeStore>, ) -> MaybeNew { - match m.entry(id) { - Entry::Occupied(entry) => MaybeNew::Existing(*entry.get()), + let entry = match m.entry(id) { + Entry::Occupied(entry) => Either::Left(*entry.get()), Entry::Vacant(entry) => { - let id = f_init(); - entry.insert(id); - MaybeNew::New(id) + // This keeps the underlying storage shard locked for deferred initialisation but + // allows unlocking the map again. + let node = f_init(); + entry.insert(node.value().vid); + Either::Right(node) + } + }; + match entry { + Either::Left(vid) => MaybeNew::Existing(vid), + Either::Right(node_entry) => { + let vid = node_entry.value().vid; + node_entry.init(); + MaybeNew::New(vid) } } } diff --git a/raphtory/src/core/entities/graph/tgraph.rs b/raphtory/src/core/entities/graph/tgraph.rs index e10434c3c1..82694a68e6 100644 --- a/raphtory/src/core/entities/graph/tgraph.rs +++ b/raphtory/src/core/entities/graph/tgraph.rs @@ -20,8 +20,10 @@ use crate::{ Direction, Prop, }, db::api::{storage::graph::edges::edge_storage_ops::EdgeStorageOps, view::Layer}, + DEFAULT_NUM_SHARDS, }; use dashmap::DashSet; +use either::Either; use itertools::Itertools; use raphtory_api::core::{ entities::{edges::edge_ref::EdgeRef, GidRef}, @@ -79,7 +81,7 @@ impl std::fmt::Display for TemporalGraph { impl Default for TemporalGraph { fn default() -> Self { - Self::new(rayon::current_num_threads()) + Self::new(DEFAULT_NUM_SHARDS) } } @@ -321,16 +323,13 @@ impl TemporalGraph { pub(crate) fn link_nodes_inner( &self, node_pair: &mut PairEntryMut, - edge: &mut EdgeWGuard, + edge_id: EID, t: TimeIndexEntry, layer: usize, - edge_fn: impl FnOnce(&mut EdgeWGuard) -> Result<(), GraphError>, ) -> Result<(), GraphError> { - edge_fn(edge)?; self.update_time(t); let src_id = node_pair.get_i().vid; let dst_id = node_pair.get_j().vid; - let edge_id = edge.edge_store().eid; let src = node_pair.get_mut_i(); src.add_edge(dst_id, Direction::OUT, layer, edge_id); src.update_time(t); @@ -353,9 +352,12 @@ impl TemporalGraph { (edge_r.src(), edge_r.dst()) }; // need to get the node pair first to avoid deadlocks with link_nodes - let mut node_pair = self.storage.pair_node_mut(src, dst); + { + let mut node_pair = self.storage.pair_node_mut(src, dst); + self.link_nodes_inner(&mut node_pair, eid, t, layer)?; + } let mut edge_w = self.storage.edges.get_edge_mut(eid); - self.link_nodes_inner(&mut node_pair, &mut edge_w, t, layer, edge_fn) + edge_fn(&mut edge_w) } pub(crate) fn link_nodes Result<(), GraphError>>( @@ -366,14 +368,32 @@ impl TemporalGraph { layer: usize, edge_fn: F, ) -> Result, GraphError> { - let mut node_pair = self.storage.pair_node_mut(src_id, dst_id); - let src = node_pair.get_i(); - let mut edge = match src.find_edge_eid(dst_id, &LayerIds::All) { - Some(edge_id) => MaybeNew::Existing(self.storage.get_edge_mut(edge_id)), - None => MaybeNew::New(self.storage.push_edge(EdgeStore::new(src_id, dst_id))), + let edge = { + let mut node_pair = self.storage.pair_node_mut(src_id, dst_id); + let src = node_pair.get_i(); + let edge = match src.find_edge_eid(dst_id, &LayerIds::All) { + Some(edge_id) => Either::Left(self.storage.get_edge_mut(edge_id)), + None => Either::Right(self.storage.push_edge(EdgeStore::new(src_id, dst_id))), + }; + let eid = match edge.as_ref() { + Either::Left(edge) => edge.edge_store().eid, + Either::Right(edge) => edge.value().eid, + }; + self.link_nodes_inner(&mut node_pair, eid, t, layer)?; + edge }; - self.link_nodes_inner(&mut node_pair, edge.as_mut().inner(), t, layer, edge_fn)?; - Ok(edge.map(|e| e.edge_store().eid)) + + match edge { + Either::Left(mut edge) => { + edge_fn(&mut edge)?; + Ok(MaybeNew::Existing(edge.edge_store().eid)) + } + Either::Right(edge) => { + let mut edge = edge.init(); + edge_fn(&mut edge)?; + Ok(MaybeNew::New(edge.edge_store().eid)) + } + } } pub(crate) fn resolve_node_ref(&self, v: NodeRef) -> Option { diff --git a/raphtory/src/core/entities/graph/tgraph_storage.rs b/raphtory/src/core/entities/graph/tgraph_storage.rs index 69903fd755..ae911a89f7 100644 --- a/raphtory/src/core/entities/graph/tgraph_storage.rs +++ b/raphtory/src/core/entities/graph/tgraph_storage.rs @@ -2,8 +2,10 @@ use crate::core::{ entities::{edges::edge_store::EdgeStore, nodes::node_store::NodeStore, EID, VID}, storage::{ self, - raw_edges::{EdgeArcGuard, EdgeRGuard, EdgeWGuard, EdgesStorage, LockedEdges}, - Entry, EntryMut, PairEntryMut, + raw_edges::{ + EdgeArcGuard, EdgeRGuard, EdgeWGuard, EdgesStorage, LockedEdges, UninitialisedEdge, + }, + Entry, EntryMut, PairEntryMut, UninitialisedEntry, }, }; use serde::{Deserialize, Serialize}; @@ -20,7 +22,7 @@ impl GraphStorage { pub(crate) fn new(num_locks: usize) -> Self { Self { nodes: storage::RawStorage::new(num_locks), - edges: EdgesStorage::new(), + edges: EdgesStorage::new(num_locks), } } @@ -45,14 +47,12 @@ impl GraphStorage { } #[inline] - pub(crate) fn push_node(&self, node: NodeStore) -> VID { - self.nodes - .push(node, |vid, node| node.vid = vid.into()) - .into() + pub(crate) fn push_node(&self, node: NodeStore) -> UninitialisedEntry { + self.nodes.push(node, |vid, node| node.vid = vid.into()) } #[inline] - pub(crate) fn push_edge(&self, edge: EdgeStore) -> EdgeWGuard { - self.edges.push_edge(edge) + pub(crate) fn push_edge(&self, edge: EdgeStore) -> UninitialisedEdge { + self.edges.push(edge) } #[inline] diff --git a/raphtory/src/core/entities/nodes/node_ref.rs b/raphtory/src/core/entities/nodes/node_ref.rs index a6d7280ee4..2248a667a4 100644 --- a/raphtory/src/core/entities/nodes/node_ref.rs +++ b/raphtory/src/core/entities/nodes/node_ref.rs @@ -2,7 +2,7 @@ use crate::core::entities::VID; use either::Either; use raphtory_api::core::entities::{GidRef, GID}; -#[derive(Copy, Clone, PartialOrd, PartialEq, Debug)] +#[derive(Copy, Clone, PartialOrd, PartialEq, Debug, Eq, Hash, Ord)] pub enum NodeRef<'a> { Internal(VID), External(GidRef<'a>), diff --git a/raphtory/src/core/storage/mod.rs b/raphtory/src/core/storage/mod.rs index 084556dd8c..83962a493e 100644 --- a/raphtory/src/core/storage/mod.rs +++ b/raphtory/src/core/storage/mod.rs @@ -1,5 +1,5 @@ use lock_api; -use parking_lot::{RwLock, RwLockReadGuard}; +use parking_lot::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::{ @@ -20,6 +20,25 @@ pub mod timeindex; type ArcRwLockReadGuard = lock_api::ArcRwLockReadGuard; +#[must_use] +pub struct UninitialisedEntry<'a, T> { + offset: usize, + guard: RwLockWriteGuard<'a, Vec>, + value: T, +} + +impl<'a, T: Default> UninitialisedEntry<'a, T> { + pub fn init(mut self) { + if self.offset >= self.guard.len() { + self.guard.resize_with(self.offset + 1, Default::default); + } + self.guard[self.offset] = self.value; + } + pub fn value(&self) -> &T { + &self.value + } +} + #[inline] fn resolve(index: usize, num_buckets: usize) -> (usize, usize) { let bucket = index % num_buckets; @@ -191,27 +210,28 @@ where } } - pub fn push(&self, mut value: T, f: F) -> usize { + pub fn push(&self, mut value: T, f: F) -> UninitialisedEntry { let index = self.len.fetch_add(1, Ordering::Relaxed); + f(index, &mut value); let (bucket, offset) = self.resolve(index); - let mut vec = self.data[bucket].data.write(); - if offset >= vec.len() { - vec.resize_with(offset + 1, || Default::default()); + let guard = self.data[bucket].data.write(); + UninitialisedEntry { + offset, + guard, + value, } - f(index, &mut value); - vec[offset] = value; - index } - pub fn set(&self, index: Index, value: T) { + pub fn set(&self, index: Index, value: T) -> UninitialisedEntry { let index = index.into(); self.len.fetch_max(index + 1, Ordering::Relaxed); let (bucket, offset) = self.resolve(index); - let mut vec = self.data[bucket].data.write(); - if offset >= vec.len() { - vec.resize_with(offset + 1, || Default::default()); + let guard = self.data[bucket].data.write(); + UninitialisedEntry { + offset, + guard, + value, } - vec[offset] = value; } #[inline] @@ -391,7 +411,7 @@ mod test { let storage = RawStorage::::new(2); for i in 0..5 { - storage.push(i.to_string(), |_, _| {}); + storage.push(i.to_string(), |_, _| {}).init(); } assert_eq!(storage.len(), 5); @@ -413,7 +433,7 @@ mod test { let storage = RawStorage::::new(2); for i in 0..5 { - storage.push(i.to_string(), |_, _| {}); + storage.push(i.to_string(), |_, _| {}).init(); } let locked = storage.read_lock(); let actual: Vec<_> = (0..5).map(|i| (i, locked.get(i).as_str())).collect(); @@ -428,7 +448,7 @@ mod test { let storage = RawStorage::::new(2); for i in 0..5 { - storage.push(i.to_string(), |_, _| {}); + storage.push(i.to_string(), |_, _| {}).init(); } for i in 0..5 { @@ -443,7 +463,7 @@ mod test { let mut expected = v .into_par_iter() .map(|v| { - storage.push(v, |_, _| {}); + storage.push(v, |_, _| {}).init(); v }) .collect::>(); diff --git a/raphtory/src/core/storage/raw_edges.rs b/raphtory/src/core/storage/raw_edges.rs index b2edcf5e77..0d84ed174b 100644 --- a/raphtory/src/core/storage/raw_edges.rs +++ b/raphtory/src/core/storage/raw_edges.rs @@ -1,28 +1,25 @@ -use std::{ - ops::Deref, - sync::{ - atomic::{self, AtomicUsize}, - Arc, +use super::{resolve, timeindex::TimeIndex}; +use crate::{ + core::entities::{ + edges::edge_store::{EdgeDataLike, EdgeLayer, EdgeStore}, + LayerIds, }, + db::api::storage::graph::edges::edge_storage_ops::{EdgeStorageOps, MemEdge}, + DEFAULT_NUM_SHARDS, }; - use lock_api::ArcRwLockReadGuard; use parking_lot::{RwLock, RwLockReadGuard, RwLockWriteGuard}; +use raphtory_api::core::{entities::EID, storage::timeindex::TimeIndexEntry}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; - -use raphtory_api::core::{entities::EID, storage::timeindex::TimeIndexEntry}; - -use crate::{ - core::entities::{ - edges::edge_store::{EdgeDataLike, EdgeLayer, EdgeStore}, - LayerIds, +use std::{ + ops::Deref, + sync::{ + atomic::{self, AtomicUsize}, + Arc, }, - db::api::storage::graph::edges::edge_storage_ops::{EdgeStorageOps, MemEdge}, }; -use super::{resolve, timeindex::TimeIndex}; - #[derive(Debug, Serialize, Deserialize, PartialEq)] pub struct EdgeShard { edge_ids: Vec, @@ -31,6 +28,31 @@ pub struct EdgeShard { deletions: Vec>>, } +#[must_use] +pub struct UninitialisedEdge<'a> { + guard: RwLockWriteGuard<'a, EdgeShard>, + offset: usize, + value: EdgeStore, +} + +impl<'a> UninitialisedEdge<'a> { + pub fn init(mut self) -> EdgeWGuard<'a> { + self.guard.insert(self.offset, self.value); + EdgeWGuard { + guard: self.guard, + i: self.offset, + } + } + + pub fn value(&self) -> &EdgeStore { + &self.value + } + + pub fn value_mut(&mut self) -> &mut EdgeStore { + &mut self.value + } +} + impl EdgeShard { pub fn insert(&mut self, index: usize, value: EdgeStore) { if index >= self.edge_ids.len() { @@ -67,8 +89,6 @@ impl EdgeShard { } } -pub const SHARD_SIZE: usize = 64; - #[derive(Debug, Clone, Serialize, Deserialize)] pub struct EdgesStorage { shards: Arc<[Arc>]>, @@ -88,13 +108,13 @@ impl PartialEq for EdgesStorage { impl Default for EdgesStorage { fn default() -> Self { - Self::new() + Self::new(DEFAULT_NUM_SHARDS) } } impl EdgesStorage { - pub fn new() -> Self { - let shards = (0..SHARD_SIZE).map(|_| { + pub fn new(num_shards: usize) -> Self { + let shards = (0..num_shards).map(|_| { Arc::new(RwLock::new(EdgeShard { edge_ids: vec![], props: Vec::with_capacity(0), @@ -113,12 +133,6 @@ impl EdgesStorage { self.len.load(atomic::Ordering::SeqCst) } - pub(crate) fn push_edge(&self, edge: EdgeStore) -> EdgeWGuard { - let (eid, mut edge) = self.push(edge); - edge.edge_store_mut().eid = eid; - edge - } - pub fn read_lock(&self) -> LockedEdges { LockedEdges { shards: self @@ -135,24 +149,28 @@ impl EdgesStorage { resolve(index, self.shards.len()) } - fn push(&self, value: EdgeStore) -> (EID, EdgeWGuard) { + pub(crate) fn push(&self, mut value: EdgeStore) -> UninitialisedEdge { let index = self.len.fetch_add(1, atomic::Ordering::Relaxed); + value.eid = EID(index); let (bucket, offset) = self.resolve(index); - let mut shard = self.shards[bucket].write(); - shard.insert(offset, value); - let guard = EdgeWGuard { - guard: shard, - i: offset, - }; - (index.into(), guard) + let guard = self.shards[bucket].write(); + UninitialisedEdge { + guard, + offset, + value, + } } - pub(crate) fn set(&self, value: EdgeStore) { + pub(crate) fn set(&self, value: EdgeStore) -> UninitialisedEdge { let EID(index) = value.eid; self.len.fetch_max(index + 1, atomic::Ordering::Relaxed); let (bucket, offset) = self.resolve(index); - let mut shard = self.shards[bucket].write(); - shard.insert(offset, value); + let guard = self.shards[bucket].write(); + UninitialisedEdge { + guard, + offset, + value, + } } pub fn get_edge_mut(&self, eid: EID) -> EdgeWGuard { diff --git a/raphtory/src/core/utils/errors.rs b/raphtory/src/core/utils/errors.rs index 33427a09eb..34afa73761 100644 --- a/raphtory/src/core/utils/errors.rs +++ b/raphtory/src/core/utils/errors.rs @@ -1,6 +1,6 @@ use crate::core::{utils::time::error::ParseTimeError, Prop, PropType}; #[cfg(feature = "arrow")] -use polars_arrow::legacy::error; +use polars_arrow::{datatypes::ArrowDataType, legacy::error}; #[cfg(feature = "storage")] use pometry_storage::RAError; #[cfg(feature = "python")] @@ -38,6 +38,29 @@ pub enum InvalidPathReason { PathIsDirectory(PathBuf), } +#[cfg(feature = "arrow")] +#[derive(thiserror::Error, Debug)] +pub enum LoadError { + #[error("Only str columns are supported for layers, got {0:?}")] + InvalidLayerType(ArrowDataType), + #[error("Only str columns are supported for node type, got {0:?}")] + InvalidNodeType(ArrowDataType), + #[error("{0:?} not supported as property type")] + InvalidPropertyType(ArrowDataType), + #[error("{0:?} not supported as node id type")] + InvalidNodeIdType(ArrowDataType), + #[error("{0:?} not supported for time column")] + InvalidTimestamp(ArrowDataType), + #[error("Missing value for src id")] + MissingSrcError, + #[error("Missing value for dst id")] + MissingDstError, + #[error("Missing value for node id")] + MissingNodeError, + #[error("Missing value for timestamp")] + MissingTimeError, +} + #[derive(thiserror::Error, Debug)] pub enum GraphError { #[error("You cannot set ‘{0}’ and ‘{1}’ at the same time. Please pick one or the other.")] @@ -50,8 +73,12 @@ pub enum GraphError { #[from] source: InvalidPathReason, }, - #[error("Graph error occurred")] - UnsupportedDataType, + #[cfg(feature = "arrow")] + #[error("{source}")] + LoadError { + #[from] + source: LoadError, + }, #[error("Disk graph not found")] DiskGraphNotFound, #[error("Disk Graph is immutable")] @@ -96,20 +123,14 @@ pub enum GraphError { #[error("Edge already exists for nodes {0:?} {1:?}")] EdgeExistsError(GID, GID), - #[error("No Node with ID {0}")] - NodeIdError(u64), - - #[error("No Node with name {0}")] - NodeNameError(String), + #[error("Node {0} does not exist")] + NodeMissingError(GID), #[error("Node Type Error {0}")] NodeTypeError(String), #[error("No Edge between {src} and {dst}")] - EdgeIdError { src: u64, dst: u64 }, - - #[error("No Edge between {src} and {dst}")] - EdgeNameError { src: String, dst: String }, + EdgeMissingError { src: GID, dst: GID }, // wasm #[error("Node is not String or Number")] NodeIdNotStringOrNumber, diff --git a/raphtory/src/db/api/mutation/internal/internal_addition_ops.rs b/raphtory/src/db/api/mutation/internal/internal_addition_ops.rs index e5ec60bbda..8c3346d259 100644 --- a/raphtory/src/db/api/mutation/internal/internal_addition_ops.rs +++ b/raphtory/src/db/api/mutation/internal/internal_addition_ops.rs @@ -12,9 +12,12 @@ use raphtory_api::core::storage::dict_mapper::MaybeNew; #[enum_dispatch] pub trait InternalAdditionOps { + fn num_shards(&self) -> Result; /// get the sequence id for the next event fn next_event_id(&self) -> Result; + fn reserve_event_ids(&self, num_ids: usize) -> Result; + /// map layer name to id and allocate a new layer if needed fn resolve_layer(&self, layer: Option<&str>) -> Result, GraphError>; @@ -99,11 +102,21 @@ pub trait DelegateAdditionOps { } impl InternalAdditionOps for G { + #[inline] + fn num_shards(&self) -> Result { + self.graph().num_shards() + } + #[inline(always)] fn next_event_id(&self) -> Result { self.graph().next_event_id() } + #[inline] + fn reserve_event_ids(&self, num_ids: usize) -> Result { + self.graph().reserve_event_ids(num_ids) + } + #[inline] fn resolve_layer(&self, layer: Option<&str>) -> Result, GraphError> { self.graph().resolve_layer(layer) diff --git a/raphtory/src/db/api/state/ops.rs b/raphtory/src/db/api/state/ops.rs index bef3046e86..9d488bd43c 100644 --- a/raphtory/src/db/api/state/ops.rs +++ b/raphtory/src/db/api/state/ops.rs @@ -4,7 +4,7 @@ use crate::{ api::state::{node_state::NodeState, ord_ops, Index}, graph::node::NodeView, }, - prelude::GraphViewOps, + prelude::{GraphViewOps, NodeViewOps}, }; use num_traits::AsPrimitive; use rayon::{ @@ -115,6 +115,29 @@ pub trait NodeStateOps<'graph>: IntoIterator { } } + /// Sort the results by global node id + fn sort_by_id(&self) -> NodeState<'graph, Self::OwnedValue, Self::BaseGraph, Self::Graph> { + let mut state: Vec<_> = self + .par_iter() + .map(|(n, v)| (n.id(), n.node, v.borrow().clone())) + .collect(); + state.par_sort_by(|(l_id, l_n, _), (r_id, r_n, _)| (l_id, l_n).cmp(&(r_id, r_n))); + + let mut keys = Vec::with_capacity(state.len()); + let mut values = Vec::with_capacity(state.len()); + state + .into_par_iter() + .map(|(_, n, v)| (n, v)) + .unzip_into_vecs(&mut keys, &mut values); + + NodeState::new( + self.base_graph().clone(), + self.graph().clone(), + values, + Some(Index::from(keys)), + ) + } + /// Retrieves the top-k elements from the `AlgorithmResult` based on its values. /// /// Arguments: diff --git a/raphtory/src/db/api/storage/graph/storage_ops/additions.rs b/raphtory/src/db/api/storage/graph/storage_ops/additions.rs index 4372e6ab85..9128594be8 100644 --- a/raphtory/src/db/api/storage/graph/storage_ops/additions.rs +++ b/raphtory/src/db/api/storage/graph/storage_ops/additions.rs @@ -19,10 +19,18 @@ use raphtory_api::core::{ use std::sync::atomic::Ordering; impl InternalAdditionOps for TemporalGraph { + fn num_shards(&self) -> Result { + Ok(self.storage.nodes.data.len()) + } + fn next_event_id(&self) -> Result { Ok(self.event_counter.fetch_add(1, Ordering::Relaxed)) } + fn reserve_event_ids(&self, num_ids: usize) -> Result { + Ok(self.event_counter.fetch_add(num_ids, Ordering::Relaxed)) + } + fn resolve_layer(&self, layer: Option<&str>) -> Result, GraphError> { Ok(layer .map(|name| self.edge_meta.get_or_create_layer_id(name)) @@ -160,6 +168,13 @@ impl InternalAdditionOps for TemporalGraph { } impl InternalAdditionOps for GraphStorage { + fn num_shards(&self) -> Result { + match self { + GraphStorage::Unlocked(storage) => storage.num_shards(), + _ => Err(GraphError::AttemptToMutateImmutableGraph), + } + } + fn next_event_id(&self) -> Result { match self { GraphStorage::Unlocked(storage) => storage.next_event_id(), @@ -167,6 +182,13 @@ impl InternalAdditionOps for GraphStorage { } } + fn reserve_event_ids(&self, num_ids: usize) -> Result { + match self { + GraphStorage::Unlocked(storage) => storage.reserve_event_ids(num_ids), + _ => Err(GraphError::AttemptToMutateImmutableGraph), + } + } + fn resolve_layer(&self, layer: Option<&str>) -> Result, GraphError> { match self { GraphStorage::Unlocked(storage) => storage.resolve_layer(layer), diff --git a/raphtory/src/db/api/storage/storage.rs b/raphtory/src/db/api/storage/storage.rs index 8c289f5827..8610530da0 100644 --- a/raphtory/src/db/api/storage/storage.rs +++ b/raphtory/src/db/api/storage/storage.rs @@ -80,11 +80,21 @@ impl Storage { impl InheritViewOps for Storage {} impl InternalAdditionOps for Storage { + #[inline] + fn num_shards(&self) -> Result { + self.graph.num_shards() + } + #[inline] fn next_event_id(&self) -> Result { self.graph.next_event_id() } + #[inline] + fn reserve_event_ids(&self, num_ids: usize) -> Result { + self.graph.reserve_event_ids(num_ids) + } + fn resolve_layer(&self, layer: Option<&str>) -> Result, GraphError> { let id = self.graph.resolve_layer(layer)?; diff --git a/raphtory/src/io/arrow/dataframe.rs b/raphtory/src/io/arrow/dataframe.rs index 76b5a6ed0b..d3c95fd2af 100644 --- a/raphtory/src/io/arrow/dataframe.rs +++ b/raphtory/src/io/arrow/dataframe.rs @@ -1,14 +1,14 @@ -use crate::core::utils::errors::GraphError; - +use crate::{ + core::utils::errors::{GraphError, LoadError}, + io::arrow::node_col::{lift_node_col, NodeCol}, +}; +use itertools::Itertools; use polars_arrow::{ - array::{Array, PrimitiveArray, Utf8Array}, + array::{Array, PrimitiveArray, StaticArray}, compute::cast::{self, CastOptions}, datatypes::{ArrowDataType as DataType, TimeUnit}, - offset::Offset, - types::NativeType, }; - -use itertools::Itertools; +use rayon::prelude::*; pub(crate) struct DFView { pub names: Vec, @@ -40,37 +40,17 @@ where } } -#[derive(Clone)] -pub(crate) struct DFChunk { - pub(crate) chunk: Vec>, -} +pub struct TimeCol(PrimitiveArray); -impl DFChunk { - pub(crate) fn iter_col( - &self, - idx: usize, - ) -> Option> + '_> { - let col_arr = (&self.chunk)[idx] - .as_any() - .downcast_ref::>()?; - Some(col_arr.iter()) - } - - pub fn utf8(&self, idx: usize) -> Option> + '_> { - // test that it's actually a utf8 array - let col_arr = (&self.chunk)[idx].as_any().downcast_ref::>()?; - - Some(col_arr.iter()) - } - - pub fn time_iter_col(&self, idx: usize) -> Option> + '_> { - let col_arr = (&self.chunk)[idx] +impl TimeCol { + fn new(arr: &dyn Array) -> Result { + let arr = arr .as_any() - .downcast_ref::>()?; - - let arr = if let DataType::Timestamp(_, _) = col_arr.data_type() { + .downcast_ref::>() + .ok_or_else(|| LoadError::InvalidTimestamp(arr.data_type().clone()))?; + let arr = if let DataType::Timestamp(_, _) = arr.data_type() { let array = cast::cast( - col_arr, + arr, &DataType::Timestamp(TimeUnit::Millisecond, Some("UTC".to_string())), CastOptions::default(), ) @@ -81,9 +61,35 @@ impl DFChunk { .unwrap() .clone() } else { - col_arr.clone() + arr.clone() }; + Ok(Self(arr)) + } + + pub fn par_iter(&self) -> impl IndexedParallelIterator> + '_ { + (0..self.0.len()).into_par_iter().map(|i| self.get(i)) + } + + pub fn get(&self, i: usize) -> Option { + self.0.get(i) + } +} + +#[derive(Clone)] +pub(crate) struct DFChunk { + pub(crate) chunk: Vec>, +} + +impl DFChunk { + pub fn len(&self) -> usize { + self.chunk.first().map(|c| c.len()).unwrap_or(0) + } + + pub fn node_col(&self, index: usize) -> Result { + lift_node_col(index, self) + } - Some(arr.into_iter()) + pub fn time_col(&self, index: usize) -> Result { + TimeCol::new(self.chunk[index].as_ref()) } } diff --git a/raphtory/src/io/arrow/df_loaders.rs b/raphtory/src/io/arrow/df_loaders.rs index a7aeeca5ca..72573865bb 100644 --- a/raphtory/src/io/arrow/df_loaders.rs +++ b/raphtory/src/io/arrow/df_loaders.rs @@ -1,18 +1,21 @@ use crate::{ - core::utils::errors::GraphError, - db::api::{ - mutation::{internal::*, AdditionOps}, - view::StaticGraphViewOps, + core::{ + utils::errors::{GraphError, LoadError}, + PropType, }, + db::api::{mutation::internal::*, view::StaticGraphViewOps}, io::arrow::{ dataframe::{DFChunk, DFView}, + layer_col::{lift_layer_col, lift_node_type_col}, + node_col::lift_node_col, prop_handler::*, }, prelude::*, }; - use kdam::{Bar, BarBuilder, BarExt}; -use std::{collections::HashMap, iter}; +use raphtory_api::core::storage::{dict_mapper::MaybeNew, timeindex::TimeIndexEntry}; +use rayon::prelude::*; +use std::collections::HashMap; fn build_progress_bar(des: String, num_rows: usize) -> Result { BarBuilder::default() @@ -23,11 +26,17 @@ fn build_progress_bar(des: String, num_rows: usize) -> Result { .build() .map_err(|_| GraphError::TqdmError) } -fn extract_out_default_type(n_t: Option<&str>) -> Option<&str> { - if n_t == Some("_default") { - None - } else { - n_t + +fn process_shared_properties( + props: Option<&HashMap>, + resolver: impl Fn(&str, PropType) -> Result, GraphError>, +) -> Result, GraphError> { + match props { + None => Ok(vec![]), + Some(props) => props + .iter() + .map(|(key, prop)| Ok((resolver(key, prop.dtype())?.inner(), prop.clone()))) + .collect(), } } @@ -66,120 +75,61 @@ pub(crate) fn load_nodes_from_df< let node_id_index = df_view.get_index(node_id)?; let time_index = df_view.get_index(time)?; + + let shared_constant_properties = + process_shared_properties(shared_constant_properties, |key, dtype| { + graph.resolve_node_property(key, dtype, true) + })?; + let mut pb = build_progress_bar("Loading nodes".to_string(), df_view.num_rows)?; + let mut start_id = graph.reserve_event_ids(df_view.num_rows)?; for chunk in df_view.chunks { let df = chunk?; - let prop_iter = combine_properties(properties, &properties_indices, &df)?; - let const_prop_iter = - combine_properties(constant_properties, &constant_properties_indices, &df)?; - - let node_type: Result>>, GraphError> = - match (node_type, node_type_index) { - (None, None) => Ok(Box::new(iter::repeat(None))), - (Some(node_type), None) => Ok(Box::new(iter::repeat(Some(node_type)))), - (None, Some(node_type_index)) => { - let iter_res: Result>>, GraphError> = - if let Some(node_types) = df.utf8::(node_type_index) { - Ok(Box::new(node_types)) - } else if let Some(node_types) = df.utf8::(node_type_index) { - Ok(Box::new(node_types)) - } else { - Err(GraphError::LoadFailure( - "Unable to convert / find node_type column in dataframe." - .to_string(), - )) - }; - iter_res - } - _ => Err(GraphError::WrongNumOfArgs( - "node_type".to_string(), - "node_type_col".to_string(), - )), - }; - let node_type = node_type?; - - if let (Some(node_id), Some(time)) = ( - df.iter_col::(node_id_index), - df.time_iter_col(time_index), - ) { - let iter = node_id - .map(|i| i.copied()) - .zip(time) - .zip(node_type) - .map(|((node_id, time), n_t)| (node_id, time, n_t)); - load_nodes_from_num_iter( - graph, - &mut pb, - iter, - prop_iter, - const_prop_iter, - shared_constant_properties, - )?; - } else if let (Some(node_id), Some(time)) = ( - df.iter_col::(node_id_index), - df.time_iter_col(time_index), - ) { - let iter = node_id.map(i64_opt_into_u64_opt).zip(time); - let iter = iter - .zip(node_type) - .map(|((node_id, time), n_t)| (node_id, time, n_t)); - - load_nodes_from_num_iter( - graph, - &mut pb, - iter, - prop_iter, - const_prop_iter, - shared_constant_properties, - )?; - } else if let (Some(node_id), Some(time)) = - (df.utf8::(node_id_index), df.time_iter_col(time_index)) - { - let iter = node_id.into_iter().zip(time); - let iter = iter - .zip(node_type) - .map(|((node_id, time), n_t)| (node_id, time, n_t)); - - for (((node_id, time, n_t), props), const_props) in - iter.zip(prop_iter).zip(const_prop_iter) - { - if let (Some(node_id), Some(time), n_t) = (node_id, time, n_t) { - let actual_type = extract_out_default_type(n_t); - let v = graph.add_node(time, node_id, props, actual_type)?; - v.add_constant_properties(const_props)?; - if let Some(shared_const_props) = &shared_constant_properties { - v.add_constant_properties(shared_const_props.iter())?; - } + let prop_cols = combine_properties(properties, &properties_indices, &df, |key, dtype| { + graph.resolve_node_property(key, dtype, false) + })?; + let const_prop_cols = combine_properties( + constant_properties, + &constant_properties_indices, + &df, + |key, dtype| graph.resolve_node_property(key, dtype, true), + )?; + let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; + let time_col = df.time_col(time_index)?; + let node_col = df.node_col(node_id_index)?; + + node_col + .par_iter() + .zip(time_col.par_iter()) + .zip(node_type_col.par_iter()) + .zip(prop_cols.par_rows()) + .zip(const_prop_cols.par_rows()) + .enumerate() + .try_for_each(|(id, ((((node, time), node_type), t_props), c_props))| { + let node = node.ok_or(LoadError::MissingNodeError)?; + let time = time.ok_or(LoadError::MissingTimeError)?; + let node_id = match node_type { + None => graph.resolve_node(node)?.inner(), + Some(node_type) => graph + .resolve_node_and_type(node, node_type)? + .inner() + .0 + .inner(), + }; + let t = TimeIndexEntry(time, start_id + id); + let t_props: Vec<_> = t_props.collect(); + graph.internal_add_node(t, node_id, &t_props)?; + let c_props: Vec<_> = c_props + .chain(shared_constant_properties.iter().cloned()) + .collect(); + if !c_props.is_empty() { + graph.internal_add_constant_node_properties(node_id, &c_props)?; } - let _ = pb.update(1); - } - } else if let (Some(node_id), Some(time)) = - (df.utf8::(node_id_index), df.time_iter_col(time_index)) - { - let iter = node_id.into_iter().zip(time); - let iter = iter - .zip(node_type) - .map(|((node_id, time), n_t)| (node_id, time, n_t)); - - for (((node_id, time, n_t), props), const_props) in - iter.zip(prop_iter).zip(const_prop_iter) - { - let actual_type = extract_out_default_type(n_t); - if let (Some(node_id), Some(time), n_t) = (node_id, time, actual_type) { - let v = graph.add_node(time, node_id, props, n_t)?; - v.add_constant_properties(const_props)?; - if let Some(shared_const_props) = shared_constant_properties { - v.add_constant_properties(shared_const_props)?; - } - } - let _ = pb.update(1); - } - } else { - return Err(GraphError::LoadFailure( - "node id column must be either u64 or text, time column must be i64. Ensure these contain no NaN, Null or None values.".to_string(), - )); - }; + Ok::<(), GraphError>(()) + })?; + let _ = pb.update(df.len()); + start_id += df.len(); } Ok(()) } @@ -215,100 +165,64 @@ pub(crate) fn load_edges_from_df< let dst_index = df_view.get_index(dst)?; let time_index = df_view.get_index(time)?; let layer_index = if let Some(layer_col) = layer_col { - Some(df_view.get_index(layer_col.as_ref())) + Some(df_view.get_index(layer_col.as_ref())?) } else { None }; - let layer_index = layer_index.transpose()?; + let shared_constant_properties = + process_shared_properties(shared_constant_properties, |key, dtype| { + graph.resolve_edge_property(key, dtype, true) + })?; + let mut pb = build_progress_bar("Loading edges".to_string(), df_view.num_rows)?; + let _ = pb.update(0); + let mut start_idx = graph.reserve_event_ids(df_view.num_rows)?; for chunk in df_view.chunks { let df = chunk?; - let prop_iter = combine_properties(properties, &properties_indices, &df)?; - let const_prop_iter = - combine_properties(constant_properties, &constant_properties_indices, &df)?; - - let layer = lift_layer(layer, layer_index, &df)?; - - if let (Some(src), Some(dst), Some(time)) = ( - df.iter_col::(src_index), - df.iter_col::(dst_index), - df.time_iter_col(time_index), - ) { - let triplets = src - .map(|i| i.copied()) - .zip(dst.map(|i| i.copied())) - .zip(time); - load_edges_from_num_iter( - graph, - &mut pb, - triplets, - prop_iter, - const_prop_iter, - shared_constant_properties, - layer, - )?; - } else if let (Some(src), Some(dst), Some(time)) = ( - df.iter_col::(src_index), - df.iter_col::(dst_index), - df.time_iter_col(time_index), - ) { - let triplets = src - .map(i64_opt_into_u64_opt) - .zip(dst.map(i64_opt_into_u64_opt)) - .zip(time); - load_edges_from_num_iter( - graph, - &mut pb, - triplets, - prop_iter, - const_prop_iter, - shared_constant_properties, - layer, - )?; - } else if let (Some(src), Some(dst), Some(time)) = ( - df.utf8::(src_index), - df.utf8::(dst_index), - df.time_iter_col(time_index), - ) { - let triplets = src.into_iter().zip(dst.into_iter()).zip(time.into_iter()); - - for (((((src, dst), time), props), const_props), layer) in - triplets.zip(prop_iter).zip(const_prop_iter).zip(layer) - { - if let (Some(src), Some(dst), Some(time)) = (src, dst, time) { - let e = graph.add_edge(time, src, dst, props, layer.as_deref())?; - e.add_constant_properties(const_props, layer.as_deref())?; - if let Some(shared_const_props) = &shared_constant_properties { - e.add_constant_properties(shared_const_props.iter(), layer.as_deref())?; - } - } - let _ = pb.update(1); - } - } else if let (Some(src), Some(dst), Some(time)) = ( - df.utf8::(src_index), - df.utf8::(dst_index), - df.time_iter_col(time_index), - ) { - let triplets = src.into_iter().zip(dst.into_iter()).zip(time.into_iter()); - for (((((src, dst), time), props), const_props), layer) in - triplets.zip(prop_iter).zip(const_prop_iter).zip(layer) - { - if let (Some(src), Some(dst), Some(time)) = (src, dst, time) { - let e = graph.add_edge(time, src, dst, props, layer.as_deref())?; - e.add_constant_properties(const_props, layer.as_deref())?; - if let Some(shared_const_props) = &shared_constant_properties { - e.add_constant_properties(shared_const_props.iter(), layer.as_deref())?; - } + let prop_cols = combine_properties(properties, &properties_indices, &df, |key, dtype| { + graph.resolve_edge_property(key, dtype, false) + })?; + let const_prop_cols = combine_properties( + constant_properties, + &constant_properties_indices, + &df, + |key, dtype| graph.resolve_edge_property(key, dtype, true), + )?; + let layer = lift_layer_col(layer, layer_index, &df)?; + let src_col = df.node_col(src_index)?; + let dst_col = df.node_col(dst_index)?; + let time_col = df.time_col(time_index)?; + src_col + .par_iter() + .zip(dst_col.par_iter()) + .zip(time_col.par_iter()) + .zip(layer.par_iter()) + .zip(prop_cols.par_rows()) + .zip(const_prop_cols.par_rows()) + .enumerate() + .try_for_each(|(idx, (((((src, dst), time), layer), t_props), c_props))| { + let src = src.ok_or(LoadError::MissingSrcError)?; + let dst = dst.ok_or(LoadError::MissingDstError)?; + let time = time.ok_or(LoadError::MissingTimeError)?; + let time_idx = TimeIndexEntry(time, start_idx + idx); + let src = graph.resolve_node(src)?.inner(); + let dst = graph.resolve_node(dst)?.inner(); + let layer = graph.resolve_layer(layer)?.inner(); + let t_props: Vec<_> = t_props.collect(); + let c_props: Vec<_> = c_props + .chain(shared_constant_properties.iter().cloned()) + .collect(); + let eid = graph + .internal_add_edge(time_idx, src, dst, &t_props, layer)? + .inner(); + if !c_props.is_empty() { + graph.internal_add_constant_edge_properties(eid, layer, &c_props)?; } - let _ = pb.update(1); - } - } else { - return Err(GraphError::LoadFailure( - "Source and Target columns must be either u64 or text, Time column must be i64. Ensure these contain no NaN, Null or None values." - .to_string(), - )); - }; + Ok::<(), GraphError>(()) + })?; + start_idx += df.len(); + let _ = pb.update(df.len()); } Ok(()) } @@ -335,74 +249,29 @@ pub(crate) fn load_edge_deletions_from_df< }; let layer_index = layer_index.transpose()?; let mut pb = build_progress_bar("Loading edge deletions".to_string(), df_view.num_rows)?; + let mut start_idx = graph.reserve_event_ids(df_view.num_rows)?; for chunk in df_view.chunks { let df = chunk?; - let layer = lift_layer(layer, layer_index, &df)?; - - if let (Some(src), Some(dst), Some(time)) = ( - df.iter_col::(src_index), - df.iter_col::(dst_index), - df.time_iter_col(time_index), - ) { - let triplets = src - .map(|i| i.copied()) - .zip(dst.map(|i| i.copied())) - .zip(time); - - for (((src, dst), time), layer) in triplets.zip(layer) { - if let (Some(src), Some(dst), Some(time)) = (src, dst, time) { - graph.delete_edge(time, src, dst, layer.as_deref())?; - } - let _ = pb.update(1); - } - } else if let (Some(src), Some(dst), Some(time)) = ( - df.iter_col::(src_index), - df.iter_col::(dst_index), - df.time_iter_col(time_index), - ) { - let triplets = src - .map(i64_opt_into_u64_opt) - .zip(dst.map(i64_opt_into_u64_opt)) - .zip(time); - - for (((src, dst), time), layer) in triplets.zip(layer) { - if let (Some(src), Some(dst), Some(time)) = (src, dst, time) { - graph.delete_edge(time, src, dst, layer.as_deref())?; - } - let _ = pb.update(1); - } - } else if let (Some(src), Some(dst), Some(time)) = ( - df.utf8::(src_index), - df.utf8::(dst_index), - df.time_iter_col(time_index), - ) { - let triplets = src.into_iter().zip(dst.into_iter()).zip(time.into_iter()); - for (((src, dst), time), layer) in triplets.zip(layer) { - if let (Some(src), Some(dst), Some(time)) = (src, dst, time) { - graph.delete_edge(time, src, dst, layer.as_deref())?; - } - let _ = pb.update(1); - } - } else if let (Some(src), Some(dst), Some(time)) = ( - df.utf8::(src_index), - df.utf8::(dst_index), - df.time_iter_col(time_index), - ) { - let triplets = src.into_iter().zip(dst.into_iter()).zip(time.into_iter()); - - for (((src, dst), time), layer) in triplets.zip(layer) { - if let (Some(src), Some(dst), Some(time)) = (src, dst, time) { - graph.delete_edge(time, src, dst, layer.as_deref())?; - } - let _ = pb.update(1); - } - } else { - return Err(GraphError::LoadFailure( - "Source and Target columns must be either u64 or text, Time column must be i64. Ensure these contain no NaN, Null or None values." - .to_string(), - )); - }; + let layer = lift_layer_col(layer, layer_index, &df)?; + let src_col = df.node_col(src_index)?; + let dst_col = df.node_col(dst_index)?; + let time_col = df.time_col(time_index)?; + src_col + .par_iter() + .zip(dst_col.par_iter()) + .zip(time_col.par_iter()) + .zip(layer.par_iter()) + .enumerate() + .try_for_each(|(idx, (((src, dst), time), layer))| { + let src = src.ok_or(LoadError::MissingSrcError)?; + let dst = dst.ok_or(LoadError::MissingDstError)?; + let time = time.ok_or(LoadError::MissingTimeError)?; + graph.delete_edge((time, start_idx + idx), src, dst, layer)?; + Ok::<(), GraphError>(()) + })?; + let _ = pb.update(df.len()); + start_idx += df.len(); } Ok(()) @@ -427,115 +296,57 @@ pub(crate) fn load_node_props_from_df< .collect::, GraphError>>()?; let node_id_index = df_view.get_index(node_id)?; let node_type_index = if let Some(node_type_col) = node_type_col { - Some(df_view.get_index(node_type_col.as_ref())) + Some(df_view.get_index(node_type_col.as_ref())?) } else { None }; - let node_type_index = node_type_index.transpose()?; + let shared_constant_properties = match shared_constant_properties { + Some(props) => props + .iter() + .map(|(name, prop)| { + Ok(( + graph + .resolve_node_property(name, prop.dtype(), true)? + .inner(), + prop.clone(), + )) + }) + .collect::, GraphError>>()?, + None => vec![], + }; let mut pb = build_progress_bar("Loading node properties".to_string(), df_view.num_rows)?; for chunk in df_view.chunks { let df = chunk?; - let const_prop_iter = - combine_properties(constant_properties, &constant_properties_indices, &df)?; - - let node_type: Result>>, GraphError> = - match (node_type, node_type_index) { - (None, None) => Ok(Box::new(iter::repeat(None))), - (Some(node_type), None) => Ok(Box::new(iter::repeat(Some(node_type)))), - (None, Some(node_type_index)) => { - let iter_res: Result>>, GraphError> = - if let Some(node_types) = df.utf8::(node_type_index) { - Ok(Box::new(node_types)) - } else if let Some(node_types) = df.utf8::(node_type_index) { - Ok(Box::new(node_types)) - } else { - Err(GraphError::LoadFailure( - "Unable to convert / find node_type column in dataframe." - .to_string(), - )) - }; - iter_res - } - _ => Err(GraphError::WrongNumOfArgs( - "node_type".to_string(), - "node_type_col".to_string(), - )), - }; - let node_type = node_type?; - - if let Some(node_id) = df.iter_col::(node_id_index) { - let iter = node_id.map(|i| i.copied()); - for ((node_id, const_props), node_type) in iter.zip(const_prop_iter).zip(node_type) { - if let Some(node_id) = node_id { - let v = graph - .node(node_id) - .ok_or(GraphError::NodeIdError(node_id))?; - v.add_constant_properties(const_props)?; - if let Some(shared_const_props) = &shared_constant_properties { - v.add_constant_properties(shared_const_props.iter())?; - } - if let Some(node_type) = node_type { - v.set_node_type(node_type)?; - } - } - let _ = pb.update(1); - } - } else if let Some(node_id) = df.iter_col::(node_id_index) { - let iter = node_id.map(i64_opt_into_u64_opt); - for ((node_id, const_props), node_type) in iter.zip(const_prop_iter).zip(node_type) { - if let Some(node_id) = node_id { - let v = graph - .node(node_id) - .ok_or(GraphError::NodeIdError(node_id))?; - v.add_constant_properties(const_props)?; - if let Some(shared_const_props) = &shared_constant_properties { - v.add_constant_properties(shared_const_props.iter())?; - } - if let Some(node_type) = node_type { - v.set_node_type(node_type)?; - } - } - let _ = pb.update(1); - } - } else if let Some(node_id) = df.utf8::(node_id_index) { - let iter = node_id.into_iter(); - for ((node_id, const_props), node_type) in iter.zip(const_prop_iter).zip(node_type) { - if let Some(node_id) = node_id { - let v = graph - .node(node_id) - .ok_or_else(|| GraphError::NodeNameError(node_id.to_owned()))?; - v.add_constant_properties(const_props)?; - if let Some(shared_const_props) = &shared_constant_properties { - v.add_constant_properties(shared_const_props.iter())?; - } - if let Some(node_type) = node_type { - v.set_node_type(node_type)?; - } + let const_props = combine_properties( + constant_properties, + &constant_properties_indices, + &df, + |name, dtype| graph.resolve_node_property(name, dtype, true), + )?; + let node_col = df.node_col(node_id_index)?; + let node_type_col = lift_node_type_col(node_type, node_type_index, &df)?; + + node_col + .par_iter() + .zip(node_type_col.par_iter()) + .zip(const_props.par_rows()) + .try_for_each(|((node_id, node_type), cprops)| { + let node_id = node_id.ok_or(LoadError::MissingNodeError)?; + let node = graph + .node(node_id) + .ok_or_else(|| GraphError::NodeMissingError(node_id.to_owned()))?; + if let Some(node_type) = node_type { + node.set_node_type(node_type)?; } - let _ = pb.update(1); - } - } else if let Some(node_id) = df.utf8::(node_id_index) { - let iter = node_id.into_iter(); - for ((node_id, const_props), node_type) in iter.zip(const_prop_iter).zip(node_type) { - if let Some(node_id) = node_id { - let v = graph - .node(node_id) - .ok_or_else(|| GraphError::NodeNameError(node_id.to_owned()))?; - v.add_constant_properties(const_props)?; - if let Some(shared_const_props) = &shared_constant_properties { - v.add_constant_properties(shared_const_props.iter())?; - } - if let Some(node_type) = node_type { - v.set_node_type(node_type)?; - } + let props = cprops + .chain(shared_constant_properties.iter().cloned()) + .collect::>(); + if !props.is_empty() { + graph.internal_add_constant_node_properties(node.node, &props)?; } - let _ = pb.update(1); - } - } else { - return Err(GraphError::LoadFailure( - "node id column must be either u64 or text, time column must be i64. Ensure these contain no NaN, Null or None values.".to_string(), - )); - }; + Ok::<(), GraphError>(()) + })?; + let _ = pb.update(df.len()); } Ok(()) } @@ -567,163 +378,59 @@ pub(crate) fn load_edges_props_from_df< }; let layer_index = layer_index.transpose()?; let mut pb = build_progress_bar("Loading edge properties".to_string(), df_view.num_rows)?; + let shared_constant_properties = match shared_constant_properties { + None => { + vec![] + } + Some(props) => props + .iter() + .map(|(key, prop)| { + Ok(( + graph + .resolve_edge_property(key, prop.dtype(), true)? + .inner(), + prop.clone(), + )) + }) + .collect::, GraphError>>()?, + }; for chunk in df_view.chunks { let df = chunk?; - let const_prop_iter = - combine_properties(constant_properties, &constant_properties_indices, &df)?; - - let layer = lift_layer(layer, layer_index, &df)?; - - if let (Some(src), Some(dst)) = - (df.iter_col::(src_index), df.iter_col::(dst_index)) - { - let triplets = src.map(|i| i.copied()).zip(dst.map(|i| i.copied())); - - for (((src, dst), const_props), layer) in triplets.zip(const_prop_iter).zip(layer) { - if let (Some(src), Some(dst)) = (src, dst) { - let e = graph - .edge(src, dst) - .ok_or(GraphError::EdgeIdError { src, dst })?; - e.add_constant_properties(const_props, layer.as_deref())?; - if let Some(shared_const_props) = &shared_constant_properties { - e.add_constant_properties(shared_const_props.iter(), layer.as_deref())?; - } - } - let _ = pb.update(1); - } - } else if let (Some(src), Some(dst)) = - (df.iter_col::(src_index), df.iter_col::(dst_index)) - { - let triplets = src - .map(i64_opt_into_u64_opt) - .zip(dst.map(i64_opt_into_u64_opt)); - - for (((src, dst), const_props), layer) in triplets.zip(const_prop_iter).zip(layer) { - if let (Some(src), Some(dst)) = (src, dst) { - let e = graph - .edge(src, dst) - .ok_or(GraphError::EdgeIdError { src, dst })?; - e.add_constant_properties(const_props, layer.as_deref())?; - if let Some(shared_const_props) = &shared_constant_properties { - e.add_constant_properties(shared_const_props.iter(), layer.as_deref())?; - } - } - let _ = pb.update(1); - } - } else if let (Some(src), Some(dst)) = - (df.utf8::(src_index), df.utf8::(dst_index)) - { - let triplets = src.into_iter().zip(dst.into_iter()); - for (((src, dst), const_props), layer) in triplets.zip(const_prop_iter).zip(layer) { - if let (Some(src), Some(dst)) = (src, dst) { - let e = graph - .edge(src, dst) - .ok_or_else(|| GraphError::EdgeNameError { - src: src.to_owned(), - dst: dst.to_owned(), - })?; - e.add_constant_properties(const_props, layer.as_deref())?; - if let Some(shared_const_props) = &shared_constant_properties { - e.add_constant_properties(shared_const_props.iter(), layer.as_deref())?; - } - } - let _ = pb.update(1); - } - } else if let (Some(src), Some(dst)) = - (df.utf8::(src_index), df.utf8::(dst_index)) - { - let triplets = src.into_iter().zip(dst.into_iter()); - - for (((src, dst), const_props), layer) in triplets.zip(const_prop_iter).zip(layer) { - if let (Some(src), Some(dst)) = (src, dst) { - let e = graph - .edge(src, dst) - .ok_or_else(|| GraphError::EdgeNameError { - src: src.to_owned(), - dst: dst.to_owned(), - })?; - e.add_constant_properties(const_props, layer.as_deref())?; - if let Some(shared_const_props) = &shared_constant_properties { - e.add_constant_properties(shared_const_props.iter(), layer.as_deref())?; - } + let const_prop_iter = combine_properties( + constant_properties, + &constant_properties_indices, + &df, + |name, dtype| graph.resolve_edge_property(name, dtype, true), + )?; + + let layer = lift_layer_col(layer, layer_index, &df)?; + let src_col = lift_node_col(src_index, &df)?; + let dst_col = lift_node_col(dst_index, &df)?; + src_col + .par_iter() + .zip(dst_col.par_iter()) + .zip(layer.par_iter()) + .zip(const_prop_iter.par_rows()) + .try_for_each(|(((src, dst), layer), cprops)| { + let src = src.ok_or(LoadError::MissingSrcError)?; + let dst = dst.ok_or(LoadError::MissingDstError)?; + let e = graph + .edge(src, dst) + .ok_or_else(|| GraphError::EdgeMissingError { + src: src.to_owned(), + dst: dst.to_owned(), + })?; + let layer_id = graph.resolve_layer(layer)?.inner(); + let props = cprops + .chain(shared_constant_properties.iter().cloned()) + .collect::>(); + if !props.is_empty() { + graph.internal_add_constant_edge_properties(e.edge.pid(), layer_id, &props)?; } - let _ = pb.update(1); - } - } else { - return Err(GraphError::LoadFailure( - "Source and Target columns must be either u64 or text, Time column must be i64. Ensure these contain no NaN, Null or None values." - .to_string(), - )); - }; - } - Ok(()) -} - -fn i64_opt_into_u64_opt(x: Option<&i64>) -> Option { - x.map(|x| (*x).try_into().unwrap()) -} - -fn load_edges_from_num_iter< - 'a, - S: AsRef, - I: Iterator, Option), Option)>, - PI: Iterator>, - IL: Iterator>, - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, ->( - graph: &G, - pb: &mut Bar, - edges: I, - properties: PI, - constant_properties: PI, - shared_constant_properties: Option<&HashMap>, - layer: IL, -) -> Result<(), GraphError> { - for (((((src, dst), time), edge_props), const_props), layer) in - edges.zip(properties).zip(constant_properties).zip(layer) - { - if let (Some(src), Some(dst), Some(time)) = (src, dst, time) { - let e = graph.add_edge(time, src, dst, edge_props, layer.as_deref())?; - e.add_constant_properties(const_props, layer.as_deref())?; - if let Some(shared_const_props) = &shared_constant_properties { - e.add_constant_properties(shared_const_props.iter(), layer.as_deref())?; - } - } - let _ = pb.update(1); - } - Ok(()) -} - -fn load_nodes_from_num_iter< - 'a, - S: AsRef, - I: Iterator, Option, Option<&'a str>)>, - PI: Iterator>, - G: StaticGraphViewOps + InternalPropertyAdditionOps + InternalAdditionOps, ->( - graph: &G, - pb: &mut Bar, - nodes: I, - properties: PI, - constant_properties: PI, - shared_constant_properties: Option<&HashMap>, -) -> Result<(), GraphError> { - for (((node, time, node_type), props), const_props) in - nodes.zip(properties).zip(constant_properties) - { - if let (Some(v), Some(t), n_t, props, const_props) = - (node, time, node_type, props, const_props) - { - let actual_node_type = extract_out_default_type(n_t); - let v = graph.add_node(t, v, props, actual_node_type)?; - v.add_constant_properties(const_props)?; - - if let Some(shared_const_props) = &shared_constant_properties { - v.add_constant_properties(shared_const_props.iter())?; - } - let _ = pb.update(1); - } + Ok::<(), GraphError>(()) + })?; + let _ = pb.update(df.len()); } Ok(()) } diff --git a/raphtory/src/io/arrow/layer_col.rs b/raphtory/src/io/arrow/layer_col.rs new file mode 100644 index 0000000000..3c0846bf1c --- /dev/null +++ b/raphtory/src/io/arrow/layer_col.rs @@ -0,0 +1,143 @@ +use crate::{ + core::utils::errors::{GraphError, LoadError}, + io::arrow::dataframe::DFChunk, +}; +use polars_arrow::array::Utf8Array; +use rayon::iter::{ + plumbing::{Consumer, ProducerCallback, UnindexedConsumer}, + IndexedParallelIterator, IntoParallelIterator, ParallelIterator, +}; + +#[derive(Copy, Clone)] +pub(crate) enum LayerCol<'a> { + Name { name: Option<&'a str>, len: usize }, + Utf8 { col: &'a Utf8Array }, + LargeUtf8 { col: &'a Utf8Array }, +} + +pub enum LayerColVariants { + Name(Name), + Utf8(Utf8), + LargeUtf8(LargeUtf8), +} + +macro_rules! for_all { + ($value:expr, $pattern:pat => $result:expr) => { + match $value { + LayerColVariants::Name($pattern) => $result, + LayerColVariants::Utf8($pattern) => $result, + LayerColVariants::LargeUtf8($pattern) => $result, + } + }; +} + +impl< + V: Send, + Name: ParallelIterator, + Utf8: ParallelIterator, + LargeUtf8: ParallelIterator, + > ParallelIterator for LayerColVariants +{ + type Item = V; + + fn drive_unindexed(self, consumer: C) -> C::Result + where + C: UnindexedConsumer, + { + for_all!(self, iter => iter.drive_unindexed(consumer)) + } + + fn opt_len(&self) -> Option { + for_all!(self, iter => iter.opt_len()) + } +} + +impl< + V: Send, + Name: IndexedParallelIterator, + Utf8: IndexedParallelIterator, + LargeUtf8: IndexedParallelIterator, + > IndexedParallelIterator for LayerColVariants +{ + fn len(&self) -> usize { + for_all!(self, iter => iter.len()) + } + + fn drive>(self, consumer: C) -> C::Result { + for_all!(self, iter => iter.drive(consumer)) + } + + fn with_producer>(self, callback: CB) -> CB::Output { + for_all!(self, iter => iter.with_producer(callback)) + } +} + +impl<'a> LayerCol<'a> { + pub fn par_iter(self) -> impl IndexedParallelIterator> { + match self { + LayerCol::Name { name, len } => { + LayerColVariants::Name((0..len).into_par_iter().map(move |_| name)) + } + LayerCol::Utf8 { col } => { + LayerColVariants::Utf8((0..col.len()).into_par_iter().map(|i| col.get(i))) + } + LayerCol::LargeUtf8 { col } => { + LayerColVariants::LargeUtf8((0..col.len()).into_par_iter().map(|i| col.get(i))) + } + } + } +} + +pub(crate) fn lift_layer_col<'a>( + layer_name: Option<&'a str>, + layer_index: Option, + df: &'a DFChunk, +) -> Result, GraphError> { + match (layer_name, layer_index) { + (name, None) => Ok(LayerCol::Name { + name, + len: df.len(), + }), + (None, Some(layer_index)) => { + let col = &df.chunk[layer_index]; + if let Some(col) = col.as_any().downcast_ref::>() { + Ok(LayerCol::Utf8 { col }) + } else if let Some(col) = col.as_any().downcast_ref::>() { + Ok(LayerCol::LargeUtf8 { col }) + } else { + Err(LoadError::InvalidLayerType(col.data_type().clone()).into()) + } + } + _ => Err(GraphError::WrongNumOfArgs( + "layer_name".to_string(), + "layer_col".to_string(), + )), + } +} + +pub(crate) fn lift_node_type_col<'a>( + node_type_name: Option<&'a str>, + node_type_index: Option, + df: &'a DFChunk, +) -> Result, GraphError> { + match (node_type_name, node_type_index) { + (name, None) => Ok(LayerCol::Name { + name, + len: df.len(), + }), + (None, Some(layer_index)) => { + let col = &df.chunk[layer_index]; + if let Some(col) = col.as_any().downcast_ref::>() { + Ok(LayerCol::Utf8 { col }) + } else if let Some(col) = col.as_any().downcast_ref::>() { + Ok(LayerCol::LargeUtf8 { col }) + } else { + Err(LoadError::InvalidNodeType(col.data_type().clone()).into()) + } + } + _ => Err(GraphError::WrongNumOfArgs( + "node_type_name".to_string(), + "node_type_col".to_string(), + )), + } +} diff --git a/raphtory/src/io/arrow/mod.rs b/raphtory/src/io/arrow/mod.rs index 64a1a95e9f..89597f5281 100644 --- a/raphtory/src/io/arrow/mod.rs +++ b/raphtory/src/io/arrow/mod.rs @@ -1,5 +1,7 @@ pub mod dataframe; pub mod df_loaders; +mod layer_col; +mod node_col; mod prop_handler; #[cfg(test)] @@ -61,7 +63,7 @@ mod test { ) .expect("failed to load edges from pretend df"); - let actual = graph + let mut actual = graph .edges() .iter() .map(|e| { @@ -81,6 +83,10 @@ mod test { }) .collect::>(); + actual.sort_by(|(l_src, l_dst, l_t, ..), (r_src, r_dst, r_t, ..)| { + (l_src, l_dst, l_t).cmp(&(r_src, r_dst, r_t)) + }); + assert_eq!( actual, vec![ @@ -152,7 +158,7 @@ mod test { ) .expect("failed to load nodes from pretend df"); - let actual = graph + let mut actual = graph .nodes() .iter() .map(|v| { @@ -168,6 +174,8 @@ mod test { }) .collect::>(); + actual.sort_by(|(l_n, l_t, ..), (r_n, r_t, ..)| (l_n, l_t).cmp(&(r_n, r_t))); + assert_eq!( actual, vec![ diff --git a/raphtory/src/io/arrow/node_col.rs b/raphtory/src/io/arrow/node_col.rs new file mode 100644 index 0000000000..0f589cc4e6 --- /dev/null +++ b/raphtory/src/io/arrow/node_col.rs @@ -0,0 +1,146 @@ +use crate::{core::utils::errors::LoadError, io::arrow::dataframe::DFChunk}; +use polars_arrow::{ + array::{Array, PrimitiveArray, StaticArray, Utf8Array}, + datatypes::ArrowDataType, + offset::Offset, +}; +use raphtory_api::core::entities::GidRef; +use rayon::prelude::{IndexedParallelIterator, *}; + +trait NodeColOps: Send + Sync { + fn get(&self, i: usize) -> Option; + + fn len(&self) -> usize; +} + +impl NodeColOps for PrimitiveArray { + fn get(&self, i: usize) -> Option { + StaticArray::get(self, i).map(GidRef::U64) + } + + fn len(&self) -> usize { + self.len() + } +} + +impl NodeColOps for PrimitiveArray { + fn get(&self, i: usize) -> Option { + StaticArray::get(self, i).map(|v| GidRef::U64(v as u64)) + } + + fn len(&self) -> usize { + self.len() + } +} + +impl NodeColOps for PrimitiveArray { + fn get(&self, i: usize) -> Option { + StaticArray::get(self, i).map(|v| GidRef::U64(v as u64)) + } + + fn len(&self) -> usize { + self.len() + } +} + +impl NodeColOps for PrimitiveArray { + fn get(&self, i: usize) -> Option { + StaticArray::get(self, i).map(|v| GidRef::U64(v as u64)) + } + + fn len(&self) -> usize { + self.len() + } +} + +impl NodeColOps for Utf8Array { + fn get(&self, i: usize) -> Option { + if i >= self.len() { + None + } else { + // safety: bounds checked above + unsafe { + if self.is_null_unchecked(i) { + None + } else { + let value = self.value_unchecked(i); + Some(GidRef::Str(value)) + } + } + } + } + + fn len(&self) -> usize { + self.len() + } +} + +pub struct NodeCol(Box); + +impl<'a> TryFrom<&'a dyn Array> for NodeCol { + type Error = LoadError; + + fn try_from(value: &'a dyn Array) -> Result { + match value.data_type() { + ArrowDataType::Int32 => { + let col = value + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); + Ok(NodeCol(Box::new(col))) + } + ArrowDataType::Int64 => { + let col = value + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); + Ok(NodeCol(Box::new(col))) + } + ArrowDataType::UInt32 => { + let col = value + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); + Ok(NodeCol(Box::new(col))) + } + ArrowDataType::UInt64 => { + let col = value + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); + Ok(NodeCol(Box::new(col))) + } + ArrowDataType::Utf8 => { + let col = value + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); + Ok(NodeCol(Box::new(col))) + } + ArrowDataType::LargeUtf8 => { + let col = value + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); + Ok(NodeCol(Box::new(col))) + } + dtype => Err(LoadError::InvalidNodeIdType(dtype.clone())), + } + } +} + +impl NodeCol { + pub fn par_iter(&self) -> impl IndexedParallelIterator>> + '_ { + (0..self.0.len()).into_par_iter().map(|i| self.0.get(i)) + } +} + +pub fn lift_node_col(index: usize, df: &DFChunk) -> Result { + (df.chunk[index].as_ref()).try_into() +} diff --git a/raphtory/src/io/arrow/prop_handler.rs b/raphtory/src/io/arrow/prop_handler.rs index 2d91f6e542..54f5507ea0 100644 --- a/raphtory/src/io/arrow/prop_handler.rs +++ b/raphtory/src/io/arrow/prop_handler.rs @@ -1,49 +1,71 @@ -use chrono::{DateTime, Utc}; -use polars_arrow::{ - array::{Array, BooleanArray, FixedSizeListArray, ListArray, PrimitiveArray, Utf8Array}, - datatypes::{ArrowDataType as DataType, TimeUnit}, -}; - use crate::{ - core::{utils::errors::GraphError, IntoPropList}, + core::{ + utils::errors::{GraphError, LoadError}, + IntoPropList, PropType, + }, io::arrow::dataframe::DFChunk, prelude::Prop, }; +use chrono::{DateTime, Utc}; +use polars_arrow::{ + array::{ + Array, BooleanArray, FixedSizeListArray, ListArray, PrimitiveArray, StaticArray, Utf8Array, + }, + datatypes::{ArrowDataType as DataType, TimeUnit}, + offset::Offset, +}; +use raphtory_api::core::storage::dict_mapper::MaybeNew; +use rayon::prelude::*; -pub struct PropIter<'a> { - inner: Vec> + 'a>>, +pub struct PropCols { + prop_ids: Vec, + cols: Vec>, + len: usize, } -impl<'a> Iterator for PropIter<'a> { - type Item = Vec<(&'a str, Prop)>; +impl PropCols { + pub fn iter_row(&self, i: usize) -> impl Iterator + '_ { + self.prop_ids + .iter() + .zip(self.cols.iter()) + .filter_map(move |(id, col)| col.get(i).map(|v| (*id, v))) + } - fn next(&mut self) -> Option { - self.inner - .iter_mut() - .map(|v| v.next()) - .filter_map(|r| match r { - Some(r1) => match r1 { - Some(r2) => Some(Some(r2)), - None => None, - }, - None => Some(None), - }) - .collect() + pub fn len(&self) -> usize { + self.len } -} -pub(crate) fn combine_properties<'a>( - props: &'a [&str], - indices: &'a [usize], - df: &'a DFChunk, -) -> Result, GraphError> { - for idx in indices { - is_data_type_supported(df.chunk[*idx].data_type())?; + pub fn par_rows( + &self, + ) -> impl IndexedParallelIterator + '_> + '_ { + (0..self.len()).into_par_iter().map(|i| self.iter_row(i)) } - let zipped = props.iter().zip(indices.iter()); - let iter = zipped.map(|(name, idx)| lift_property(*idx, name, df)); - Ok(PropIter { - inner: iter.collect(), +} + +pub(crate) fn combine_properties( + props: &[&str], + indices: &[usize], + df: &DFChunk, + prop_id_resolver: impl Fn(&str, PropType) -> Result, GraphError>, +) -> Result { + let dtypes = indices + .iter() + .map(|idx| data_type_as_prop_type(df.chunk[*idx].data_type())) + .collect::, _>>()?; + let cols = indices + .iter() + .map(|idx| lift_property_col(df.chunk[*idx].as_ref())) + .collect::>(); + let prop_ids = props + .iter() + .zip(dtypes.into_iter()) + .map(|(name, dtype)| Ok(prop_id_resolver(name, dtype)?.inner())) + .collect::, GraphError>>()?; + + Ok(PropCols { + prop_ids, + cols, + len: df.len(), }) } @@ -118,6 +140,32 @@ fn arr_as_prop(arr: Box) -> Prop { } } +fn data_type_as_prop_type(dt: &DataType) -> Result { + match dt { + DataType::Boolean => Ok(PropType::Bool), + DataType::Int32 => Ok(PropType::I32), + DataType::Int64 => Ok(PropType::I64), + DataType::UInt8 => Ok(PropType::U8), + DataType::UInt16 => Ok(PropType::U16), + DataType::UInt32 => Ok(PropType::U32), + DataType::UInt64 => Ok(PropType::U64), + DataType::Float32 => Ok(PropType::F32), + DataType::Float64 => Ok(PropType::F64), + DataType::Utf8 => Ok(PropType::Str), + DataType::LargeUtf8 => Ok(PropType::Str), + DataType::List(v) => is_data_type_supported(v.data_type()).map(|_| PropType::List), + DataType::FixedSizeList(v, _) => { + is_data_type_supported(v.data_type()).map(|_| PropType::List) + } + DataType::LargeList(v) => is_data_type_supported(v.data_type()).map(|_| PropType::List), + DataType::Timestamp(_, v) => match v { + None => Ok(PropType::NDTime), + Some(_) => Ok(PropType::DTime), + }, + _ => Err(LoadError::InvalidPropertyType(dt.clone()).into()), + } +} + fn is_data_type_supported(dt: &DataType) -> Result<(), GraphError> { match dt { DataType::Boolean => {} @@ -135,247 +183,204 @@ fn is_data_type_supported(dt: &DataType) -> Result<(), GraphError> { DataType::FixedSizeList(v, _) => is_data_type_supported(v.data_type())?, DataType::LargeList(v) => is_data_type_supported(v.data_type())?, DataType::Timestamp(_, _) => {} - _ => Err(GraphError::UnsupportedDataType)?, + _ => return Err(LoadError::InvalidPropertyType(dt.clone()).into()), } Ok(()) } -pub(crate) fn lift_property<'a: 'b, 'b>( - idx: usize, - name: &'a str, - df: &'b DFChunk, -) -> Box> + 'b> { - let arr = &df.chunk[idx]; - let r = match arr.data_type() { +trait PropCol: Send + Sync { + fn get(&self, i: usize) -> Option; +} + +impl PropCol for A +where + A: StaticArray, + for<'a> A::ValueT<'a>: Into, +{ + #[inline] + fn get(&self, i: usize) -> Option { + StaticArray::get(self, i).map(|v| v.into()) + } +} + +struct Wrap(A); + +impl PropCol for Wrap> { + fn get(&self, i: usize) -> Option { + self.0.get(i).map(Prop::str) + } +} + +impl PropCol for Wrap> { + fn get(&self, i: usize) -> Option { + if i >= self.0.len() { + None + } else { + // safety: bounds checked above + unsafe { + if self.0.is_null_unchecked(i) { + None + } else { + let value = self.0.value_unchecked(i); + Some(arr_as_prop(value)) + } + } + } + } +} + +impl PropCol for Wrap { + fn get(&self, i: usize) -> Option { + self.0.get(i).map(arr_as_prop) + } +} + +struct DTimeCol { + arr: PrimitiveArray, + map: fn(i64) -> Prop, +} + +impl PropCol for DTimeCol { + fn get(&self, i: usize) -> Option { + StaticArray::get(&self.arr, i).map(self.map) + } +} +fn lift_property_col(arr: &dyn Array) -> Box { + match arr.data_type() { DataType::Boolean => { let arr = arr.as_any().downcast_ref::().unwrap(); - iter_as_prop(name, arr.iter()) + Box::new(arr.clone()) } DataType::Int32 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_prop(name, arr.iter().map(|i| i.copied())) + Box::new(arr.clone()) } DataType::Int64 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_prop(name, arr.iter().map(|i| i.copied())) + Box::new(arr.clone()) } DataType::UInt8 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_prop(name, arr.iter().map(|i| i.copied())) + Box::new(arr.clone()) } DataType::UInt16 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_prop(name, arr.iter().map(|i| i.copied())) + Box::new(arr.clone()) } DataType::UInt32 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_prop(name, arr.iter().map(|i| i.copied())) + Box::new(arr.clone()) } DataType::UInt64 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_prop(name, arr.iter().map(|i| i.copied())) + Box::new(arr.clone()) } DataType::Float32 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_prop(name, arr.iter().map(|i| i.copied())) + Box::new(arr.clone()) } DataType::Float64 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_prop(name, arr.iter().map(|i| i.copied())) + Box::new(arr.clone()) } DataType::Utf8 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_prop(name, arr.iter()) + Box::new(Wrap(arr.clone())) } DataType::LargeUtf8 => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_prop(name, arr.iter()) + Box::new(arr.clone()) } DataType::List(_) => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_arr_prop(name, arr.iter()) + Box::new(Wrap(arr.clone())) } DataType::FixedSizeList(_, _) => { let arr = arr.as_any().downcast_ref::().unwrap(); - iter_as_arr_prop(name, arr.iter()) + Box::new(Wrap(arr.clone())) } DataType::LargeList(_) => { let arr = arr.as_any().downcast_ref::>().unwrap(); - iter_as_arr_prop(name, arr.iter()) + Box::new(Wrap(arr.clone())) } DataType::Timestamp(timeunit, timezone) => { - let arr = arr.as_any().downcast_ref::>().unwrap(); + let arr = arr + .as_any() + .downcast_ref::>() + .unwrap() + .clone(); match timezone { Some(_) => match timeunit { - TimeUnit::Second => { - println!("Timestamp(Second, Some({:?})); ", timezone); - let r: Box> + 'b> = - Box::new(arr.iter().map(move |val| { - val.map(|v| { - ( - name, - Prop::DTime( - DateTime::::from_timestamp(*v, 0) - .expect("DateTime conversion failed"), - ), - ) - }) - })); - r - } - TimeUnit::Millisecond => { - println!("Timestamp(Millisecond, Some({:?})); ", timezone); - let r: Box> + 'b> = - Box::new(arr.iter().map(move |val| { - val.map(|v| { - ( - name, - Prop::DTime( - DateTime::::from_timestamp_millis(*v) - .expect("DateTime conversion failed"), - ), - ) - }) - })); - r - } - TimeUnit::Microsecond => { - println!("Timestamp(Microsecond, Some({:?})); ", timezone); - let r: Box> + 'b> = - Box::new(arr.iter().map(move |val| { - val.map(|v| { - ( - name, - Prop::DTime( - DateTime::::from_timestamp_micros(*v) - .expect("DateTime conversion failed"), - ), - ) - }) - })); - r - } - TimeUnit::Nanosecond => { - println!("Timestamp(Nanosecond, Some({:?})); ", timezone); - let r: Box> + 'b> = - Box::new(arr.iter().map(move |val| { - val.map(|v| { - (name, Prop::DTime(DateTime::::from_timestamp_nanos(*v))) - }) - })); - r - } + TimeUnit::Second => Box::new(DTimeCol { + arr, + map: |v| { + Prop::DTime( + DateTime::::from_timestamp(v, 0) + .expect("DateTime conversion failed"), + ) + }, + }), + TimeUnit::Millisecond => Box::new(DTimeCol { + arr, + map: |v| { + Prop::DTime( + DateTime::::from_timestamp_millis(v) + .expect("DateTime conversion failed"), + ) + }, + }), + TimeUnit::Microsecond => Box::new(DTimeCol { + arr, + map: |v| { + Prop::DTime( + DateTime::::from_timestamp_micros(v) + .expect("DateTime conversion failed"), + ) + }, + }), + TimeUnit::Nanosecond => Box::new(DTimeCol { + arr, + map: |v| Prop::DTime(DateTime::::from_timestamp_nanos(v)), + }), }, None => match timeunit { - TimeUnit::Second => { - println!("Timestamp(Second, None); "); - let r: Box> + 'b> = - Box::new(arr.iter().map(move |val| { - val.map(|v| { - ( - name, - Prop::NDTime( - DateTime::from_timestamp(*v, 0) - .expect("DateTime conversion failed") - .naive_utc(), - ), - ) - }) - })); - r - } - TimeUnit::Millisecond => { - println!("Timestamp(Millisecond, None); "); - let r: Box> + 'b> = - Box::new(arr.iter().map(move |val| { - val.map(|v| { - ( - name, - Prop::NDTime( - DateTime::from_timestamp_millis(*v) - .expect("DateTime conversion failed") - .naive_utc(), - ), - ) - }) - })); - r - } - TimeUnit::Microsecond => { - println!("Timestamp(Microsecond, None); "); - let r: Box> + 'b> = - Box::new(arr.iter().map(move |val| { - val.map(|v| { - ( - name, - Prop::NDTime( - DateTime::from_timestamp_micros(*v) - .expect("DateTime conversion failed") - .naive_utc(), - ), - ) - }) - })); - r - } - TimeUnit::Nanosecond => { - println!("Timestamp(Nanosecond, None); "); - let r: Box> + 'b> = - Box::new(arr.iter().map(move |val| { - val.map(|v| { - ( - name, - Prop::NDTime( - DateTime::from_timestamp_nanos(*v).naive_utc(), - ), - ) - }) - })); - r - } + TimeUnit::Second => Box::new(DTimeCol { + arr, + map: |v| { + Prop::NDTime( + DateTime::from_timestamp(v, 0) + .expect("DateTime conversion failed") + .naive_utc(), + ) + }, + }), + TimeUnit::Millisecond => Box::new(DTimeCol { + arr, + map: |v| { + Prop::NDTime( + DateTime::from_timestamp_millis(v) + .expect("DateTime conversion failed") + .naive_utc(), + ) + }, + }), + TimeUnit::Microsecond => Box::new(DTimeCol { + arr, + map: |v| { + Prop::NDTime( + DateTime::from_timestamp_micros(v) + .expect("DateTime conversion failed") + .naive_utc(), + ) + }, + }), + TimeUnit::Nanosecond => Box::new(DTimeCol { + arr, + map: |v| Prop::NDTime(DateTime::from_timestamp_nanos(v).naive_utc()), + }), }, } } unsupported => panic!("Data type not supported: {:?}", unsupported), - }; - - r -} - -pub(crate) fn lift_layer<'a>( - layer_name: Option<&str>, - layer_index: Option, - df: &'a DFChunk, -) -> Result> + 'a>, GraphError> { - match (layer_name, layer_index) { - (None, None) => Ok(Box::new(std::iter::repeat(None))), - (Some(layer_name), None) => Ok(Box::new(std::iter::repeat(Some(layer_name.to_string())))), - (None, Some(layer_index)) => { - if let Some(col) = df.utf8::(layer_index) { - Ok(Box::new(col.map(|v| v.map(|v| v.to_string())))) - } else if let Some(col) = df.utf8::(layer_index) { - Ok(Box::new(col.map(|v| v.map(|v| v.to_string())))) - } else { - Ok(Box::new(std::iter::repeat(None))) - } - } - _ => Err(GraphError::WrongNumOfArgs( - "layer_name".to_string(), - "layer_col".to_string(), - )), } } - -fn iter_as_prop<'a, T: Into + 'a, I: Iterator> + 'a>( - name: &'a str, - is: I, -) -> Box> + 'a> { - Box::new(is.map(move |val| val.map(|v| (name, v.into())))) -} - -fn iter_as_arr_prop<'a, I: Iterator>> + 'a>( - name: &'a str, - is: I, -) -> Box> + 'a> { - Box::new(is.map(move |val| val.map(|v| (name, arr_as_prop(v))))) -} diff --git a/raphtory/src/lib.rs b/raphtory/src/lib.rs index 3acb134bf7..3668b1b840 100644 --- a/raphtory/src/lib.rs +++ b/raphtory/src/lib.rs @@ -88,6 +88,15 @@ pub mod core; pub mod db; pub mod graphgen; +#[cfg(target_os = "macos")] +use snmalloc_rs; + +pub const DEFAULT_NUM_SHARDS: usize = 128; + +#[cfg(target_os = "macos")] +#[global_allocator] +static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; + #[cfg(feature = "storage")] pub mod disk_graph; diff --git a/raphtory/src/python/graph/graph.rs b/raphtory/src/python/graph/graph.rs index 0b27e9bb5e..b9310ad4ff 100644 --- a/raphtory/src/python/graph/graph.rs +++ b/raphtory/src/python/graph/graph.rs @@ -135,8 +135,12 @@ impl PyGraphEncoder { #[pymethods] impl PyGraph { #[new] - pub fn py_new() -> (Self, PyGraphView) { - let graph = Graph::new(); + #[pyo3(signature=(num_shards=None))] + pub fn py_new(num_shards: Option) -> (Self, PyGraphView) { + let graph = match num_shards { + None => Graph::new(), + Some(num_shards) => Graph::new_with_shards(num_shards), + }; ( Self { graph: graph.clone(), diff --git a/raphtory/src/python/graph/io/pandas_loaders.rs b/raphtory/src/python/graph/io/pandas_loaders.rs index 38c37af890..a7fac2e196 100644 --- a/raphtory/src/python/graph/io/pandas_loaders.rs +++ b/raphtory/src/python/graph/io/pandas_loaders.rs @@ -44,11 +44,7 @@ pub fn load_nodes_from_pandas( node_type_col, graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; - Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; - Ok(()) } pub fn load_edges_from_pandas( @@ -85,11 +81,7 @@ pub fn load_edges_from_pandas( layer_col, graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; - Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; - Ok(()) } pub fn load_node_props_from_pandas( @@ -118,11 +110,7 @@ pub fn load_node_props_from_pandas( shared_constant_properties, graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; - Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; - Ok(()) } pub fn load_edge_props_from_pandas( @@ -153,11 +141,7 @@ pub fn load_edge_props_from_pandas( layer_col, graph, ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; - Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; - Ok(()) } pub fn load_edge_deletions_from_pandas( @@ -186,11 +170,7 @@ pub fn load_edge_deletions_from_pandas( layer_col, graph.core_graph(), ) - .map_err(|e| GraphLoadException::new_err(format!("{:?}", e)))?; - Ok::<(), PyErr>(()) }) - .map_err(|e| GraphError::LoadFailure(format!("Failed to load graph {e:?}")))?; - Ok(()) } pub(crate) fn process_pandas_py_df<'a>( @@ -221,7 +201,7 @@ pub(crate) fn process_pandas_py_df<'a>( let table = pa_table.call_method("from_pandas", (dropped_df,), None)?; let kwargs = PyDict::new(py); - kwargs.set_item("max_chunksize", 100000)?; + kwargs.set_item("max_chunksize", 1000000)?; let rb = table .call_method("to_batches", (), Some(kwargs))? .extract::>()?; diff --git a/raphtory/src/python/types/macros/trait_impl/node_state.rs b/raphtory/src/python/types/macros/trait_impl/node_state.rs index bab1b3c9ef..45fbe2eb36 100644 --- a/raphtory/src/python/types/macros/trait_impl/node_state.rs +++ b/raphtory/src/python/types/macros/trait_impl/node_state.rs @@ -17,7 +17,7 @@ use pyo3::{ types::PyNotImplemented, }; use raphtory_api::core::{entities::GID, storage::arc_str::ArcStr}; -use std::sync::Arc; +use std::{collections::HashMap, sync::Arc}; macro_rules! impl_node_state_ops { ($name:ident<$value:ty>, $inner_t:ty, $to_owned:expr) => { @@ -75,6 +75,10 @@ macro_rules! impl_node_state_ops { self.__iter__() } + fn sorted_by_id(&self) -> NodeState<'static, $value, DynamicGraph> { + self.inner.sort_by_id() + } + fn __repr__(&self) -> String { self.inner.repr() } @@ -137,8 +141,14 @@ macro_rules! impl_node_state_ord_ops { .inner .values() .map($to_owned) - .eq(other.iter().cloned()) + .eq(other.into_iter()) .into_py(py); + } else if let Ok(other) = other.extract::>() { + return (self.inner.len() == other.len() + && other.into_iter().all(|(node, value)| { + self.inner.get_by_node(node).map($to_owned) == Some(value) + })) + .into_py(py); } PyNotImplemented::get(py).into_py(py) } @@ -262,7 +272,7 @@ impl_node_state_num!(NodeStateUsize); impl_lazy_node_state_num!(LazyNodeStateU64); impl_node_state_num!(NodeStateU64); -impl_node_state!(NodeStateGID); +impl_node_state_ord!(NodeStateGID); impl_lazy_node_state_ord!(LazyNodeStateGID); impl_lazy_node_state_ord!(LazyNodeStateOptionI64>); diff --git a/raphtory/src/search/mod.rs b/raphtory/src/search/mod.rs index c5170ebd9c..ddc0ae3615 100644 --- a/raphtory/src/search/mod.rs +++ b/raphtory/src/search/mod.rs @@ -756,10 +756,21 @@ impl<'graph, G: GraphViewOps<'graph>> IndexedGraph { } impl InternalAdditionOps for IndexedGraph { + #[inline] + fn num_shards(&self) -> Result { + self.graph.num_shards() + } + #[inline] fn next_event_id(&self) -> Result { self.graph.next_event_id() } + + #[inline] + fn reserve_event_ids(&self, num_ids: usize) -> Result { + self.graph.reserve_event_ids(num_ids) + } + #[inline] fn resolve_layer(&self, layer: Option<&str>) -> Result, GraphError> { self.graph.resolve_layer(layer) diff --git a/raphtory/src/serialise/serialise.rs b/raphtory/src/serialise/serialise.rs index 7a36156fef..26b9be0add 100644 --- a/raphtory/src/serialise/serialise.rs +++ b/raphtory/src/serialise/serialise.rs @@ -708,11 +708,11 @@ impl StableDecode for TemporalGraph { Gid::GidU64(gid) => GidRef::U64(*gid), }; let vid = VID(node.vid as usize); - storage.logical_to_physical.get_or_init(gid, || vid)?; + storage.logical_to_physical.set(gid, vid)?; let mut node_store = NodeStore::empty(gid.to_owned()); node_store.vid = vid; node_store.node_type = node.type_id as usize; - storage.storage.nodes.set(vid, node_store); + storage.storage.nodes.set(vid, node_store).init(); Ok::<(), GraphError>(()) })?; graph.edges.par_iter().for_each(|edge| { @@ -721,7 +721,7 @@ impl StableDecode for TemporalGraph { let dst = VID(edge.dst as usize); let mut edge = EdgeStore::new(src, dst); edge.eid = eid; - storage.storage.edges.set(edge); + storage.storage.edges.set(edge).init(); }); graph.updates.par_iter().try_for_each(|update| { if let Some(update) = update.update.as_ref() {