Skip to content

Commit

Permalink
Disk merge (#1678)
Browse files Browse the repository at this point in the history
* less string clones in node lookups

* add public interface for disk merge and start testing

* more tests

* actually write all the benchmark results again

* clean up debug output and fix merging bugs

* move DiskGraph struct up one level

* Expose the merge method to python

* update storage

* fix the features

* add graph_dir method to DiskGraph

* update submodule

* minor fixes

* try to stop the 143 failures

* fix the workflow

* more workflow tweaks

* verbose output

* does older ubuntu have the same problem?

* update submodule

* update submodule

* update submodule

* update submodule

* update submodule

* try to fix linker running out of memory

* install lld

* try to get protoc on readthedocs
  • Loading branch information
ljeub-pometry authored Jul 15, 2024
1 parent 0d45309 commit a9fe913
Show file tree
Hide file tree
Showing 40 changed files with 721 additions and 507 deletions.
174 changes: 87 additions & 87 deletions .environment.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@

name: py39
channels:
- conda-forge
- defaults
dependencies:
- protobuf>=4.25.3
- ca-certificates>=2021.10.8
- certifi>=2021.10.8
- ipykernel
Expand All @@ -27,89 +27,89 @@ dependencies:
- xz>=5.2.5
- zlib>=1.2.12
- pip:
- aiohttp==3.9.2
- aiosignal==1.3.1
- appnope==0.1.3
- asttokens==2.4.0
- async-timeout==4.0.3
- attrs==23.1.0
- autodocsumm==0.2.11
- backcall==0.2.0
- backoff==2.2.1
- bleach==6.0.0
- botocore==1.31.53
- certifi==2023.7.22
- contourpy==1.1.1
- cycler==0.11.0
- decorator==5.1.1
- defusedxml==0.7.1
- docopt==0.6.2
- exceptiongroup==1.1.3
- executing==1.2.0
- fastjsonschema==2.18.0
- fonttools==4.42.1
- frozenlist==1.4.0
- gql==3.4.1
- graphql-core==3.2.3
- ipython==8.15.0
- jedi==0.19.0
- jmespath==1.0.1
- jsonpickle==3.0.2
- jsonschema==4.19.1
- jsonschema-specifications==2023.7.1
- jupyter_client==8.3.1
- jupyter_core==5.3.1
- jupyterlab-pygments==0.2.2
- kiwisolver==1.4.5
- matplotlib==3.8.0
- matplotlib-inline==0.1.6
- maturin==1.2.3
- mistune==3.0.1
- multidict==6.0.4
- nbclient==0.8.0
- nbconvert==7.8.0
- nbformat==5.9.2
- nbsphinx==0.9.3
- networkx==3.1
- numpy==1.26.0
- numpydoc==1.5.0
- pandas==2.1.1
- pandocfilters==1.5.0
- parso==0.8.3
- pexpect==4.8.0
- pickleshare==0.7.5
- Pillow==10.0.1
- pipreqs==0.4.13
- platformdirs==3.10.0
- prompt-toolkit==3.0.39
- ptyprocess==0.7.0
- pure-eval==0.2.2
- pyarrow==13.0.0
- pydata-sphinx-theme==0.14.1
- pyparsing==3.1.1
- python-dateutil==2.8.2
- pyvis==0.3.2
- pyzmq==25.1.1
- referencing==0.30.2
- requests-toolbelt==0.10.1
- rpds-py==0.10.3
- six==1.16.0
- sphinx-automodapi==0.16.0
- sphinx-autosummary-accessors==2023.4.0
- sphinx-copybutton==0.5.2
- sphinx-favicon==1.0.1
- sphinx_design==0.5.0
- sphinx-notfound-page==1.0.0
- stack-data==0.6.2
- tinycss2==1.2.1
- tomli==2.0.1
- tornado==6.3.3
- traitlets==5.10.0
- typing_extensions==4.8.0
- tzdata==2023.3
- urllib3==1.26.16
- wcwidth==0.2.6
- webencodings==0.5.1
- websockets==10.4
- yarg==0.1.9
- yarl==1.9.2
- aiohttp==3.9.2
- aiosignal==1.3.1
- appnope==0.1.3
- asttokens==2.4.0
- async-timeout==4.0.3
- attrs==23.1.0
- autodocsumm==0.2.11
- backcall==0.2.0
- backoff==2.2.1
- bleach==6.0.0
- botocore==1.31.53
- certifi==2023.7.22
- contourpy==1.1.1
- cycler==0.11.0
- decorator==5.1.1
- defusedxml==0.7.1
- docopt==0.6.2
- exceptiongroup==1.1.3
- executing==1.2.0
- fastjsonschema==2.18.0
- fonttools==4.42.1
- frozenlist==1.4.0
- gql==3.4.1
- graphql-core==3.2.3
- ipython==8.15.0
- jedi==0.19.0
- jmespath==1.0.1
- jsonpickle==3.0.2
- jsonschema==4.19.1
- jsonschema-specifications==2023.7.1
- jupyter_client==8.3.1
- jupyter_core==5.3.1
- jupyterlab-pygments==0.2.2
- kiwisolver==1.4.5
- matplotlib==3.8.0
- matplotlib-inline==0.1.6
- maturin==1.2.3
- mistune==3.0.1
- multidict==6.0.4
- nbclient==0.8.0
- nbconvert==7.8.0
- nbformat==5.9.2
- nbsphinx==0.9.3
- networkx==3.1
- numpy==1.26.0
- numpydoc==1.5.0
- pandas==2.1.1
- pandocfilters==1.5.0
- parso==0.8.3
- pexpect==4.8.0
- pickleshare==0.7.5
- Pillow==10.0.1
- pipreqs==0.4.13
- platformdirs==3.10.0
- prompt-toolkit==3.0.39
- ptyprocess==0.7.0
- pure-eval==0.2.2
- pyarrow==13.0.0
- pydata-sphinx-theme==0.14.1
- pyparsing==3.1.1
- python-dateutil==2.8.2
- pyvis==0.3.2
- pyzmq==25.1.1
- referencing==0.30.2
- requests-toolbelt==0.10.1
- rpds-py==0.10.3
- six==1.16.0
- sphinx-automodapi==0.16.0
- sphinx-autosummary-accessors==2023.4.0
- sphinx-copybutton==0.5.2
- sphinx-favicon==1.0.1
- sphinx_design==0.5.0
- sphinx-notfound-page==1.0.0
- stack-data==0.6.2
- tinycss2==1.2.1
- tomli==2.0.1
- tornado==6.3.3
- traitlets==5.10.0
- typing_extensions==4.8.0
- tzdata==2023.3
- urllib3==1.26.16
- wcwidth==0.2.6
- webencodings==0.5.1
- websockets==10.4
- yarg==0.1.9
- yarl==1.9.2
3 changes: 1 addition & 2 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,7 @@ jobs:
- name: Run benchmark (Unix)
run: |
set -o pipefail
cargo bench --bench base -p raphtory-benchmark -- --output-format=bencher | tee benchmark-result.txt
cargo bench --bench algobench -p raphtory-benchmark -- --output-format=bencher | tee benchmark-result.txt
cargo bench --bench base --bench algobench -p raphtory-benchmark -- --output-format=bencher | tee benchmark-result.txt
- name: Delete cargo.lock if it exists
run: |
rm -f Cargo.lock
Expand Down
14 changes: 9 additions & 5 deletions .github/workflows/test_rust_disk_storage_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ jobs:
strategy:
matrix:
include:
- os: macos-latest
- os: ubuntu-20.04
- os: windows-latest
- { os: macos-latest, flags: "" }
- { os: ubuntu-20.04, flags: "-C link-arg=-fuse-ld=lld" }
- { os: windows-latest, flags: "" }
steps:
- uses: maxim-lobanov/setup-xcode@v1
name: Xcode version
Expand All @@ -43,6 +43,10 @@ jobs:
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: Install LLD
if: "contains(matrix.os, 'ubuntu')"
run: |
sudo apt-get install lld
- uses: webfactory/[email protected]
name: Load pometry-storage key
with:
Expand All @@ -56,14 +60,14 @@ jobs:
- name: Install Protoc
uses: arduino/setup-protoc@v3
with:
repo-token: ${{ secrets.GITHUB_TOKEN }}
repo-token: ${{ secrets.GITHUB_TOKEN }}
- name: Install nextest
uses: taiki-e/install-action@nextest
- name: Activate pometry-storage in Cargo.toml
run: make pull-storage
- name: Run all Tests (disk_graph)
env:
RUSTFLAGS: -Awarnings
RUSTFLAGS: -Awarnings ${{ matrix.flags }}
TEMPDIR: ${{ runner.temp }}
run: |
cargo nextest run --all --no-default-features --features "storage"
Expand Down
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ display-error-chain = "0.2.0"
polars-arrow = "0.39.2"
polars-parquet = "0.39.2"
polars-utils = "0.39.2"
kdam = { version = "0.5.1", features = ["notebook"] }
kdam = { version = "0.5.1" }
pretty_assertions = "1.4.0"
quickcheck = "1.0.3"
quickcheck_macros = "1.0.0"
Expand Down
7 changes: 0 additions & 7 deletions examples/rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,3 @@ name = "crypto"

[[bin]]
name = "pokec"

[target.x86_64-unknown-linux-gnu]
linker = "/usr/bin/clang"
rustflags = ["-Clink-arg=-fuse-ld=lld", "-Clink-arg=-Wl,--no-rosegment"]

[profile.release]
debug = true
2 changes: 1 addition & 1 deletion pometry-storage-private
2 changes: 0 additions & 2 deletions python/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#![allow(non_local_definitions)]

extern crate core;
use pyo3::prelude::*;
use raphtory_core::python::packages::base_modules::{
Expand Down
2 changes: 1 addition & 1 deletion raphtory-cypher/examples/raphtory_cypher.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ mod cypher {
use arrow::util::pretty::print_batches;
use clap::Parser;
use futures::{stream, StreamExt};
use raphtory::disk_graph::graph_impl::{DiskGraph, ParquetLayerCols};
use raphtory::disk_graph::{graph_impl::ParquetLayerCols, DiskGraph};
use raphtory_cypher::{run_cypher, run_cypher_to_streams, run_sql};
use serde::{de::DeserializeOwned, Deserialize};

Expand Down
4 changes: 2 additions & 2 deletions raphtory-cypher/src/executor/table_provider/edge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ use datafusion::{
};
use futures::Stream;
use pometry_storage::prelude::*;
use raphtory::disk_graph::graph_impl::DiskGraph;
use raphtory::disk_graph::DiskGraph;

use crate::executor::{arrow2_to_arrow_buf, ExecError};

Expand Down Expand Up @@ -99,7 +99,7 @@ impl EdgeListTableProvider {

fn lift_nested_arrow_schema(graph: &DiskGraph, layer_id: usize) -> Result<Arc<Schema>, ExecError> {
let arrow2_fields = graph.as_ref().layer(layer_id).edges_data_type();
let a2_dt = crate::arrow2::datatypes::ArrowDataType::Struct(arrow2_fields.clone());
let a2_dt = crate::arrow2::datatypes::ArrowDataType::Struct(arrow2_fields.to_vec());
let a_dt: DataType = a2_dt.into();
let schema = match a_dt {
DataType::Struct(fields) => {
Expand Down
2 changes: 1 addition & 1 deletion raphtory-cypher/src/executor/table_provider/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use futures::Stream;
use pometry_storage::properties::ConstProps;
use raphtory::{
core::entities::VID,
disk_graph::{graph_impl::DiskGraph, prelude::*},
disk_graph::{prelude::*, DiskGraph},
};
use std::{any::Any, fmt::Formatter, sync::Arc};

Expand Down
5 changes: 2 additions & 3 deletions raphtory-cypher/src/hop/execution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,16 @@ use datafusion::{
use datafusion::physical_expr::Partitioning;
use futures::{Stream, StreamExt};

use crate::take_record_batch;
use pometry_storage::graph_fragment::TempColGraphFragment;
use raphtory::{
core::{entities::VID, Direction},
disk_graph::{
graph_impl::DiskGraph,
prelude::{ArrayOps, BaseArrayOps, PrimitiveCol},
DiskGraph,
},
};

use crate::take_record_batch;

use super::operator::HopPlan;

#[derive(Debug)]
Expand Down
2 changes: 1 addition & 1 deletion raphtory-cypher/src/hop/operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use datafusion::{
logical_expr::{Expr, LogicalPlan, TableScan, UserDefinedLogicalNodeCore},
};

use raphtory::{core::Direction, disk_graph::graph_impl::DiskGraph};
use raphtory::{core::Direction, disk_graph::DiskGraph};

#[derive(Debug, PartialEq, Hash, Eq)]
pub struct HopPlan {
Expand Down
7 changes: 3 additions & 4 deletions raphtory-cypher/src/hop/rule.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::sync::Arc;

use crate::hop::operator::HopPlan;
use async_trait::async_trait;
use datafusion::{
common::Column,
Expand All @@ -10,9 +11,7 @@ use datafusion::{
physical_plan::ExecutionPlan,
physical_planner::{DefaultPhysicalPlanner, ExtensionPlanner, PhysicalPlanner},
};
use raphtory::{core::Direction, disk_graph::graph_impl::DiskGraph};

use crate::hop::operator::HopPlan;
use raphtory::{core::Direction, disk_graph::DiskGraph};

use super::execution::HopExec;

Expand Down Expand Up @@ -141,7 +140,7 @@ impl ExtensionPlanner for HopPlanner {
#[cfg(test)]
mod test {
use arrow::util::pretty::print_batches;
use raphtory::disk_graph::graph_impl::DiskGraph;
use raphtory::disk_graph::DiskGraph;
use tempfile::tempdir;

use crate::prepare_plan;
Expand Down
8 changes: 3 additions & 5 deletions raphtory-cypher/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ mod cypher {
parser::ast::*,
*,
};
use raphtory::disk_graph::graph_impl::DiskGraph;
use raphtory::disk_graph::DiskGraph;

use crate::{
executor::table_provider::node::NodeTableProvider,
Expand Down Expand Up @@ -185,9 +185,8 @@ mod cypher {
use arrow_array::RecordBatch;
use tempfile::tempdir;

use raphtory::{disk_graph::graph_impl::DiskGraph, prelude::*};

use crate::run_cypher;
use raphtory::{disk_graph::DiskGraph, prelude::*};

lazy_static::lazy_static! {
static ref EDGES: Vec<(u64, u64, i64, f64)> = vec![
Expand Down Expand Up @@ -278,10 +277,9 @@ mod cypher {
datatypes::*,
};
use arrow::util::pretty::print_batches;
use raphtory::disk_graph::{graph_impl::ParquetLayerCols, DiskGraph};
use tempfile::tempdir;

use raphtory::disk_graph::graph_impl::{DiskGraph, ParquetLayerCols};

use crate::run_cypher;

fn schema() -> ArrowSchema {
Expand Down
Loading

0 comments on commit a9fe913

Please sign in to comment.