From 62a13b2d246efe39c8884f7c619948da33c2f770 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Wed, 20 Mar 2024 19:21:36 +0100 Subject: [PATCH 1/7] Split into binary and library --- Cargo.toml | 12 ++-- src/bin.rs | 154 +++++++++++++++++++++++++++++++++++++++++++ src/configuration.rs | 81 ----------------------- src/lib.rs | 51 ++------------ src/main.rs | 21 ------ 5 files changed, 165 insertions(+), 154 deletions(-) create mode 100644 src/bin.rs delete mode 100644 src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 0e61d02..2aa631a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,16 @@ [package] name = "speedytree" version = "0.1.0" +authors = ["Curro Campuzano "] edition = "2021" -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lib] +name = "speedytree" +path = "src/lib.rs" +[[bin]] +name = "speedytree" +path = "src/bin.rs" [dependencies] bit-set = "0.5.3" bit-vec = "0.6.3" @@ -17,7 +23,3 @@ petgraph = "0.6.4" rand = "0.8.5" rayon = "1.8.0" rb_tree = "0.5.0" -[profile.release] -lto = true -codegen-units = 1 -panic = "abort" diff --git a/src/bin.rs b/src/bin.rs new file mode 100644 index 0000000..9216931 --- /dev/null +++ b/src/bin.rs @@ -0,0 +1,154 @@ +use clap::Parser; +/// # speedytree +/// `speedytree` is a command line tool for quickly creating a directory tree. +/// It is a Rust implementation of the `tree` command line tool. +/// It is intended to be a drop-in replacement for the `tree` command. +/// It is not intended to be a complete implementation of the `tree` command. +/// It is intended to be a fast implementation of the `tree` command. +use hybrid_nj::neighbor_joining; +use speedytree::hybrid_nj; + +use speedytree::distances::DistanceMatrix; +use speedytree::naive_nj::naive_neighbor_joining; +use speedytree::newick::to_newick; +use speedytree::rapid_nj::rapid_nj; +use std::{ + error, + io::{self, Write}, + process, +}; +type ResultBox = std::result::Result>; + + + +/// Define the configuration of the program +/// It contains the algorithm to use and the number of threads to use +/// +#[derive(Debug)] +pub struct Config { + pub(crate) algo: Algorithm, + pub(crate) threads: usize, + pub(crate) chunk_size: usize, + pub(crate) naive_percentage: usize, +} + +impl Config { + /// Build the configuration from the command line arguments + pub fn build(args: Args) -> ResultBox { + // Let match the algorithm, if not specified, use Naive + let algo = if args.naive { + Algorithm::Naive + } else if args.rapidnj { + Algorithm::RapidNJ + } else { + Algorithm::Hybrid + }; + let cores = args.cores; + let chunk_size = args.chunk_size; + // If chunk size is 0, error + if chunk_size == 0 { + return Err("Chunk size cannot be 0".into()); + } + let naive_percentage = args.naive_percentage; + // If naive percentage is 0, error + if naive_percentage == 0 { + return Err("Naive percentage cannot be 0".into()); + } + // If naive percentage is 100, error + if naive_percentage == 100 { + return Err("Naive percentage cannot be 100".into()); + } + Ok(Config { + algo, + threads: cores, + chunk_size, + naive_percentage, + }) + } +} + +/// Define the command line arguments +#[derive(Parser, Debug)] +#[command(author, version, about, long_about = None)] +pub struct Args { + /// Use the rapidnj heuristic + #[arg(long, conflicts_with = "hybrid", conflicts_with = "naive")] + rapidnj: bool, + /// Use the naive algorithm + #[arg(long, conflicts_with = "hybrid", conflicts_with = "rapidnj")] + naive: bool, + /// Use the hybrid heuristic + #[arg(long, conflicts_with = "rapidnj", conflicts_with = "naive")] + hybrid: bool, + /// Number of cores to use + /// Default: 1 + #[arg(short, long, default_value = "1")] + cores: usize, + /// Chunk size to be handled by each thread + /// Default: 30 + #[arg(long, default_value = "30", conflicts_with = "naive")] + chunk_size: usize, + /// Percentage of the matrix to be handled by the naive algorithm + /// Default: 90 + #[arg( + long, + default_value = "90", + conflicts_with = "naive", + conflicts_with = "rapidnj" + )] + naive_percentage: usize, +} + +/// Available algorithms in the program +#[derive(Debug, Clone)] +pub enum Algorithm { + /// Naive neighbor joining + Naive, + /// Rapid neighbor joining + RapidNJ, + /// Hybrid neighbor joining + Hybrid, +} +/// Main function of the crate +pub fn run(config: Config) { + rayon::ThreadPoolBuilder::new() + .num_threads(config.threads) + .build_global() + .unwrap(); + + let reader = io::stdin().lock(); + let d = DistanceMatrix::build_from_phylip(reader).unwrap_or_else(|err| { + eprintln!("{err}"); + process::exit(1); + }); + + let d = match config.algo { + Algorithm::Naive => naive_neighbor_joining(d), + Algorithm::RapidNJ => rapid_nj(d, config.chunk_size), + Algorithm::Hybrid => { + let naive_steps = d.size() * config.naive_percentage / 100; + dbg!(naive_steps); + neighbor_joining(d, naive_steps, config.chunk_size) + } + }; + let graph = d.unwrap_or_else(|err| { + eprintln!("{err}"); + process::exit(1); + }); + io::stdout() + .write_all(to_newick(&graph).as_bytes()) + .unwrap_or_else(|err| { + eprintln!("{err}"); + process::exit(1); + }); +} + +fn main() { + let args = Args::parse(); + //dbg!(&args); + let config = Config::build(args).unwrap_or_else(|err| { + eprintln!("Problem parsing arguments: {err}"); + process::exit(1); + }); + run(config); +} diff --git a/src/configuration.rs b/src/configuration.rs index 5479032..e69de29 100644 --- a/src/configuration.rs +++ b/src/configuration.rs @@ -1,81 +0,0 @@ -use crate::Algorithm; - -use super::ResultBox; -use clap::Parser; -/// Define the configuration of the program -/// It contains the algorithm to use and the number of threads to use -/// -#[derive(Debug)] -pub struct Config { - pub(crate) algo: Algorithm, - pub(crate) threads: usize, - pub(crate) chunk_size: usize, - pub(crate) naive_percentage: usize, -} - -impl Config { - /// Build the configuration from the command line arguments - pub fn build(args: Args) -> ResultBox { - // Let match the algorithm, if not specified, use Naive - let algo = if args.naive { - Algorithm::Naive - } else if args.rapidnj { - Algorithm::RapidNJ - } else { - Algorithm::Hybrid - }; - let cores = args.cores; - let chunk_size = args.chunk_size; - // If chunk size is 0, error - if chunk_size == 0 { - return Err("Chunk size cannot be 0".into()); - } - let naive_percentage = args.naive_percentage; - // If naive percentage is 0, error - if naive_percentage == 0 { - return Err("Naive percentage cannot be 0".into()); - } - // If naive percentage is 100, error - if naive_percentage == 100 { - return Err("Naive percentage cannot be 100".into()); - } - Ok(Config { - algo, - threads: cores, - chunk_size, - naive_percentage, - }) - } -} - -/// Define the command line arguments -#[derive(Parser, Debug)] -#[command(author, version, about, long_about = None)] -pub struct Args { - /// Use the rapidnj heuristic - #[arg(long, conflicts_with = "hybrid", conflicts_with = "naive")] - rapidnj: bool, - /// Use the naive algorithm - #[arg(long, conflicts_with = "hybrid", conflicts_with = "rapidnj")] - naive: bool, - /// Use the hybrid heuristic - #[arg(long, conflicts_with = "rapidnj", conflicts_with = "naive")] - hybrid: bool, - /// Number of cores to use - /// Default: 1 - #[arg(short, long, default_value = "1")] - cores: usize, - /// Chunk size to be handled by each thread - /// Default: 30 - #[arg(long, default_value = "30", conflicts_with = "naive")] - chunk_size: usize, - /// Percentage of the matrix to be handled by the naive algorithm - /// Default: 90 - #[arg( - long, - default_value = "90", - conflicts_with = "naive", - conflicts_with = "rapidnj" - )] - naive_percentage: usize, -} diff --git a/src/lib.rs b/src/lib.rs index 9859c9f..98529e3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,10 +20,10 @@ pub mod property_tests; /// Configuration of the program pub mod configuration; -mod distances; -mod naive_nj; -mod newick; -mod rapid_nj; +pub mod distances; +pub mod naive_nj; +pub mod newick; +pub mod rapid_nj; use hybrid_nj::neighbor_joining; use crate::distances::DistanceMatrix; @@ -39,46 +39,3 @@ use std::{ type ResultBox = std::result::Result>; type Tree = petgraph::graph::UnGraph; -/// Available algorithms in the program -#[derive(Debug, Clone)] -pub enum Algorithm { - /// Naive neighbor joining - Naive, - /// Rapid neighbor joining - RapidNJ, - /// Hybrid neighbor joining - Hybrid, -} -/// Main function of the crate -pub fn run(config: configuration::Config) { - rayon::ThreadPoolBuilder::new() - .num_threads(config.threads) - .build_global() - .unwrap(); - - let reader = io::stdin().lock(); - let d = DistanceMatrix::build_from_phylip(reader).unwrap_or_else(|err| { - eprintln!("{err}"); - process::exit(1); - }); - - let d = match config.algo { - Algorithm::Naive => naive_neighbor_joining(d), - Algorithm::RapidNJ => rapid_nj(d, config.chunk_size), - Algorithm::Hybrid => { - let naive_steps = d.size() * config.naive_percentage / 100; - dbg!(naive_steps); - neighbor_joining(d, naive_steps, config.chunk_size) - } - }; - let graph = d.unwrap_or_else(|err| { - eprintln!("{err}"); - process::exit(1); - }); - io::stdout() - .write_all(to_newick(&graph).as_bytes()) - .unwrap_or_else(|err| { - eprintln!("{err}"); - process::exit(1); - }); -} diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 862b1e0..0000000 --- a/src/main.rs +++ /dev/null @@ -1,21 +0,0 @@ -use clap::Parser; -/// # speedytree -/// `speedytree` is a command line tool for quickly creating a directory tree. -/// It is a Rust implementation of the `tree` command line tool. -/// It is intended to be a drop-in replacement for the `tree` command. -/// It is not intended to be a complete implementation of the `tree` command. -/// It is intended to be a fast implementation of the `tree` command. -use speedytree::{ - configuration::{Args, Config}, - run, -}; -use std::process; -fn main() { - let args = Args::parse(); - //dbg!(&args); - let config = Config::build(args).unwrap_or_else(|err| { - eprintln!("Problem parsing arguments: {err}"); - process::exit(1); - }); - run(config); -} From 2f5ab03a46200e4609412389e57975874c35a538 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Wed, 20 Mar 2024 20:08:07 +0100 Subject: [PATCH 2/7] Make a minimal functional API --- Cargo.lock | 7 +++++ Cargo.toml | 1 + src/bin.rs | 7 +++-- src/distances.rs | 8 +++--- src/hybrid_nj/algorithm.rs | 5 ++-- src/lib.rs | 56 ++++++++++++++++++------------------- src/naive_nj/algorithm.rs | 6 ++-- src/naive_nj/mod.rs | 10 +++---- src/naive_nj/phylo_tree.rs | 2 +- src/property_tests/tests.rs | 4 +-- src/rapid_nj/mod.rs | 6 ++-- src/rapid_nj/phylo_tree.rs | 4 +-- 12 files changed, 62 insertions(+), 54 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 61f9ade..a126792 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -50,6 +50,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "anyhow" +version = "1.0.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" + [[package]] name = "autocfg" version = "1.1.0" @@ -406,6 +412,7 @@ checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" name = "speedytree" version = "0.1.0" dependencies = [ + "anyhow", "bit-set", "bit-vec", "bitvec", diff --git a/Cargo.toml b/Cargo.toml index 2aa631a..0cf77b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ path = "src/lib.rs" name = "speedytree" path = "src/bin.rs" [dependencies] +anyhow = "1.0.81" bit-set = "0.5.3" bit-vec = "0.6.3" bitvec = "1.0.1" diff --git a/src/bin.rs b/src/bin.rs index 9216931..73965b1 100644 --- a/src/bin.rs +++ b/src/bin.rs @@ -1,3 +1,4 @@ +extern crate speedytree; use clap::Parser; /// # speedytree /// `speedytree` is a command line tool for quickly creating a directory tree. @@ -9,7 +10,7 @@ use hybrid_nj::neighbor_joining; use speedytree::hybrid_nj; use speedytree::distances::DistanceMatrix; -use speedytree::naive_nj::naive_neighbor_joining; +use speedytree::naive_nj::canonical_neighbor_joining; use speedytree::newick::to_newick; use speedytree::rapid_nj::rapid_nj; use std::{ @@ -117,13 +118,13 @@ pub fn run(config: Config) { .unwrap(); let reader = io::stdin().lock(); - let d = DistanceMatrix::build_from_phylip(reader).unwrap_or_else(|err| { + let d = DistanceMatrix::read_from_phylip(reader).unwrap_or_else(|err| { eprintln!("{err}"); process::exit(1); }); let d = match config.algo { - Algorithm::Naive => naive_neighbor_joining(d), + Algorithm::Naive => canonical_neighbor_joining(d), Algorithm::RapidNJ => rapid_nj(d, config.chunk_size), Algorithm::Hybrid => { let naive_steps = d.size() * config.naive_percentage / 100; diff --git a/src/distances.rs b/src/distances.rs index 80894a8..7ee6806 100644 --- a/src/distances.rs +++ b/src/distances.rs @@ -13,7 +13,7 @@ pub struct DistanceMatrix { /// Distance matrix from a phylip file impl DistanceMatrix { - pub fn build_from_phylip(mut reader: R) -> ResultBox + pub fn read_from_phylip(mut reader: R) -> ResultBox where R: io::BufRead, { @@ -41,7 +41,7 @@ impl DistanceMatrix { self.matrix.len() } /// Permutate the distance matrix for testing purposes - pub fn permutate(&mut self) { + pub(crate) fn permutate(&mut self) { let mut rng = rand::thread_rng(); let mut perm = (0..self.size()).collect::>(); perm.shuffle(&mut rng); @@ -59,7 +59,7 @@ impl DistanceMatrix { self.names = new_names; } /// Example from Wikipedia, https://en.wikipedia.org/wiki/Neighbor_joining - pub fn wikipedia_example() -> DistanceMatrix { + pub(crate) fn wikipedia_example() -> DistanceMatrix { DistanceMatrix { matrix: vec![ vec![0.0, 5.0, 9.0, 9.0, 8.0], @@ -95,7 +95,7 @@ D 9.0 10.0 8.0 0.0 " .as_bytes(); // run function - let distance_matrix = DistanceMatrix::build_from_phylip::<&[u8]>(input).unwrap(); + let distance_matrix = DistanceMatrix::read_from_phylip::<&[u8]>(input).unwrap(); // check result assert_eq!( distance_matrix.matrix, diff --git a/src/hybrid_nj/algorithm.rs b/src/hybrid_nj/algorithm.rs index 5f9f212..c7ce16c 100644 --- a/src/hybrid_nj/algorithm.rs +++ b/src/hybrid_nj/algorithm.rs @@ -2,7 +2,6 @@ use crate::{ distances::DistanceMatrix, naive_nj::DataNaiveNJ, rapid_nj::DataRapidNJ, ResultBox, Tree, }; -/// Main function of the crate /// This approach is a hybrid between the naive neighbor joining and the rapid neighbor joining. /// If `naive_iters` is greater than n, then this function calls `naive_neighbor_joining` instead. /// If `naive_iters` is less than 4, then this function calls `rapid_nj` instead. @@ -19,10 +18,10 @@ pub fn neighbor_joining( chunk_size: usize, ) -> ResultBox { if dist.size() < 4 || naive_iters >= dist.size() { - return crate::naive_neighbor_joining(dist); + return crate::naive_nj::canonical_neighbor_joining(dist); } if naive_iters < 4 { - return crate::rapid_nj(dist, chunk_size); + return crate::rapid_nj::rapid_nj(dist, chunk_size); } let mut q = crate::rapid_nj::QMatrix::from(&dist); let mut t = crate::rapid_nj::PhyloTree::build(&dist.names); diff --git a/src/lib.rs b/src/lib.rs index 98529e3..cfe4313 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,41 +1,41 @@ -//! speedytree: Command line tool for Neighbor Joining of biological sequences -//! -//! It implements different heuristics for fast Neighbor-Joining. -//! -//! 1. Naive Neighbor-Joining -//! 2. RapidNJ -//! 3. Hybrid +//! Canonical and RapidNJ implementations of Neighbor-joining in Rust //! +//! Provides Rust implementation of the Canonical algorithm and something in the spirit of RapidNJ but with B-trees. Work in progress. +//! A minimal example is provided here. +//! ``` +//! use speedytree::distances::DistanceMatrix; +//! use speedytree::canonical_neighbor_joining; +//! use speedytree::rapid_nj_neighbor_joining; +//! use speedytree::robinson_foulds; +//!// Raw Phylip format +//!let input = "5 +//! a 0 5 9 9 8 +//! b 5 0 10 10 9 +//! c 9 10 0 8 7 +//! d 9 10 8 0 3 +//! e 8 9 7 3 0 +//!".as_bytes();; +//!let distances = DistanceMatrix::read_from_phylip(input).unwrap(); +//! // Use canonical algorithm +//!let tree1 = canonical_neighbor_joining(distances.clone()).unwrap(); +//! // Use RapidNJ with b-trees- +//!let tree2 = rapid_nj_neighbor_joining(distances.clone(), 2).unwrap(); +//! assert_eq!(robinson_foulds(tree1, tree2, 5), 0); +//! ``` -#![warn(missing_docs)] -/// Hybrid neighbor joining algorithm -/// This approach is a hybrid between the naive neighbor joining and the rapid neighbor joining. -/// The idea is to use the rapidnj heuristic first to potentially stop the algorithm early, -/// and then use the naive neighbor joining to finish the algorithm, which is faster -/// in practice, but performs more comparisons in theory. -/// However, both algorithms are O(n^3), so the difference is not that big. pub mod hybrid_nj; /// Property tests for neighbor joining algorithm pub mod property_tests; - -/// Configuration of the program -pub mod configuration; pub mod distances; pub mod naive_nj; pub mod newick; pub mod rapid_nj; -use hybrid_nj::neighbor_joining; - -use crate::distances::DistanceMatrix; -use crate::naive_nj::naive_neighbor_joining; -use crate::newick::to_newick; -use crate::rapid_nj::rapid_nj; -use std::{ - error, - io::{self, Write}, - process, -}; +use std::error; type ResultBox = std::result::Result>; type Tree = petgraph::graph::UnGraph; +pub use naive_nj::canonical_neighbor_joining as canonical_neighbor_joining; +pub use rapid_nj::rapid_nj as rapid_nj_neighbor_joining; +pub use property_tests::tree_distances::robinson_foulds as robinson_foulds; +pub use property_tests::tree_distances::branch_score as branch_score; \ No newline at end of file diff --git a/src/naive_nj/algorithm.rs b/src/naive_nj/algorithm.rs index 046ea8a..d479f83 100644 --- a/src/naive_nj/algorithm.rs +++ b/src/naive_nj/algorithm.rs @@ -2,7 +2,7 @@ use crate::{distances::DistanceMatrix, ResultBox, Tree}; use super::{phylo_tree::PhyloTree, qmatrix::QMatrix}; -pub fn naive_neighbor_joining(dist: DistanceMatrix) -> ResultBox { +pub fn canonical_neighbor_joining(dist: DistanceMatrix) -> ResultBox { let mut t = PhyloTree::build(&dist.names); let mut q = QMatrix::build(dist); while q.n_leaves() > 3 { @@ -16,7 +16,7 @@ pub fn naive_neighbor_joining(dist: DistanceMatrix) -> ResultBox { Ok(terminate_nj(t, q)) } -pub fn terminate_nj(tree: PhyloTree, q: QMatrix) -> Tree { +pub(crate) fn terminate_nj(tree: PhyloTree, q: QMatrix) -> Tree { let (i, j, m) = (tree.nodes[&0], tree.nodes[&1], tree.nodes[&2]); let mut tree = tree.tree; @@ -54,7 +54,7 @@ mod tests { ], }; - let phylo = naive_neighbor_joining(d); + let phylo = canonical_neighbor_joining(d); assert!(phylo.is_ok()); let tree = phylo.unwrap(); diff --git a/src/naive_nj/mod.rs b/src/naive_nj/mod.rs index 44964da..add5d66 100644 --- a/src/naive_nj/mod.rs +++ b/src/naive_nj/mod.rs @@ -5,11 +5,11 @@ mod qmatrix; // PhyloTree is a helper struct for the Naive Neighbor Joining algorithm. mod phylo_tree; // Export the public interface of the Naive Neighbor Joining algorithm. -pub use algorithm::naive_neighbor_joining; -pub use algorithm::terminate_nj; -pub use phylo_tree::PhyloTree; -pub use qmatrix::QMatrix; -pub struct DataNaiveNJ { +pub use algorithm::canonical_neighbor_joining; +pub(crate) use algorithm::terminate_nj; +pub(crate) use phylo_tree::PhyloTree; +pub(crate) use qmatrix::QMatrix; +pub(crate) struct DataNaiveNJ { pub qmatrix: qmatrix::QMatrix, pub phylo_tree: phylo_tree::PhyloTree, } diff --git a/src/naive_nj/phylo_tree.rs b/src/naive_nj/phylo_tree.rs index 8a2055c..3eb1fdd 100644 --- a/src/naive_nj/phylo_tree.rs +++ b/src/naive_nj/phylo_tree.rs @@ -20,7 +20,7 @@ impl PhyloTree { nodes, } } - pub fn build(leafs: &Vec) -> PhyloTree { + pub fn build(leafs: &[String]) -> PhyloTree { let mut tree: petgraph::Graph = UnGraph::new_undirected(); let mut nodes = HashMap::new(); diff --git a/src/property_tests/tests.rs b/src/property_tests/tests.rs index bf5e7f5..c106cc5 100644 --- a/src/property_tests/tests.rs +++ b/src/property_tests/tests.rs @@ -8,7 +8,7 @@ fn assert_equal_tree(a: &crate::Tree, b: &crate::Tree, i: usize) { #[test] fn test_random_additive_binary_trees_naive() { use crate::{ - naive_nj::naive_neighbor_joining, + naive_nj::canonical_neighbor_joining, property_tests::random_additive_tree::{ distance_matrix_from_tree, random_unrooted_binary_tree, }, @@ -16,7 +16,7 @@ fn test_random_additive_binary_trees_naive() { for i in 4..20 { let original_tree = random_unrooted_binary_tree(i); let d = distance_matrix_from_tree(original_tree.clone()); - let tree = naive_neighbor_joining(d).unwrap(); + let tree = canonical_neighbor_joining(d).unwrap(); assert_equal_tree(&original_tree, &tree, i) } } diff --git a/src/rapid_nj/mod.rs b/src/rapid_nj/mod.rs index a79cf8b..b649fe8 100644 --- a/src/rapid_nj/mod.rs +++ b/src/rapid_nj/mod.rs @@ -3,10 +3,10 @@ mod node; mod phylo_tree; mod qmatrix; pub use algorithm::rapid_nj; -pub use phylo_tree::PhyloTree; -pub use qmatrix::QMatrix; +pub(crate) use phylo_tree::PhyloTree; +pub(crate) use qmatrix::QMatrix; -pub struct DataRapidNJ { +pub(crate) struct DataRapidNJ { pub qmatrix: QMatrix, pub phylo_tree: phylo_tree::PhyloTree, } diff --git a/src/rapid_nj/phylo_tree.rs b/src/rapid_nj/phylo_tree.rs index 636d4a4..37b2f17 100644 --- a/src/rapid_nj/phylo_tree.rs +++ b/src/rapid_nj/phylo_tree.rs @@ -3,14 +3,14 @@ use std::collections::HashMap; use petgraph::{graph::UnGraph, stable_graph::NodeIndex}; #[derive(Debug, Clone)] -pub struct PhyloTree { +pub(crate) struct PhyloTree { pub tree: crate::Tree, pub nodes: HashMap, n_nodes: usize, } impl PhyloTree { - pub fn build(leafs: &Vec) -> PhyloTree { + pub fn build(leafs: &[String]) -> PhyloTree { let mut tree: petgraph::Graph = UnGraph::new_undirected(); let mut nodes = HashMap::new(); From b38083c8e4a4c4f10f69926f2f3a10ef55a7d0e4 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Thu, 21 Mar 2024 13:10:38 +0100 Subject: [PATCH 3/7] Remove unnecesary public methods --- src/distances.rs | 41 ++++--------------------------------- src/hybrid_nj/mod.rs | 18 ++++++++++++++-- src/property_tests/tests.rs | 5 +---- 3 files changed, 21 insertions(+), 43 deletions(-) diff --git a/src/distances.rs b/src/distances.rs index 7ee6806..10e4578 100644 --- a/src/distances.rs +++ b/src/distances.rs @@ -1,5 +1,3 @@ -use rand::seq::SliceRandom; - use crate::ResultBox; use std::io::{self}; /// Distance matrix struct @@ -40,42 +38,11 @@ impl DistanceMatrix { pub fn size(&self) -> usize { self.matrix.len() } - /// Permutate the distance matrix for testing purposes - pub(crate) fn permutate(&mut self) { - let mut rng = rand::thread_rng(); - let mut perm = (0..self.size()).collect::>(); - perm.shuffle(&mut rng); - let mut new_matrix = vec![vec![0.0; self.size()]; self.size()]; - for i in 0..self.size() { - for j in 0..self.size() { - new_matrix[i][j] = self.matrix[perm[i]][perm[j]]; - } - } - self.matrix = new_matrix; - let mut new_names = vec![String::new(); self.size()]; - for i in 0..self.size() { - new_names[i] = self.names[perm[i]].clone(); - } - self.names = new_names; - } - /// Example from Wikipedia, https://en.wikipedia.org/wiki/Neighbor_joining - pub(crate) fn wikipedia_example() -> DistanceMatrix { - DistanceMatrix { - matrix: vec![ - vec![0.0, 5.0, 9.0, 9.0, 8.0], - vec![5.0, 0.0, 10.0, 10.0, 9.0], - vec![9.0, 10.0, 0.0, 8.0, 7.0], - vec![9.0, 10.0, 8.0, 0.0, 3.0], - vec![8.0, 9.0, 7.0, 3.0, 0.0], - ], - names: vec![ - "A".to_string(), - "B".to_string(), - "C".to_string(), - "D".to_string(), - "E".to_string(), - ], + pub fn build(matrix: Vec>, names: Vec) -> ResultBox{ + if matrix.len() != names.len() { + return Err("Matrix and names have different lengths".into()); } + Ok(DistanceMatrix { matrix, names }) } } diff --git a/src/hybrid_nj/mod.rs b/src/hybrid_nj/mod.rs index 8747773..616b528 100644 --- a/src/hybrid_nj/mod.rs +++ b/src/hybrid_nj/mod.rs @@ -5,11 +5,25 @@ pub use algorithm::neighbor_joining; #[cfg(test)] mod tests { use crate::distances::DistanceMatrix; - use super::*; #[test] fn test_example_wikipedia() { - let d = DistanceMatrix::wikipedia_example(); + let d = DistanceMatrix { + matrix: vec![ + vec![0.0, 5.0, 9.0, 9.0, 8.0], + vec![5.0, 0.0, 10.0, 10.0, 9.0], + vec![9.0, 10.0, 0.0, 8.0, 7.0], + vec![9.0, 10.0, 8.0, 0.0, 3.0], + vec![8.0, 9.0, 7.0, 3.0, 0.0], + ], + names: vec![ + "A".to_string(), + "B".to_string(), + "C".to_string(), + "D".to_string(), + "E".to_string(), + ], + }; let phylo = neighbor_joining(d, 4, 1); assert!(phylo.is_ok()); let tree = phylo.unwrap(); diff --git a/src/property_tests/tests.rs b/src/property_tests/tests.rs index c106cc5..b1d3c78 100644 --- a/src/property_tests/tests.rs +++ b/src/property_tests/tests.rs @@ -5,6 +5,7 @@ fn assert_equal_tree(a: &crate::Tree, b: &crate::Tree, i: usize) { assert!(petgraph::algo::is_isomorphic(&a.clone(), &b)); assert!(branch_score(a.clone(), b.clone(), i) < f64::EPSILON); } + #[test] fn test_random_additive_binary_trees_naive() { use crate::{ @@ -41,10 +42,6 @@ fn test_random_additive_binary_trees_mix() { use crate::property_tests::random_additive_tree::{ distance_matrix_from_tree, random_unrooted_binary_tree, }; - - let original_tree = random_unrooted_binary_tree(20); - let mut d: crate::distances::DistanceMatrix = distance_matrix_from_tree(original_tree.clone()); - d.permutate(); for i in (25..100).step_by(25) { let original_tree = random_unrooted_binary_tree(i); let d: crate::distances::DistanceMatrix = distance_matrix_from_tree(original_tree.clone()); From 50d902f12dca4385023c93c0f7c7ee0c904d9874 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Thu, 21 Mar 2024 14:51:26 +0100 Subject: [PATCH 4/7] Improve public API --- Cargo.lock | 7 -- Cargo.toml | 1 - src/lib.rs | 111 +++++++++++++++++++++++---- src/property_tests/tests.rs | 12 +-- src/property_tests/tree_distances.rs | 34 ++++++-- 5 files changed, 128 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a126792..61f9ade 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -50,12 +50,6 @@ dependencies = [ "windows-sys", ] -[[package]] -name = "anyhow" -version = "1.0.81" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" - [[package]] name = "autocfg" version = "1.1.0" @@ -412,7 +406,6 @@ checksum = "942b4a808e05215192e39f4ab80813e599068285906cc91aa64f923db842bd5a" name = "speedytree" version = "0.1.0" dependencies = [ - "anyhow", "bit-set", "bit-vec", "bitvec", diff --git a/Cargo.toml b/Cargo.toml index 0cf77b8..2aa631a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,6 @@ path = "src/lib.rs" name = "speedytree" path = "src/bin.rs" [dependencies] -anyhow = "1.0.81" bit-set = "0.5.3" bit-vec = "0.6.3" bitvec = "1.0.1" diff --git a/src/lib.rs b/src/lib.rs index cfe4313..d220ce1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,10 +3,10 @@ //! Provides Rust implementation of the Canonical algorithm and something in the spirit of RapidNJ but with B-trees. Work in progress. //! A minimal example is provided here. //! ``` -//! use speedytree::distances::DistanceMatrix; -//! use speedytree::canonical_neighbor_joining; -//! use speedytree::rapid_nj_neighbor_joining; +//! use speedytree::DistanceMatrix; //! use speedytree::robinson_foulds; +//! use speedytree::{NeighborJoiningSolver, Canonical, RapidBtrees, Hybrid}; +//! use rayon; //!// Raw Phylip format //!let input = "5 //! a 0 5 9 9 8 @@ -15,12 +15,30 @@ //! d 9 10 8 0 3 //! e 8 9 7 3 0 //!".as_bytes();; -//!let distances = DistanceMatrix::read_from_phylip(input).unwrap(); -//! // Use canonical algorithm -//!let tree1 = canonical_neighbor_joining(distances.clone()).unwrap(); -//! // Use RapidNJ with b-trees- -//!let tree2 = rapid_nj_neighbor_joining(distances.clone(), 2).unwrap(); -//! assert_eq!(robinson_foulds(tree1, tree2, 5), 0); +//!let d = DistanceMatrix::read_from_phylip(input).unwrap(); +//! // Canonical Neighbor Joining. Optimal for small problems. +//! let tree1 = NeighborJoiningSolver::::default(d.clone()) +//! .solve() +//! .unwrap(); +//! // An algorithm in the spirit of RapidNJ using B-trees. Optimal for big problems. +//! let tree2 = NeighborJoiningSolver::::default(d.clone()) +//! .solve() +//! .unwrap(); +//! assert_eq!(robinson_foulds(tree1, tree2), 0); +//! +//! // You can improve the speed a lot by using multiple threads (see rayon::ThreadPoolBuilder::new()) +//! // If so, you may want to tune the chunk size every worker uses +//! let tree3 = NeighborJoiningSolver::::default(d.clone()) +//! .set_chunk_size(4) +//! .solve() +//! .unwrap(); +//! // An algorithm that starts using RapidBtrees, later uses Canonical +//! // Optimal for intermediate problems (only if you tune the number of "canonical steps") +//! let tree4 = NeighborJoiningSolver::::default(d.clone()) +//! .set_canonical_steps(2) +//! .solve() +//! .unwrap(); +//! assert_eq!(robinson_foulds(tree3, tree4), 0); //! ``` pub mod hybrid_nj; @@ -31,11 +49,74 @@ pub mod naive_nj; pub mod newick; pub mod rapid_nj; use std::error; - +pub use distances::DistanceMatrix; +pub use property_tests::tree_distances::{branch_score, robinson_foulds}; type ResultBox = std::result::Result>; -type Tree = petgraph::graph::UnGraph; +pub type Tree = petgraph::graph::UnGraph; + +pub struct NeighborJoiningSolver {algo: U, dist: DistanceMatrix} +pub struct Canonical{} +impl NeighborJoiningSolver { + pub fn build(dist: DistanceMatrix) -> Self{ + NeighborJoiningSolver { algo: Canonical {}, dist: dist} + } + pub fn default(dist: DistanceMatrix) -> Self{ + Self::build(dist) + } + pub fn solve(self) -> ResultBox { + naive_nj::canonical_neighbor_joining(self.dist) + } +} +pub struct RapidBtrees{chunk_size: usize} +impl NeighborJoiningSolver { + pub fn build(dist: DistanceMatrix, chunk_size: usize) -> Self{ + NeighborJoiningSolver { algo: RapidBtrees {chunk_size}, dist: dist} + } + pub fn default(dist: DistanceMatrix) -> Self { + let n = dist.size(); + let threads = rayon::current_num_threads(); + let chunk_size = std::cmp::max(n/threads, 1); + if chunk_size == 0 { + panic!("Chunk size cannot be zero."); + } + NeighborJoiningSolver { algo: RapidBtrees {chunk_size}, dist: dist} + } + pub fn set_chunk_size(self, chunk_size: usize) -> Self { + if chunk_size < 1 { + panic!("Chunk size must be > 0."); + } + Self::build(self.dist, chunk_size) + } + pub fn solve(self) -> ResultBox { + rapid_nj::rapid_nj(self.dist, self.algo.chunk_size) + } +} -pub use naive_nj::canonical_neighbor_joining as canonical_neighbor_joining; -pub use rapid_nj::rapid_nj as rapid_nj_neighbor_joining; -pub use property_tests::tree_distances::robinson_foulds as robinson_foulds; -pub use property_tests::tree_distances::branch_score as branch_score; \ No newline at end of file +pub struct Hybrid{chunk_size: usize, canonical_iters: usize} +impl NeighborJoiningSolver { + pub fn build(dist: DistanceMatrix, chunk_size: usize, canonical_iters: usize) -> Self{ + NeighborJoiningSolver { algo: Hybrid {chunk_size, canonical_iters}, dist: dist} + } + pub fn default(dist: DistanceMatrix) -> Self { + let n = dist.size(); + let threads = rayon::current_num_threads(); + let chunk_size = std::cmp::max(n/threads, 1); + let canonical_iters = std::cmp::max(n/2, 1); + NeighborJoiningSolver { algo: Hybrid {chunk_size, canonical_iters}, dist: dist} + } + pub fn solve(self) -> ResultBox { + hybrid_nj::neighbor_joining(self.dist, self.algo.canonical_iters, self.algo.chunk_size) + } + pub fn set_chunk_size(self, chunk_size: usize) -> Self { + if chunk_size < 1 { + panic!("Chunk size must be > 0."); + } + Self::build(self.dist, chunk_size, self.algo.canonical_iters) + } + pub fn set_canonical_steps(self, n: usize) -> Self { + if n < 1 { + panic!("n must be > 0."); + } + Self::build(self.dist, self.algo.chunk_size, n) + } +} diff --git a/src/property_tests/tests.rs b/src/property_tests/tests.rs index b1d3c78..1294a5c 100644 --- a/src/property_tests/tests.rs +++ b/src/property_tests/tests.rs @@ -1,9 +1,9 @@ #[cfg(test)] -fn assert_equal_tree(a: &crate::Tree, b: &crate::Tree, i: usize) { +fn assert_equal_tree(a: &crate::Tree, b: &crate::Tree) { use crate::property_tests::tree_distances::{branch_score, robinson_foulds}; - assert_eq!(robinson_foulds(a.clone(), b.clone(), i), 0); + assert_eq!(robinson_foulds(a.clone(), b.clone()), 0); assert!(petgraph::algo::is_isomorphic(&a.clone(), &b)); - assert!(branch_score(a.clone(), b.clone(), i) < f64::EPSILON); + assert!(branch_score(a.clone(), b.clone()) < f64::EPSILON); } #[test] @@ -18,7 +18,7 @@ fn test_random_additive_binary_trees_naive() { let original_tree = random_unrooted_binary_tree(i); let d = distance_matrix_from_tree(original_tree.clone()); let tree = canonical_neighbor_joining(d).unwrap(); - assert_equal_tree(&original_tree, &tree, i) + assert_equal_tree(&original_tree, &tree) } } #[test] @@ -32,7 +32,7 @@ fn test_random_additive_binary_trees_rapid() { let d = distance_matrix_from_tree(original_tree.clone()); let chunk_size = rand::random::() % (i + 1) + 1; let tree = rapid_nj(d, chunk_size).unwrap(); - assert_equal_tree(&original_tree, &tree, i) + assert_equal_tree(&original_tree, &tree) } } @@ -49,7 +49,7 @@ fn test_random_additive_binary_trees_mix() { let naive_steps = rand::random::() % (i + 1); let chunk_size = rand::random::() % (i + 1) + 1; let tree = neighbor_joining(d.clone(), naive_steps, chunk_size).unwrap(); - assert_equal_tree(&original_tree, &tree, i) + assert_equal_tree(&original_tree, &tree) } } } diff --git a/src/property_tests/tree_distances.rs b/src/property_tests/tree_distances.rs index 03a47ab..eb3c7e7 100644 --- a/src/property_tests/tree_distances.rs +++ b/src/property_tests/tree_distances.rs @@ -5,8 +5,22 @@ use bit_vec::BitVec; use petgraph::stable_graph::EdgeIndex; use crate::Tree; + +fn count_leaves(x: &Tree) -> usize { + let mut leaf_count = 0; + for node in x.node_indices() { + if x.neighbors(node).count() == 1 { + leaf_count += 1; + } + } + leaf_count +} + /// Calculate the Branch-Score distance between two trees -pub fn branch_score(a: Tree, b: Tree, n_leaves: usize) -> f64 { +pub fn branch_score(a: Tree, b: Tree) -> f64 { + let n_leaves = (count_leaves(&a), count_leaves(&b)); + assert_eq!(n_leaves.0, n_leaves.1); + let n_leaves = n_leaves.0; let mut bits_a = HashMap::new(); let mut bits_b = HashMap::new(); a.edge_indices() @@ -36,8 +50,12 @@ pub fn branch_score(a: Tree, b: Tree, n_leaves: usize) -> f64 { distance } + /// Calculate the Robinson-Foulds distance between two trees -pub fn robinson_foulds(a: Tree, b: Tree, n_leaves: usize) -> usize { +pub fn robinson_foulds(a: Tree, b: Tree) -> usize { + let n_leaves = (count_leaves(&a), count_leaves(&b)); + assert_eq!(n_leaves.0, n_leaves.1); + let n_leaves = n_leaves.0; let bits_a: HashSet = HashSet::from_iter( a.edge_indices() .map(|edge| collect_bit_vector(&a, edge, n_leaves)), @@ -146,12 +164,12 @@ mod tests { t2.add_edge(d, u, 1.0); t2.add_edge(v, u, 1.0); } - assert_eq!(robinson_foulds(t1.clone(), t2.clone(), 4), 2); - assert_eq!(robinson_foulds(t1.clone(), t1.clone(), 4), 0); - assert_eq!(robinson_foulds(t2.clone(), t2.clone(), 4), 0); + assert_eq!(robinson_foulds(t1.clone(), t2.clone()), 2); + assert_eq!(robinson_foulds(t1.clone(), t1.clone()), 0); + assert_eq!(robinson_foulds(t2.clone(), t2.clone()), 0); // - assert_eq!(branch_score(t1.clone(), t2.clone(), 4), 112.0); - assert_eq!(branch_score(t1.clone(), t1, 4), 0.0); - assert_eq!(branch_score(t2.clone(), t2, 4), 0.0); + assert_eq!(branch_score(t1.clone(), t2.clone()), 112.0); + assert_eq!(branch_score(t1.clone(), t1), 0.0); + assert_eq!(branch_score(t2.clone(), t2), 0.0); } } From 8a1e754c6403574785caa80d7bb53853a1da3963 Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Thu, 21 Mar 2024 14:56:32 +0100 Subject: [PATCH 5/7] Pass by reference --- src/lib.rs | 4 ++-- src/property_tests/tests.rs | 6 +++--- src/property_tests/tree_distances.rs | 16 ++++++++-------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index d220ce1..0f3d6bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,7 +24,7 @@ //! let tree2 = NeighborJoiningSolver::::default(d.clone()) //! .solve() //! .unwrap(); -//! assert_eq!(robinson_foulds(tree1, tree2), 0); +//! assert_eq!(robinson_foulds(&tree1, &tree2), 0); //! //! // You can improve the speed a lot by using multiple threads (see rayon::ThreadPoolBuilder::new()) //! // If so, you may want to tune the chunk size every worker uses @@ -38,7 +38,7 @@ //! .set_canonical_steps(2) //! .solve() //! .unwrap(); -//! assert_eq!(robinson_foulds(tree3, tree4), 0); +//! assert_eq!(robinson_foulds(&tree3, &tree4), 0); //! ``` pub mod hybrid_nj; diff --git a/src/property_tests/tests.rs b/src/property_tests/tests.rs index 1294a5c..c36022c 100644 --- a/src/property_tests/tests.rs +++ b/src/property_tests/tests.rs @@ -1,9 +1,9 @@ #[cfg(test)] fn assert_equal_tree(a: &crate::Tree, b: &crate::Tree) { use crate::property_tests::tree_distances::{branch_score, robinson_foulds}; - assert_eq!(robinson_foulds(a.clone(), b.clone()), 0); - assert!(petgraph::algo::is_isomorphic(&a.clone(), &b)); - assert!(branch_score(a.clone(), b.clone()) < f64::EPSILON); + assert_eq!(robinson_foulds(&a, &b), 0); + assert!(petgraph::algo::is_isomorphic(&a, &b)); + assert!(branch_score(&a, &b) < f64::EPSILON); } #[test] diff --git a/src/property_tests/tree_distances.rs b/src/property_tests/tree_distances.rs index eb3c7e7..7821521 100644 --- a/src/property_tests/tree_distances.rs +++ b/src/property_tests/tree_distances.rs @@ -17,7 +17,7 @@ fn count_leaves(x: &Tree) -> usize { } /// Calculate the Branch-Score distance between two trees -pub fn branch_score(a: Tree, b: Tree) -> f64 { +pub fn branch_score(a: &Tree, b: &Tree) -> f64 { let n_leaves = (count_leaves(&a), count_leaves(&b)); assert_eq!(n_leaves.0, n_leaves.1); let n_leaves = n_leaves.0; @@ -52,7 +52,7 @@ pub fn branch_score(a: Tree, b: Tree) -> f64 { /// Calculate the Robinson-Foulds distance between two trees -pub fn robinson_foulds(a: Tree, b: Tree) -> usize { +pub fn robinson_foulds(a: &Tree, b: &Tree) -> usize { let n_leaves = (count_leaves(&a), count_leaves(&b)); assert_eq!(n_leaves.0, n_leaves.1); let n_leaves = n_leaves.0; @@ -164,12 +164,12 @@ mod tests { t2.add_edge(d, u, 1.0); t2.add_edge(v, u, 1.0); } - assert_eq!(robinson_foulds(t1.clone(), t2.clone()), 2); - assert_eq!(robinson_foulds(t1.clone(), t1.clone()), 0); - assert_eq!(robinson_foulds(t2.clone(), t2.clone()), 0); + assert_eq!(robinson_foulds(&t1, &t2), 2); + assert_eq!(robinson_foulds(&t1, &t1), 0); + assert_eq!(robinson_foulds(&t2, &t2), 0); // - assert_eq!(branch_score(t1.clone(), t2.clone()), 112.0); - assert_eq!(branch_score(t1.clone(), t1), 0.0); - assert_eq!(branch_score(t2.clone(), t2), 0.0); + assert_eq!(branch_score(&t1, &t2), 112.0); + assert_eq!(branch_score(&t1, &t1), 0.0); + assert_eq!(branch_score(&t2, &t2), 0.0); } } From 0e1a568b17a5bd29aa6ccf63d2bc0927277275fa Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Thu, 21 Mar 2024 14:58:41 +0100 Subject: [PATCH 6/7] Apply linter --- src/bin.rs | 4 +- src/distances.rs | 2 +- src/hybrid_nj/mod.rs | 2 +- src/lib.rs | 153 ++++++++++++++++----------- src/property_tests/tree_distances.rs | 13 ++- 5 files changed, 100 insertions(+), 74 deletions(-) diff --git a/src/bin.rs b/src/bin.rs index 73965b1..05bbe11 100644 --- a/src/bin.rs +++ b/src/bin.rs @@ -1,4 +1,4 @@ -extern crate speedytree; +extern crate speedytree; use clap::Parser; /// # speedytree /// `speedytree` is a command line tool for quickly creating a directory tree. @@ -20,8 +20,6 @@ use std::{ }; type ResultBox = std::result::Result>; - - /// Define the configuration of the program /// It contains the algorithm to use and the number of threads to use /// diff --git a/src/distances.rs b/src/distances.rs index 10e4578..5701d2f 100644 --- a/src/distances.rs +++ b/src/distances.rs @@ -38,7 +38,7 @@ impl DistanceMatrix { pub fn size(&self) -> usize { self.matrix.len() } - pub fn build(matrix: Vec>, names: Vec) -> ResultBox{ + pub fn build(matrix: Vec>, names: Vec) -> ResultBox { if matrix.len() != names.len() { return Err("Matrix and names have different lengths".into()); } diff --git a/src/hybrid_nj/mod.rs b/src/hybrid_nj/mod.rs index 616b528..67cf60b 100644 --- a/src/hybrid_nj/mod.rs +++ b/src/hybrid_nj/mod.rs @@ -4,8 +4,8 @@ pub use algorithm::neighbor_joining; #[cfg(test)] mod tests { - use crate::distances::DistanceMatrix; use super::*; + use crate::distances::DistanceMatrix; #[test] fn test_example_wikipedia() { let d = DistanceMatrix { diff --git a/src/lib.rs b/src/lib.rs index 0f3d6bd..3490c27 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,7 @@ //! Canonical and RapidNJ implementations of Neighbor-joining in Rust //! //! Provides Rust implementation of the Canonical algorithm and something in the spirit of RapidNJ but with B-trees. Work in progress. -//! A minimal example is provided here. +//! A minimal example is provided here. //! ``` //! use speedytree::DistanceMatrix; //! use speedytree::robinson_foulds; @@ -16,7 +16,7 @@ //! e 8 9 7 3 0 //!".as_bytes();; //!let d = DistanceMatrix::read_from_phylip(input).unwrap(); -//! // Canonical Neighbor Joining. Optimal for small problems. +//! // Canonical Neighbor Joining. Optimal for small problems. //! let tree1 = NeighborJoiningSolver::::default(d.clone()) //! .solve() //! .unwrap(); @@ -25,7 +25,7 @@ //! .solve() //! .unwrap(); //! assert_eq!(robinson_foulds(&tree1, &tree2), 0); -//! +//! //! // You can improve the speed a lot by using multiple threads (see rayon::ThreadPoolBuilder::new()) //! // If so, you may want to tune the chunk size every worker uses //! let tree3 = NeighborJoiningSolver::::default(d.clone()) @@ -41,82 +41,111 @@ //! assert_eq!(robinson_foulds(&tree3, &tree4), 0); //! ``` -pub mod hybrid_nj; -/// Property tests for neighbor joining algorithm -pub mod property_tests; pub mod distances; +pub mod hybrid_nj; pub mod naive_nj; pub mod newick; +/// Property tests for neighbor joining algorithm +pub mod property_tests; pub mod rapid_nj; -use std::error; pub use distances::DistanceMatrix; pub use property_tests::tree_distances::{branch_score, robinson_foulds}; +use std::error; type ResultBox = std::result::Result>; pub type Tree = petgraph::graph::UnGraph; -pub struct NeighborJoiningSolver {algo: U, dist: DistanceMatrix} -pub struct Canonical{} +pub struct NeighborJoiningSolver { + algo: U, + dist: DistanceMatrix, +} +pub struct Canonical {} impl NeighborJoiningSolver { - pub fn build(dist: DistanceMatrix) -> Self{ - NeighborJoiningSolver { algo: Canonical {}, dist: dist} - } - pub fn default(dist: DistanceMatrix) -> Self{ - Self::build(dist) - } - pub fn solve(self) -> ResultBox { - naive_nj::canonical_neighbor_joining(self.dist) - } + pub fn build(dist: DistanceMatrix) -> Self { + NeighborJoiningSolver { + algo: Canonical {}, + dist: dist, + } + } + pub fn default(dist: DistanceMatrix) -> Self { + Self::build(dist) + } + pub fn solve(self) -> ResultBox { + naive_nj::canonical_neighbor_joining(self.dist) + } +} +pub struct RapidBtrees { + chunk_size: usize, } -pub struct RapidBtrees{chunk_size: usize} impl NeighborJoiningSolver { - pub fn build(dist: DistanceMatrix, chunk_size: usize) -> Self{ - NeighborJoiningSolver { algo: RapidBtrees {chunk_size}, dist: dist} - } - pub fn default(dist: DistanceMatrix) -> Self { - let n = dist.size(); - let threads = rayon::current_num_threads(); - let chunk_size = std::cmp::max(n/threads, 1); - if chunk_size == 0 { - panic!("Chunk size cannot be zero."); + pub fn build(dist: DistanceMatrix, chunk_size: usize) -> Self { + NeighborJoiningSolver { + algo: RapidBtrees { chunk_size }, + dist: dist, + } + } + pub fn default(dist: DistanceMatrix) -> Self { + let n = dist.size(); + let threads = rayon::current_num_threads(); + let chunk_size = std::cmp::max(n / threads, 1); + if chunk_size == 0 { + panic!("Chunk size cannot be zero."); + } + NeighborJoiningSolver { + algo: RapidBtrees { chunk_size }, + dist: dist, + } } - NeighborJoiningSolver { algo: RapidBtrees {chunk_size}, dist: dist} - } - pub fn set_chunk_size(self, chunk_size: usize) -> Self { - if chunk_size < 1 { - panic!("Chunk size must be > 0."); + pub fn set_chunk_size(self, chunk_size: usize) -> Self { + if chunk_size < 1 { + panic!("Chunk size must be > 0."); + } + Self::build(self.dist, chunk_size) + } + pub fn solve(self) -> ResultBox { + rapid_nj::rapid_nj(self.dist, self.algo.chunk_size) } - Self::build(self.dist, chunk_size) - } - pub fn solve(self) -> ResultBox { - rapid_nj::rapid_nj(self.dist, self.algo.chunk_size) - } } -pub struct Hybrid{chunk_size: usize, canonical_iters: usize} +pub struct Hybrid { + chunk_size: usize, + canonical_iters: usize, +} impl NeighborJoiningSolver { - pub fn build(dist: DistanceMatrix, chunk_size: usize, canonical_iters: usize) -> Self{ - NeighborJoiningSolver { algo: Hybrid {chunk_size, canonical_iters}, dist: dist} - } - pub fn default(dist: DistanceMatrix) -> Self { - let n = dist.size(); - let threads = rayon::current_num_threads(); - let chunk_size = std::cmp::max(n/threads, 1); - let canonical_iters = std::cmp::max(n/2, 1); - NeighborJoiningSolver { algo: Hybrid {chunk_size, canonical_iters}, dist: dist} - } - pub fn solve(self) -> ResultBox { - hybrid_nj::neighbor_joining(self.dist, self.algo.canonical_iters, self.algo.chunk_size) - } - pub fn set_chunk_size(self, chunk_size: usize) -> Self { - if chunk_size < 1 { - panic!("Chunk size must be > 0."); + pub fn build(dist: DistanceMatrix, chunk_size: usize, canonical_iters: usize) -> Self { + NeighborJoiningSolver { + algo: Hybrid { + chunk_size, + canonical_iters, + }, + dist: dist, + } + } + pub fn default(dist: DistanceMatrix) -> Self { + let n = dist.size(); + let threads = rayon::current_num_threads(); + let chunk_size = std::cmp::max(n / threads, 1); + let canonical_iters = std::cmp::max(n / 2, 1); + NeighborJoiningSolver { + algo: Hybrid { + chunk_size, + canonical_iters, + }, + dist: dist, + } + } + pub fn solve(self) -> ResultBox { + hybrid_nj::neighbor_joining(self.dist, self.algo.canonical_iters, self.algo.chunk_size) + } + pub fn set_chunk_size(self, chunk_size: usize) -> Self { + if chunk_size < 1 { + panic!("Chunk size must be > 0."); + } + Self::build(self.dist, chunk_size, self.algo.canonical_iters) } - Self::build(self.dist, chunk_size, self.algo.canonical_iters) - } - pub fn set_canonical_steps(self, n: usize) -> Self { - if n < 1 { - panic!("n must be > 0."); + pub fn set_canonical_steps(self, n: usize) -> Self { + if n < 1 { + panic!("n must be > 0."); + } + Self::build(self.dist, self.algo.chunk_size, n) } - Self::build(self.dist, self.algo.chunk_size, n) - } } diff --git a/src/property_tests/tree_distances.rs b/src/property_tests/tree_distances.rs index 7821521..cb6b9eb 100644 --- a/src/property_tests/tree_distances.rs +++ b/src/property_tests/tree_distances.rs @@ -18,7 +18,7 @@ fn count_leaves(x: &Tree) -> usize { /// Calculate the Branch-Score distance between two trees pub fn branch_score(a: &Tree, b: &Tree) -> f64 { - let n_leaves = (count_leaves(&a), count_leaves(&b)); + let n_leaves = (count_leaves(a), count_leaves(b)); assert_eq!(n_leaves.0, n_leaves.1); let n_leaves = n_leaves.0; let mut bits_a = HashMap::new(); @@ -26,12 +26,12 @@ pub fn branch_score(a: &Tree, b: &Tree) -> f64 { a.edge_indices() .zip(a.edge_weights()) .for_each(|(edge, w)| { - bits_a.insert(collect_bit_vector(&a, edge, n_leaves), *w); + bits_a.insert(collect_bit_vector(a, edge, n_leaves), *w); }); b.edge_indices() .zip(b.edge_weights()) .for_each(|(edge, w)| { - bits_b.insert(collect_bit_vector(&b, edge, n_leaves), *w); + bits_b.insert(collect_bit_vector(b, edge, n_leaves), *w); }); // Get union of a and b keys @@ -50,19 +50,18 @@ pub fn branch_score(a: &Tree, b: &Tree) -> f64 { distance } - /// Calculate the Robinson-Foulds distance between two trees pub fn robinson_foulds(a: &Tree, b: &Tree) -> usize { - let n_leaves = (count_leaves(&a), count_leaves(&b)); + let n_leaves = (count_leaves(a), count_leaves(b)); assert_eq!(n_leaves.0, n_leaves.1); let n_leaves = n_leaves.0; let bits_a: HashSet = HashSet::from_iter( a.edge_indices() - .map(|edge| collect_bit_vector(&a, edge, n_leaves)), + .map(|edge| collect_bit_vector(a, edge, n_leaves)), ); let bits_b: HashSet = HashSet::from_iter( b.edge_indices() - .map(|edge| collect_bit_vector(&b, edge, n_leaves)), + .map(|edge| collect_bit_vector(b, edge, n_leaves)), ); let mut distance = 0; From 99b8ed30f9178a63185d87687ced791850591a0a Mon Sep 17 00:00:00 2001 From: Curro Campuzano Date: Thu, 21 Mar 2024 15:35:13 +0100 Subject: [PATCH 7/7] Add new documentation version --- src/bin.rs | 19 +++--- src/lib.rs | 67 ++++++++++++++++------ src/newick.rs | 1 + src/property_tests/random_additive_tree.rs | 3 + 4 files changed, 60 insertions(+), 30 deletions(-) diff --git a/src/bin.rs b/src/bin.rs index 05bbe11..6ce950e 100644 --- a/src/bin.rs +++ b/src/bin.rs @@ -6,13 +6,9 @@ use clap::Parser; /// It is intended to be a drop-in replacement for the `tree` command. /// It is not intended to be a complete implementation of the `tree` command. /// It is intended to be a fast implementation of the `tree` command. -use hybrid_nj::neighbor_joining; -use speedytree::hybrid_nj; +use speedytree::DistanceMatrix; +use speedytree::{Canonical, Hybrid, NeighborJoiningSolver, RapidBtrees}; -use speedytree::distances::DistanceMatrix; -use speedytree::naive_nj::canonical_neighbor_joining; -use speedytree::newick::to_newick; -use speedytree::rapid_nj::rapid_nj; use std::{ error, io::{self, Write}, @@ -122,12 +118,13 @@ pub fn run(config: Config) { }); let d = match config.algo { - Algorithm::Naive => canonical_neighbor_joining(d), - Algorithm::RapidNJ => rapid_nj(d, config.chunk_size), + Algorithm::Naive => NeighborJoiningSolver::::default(d).solve(), + Algorithm::RapidNJ => { + NeighborJoiningSolver::::build(d, config.chunk_size).solve() + } Algorithm::Hybrid => { let naive_steps = d.size() * config.naive_percentage / 100; - dbg!(naive_steps); - neighbor_joining(d, naive_steps, config.chunk_size) + NeighborJoiningSolver::::build(d, config.chunk_size, naive_steps).solve() } }; let graph = d.unwrap_or_else(|err| { @@ -135,7 +132,7 @@ pub fn run(config: Config) { process::exit(1); }); io::stdout() - .write_all(to_newick(&graph).as_bytes()) + .write_all(speedytree::to_newick(&graph).as_bytes()) .unwrap_or_else(|err| { eprintln!("{err}"); process::exit(1); diff --git a/src/lib.rs b/src/lib.rs index 3490c27..d5c0528 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,9 @@ //! Canonical and RapidNJ implementations of Neighbor-joining in Rust //! -//! Provides Rust implementation of the Canonical algorithm and something in the spirit of RapidNJ but with B-trees. Work in progress. -//! A minimal example is provided here. +//! Provides Rust implementation of the Canonical algorithm and something in the spirit of RapidNJ but with B-trees. Some helper functions are also provided. +//! +//! This repository provides (a) a command line application that reads files in Phylip format and (b) a small library you can easily integrate in your projects. It relies on rayon for parallelization. PR's are more than wellcome! +//! A minimal example of the library is provided here. You can read more about the command line app by running speedytree -h //! ``` //! use speedytree::DistanceMatrix; //! use speedytree::robinson_foulds; @@ -9,12 +11,12 @@ //! use rayon; //!// Raw Phylip format //!let input = "5 -//! a 0 5 9 9 8 -//! b 5 0 10 10 9 -//! c 9 10 0 8 7 -//! d 9 10 8 0 3 -//! e 8 9 7 3 0 -//!".as_bytes();; +//! a 0 5 9 9 8 +//! b 5 0 10 10 9 +//! c 9 10 0 8 7 +//! d 9 10 8 0 3 +//! e 8 9 7 3 0 +//!".as_bytes(); //!let d = DistanceMatrix::read_from_phylip(input).unwrap(); //! // Canonical Neighbor Joining. Optimal for small problems. //! let tree1 = NeighborJoiningSolver::::default(d.clone()) @@ -41,48 +43,59 @@ //! assert_eq!(robinson_foulds(&tree3, &tree4), 0); //! ``` -pub mod distances; -pub mod hybrid_nj; -pub mod naive_nj; -pub mod newick; +mod distances; +mod hybrid_nj; +mod naive_nj; +mod newick; /// Property tests for neighbor joining algorithm -pub mod property_tests; -pub mod rapid_nj; +mod property_tests; +mod rapid_nj; pub use distances::DistanceMatrix; +pub use newick::to_newick; pub use property_tests::tree_distances::{branch_score, robinson_foulds}; + use std::error; type ResultBox = std::result::Result>; +/// An undirected graph from Petgraph (internal nodes with empty names) pub type Tree = petgraph::graph::UnGraph; +/// Generic solver pub struct NeighborJoiningSolver { algo: U, dist: DistanceMatrix, } +/// Canonical Neighbor-Joining, similar to QuickTree pub struct Canonical {} impl NeighborJoiningSolver { + /// Construct solver from parameters pub fn build(dist: DistanceMatrix) -> Self { NeighborJoiningSolver { algo: Canonical {}, - dist: dist, + dist, } } + /// Default solver pub fn default(dist: DistanceMatrix) -> Self { Self::build(dist) } + /// Solve the Neighbor-Joining problem pub fn solve(self) -> ResultBox { naive_nj::canonical_neighbor_joining(self.dist) } } +/// In the spirit of RapidNJ, but with B-trees pub struct RapidBtrees { chunk_size: usize, } impl NeighborJoiningSolver { + /// Construct solver from parameters pub fn build(dist: DistanceMatrix, chunk_size: usize) -> Self { NeighborJoiningSolver { algo: RapidBtrees { chunk_size }, - dist: dist, + dist, } } + /// Default solver (based on available rayon threads) pub fn default(dist: DistanceMatrix) -> Self { let n = dist.size(); let threads = rayon::current_num_threads(); @@ -92,34 +105,39 @@ impl NeighborJoiningSolver { } NeighborJoiningSolver { algo: RapidBtrees { chunk_size }, - dist: dist, + dist, } } + /// Set chunk size (for every worker) pub fn set_chunk_size(self, chunk_size: usize) -> Self { if chunk_size < 1 { panic!("Chunk size must be > 0."); } Self::build(self.dist, chunk_size) } + /// Solve the Neighbor-Joining problem pub fn solve(self) -> ResultBox { rapid_nj::rapid_nj(self.dist, self.algo.chunk_size) } } +/// A mix of the Canonical and RapidNJ pub struct Hybrid { chunk_size: usize, canonical_iters: usize, } impl NeighborJoiningSolver { + /// Construct solver from parameters pub fn build(dist: DistanceMatrix, chunk_size: usize, canonical_iters: usize) -> Self { NeighborJoiningSolver { algo: Hybrid { chunk_size, canonical_iters, }, - dist: dist, + dist, } } + /// Default solver (based on available rayon threads and problem size) pub fn default(dist: DistanceMatrix) -> Self { let n = dist.size(); let threads = rayon::current_num_threads(); @@ -130,22 +148,33 @@ impl NeighborJoiningSolver { chunk_size, canonical_iters, }, - dist: dist, + dist, } } + /// Solve the Neighbor-Joining problem pub fn solve(self) -> ResultBox { hybrid_nj::neighbor_joining(self.dist, self.algo.canonical_iters, self.algo.chunk_size) } + /// Set chunk size (for every worker) pub fn set_chunk_size(self, chunk_size: usize) -> Self { if chunk_size < 1 { panic!("Chunk size must be > 0."); } Self::build(self.dist, chunk_size, self.algo.canonical_iters) } + /// Set number of canonical iterations will be done pub fn set_canonical_steps(self, n: usize) -> Self { if n < 1 { panic!("n must be > 0."); } Self::build(self.dist, self.algo.chunk_size, n) } + /// Set fraction of canonical iterations will be done + pub fn set_canonical_percentage(self, prop: f64) -> Self { + if prop <= 0.0 || prop >= 1.0 { + panic!("Proportion must be between 0 and 1."); + } + let n = self.dist.size() as f64 * prop / 100.0; + Self::build(self.dist, self.algo.chunk_size, n as usize) + } } diff --git a/src/newick.rs b/src/newick.rs index 95339a1..0fd2153 100644 --- a/src/newick.rs +++ b/src/newick.rs @@ -14,6 +14,7 @@ fn format_edge_float<'a>( buffer.format_finite(*t.edge_weight(e).expect("Valid edge")) } +/// Convert a tree (as generated from solver functions, as an undidrected Petgraph) to a string pub fn to_newick(t: &Tree) -> String { let mut buffer = dtoa::Buffer::new(); let root = root(t).expect("Node with three edges"); diff --git a/src/property_tests/random_additive_tree.rs b/src/property_tests/random_additive_tree.rs index f1d27ba..a90c7bd 100644 --- a/src/property_tests/random_additive_tree.rs +++ b/src/property_tests/random_additive_tree.rs @@ -7,6 +7,7 @@ use rand::Rng; use crate::distances::DistanceMatrix; use crate::Tree; +#[allow(dead_code)] fn random_rooted_binary_tree(leaves: usize) -> UnGraph { let mut tree = UnGraph::new_undirected(); let mut next_node = leaves; @@ -33,6 +34,7 @@ fn random_rooted_binary_tree(leaves: usize) -> UnGraph { /// Generate a random unrooted binary tree with n_leaves leaves /// The tree is generated by first generating a random rooted binary tree /// and then removing the root and adding an edge between two leaves +#[allow(dead_code)] pub fn random_unrooted_binary_tree(n_leaves: usize) -> UnGraph { let mut t = random_rooted_binary_tree(n_leaves); // Remove root as node with degree 2 @@ -53,6 +55,7 @@ pub fn random_unrooted_binary_tree(n_leaves: usize) -> UnGraph { /// The distance matrix is generated by computing the distance between /// all pairs of leaves. It uses the astar algorithm from petgraph, so it's not efficient. /// It's only used for testing purposes. +#[allow(dead_code)] pub fn distance_matrix_from_tree(t: Tree) -> DistanceMatrix { // Find all leaves let leaves: Vec = t