From 56343b4c0cc78f041801a49c7552e7167d102b14 Mon Sep 17 00:00:00 2001 From: Tarek Elsayed <60650661+tareknaser@users.noreply.github.com> Date: Tue, 3 Sep 2024 19:18:05 +0300 Subject: [PATCH] `fs-index`: Track API (#85) * feat(fs-index): implement method update_one() for track api Signed-off-by: Tarek * test(fs-index): unit tests for resource index track API Signed-off-by: Tarek * feat(fs-index): add benchmarks for Resourceindex::update_one Signed-off-by: Tarek * feat(fs-index): update_one() to be its own type of API Signed-off-by: Tarek * docs(fs-index): add a note that selective API is experimental Signed-off-by: Tarek * benchmark(fs-index): increase number of updates for update_one() Signed-off-by: Tarek * feat(fs-index): debug_assert that the path already exists in index Signed-off-by: Tarek --------- Signed-off-by: Tarek --- fs-index/README.md | 2 + fs-index/benches/resource_index_benchmark.rs | 60 +++++ fs-index/src/index.rs | 94 ++++++- fs-index/src/tests.rs | 248 ++++++++++++++++++- fs-storage/src/file_storage.rs | 2 +- 5 files changed, 394 insertions(+), 12 deletions(-) diff --git a/fs-index/README.md b/fs-index/README.md index 9253375a..ca3fb565 100644 --- a/fs-index/README.md +++ b/fs-index/README.md @@ -13,6 +13,8 @@ The most important struct in this crate is `ResourceIndex` which comes with: - **Snapshot API** - `get_resources_by_id`: Query resources from the index by ID. - `get_resource_by_path`: Query a resource from the index by its path. +- **Selective API** + - `update_one`: Method to manually update a specific resource by selectively rescanning a single file. ## Custom Serialization diff --git a/fs-index/benches/resource_index_benchmark.rs b/fs-index/benches/resource_index_benchmark.rs index f6d3f21a..ae46e727 100644 --- a/fs-index/benches/resource_index_benchmark.rs +++ b/fs-index/benches/resource_index_benchmark.rs @@ -84,6 +84,66 @@ fn resource_index_benchmark(c: &mut Criterion) { }); }); + // Benchmark `ResourceIndex::update_one()` + + // First, create a new temp directory specifically for the update_one + // benchmark since we will be creating new files, removing files, and + // modifying files + + let update_one_benchmarks_dir = + TempDir::with_prefix("ark-fs-index-benchmarks-update-one").unwrap(); + let update_one_benchmarks_dir = update_one_benchmarks_dir.path(); + + group.bench_function("index_update_one", |b| { + b.iter(|| { + // Clear the directory + std::fs::remove_dir_all(&update_one_benchmarks_dir).unwrap(); + std::fs::create_dir(&update_one_benchmarks_dir).unwrap(); + + // Create 5000 new files + for i in 0..5000 { + let new_file = + update_one_benchmarks_dir.join(format!("file_{}.txt", i)); + std::fs::File::create(&new_file).unwrap(); + std::fs::write(&new_file, format!("Hello, World! {}", i)) + .unwrap(); + } + let mut index: ResourceIndex = + ResourceIndex::build(black_box(&update_one_benchmarks_dir)) + .unwrap(); + + // Create 1000 new files + for i in 5000..6000 { + let new_file = + update_one_benchmarks_dir.join(format!("file_{}.txt", i)); + std::fs::File::create(&new_file).unwrap(); + std::fs::write(&new_file, format!("Hello, World! {}", i)) + .unwrap(); + } + + // Modify 1000 files + for i in 4000..5000 { + let modified_file = + update_one_benchmarks_dir.join(format!("file_{}.txt", i)); + std::fs::write(&modified_file, format!("Bye, World! {}", i)) + .unwrap(); + } + + // Remove 1000 files + for i in 3000..4000 { + let removed_file = + update_one_benchmarks_dir.join(format!("file_{}.txt", i)); + std::fs::remove_file(&removed_file).unwrap(); + } + + // Call update_one for each of the 3000 files + for i in 3000..6000 { + let file_path = format!("file_{}.txt", i); + let _update_result = index.update_one(&file_path).unwrap(); + } + }); + }); + group.finish(); } diff --git a/fs-index/src/index.rs b/fs-index/src/index.rs index 98bcafb8..6746ac29 100644 --- a/fs-index/src/index.rs +++ b/fs-index/src/index.rs @@ -75,6 +75,7 @@ type IndexedPaths = HashSet>; /// #### Reactive API /// - [`ResourceIndex::update_all`]: Method to update the index by rescanning /// files and returning changes (additions/deletions/updates). + /// /// #### Snapshot API /// - [`ResourceIndex::get_resources_by_id`]: Query resources from the index by @@ -82,13 +83,12 @@ type IndexedPaths = HashSet>; /// - [`ResourceIndex::get_resource_by_path`]: Query a resource from the index /// by its path. /// -/// #### Track API -/// Allows for fine-grained control over tracking changes in the index -/// - [`ResourceIndex::track_addition`]: Track a newly added file (checks if the -/// file exists in the file system). -/// - [`ResourceIndex::track_removal`]: Track the deletion of a file (checks if -/// the file was actually deleted). -/// - [`ResourceIndex::track_modification`]: Track an update on a single file. +/// #### Selective API +/// - [`ResourceIndex::update_one`]: An experimental method to manually update a +/// specific resource by rescanning a single file. It provides targeted +/// control but is less dynamic than the reactive `update_all()`. The reactive +/// API is typically preferred for broader updates. +/// /// /// ## Examples /// ```no_run @@ -97,7 +97,7 @@ type IndexedPaths = HashSet>; /// use dev_hash::Crc32; /// /// // Define the root path -/// let root_path = Path::new("animals"); +/// let root_path = Path::new("path/to/animals"); /// /// // Build the index /// let index: ResourceIndex = ResourceIndex::build(root_path).expect("Failed to build index"); @@ -474,4 +474,82 @@ impl ResourceIndex { Ok(IndexUpdate { added, removed }) } + + /// Update the index with the latest information from the file system + /// for a single resource + /// + /// This method accepts the relative path of a single resource and updates + /// the index regardless of whether the resource was added, removed, or + /// modified. + /// + /// **Note**: The caller must ensure that: + /// - The index is up-to-date with the file system except for the updated + /// resource + /// - In case of a addition, the resource was not already in the index + /// - In case of a modification or removal, the resource was already in the + /// index + pub fn update_one>( + &mut self, + relative_path: P, + ) -> Result<()> { + let path = relative_path.as_ref(); + let entry_path = self.root.join(path); + + // Check if the entry exists in the file system + if !entry_path.exists() { + // If the entry does not exist in the file system, it's a removal + + // Remove the resource from the path to ID map + debug_assert!( + self.path_to_id.contains_key(path), + "Caller must ensure that the resource exists in the index: {:?}", + path + ); + let id = self.path_to_id.remove(path).unwrap(); + self.id_to_paths + .get_mut(&id.item) + .unwrap() + .remove(path); + // If the ID has no paths, remove it from the ID to paths map + if self.id_to_paths[&id.item].is_empty() { + self.id_to_paths.remove(&id.item); + } + + log::trace!("Resource removed: {:?}", path); + } else { + // If the entry exists in the file system, it's an addition or + // update. In either case, we need to update the index + // with the latest information about the resource + + let id = Id::from_path(entry_path.clone())?; + let metadata = fs::metadata(&entry_path)?; + let last_modified = metadata.modified()?; + let resource_path = Timestamped { + item: id.clone(), + last_modified, + }; + + // In case of modification, we need to remove the old path from + // the ID to paths map + if let Some(prev_id) = self.path_to_id.get(path) { + self.id_to_paths + .get_mut(&prev_id.item) + .unwrap() + .remove(path); + } + + // Update the path to resource map + self.path_to_id + .insert(path.to_path_buf(), resource_path); + // Update the ID to paths map + self.id_to_paths + .entry(id.clone()) + .or_default() + .insert(path.to_path_buf()); + + log::trace!("Resource added/updated: {:?}", path); + } + + Ok(()) + } } diff --git a/fs-index/src/tests.rs b/fs-index/src/tests.rs index 678042ed..24f4defa 100644 --- a/fs-index/src/tests.rs +++ b/fs-index/src/tests.rs @@ -534,8 +534,8 @@ fn test_hidden_files() { /// - Build a resource index in the temporary directory. /// - Create a new file. /// - Update the resource index. -/// - Assert that the return from `update_all` is that `added` includes the -/// new file. +/// - Assert that the return from `update_all` is that `added` includes the new +/// file. #[test] fn test_update_all_added_files() { for_each_type!(Crc32, Blake3 => { @@ -567,7 +567,7 @@ fn test_update_all_added_files() { /// - Update the file. /// - Update the resource index. /// - Assert that the return from `update_all` is that `added` includes the -/// updated file. +/// updated file. #[test] fn test_update_all_updated_files() { for_each_type!(Crc32, Blake3 => { @@ -666,3 +666,245 @@ fn test_update_all_files_with_same_hash() { assert_eq!(index.collisions().len(), 1, "{:?}", index); }); } + +/// Simple test for tracking a single resource addition. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Create a new file. +/// - Call `update_one()` with the relative path of the new file. +/// - Assert that the index contains the expected number of entries after the +/// addition. +#[test] +fn test_track_addition() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_track_addition") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + + let new_file_path = root_path.join("new_file.txt"); + fs::write(&new_file_path, "new file content") + .expect("Failed to write to file"); + let new_file_id = Id::from_path(&new_file_path).expect("Failed to get checksum"); + + index.update_one("new_file.txt").expect("Failed to update index"); + + assert_eq!(index.len(), 2, "{:?}", index); + let resource = index + .get_resource_by_path("new_file.txt") + .expect("Failed to get resource"); + assert_eq!(*resource.id(), new_file_id); + }); +} + +/// Test for tracking addition of a file with the same checksum as an existing +/// file in the index. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Create a new file with the same content as the existing file. +/// - Calculate the checksum of the new file. +/// - Call `update_one()` with the relative path of the new file. +/// - Assert that the index contains the expected number of entries with the +/// correct IDs and paths. +#[test] +fn test_track_addition_with_collision() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_track_addition_with_collision") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file1.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let file_path2 = root_path.join("file2.txt"); + fs::write(&file_path2, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + + let new_file_path = root_path.join("file3.txt"); + fs::write(&new_file_path, "file content") + .expect("Failed to write to file"); + let new_file_id = Id::from_path(&new_file_path).expect("Failed to get checksum"); + + index.update_one("file3.txt").expect("Failed to update index"); + + assert_eq!(index.len(), 3, "{:?}", index); + let resource = index + .get_resource_by_path("file3.txt") + .expect("Failed to get resource"); + assert_eq!(*resource.id(), new_file_id); + assert_eq!(index.collisions().len(), 1, "{:?}", index); + // Collision should contain 3 entries + assert_eq!(index.collisions()[&new_file_id].len(), 3, "{:?}", index); + }); +} + +/// Simple test for tracking a single resource removal. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory and get its checksum. +/// - Build a resource index in the temporary directory. +/// - Remove the file from the file system. +/// - Call `update_one()` with the relative path of the removed file. +/// - Assert that the index contains the expected number of entries after the +/// removal. +#[test] +fn test_track_removal() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_track_removal") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + + fs::remove_file(&file_path).expect("Failed to remove file"); + + index.update_one("file.txt").expect("Failed to update index"); + + assert_eq!(index.len(), 0, "{:?}", index); + }); +} + +/// Test for tracking removal of a file with the same checksum as an existing +/// file in the index. +/// +/// ## Test scenario: +/// - Create 2 files with the same content within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Remove one of the files from the file system. +/// - Call `update_one()` with the relative path of the removed file. +/// - Assert that the index contains the expected number of entries with the +/// correct IDs and paths after the removal. +#[test] +fn test_track_removal_with_collision() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_track_removal_with_collision") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file1.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + // Create a file with the same content as file1.txt + let file_path2 = root_path.join("file2.txt"); + fs::write(&file_path2, "file content").expect("Failed to write to file"); + + let file_id = Id::from_path(&file_path).expect("Failed to get checksum"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + + fs::remove_file(&file_path).expect("Failed to remove file"); + + index.update_one("file1.txt").expect("Failed to update index"); + + assert_eq!(index.len(), 1, "{:?}", index); + let resource_by_path = index + .get_resource_by_path("file2.txt") + .expect("Failed to get resource"); + assert_eq!(*resource_by_path.id(), file_id); + + let resources_by_id = index.get_resources_by_id(&file_id).unwrap(); + assert_eq!(resources_by_id.len(), 1, "{:?}", resources_by_id); + + // The length of `collisions` should be 0 because file1.txt was removed + assert_eq!(index.collisions().len(), 0, "{:?}", index); + }); +} + +/// Simple test for tracking a single resource modification. +/// +/// ## Test scenario: +/// - Create a file within the temporary directory and get its checksum. +/// - Build a resource index in the temporary directory. +/// - Modify the content of the file. +/// - Call `update_one()` with the relative path of the modified file. +/// - Assert that the index has 1 entry with the correct ID and path. +#[test] +fn test_track_modification() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_track_modification") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + + fs::write(&file_path, "updated file content") + .expect("Failed to write to file"); + let updated_file_id = Id::from_path(&file_path).expect("Failed to get checksum"); + + index.update_one("file.txt").expect("Failed to update index"); + + assert_eq!(index.len(), 1, "{:?}", index); + let resource = index + .get_resource_by_path("file.txt") + .expect("Failed to get resource"); + assert_eq!(*resource.id(), updated_file_id); + }); +} + +/// Test for tracking modification of a file with the same checksum as an +/// existing file in the index. +/// +/// ## Test scenario: +/// - Create 2 files with the same content within the temporary directory. +/// - Build a resource index in the temporary directory. +/// - Modify the content of one of the files. +/// - Call `update_one()` with the relative path of the modified file. +/// - Assert that the index contains the expected number of entries with the +/// correct IDs and paths after the modification. +#[test] +fn test_track_modification_with_collision() { + for_each_type!(Crc32, Blake3 => { + let temp_dir = TempDir::with_prefix("ark_test_track_modification_with_collision") + .expect("Failed to create temp dir"); + let root_path = temp_dir.path(); + + let file_path = root_path.join("file1.txt"); + fs::write(&file_path, "file content").expect("Failed to write to file"); + let file_id = Id::from_path(&file_path).expect("Failed to get checksum"); + + let file_path2 = root_path.join("file2.txt"); + fs::write(&file_path2, "updated file content").expect("Failed to write to file"); + + let mut index: ResourceIndex = + ResourceIndex::build(root_path).expect("Failed to build index"); + + fs::write(&file_path, "updated file content") + .expect("Failed to write to file"); + let updated_file_id = Id::from_path(&file_path).expect("Failed to get checksum"); + + index.update_one("file1.txt").expect("Failed to update index"); + + assert_eq!(index.len(), 2, "{:?}", index); + let resource_by_path = index + .get_resource_by_path("file1.txt") + .expect("Failed to get resource"); + assert_eq!(*resource_by_path.id(), updated_file_id); + + let resources_by_id = index.get_resources_by_id(&file_id).unwrap(); + assert_eq!(resources_by_id.len(), 0, "{:?}", resources_by_id); + + // The length of `collisions` should be 1 because file1.txt and file2.txt + // have the same content. + assert_eq!(index.collisions().len(), 1, "{:?}", index); + }); +} diff --git a/fs-storage/src/file_storage.rs b/fs-storage/src/file_storage.rs index 26a50969..2e3332e9 100644 --- a/fs-storage/src/file_storage.rs +++ b/fs-storage/src/file_storage.rs @@ -362,7 +362,7 @@ mod tests { file_storage.set("key1".to_string(), "value1".to_string()); file_storage.set("key1".to_string(), "value2".to_string()); assert!(file_storage.write_fs().is_ok()); - assert_eq!(storage_path.exists(), true); + assert!(storage_path.exists()); if let Err(err) = file_storage.erase() { panic!("Failed to delete file: {:?}", err);