Skip to content

Commit

Permalink
Minor enhancements
Browse files Browse the repository at this point in the history
Signed-off-by: Tarek <[email protected]>
  • Loading branch information
kirillt authored and tareknaser committed Mar 12, 2024
1 parent 2005fa3 commit 16a9b23
Show file tree
Hide file tree
Showing 11 changed files with 185 additions and 176 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.idea
target
Cargo.lock
**/app_id
8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
[workspace]
members = ["data-resource", "fs-atomic-versions", "fs-index", "fs-utils"]
members = [
"data-resource",
"fs-atomic-versions",
"fs-index",
"fs-utils"
]

default-members = [
"data-resource",
"fs-atomic-versions",
Expand Down
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ The purpose of the library is to manage _resource index_ of folders with various

<div align="center">

| Package | Description |
| -------------------- | ---------------------------------------- |
| `fs-index` | Resource Index construction and updating |
| `data-resource` | Resource hashing and ID construction |
| `fs-atomic-versions` | Version-based preventing of dirty writes |
| `fs-utils` | Utility functions and common code |
| Package | Description |
| -------------------- | ------------------------------------------ |
| `data-resource` | Resource hashing and ID construction |
| `fs-index` | Resource Index construction and updating |
| `fs-atomic-light` | Temp file-based preventing of dirty writes |
| `fs-atomic-versions` | Version-based preventing of dirty writes |
| `fs-utils` | Utility functions and common code |

</div>

Expand Down Expand Up @@ -61,6 +62,6 @@ cargo bench index_build

Our benchmark suite includes tests on local files and directories. These benchmarks are located in the `benches/` directory of some crates. Each benchmark sets a time limit using `group.measurement_time()`, which you can adjust manually based on your requirements.

You have the flexibility to benchmark specific files or folders by modifying the variables within the benchmark files. By default, the benchmarks operate on the [`testdata/`](../testdata/) directory and its contents. You can change the directory/files by setting the `DIR_PATH` and `FILE_PATHS` variables to the desired values.
You have the flexibility to benchmark specific files or folders by modifying the variables within the benchmark files. By default, the benchmarks operate on the [`test-assets/`](../test-assets/) directory and its contents. You can change the directory/files by setting the `DIR_PATH` and `FILE_PATHS` variables to the desired values.

For pre-benchmark assessment of required time to index a huge local folder, you can modify `test_build_resource_index` test case in `src/index.rs`.
6 changes: 5 additions & 1 deletion data-resource/benches/compute_bytes_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@ use pprof::criterion::{Output, PProfProfiler};
use rand::prelude::*;
use std::fs;

const FILE_PATHS: [&str; 2] = ["../testdata/lena.jpg", "../testdata/test.pdf"]; // Add files to benchmark here
const FILE_PATHS: [&str; 2] = [
// Add files to benchmark here
"../test-assets/lena.jpg",
"../test-assets/test.pdf",
];

fn generate_random_data(size: usize) -> Vec<u8> {
let mut rng = rand::thread_rng();
Expand Down
162 changes: 0 additions & 162 deletions data-resource/src/id.rs

This file was deleted.

163 changes: 161 additions & 2 deletions data-resource/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,162 @@
mod id;
use anyhow::anyhow;
use crc32fast::Hasher;
use log;
use serde::{Deserialize, Serialize};
use std::fmt::{self, Display, Formatter};
use std::fs;
use std::io::Read;
use std::io::{BufRead, BufReader};
use std::path::Path;
use std::str::FromStr;

pub use id::ResourceId;
use fs_utils::errors::{ArklibError, Result};

#[derive(
Eq,
Ord,
PartialEq,
PartialOrd,
Hash,
Clone,
Copy,
Debug,
Deserialize,
Serialize,
)]
pub struct ResourceId {
pub data_size: u64,
pub crc32: u32,
}

impl Display for ResourceId {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
write!(f, "{}-{}", self.data_size, self.crc32)
}
}

impl FromStr for ResourceId {
type Err = ArklibError;

fn from_str(s: &str) -> Result<Self> {
let (l, r) = s.split_once('-').ok_or(ArklibError::Parse)?;
let data_size: u64 = l.parse().map_err(|_| ArklibError::Parse)?;
let crc32: u32 = r.parse().map_err(|_| ArklibError::Parse)?;

Ok(ResourceId { data_size, crc32 })
}
}

impl ResourceId {
pub fn compute<P: AsRef<Path>>(
data_size: u64,
file_path: P,
) -> Result<Self> {
log::trace!(
"[compute] file {} with size {} mb",
file_path.as_ref().display(),
data_size / MEGABYTE
);

let source = fs::OpenOptions::new()
.read(true)
.open(file_path.as_ref())?;

let mut reader = BufReader::with_capacity(BUFFER_CAPACITY, source);
ResourceId::compute_reader(data_size, &mut reader)
}

pub fn compute_bytes(bytes: &[u8]) -> Result<Self> {
let data_size = bytes.len().try_into().map_err(|_| {
ArklibError::Other(anyhow!("Can't convert usize to u64"))
})?; //.unwrap();
let mut reader = BufReader::with_capacity(BUFFER_CAPACITY, bytes);
ResourceId::compute_reader(data_size, &mut reader)
}

pub fn compute_reader<R: Read>(
data_size: u64,
reader: &mut BufReader<R>,
) -> Result<Self> {
assert!(reader.buffer().is_empty());

log::trace!(
"Calculating hash of raw bytes (given size is {} megabytes)",
data_size / MEGABYTE
);

let mut hasher = Hasher::new();
let mut bytes_read: u32 = 0;
loop {
let bytes_read_iteration: usize = reader.fill_buf()?.len();
if bytes_read_iteration == 0 {
break;
}
hasher.update(reader.buffer());
reader.consume(bytes_read_iteration);
bytes_read +=
u32::try_from(bytes_read_iteration).map_err(|_| {
ArklibError::Other(anyhow!("Can't convert usize to u32"))
})?;
}

let crc32: u32 = hasher.finalize();
log::trace!("[compute] {} bytes has been read", bytes_read);
log::trace!("[compute] checksum: {:#02x}", crc32);
assert_eq!(std::convert::Into::<u64>::into(bytes_read), data_size);

Ok(ResourceId { data_size, crc32 })
}
}

const KILOBYTE: u64 = 1024;
const MEGABYTE: u64 = 1024 * KILOBYTE;
const BUFFER_CAPACITY: usize = 512 * KILOBYTE as usize;

#[cfg(test)]
mod tests {
use fs_atomic_versions::initialize;

use super::*;

#[test]
fn compute_id_test() {
initialize();

let file_path = Path::new("../test-assets/lena.jpg");
let data_size = fs::metadata(file_path)
.unwrap_or_else(|_| {
panic!(
"Could not open image test file_path.{}",
file_path.display()
)
})
.len();

let id1 = ResourceId::compute(data_size, file_path).unwrap();
assert_eq!(id1.crc32, 0x342a3d4a);
assert_eq!(id1.data_size, 128760);

let raw_bytes = fs::read(file_path).unwrap();
let id2 = ResourceId::compute_bytes(raw_bytes.as_slice()).unwrap();
assert_eq!(id2.crc32, 0x342a3d4a);
assert_eq!(id2.data_size, 128760);
}

#[test]
fn resource_id_order() {
let id1 = ResourceId {
data_size: 1,
crc32: 2,
};
let id2 = ResourceId {
data_size: 2,
crc32: 1,
};

assert!(id1 < id2);
assert!(id2 > id1);
assert!(id1 != id2);
assert!(id1 == id1);
assert!(id2 == id2);
}
}
2 changes: 1 addition & 1 deletion fs-index/benches/index_build_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use criterion::{
use fs_index::index::ResourceIndex;
use pprof::criterion::{Output, PProfProfiler};

const DIR_PATH: &str = "../testdata/"; // Set the path to the directory containing the resources here
const DIR_PATH: &str = "../test-assets/"; // Set the path to the directory containing the resources here

fn index_build_benchmark(c: &mut Criterion) {
// assert the path exists and is a directory
Expand Down
Loading

0 comments on commit 16a9b23

Please sign in to comment.