Skip to content

Commit

Permalink
feat: initial implementation of header validation
Browse files Browse the repository at this point in the history
  • Loading branch information
theJasonFan committed Apr 24, 2024
1 parent 65213d2 commit 4580c54
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 1 deletion.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ flate2 = "^1"
# For auto-serialization of structs to csv/tsv
csv = "^1"
serde = { version = "^1.0.123", features = ["derive"] }
serde-aux = "^4"

[dev-dependencies]
tempfile = "3.2.0"
Expand Down
92 changes: 91 additions & 1 deletion src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,16 @@ impl<D: DeserializeOwned> DelimFileReader<D> {
.has_headers(true)
.quoting(quote)
.from_reader(reader);
assert!(csv_reader.has_headers(), "Expected input file to have a header row");

// NB: csv_reader.has_header() does not actually check for existence of a header, but only
// checks that the reader is configured to read a header.

// If the header is not empty (try to parse it)
let header = csv_reader.headers().map_err(FgError::ConversionError)?.to_owned();
if !header.is_empty() {
Self::validate_header(&header, delimiter)?
}

let record_iter = csv_reader.into_deserialize();
Ok(Self { record_iter, header })
}
Expand All @@ -186,6 +194,25 @@ impl<D: DeserializeOwned> DelimFileReader<D> {
pub fn read(&mut self) -> Option<Result<D>> {
self.record_iter.next().map(|result| result.map_err(FgError::ConversionError))
}

fn validate_header(header: &StringRecord, delimiter: u8) -> Result<()> {
let delim = String::from_utf8(vec![delimiter]).unwrap();
let found_header_parts: Vec<&str> = header.iter().collect();
let expected_header_parts = serde_aux::prelude::serde_introspect::<D>();

// Expected header fields must be a _subset_ of found header fields
let ok = expected_header_parts.iter().all(|field| found_header_parts.contains(field));

if !ok {
let expected = expected_header_parts.join(&delim);
return Err(FgError::DelimFileHeaderError {
expected,
found: header.as_slice().to_owned(),
});
}

Ok(())
}
}

impl<D: DeserializeOwned> Iterator for DelimFileReader<D> {
Expand Down Expand Up @@ -342,6 +369,7 @@ impl DelimFile {

#[cfg(test)]
mod tests {
use super::*;
use crate::io::{DelimFile, Io};
use serde::{Deserialize, Serialize};
use tempfile::TempDir;
Expand All @@ -355,6 +383,25 @@ mod tests {
o: Option<f64>,
}

// Trickier record types in which fields are skipped in de/serialization
#[derive(Debug, Serialize, Deserialize, PartialEq)]
struct RecWithSkipDe {
s: String,
i: usize,
b: bool,
#[serde(skip_deserializing)]
o: Option<f64>,
}

#[derive(Debug, Serialize, Deserialize, PartialEq)]
struct RecWithSkipSe {
s: String,
i: usize,
b: bool,
#[serde(skip_serializing)]
o: Option<f64>,
}

#[test]
fn test_reading_and_writing_lines_to_file() {
let lines = vec!["foo", "bar,splat,whee", "baz\twhoopsie"];
Expand Down Expand Up @@ -431,4 +478,47 @@ mod tests {
assert_eq!(from_csv, recs);
assert_eq!(from_tsv, recs);
}

#[test]
fn test_header_error() {
let recs = vec![
RecWithSkipDe { s: "Hello".to_string(), i: 123, b: true, o: None },
RecWithSkipDe { s: "A,B,C".to_string(), i: 456, b: false, o: Some(123.45) },
];
let tmp = TempDir::new().unwrap();
let csv = tmp.path().join("recs.csv");
let df = DelimFile::default();
df.write_csv(&csv, recs).unwrap();

let result: Result<Vec<RecWithSkipDe>> = df.read_tsv(&csv);
let err = result.unwrap_err();

// Serialized CSV should contain all fields, deserializing should skip "o"
if let FgError::DelimFileHeaderError { expected, found } = err {
assert_eq!(expected, "s\ti\tb");
assert_eq!(found, "s,i,b,o");
} else {
panic!()
}

let recs = vec![
RecWithSkipSe { s: "Hello".to_string(), i: 123, b: true, o: None },
RecWithSkipSe { s: "A,B,C".to_string(), i: 456, b: false, o: Some(123.45) },
];
let tmp = TempDir::new().unwrap();
let csv = tmp.path().join("recs.csv");
let df = DelimFile::default();
df.write_csv(&csv, recs).unwrap();

let result: Result<Vec<RecWithSkipSe>> = df.read_tsv(&csv);
let err = result.unwrap_err();

// Serialized CSV should contain should skip "o", deserailize should expect all fields
if let FgError::DelimFileHeaderError { expected, found } = err {
assert_eq!(expected, "s\ti\tb\to");
assert_eq!(found, "s,i,b");
} else {
panic!()
}
}
}
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ pub enum FgError {

#[error("Error parsing/formatting delimited data.")]
ConversionError(#[from] csv::Error),

#[error("Error parsing delimited data file header.")]
DelimFileHeaderError { expected: String, found: String },
}

/// Result type that should be used everywhere
Expand Down

0 comments on commit 4580c54

Please sign in to comment.