From 4580c54c237966e54afe8a0cf520715ebdc2ead8 Mon Sep 17 00:00:00 2001 From: Jason Fan Date: Wed, 24 Apr 2024 17:43:45 -0400 Subject: [PATCH] feat: initial implementation of header validation --- Cargo.toml | 1 + src/io/mod.rs | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++- src/lib.rs | 3 ++ 3 files changed, 95 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 9b45b83..3ef84aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ flate2 = "^1" # For auto-serialization of structs to csv/tsv csv = "^1" serde = { version = "^1.0.123", features = ["derive"] } +serde-aux = "^4" [dev-dependencies] tempfile = "3.2.0" diff --git a/src/io/mod.rs b/src/io/mod.rs index ae8e2af..9ce1a1c 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -171,8 +171,16 @@ impl DelimFileReader { .has_headers(true) .quoting(quote) .from_reader(reader); - assert!(csv_reader.has_headers(), "Expected input file to have a header row"); + + // NB: csv_reader.has_header() does not actually check for existence of a header, but only + // checks that the reader is configured to read a header. + + // If the header is not empty (try to parse it) let header = csv_reader.headers().map_err(FgError::ConversionError)?.to_owned(); + if !header.is_empty() { + Self::validate_header(&header, delimiter)? + } + let record_iter = csv_reader.into_deserialize(); Ok(Self { record_iter, header }) } @@ -186,6 +194,25 @@ impl DelimFileReader { pub fn read(&mut self) -> Option> { self.record_iter.next().map(|result| result.map_err(FgError::ConversionError)) } + + fn validate_header(header: &StringRecord, delimiter: u8) -> Result<()> { + let delim = String::from_utf8(vec![delimiter]).unwrap(); + let found_header_parts: Vec<&str> = header.iter().collect(); + let expected_header_parts = serde_aux::prelude::serde_introspect::(); + + // Expected header fields must be a _subset_ of found header fields + let ok = expected_header_parts.iter().all(|field| found_header_parts.contains(field)); + + if !ok { + let expected = expected_header_parts.join(&delim); + return Err(FgError::DelimFileHeaderError { + expected, + found: header.as_slice().to_owned(), + }); + } + + Ok(()) + } } impl Iterator for DelimFileReader { @@ -342,6 +369,7 @@ impl DelimFile { #[cfg(test)] mod tests { + use super::*; use crate::io::{DelimFile, Io}; use serde::{Deserialize, Serialize}; use tempfile::TempDir; @@ -355,6 +383,25 @@ mod tests { o: Option, } + // Trickier record types in which fields are skipped in de/serialization + #[derive(Debug, Serialize, Deserialize, PartialEq)] + struct RecWithSkipDe { + s: String, + i: usize, + b: bool, + #[serde(skip_deserializing)] + o: Option, + } + + #[derive(Debug, Serialize, Deserialize, PartialEq)] + struct RecWithSkipSe { + s: String, + i: usize, + b: bool, + #[serde(skip_serializing)] + o: Option, + } + #[test] fn test_reading_and_writing_lines_to_file() { let lines = vec!["foo", "bar,splat,whee", "baz\twhoopsie"]; @@ -431,4 +478,47 @@ mod tests { assert_eq!(from_csv, recs); assert_eq!(from_tsv, recs); } + + #[test] + fn test_header_error() { + let recs = vec![ + RecWithSkipDe { s: "Hello".to_string(), i: 123, b: true, o: None }, + RecWithSkipDe { s: "A,B,C".to_string(), i: 456, b: false, o: Some(123.45) }, + ]; + let tmp = TempDir::new().unwrap(); + let csv = tmp.path().join("recs.csv"); + let df = DelimFile::default(); + df.write_csv(&csv, recs).unwrap(); + + let result: Result> = df.read_tsv(&csv); + let err = result.unwrap_err(); + + // Serialized CSV should contain all fields, deserializing should skip "o" + if let FgError::DelimFileHeaderError { expected, found } = err { + assert_eq!(expected, "s\ti\tb"); + assert_eq!(found, "s,i,b,o"); + } else { + panic!() + } + + let recs = vec![ + RecWithSkipSe { s: "Hello".to_string(), i: 123, b: true, o: None }, + RecWithSkipSe { s: "A,B,C".to_string(), i: 456, b: false, o: Some(123.45) }, + ]; + let tmp = TempDir::new().unwrap(); + let csv = tmp.path().join("recs.csv"); + let df = DelimFile::default(); + df.write_csv(&csv, recs).unwrap(); + + let result: Result> = df.read_tsv(&csv); + let err = result.unwrap_err(); + + // Serialized CSV should contain should skip "o", deserailize should expect all fields + if let FgError::DelimFileHeaderError { expected, found } = err { + assert_eq!(expected, "s\ti\tb\to"); + assert_eq!(found, "s,i,b"); + } else { + panic!() + } + } } diff --git a/src/lib.rs b/src/lib.rs index 475ddb5..a3b0099 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,9 @@ pub enum FgError { #[error("Error parsing/formatting delimited data.")] ConversionError(#[from] csv::Error), + + #[error("Error parsing delimited data file header.")] + DelimFileHeaderError { expected: String, found: String }, } /// Result type that should be used everywhere