From fe73d78ae3f5b68892baff4f61cb1d01a5b56faa Mon Sep 17 00:00:00 2001 From: Dieter Baron Date: Fri, 19 Apr 2024 14:44:44 +0200 Subject: [PATCH] Optimize adding hashes to CkmameDB. --- src/Archive.cc | 47 ++++++++++++++++++++++++++++------------ src/Archive.h | 11 +++++++++- src/ArchiveLibarchive.cc | 1 + src/ArchiveZip.cc | 2 +- src/CkmameDB.cc | 32 +++++++++++++++++++++++++-- src/CkmameDB.h | 5 ++++- src/Detector.h | 3 ++- src/File.cc | 2 +- src/archive_modify.cc | 46 +++++++++++++++++++++++++++++---------- src/detector_execute.cc | 15 ++++++++++--- 10 files changed, 129 insertions(+), 35 deletions(-) diff --git a/src/Archive.cc b/src/Archive.cc index 59136f4f..d4311d5e 100644 --- a/src/Archive.cc +++ b/src/Archive.cc @@ -83,7 +83,7 @@ Archive::Archive(ArchiveContentsPtr contents_) : name(contents->name), filetype(contents->filetype), where(contents->where), - cache_changed(false), + cache_changed(NONE), modified(false) { changes.resize(files.size()); } @@ -197,6 +197,7 @@ bool Archive::file_ensure_hashes(uint64_t idx, size_t detector_id, int hashtypes f->open(); } catch (Exception &e) { output.error("%s: %s: can't open: %s", name.c_str(), file.name.c_str(), e.what()); + set_cache_changed(FILES); file.broken = true; return false; } @@ -207,11 +208,13 @@ bool Archive::file_ensure_hashes(uint64_t idx, size_t detector_id, int hashtypes case READ_ERROR: output.error("%s: %s: can't compute hashes: %s", name.c_str(), file.name.c_str(), strerror(errno)); + set_cache_changed(FILES); file.broken = true; return false; case CRC_ERROR: output.error("%s: %s: CRC error: %08x != %08x", name.c_str(), file.name.c_str(), hashes.crc, file.hashes.crc); + set_cache_changed(FILES); file.broken = true; return false; } @@ -223,7 +226,8 @@ bool Archive::file_ensure_hashes(uint64_t idx, size_t detector_id, int hashtypes return false; } } - cache_changed = true; + set_cache_changed(HASHES_ONLY); + changes[idx].updated_hashes.insert(detector_id); return true; } @@ -388,8 +392,8 @@ ArchivePtr Archive::open_toplevel(const std::string &name, filetype_t filetype, bool Archive::read_infos() { std::vector files_cache; - cache_changed = false; - + set_cache_changed(NONE); + contents->read_infos_from_cachedb(&files_cache); if (contents->cache_id > 0) { @@ -398,7 +402,7 @@ bool Archive::read_infos() { return false; case 0: - cache_changed = true; + set_cache_changed(FILES); break; case 1: @@ -413,12 +417,12 @@ bool Archive::read_infos() { } if (!read_infos_xxx()) { - cache_changed = true; + set_cache_changed(FILES); return false; } - merge_files(files_cache); changes.resize(files.size()); + merge_files(files_cache); return true; } @@ -493,6 +497,8 @@ Archive::GetHashesStatus Archive::get_hashes(ZipSource *source, uint64_t length, void Archive::merge_files(const std::vector &files_cache) { + set_cache_changed(NONE); + for (uint64_t i = 0; i < files.size(); i++) { auto &file = files[i]; @@ -500,31 +506,37 @@ void Archive::merge_files(const std::vector &files_cache) { auto it = std::find_if(files_cache.cbegin(), files_cache.cend(), [&file](const File &file_cache){ return file.name == file_cache.name; }); if (it != files_cache.cend()) { if (file.mtime == (*it).mtime && file.compare_size_hashes(*it)) { + if ((file.hashes.get_types() & ~(it->hashes.get_types())) == 0) { + changes[i].updated_hashes.clear(); + } + else { + changes[i].updated_hashes.insert(0); + } file.hashes.merge((*it).hashes); file.detector_hashes = it->detector_hashes; } else { - cache_changed = true; + set_cache_changed(FILES); } } else { - cache_changed = true; + set_cache_changed(FILES); } if (want_crc() && !file.hashes.has_type(Hashes::TYPE_CRC)) { if (!file_ensure_hashes(i, Hashes::TYPE_ALL)) { file.broken = true; if (it == files_cache.cend() || !(*it).broken) { - cache_changed = true; + set_cache_changed(FILES); } continue; } - cache_changed = true; + set_cache_changed(HASHES_ONLY); } } if (files.size() != files_cache.size()) { - cache_changed = true; + set_cache_changed(FILES); } } @@ -619,7 +631,7 @@ bool Archive::compute_detector_hashes(const std::unordered_mapdetectors, &changes[index].updated_hashes); } @@ -690,3 +702,10 @@ bool Archive::compare_size_hashes(size_t index, size_t detector_id, const FileDa return ok; } + +void Archive::set_cache_changed(Archive::CacheChange new_changed) { + if (new_changed == HASHES_ONLY && cache_changed == FILES) { + return; + } + cache_changed = new_changed; +} \ No newline at end of file diff --git a/src/Archive.h b/src/Archive.h index 6255b133..d228a38e 100644 --- a/src/Archive.h +++ b/src/Archive.h @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -125,6 +126,11 @@ struct hash { class Archive { public: + enum CacheChange { + NONE, + HASHES_ONLY, + FILES + }; class Change { public: enum Status { @@ -140,6 +146,7 @@ class Archive { std::string source_name; ZipSourcePtr source; std::string file; + std::unordered_set updated_hashes; }; static ArchivePtr open(const std::string &name, filetype_t filetype, where_t where, int flags); @@ -190,6 +197,8 @@ class Archive { virtual std::string get_full_filename(uint64_t index) { return ""; } virtual std::string get_original_filename(uint64_t index) { return ""; } + void set_cache_changed(CacheChange new_changed); + ArchiveContentsPtr contents; std::vector &files; std::string &name; @@ -197,7 +206,7 @@ class Archive { const where_t where; std::vector changes; - bool cache_changed; + CacheChange cache_changed{NONE}; bool modified; protected: diff --git a/src/ArchiveLibarchive.cc b/src/ArchiveLibarchive.cc index bdd8b266..93b5c8a6 100644 --- a/src/ArchiveLibarchive.cc +++ b/src/ArchiveLibarchive.cc @@ -330,6 +330,7 @@ bool ArchiveLibarchive::read_infos_xxx() { r.name = archive_entry_pathname_utf8(entry); r.broken = false; files.push_back(r); + changes.emplace_back(); header_read = true; file_ensure_hashes(current_index, Hashes::TYPE_ALL); diff --git a/src/ArchiveZip.cc b/src/ArchiveZip.cc index 19982485..38481157 100644 --- a/src/ArchiveZip.cc +++ b/src/ArchiveZip.cc @@ -248,7 +248,7 @@ void ArchiveZip::commit_cleanup() { void ArchiveZip::get_last_update() { - if (cache_changed) { + if (cache_changed != NONE) { close_xxx(); } struct stat st; diff --git a/src/CkmameDB.cc b/src/CkmameDB.cc index f8fa58f4..f27056b0 100644 --- a/src/CkmameDB.cc +++ b/src/CkmameDB.cc @@ -114,7 +114,8 @@ std::unordered_map CkmameDB::queries = { { QUERY_ARCHIVE_ID, "select archive_id from archive where name = :name and file_type = :file_type" }, { QUERY_ARCHIVE_LAST_CHANGE, "select mtime, size from archive where archive_id = :archive_id" }, { QUERY_FILE, "select file_idx, detector_id, name, mtime, status, size, crc, md5, sha1 from file where archive_id = :archive_id order by file_idx, detector_id" }, - { QUERY_HAS_ARCHIVES, "select archive_id from archive limit 1" } + { QUERY_HAS_ARCHIVES, "select archive_id from archive limit 1" }, + { UPDATE_FILE_HASHES, "update file set crc = :crc, md5 = :md5, sha1 = :sha1 where archive_id = :archive_id and file_idx = :file_idx and detector_id = 0" } }; std::unordered_map CkmameDB::parameterized_queries = { @@ -515,4 +516,31 @@ bool CkmameDB::compute_detector_hashes(const std::unordered_mapset_int("archive_id", archive_id); + stmt->set_int("file_idx", static_cast(file_id)); + stmt->set_hashes(hashes, true); + + stmt->execute(); +} + +void CkmameDB::insert_file_detector_hashes(int archive_id, size_t file_id, size_t detector_id, const Hashes& hashes) { + auto local_detector_id = get_detector_id(detector_id); + + auto stmt = get_statement(INSERT_FILE); + + stmt->set_int("archive_id", archive_id); + stmt->set_int("file_idx", static_cast(file_id)); + stmt->set_uint64("detector_id", local_detector_id); + stmt->set_string("name", "", true); + stmt->set_int64("mtime", 0); + stmt->set_int("status", 0); + stmt->set_uint64("size", hashes.size); + stmt->set_hashes(hashes, true); + + stmt->execute(); +} diff --git a/src/CkmameDB.h b/src/CkmameDB.h index 8e25119d..9fd32252 100644 --- a/src/CkmameDB.h +++ b/src/CkmameDB.h @@ -74,7 +74,8 @@ class CkmameDB : public DB { QUERY_ARCHIVE_ID, QUERY_ARCHIVE_LAST_CHANGE, QUERY_FILE, - QUERY_HAS_ARCHIVES + QUERY_HAS_ARCHIVES, + UPDATE_FILE_HASHES }; enum ParameterizedStatement { QUERY_FIND_FILE @@ -95,6 +96,8 @@ class CkmameDB : public DB { std::vector list_archives(); int read_files(int archive_id, std::vector *files); void write_archive(ArchiveContents *archive); + void update_file_hashes(int archive_id, size_t file_id, const Hashes& hashes); + void insert_file_detector_hashes(int archive_id, size_t file_id, size_t detector_id, const Hashes& hashes); void find_file(filetype_t filetype, size_t detector_id, const FileData& file, std::vector &results); bool compute_detector_hashes(const std::unordered_map& detectors); diff --git a/src/Detector.h b/src/Detector.h index f450b4b2..410e9a97 100644 --- a/src/Detector.h +++ b/src/Detector.h @@ -35,6 +35,7 @@ */ #include +#include #include #include "DetectorCollection.h" @@ -128,7 +129,7 @@ class Detector { static const DetectorDescriptor *get_descriptor(size_t id) { return detector_ids.get_descriptor(id); } // Returns true if new hashes were computed. - static bool compute_hashes(const std::vector &data, File *file, const std::unordered_map &detectors); + static bool compute_hashes(const std::vector &data, File *file, const std::unordered_map &detectors, std::unordered_set* changed = {}); private: static uint64_t operation_unit_size(Operation operation); diff --git a/src/File.cc b/src/File.cc index c0cf3aa4..5be5d6e8 100644 --- a/src/File.cc +++ b/src/File.cc @@ -37,7 +37,7 @@ Hashes File::empty_hashes; bool File::has_all_hashes(size_t detector, int requested_types) const { - return hashes.has_all_types(requested_types) && get_hashes(detector).has_all_types(requested_types); + return get_hashes(detector).has_all_types(requested_types); } diff --git a/src/archive_modify.cc b/src/archive_modify.cc index 71d9c090..5bef8e3c 100644 --- a/src/archive_modify.cc +++ b/src/archive_modify.cc @@ -46,7 +46,7 @@ bool Archive::commit() { if (modified) { output.set_error_archive(name); - cache_changed = true; + set_cache_changed(FILES); if (!commit_xxx()) { return false; @@ -86,7 +86,7 @@ bool Archive::commit() { } void Archive::update_cache() { - if (!cache_changed) { + if (cache_changed == NONE) { return; } @@ -109,14 +109,38 @@ void Archive::update_cache() { } else { get_last_update(); - - try { - contents->cache_db->write_archive(contents.get()); + + // TODO: check if size/mtime changed + + if (contents->cache_id != 0 && cache_changed == HASHES_ONLY) { + for (size_t i = 0; i < changes.size(); i++) { + auto& change = changes[i]; + const auto& file = files[i]; + + if (change.updated_hashes.empty()) { + continue; + } + + for (auto detector_id: change.updated_hashes) { + if (detector_id == 0) { + contents->cache_db->update_file_hashes(contents->cache_id, i, file.hashes); + } + else { + contents->cache_db->insert_file_detector_hashes(contents->cache_id, i, detector_id, file.get_hashes(detector_id)); + } + } + change.updated_hashes.clear(); + } } - catch (Exception &exception) { - contents->cache_db->seterr(); - output.error_database("%s: error writing to %s", name.c_str(), CkmameDB::db_name.c_str()); - contents->cache_id = 0; + else { + try { + contents->cache_db->write_archive(contents.get()); + } + catch (Exception& exception) { + contents->cache_db->seterr(); + output.error_database("%s: error writing to %s", name.c_str(), CkmameDB::db_name.c_str()); + contents->cache_id = 0; + } } } } @@ -124,7 +148,7 @@ void Archive::update_cache() { contents->cache_id = 0; } - cache_changed = false; + set_cache_changed(NONE); } @@ -260,7 +284,7 @@ bool Archive::file_rename(uint64_t index, const std::string &filename) { return false; } if (changes[index].status != Change::EXISTS) { - output.archive_error("cannot copy broken/added/deleted file"); + output.archive_error("cannot rename broken/added/deleted file"); return false; } diff --git a/src/detector_execute.cc b/src/detector_execute.cc index b7acda17..2e79936c 100644 --- a/src/detector_execute.cc +++ b/src/detector_execute.cc @@ -231,13 +231,22 @@ bool Detector::Test::execute(const std::vector &data) const { } -bool Detector::compute_hashes(const std::vector &data, File *file, const std::unordered_map &detectors) { +bool Detector::compute_hashes(const std::vector &data, File *file, const std::unordered_map &detectors, std::unordered_set* changed) { if (file->get_size(0) > MAX_DETECTOR_FILE_SIZE) { return false; } - for (const auto &pair : detectors) { - file->detector_hashes[pair.first] = pair.second->execute(data); + for (const auto &[id, detector] : detectors) { + auto it = file->detector_hashes.find(id); + + if (it != file->detector_hashes.end() && !it->second.empty()) { + continue; + } + + file->detector_hashes[id] = detector->execute(data); + if (changed) { + changed->insert(id); + } } return true;