Skip to content

Commit

Permalink
Optimize tuple pair construction
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelS239 committed Dec 14, 2024
1 parent a1a86fe commit 38b7c52
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 36 deletions.
99 changes: 64 additions & 35 deletions src/core/algorithms/dd/split/split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,11 @@
#include <list>
#include <set>
#include <string>
#include <unordered_set>
#include <utility>
#include <vector>

#include <boost/dynamic_bitset.hpp>
#include <boost/regex.hpp>
#include <easylogging++.h>

Expand Down Expand Up @@ -116,22 +118,27 @@ void Split::ParseDifferenceTable() {
}

unsigned long long Split::ExecuteInternal() {
auto const start_time = std::chrono::system_clock::now();
LOG(DEBUG) << "Start";

SetLimits();
CheckTypes();
ParseDifferenceTable();

auto const start_time = std::chrono::system_clock::now();
LOG(DEBUG) << "Start";

CalculateAllDistances();
CalculateIndexSearchSpaces();

LOG(INFO) << "Calculated distances";
auto elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now() - start_time);
LOG(DEBUG) << "Current time: " << elapsed_milliseconds.count();

if (reduce_method_ == +Reduce::IEHybrid) {
CalculateTuplePairs();
}

LOG(INFO) << "Calculated distances";
auto elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(
LOG(INFO) << "Calculated tuple pairs";
elapsed_milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now() - start_time);
LOG(DEBUG) << "Current time: " << elapsed_milliseconds.count();
LOG(INFO) << "Minimum and maximum distances for each column with non-empty search space:";
Expand Down Expand Up @@ -295,6 +302,33 @@ void Split::CalculateIndexSearchSpaces() {
plis_ = std::move(new_plis);
}

void Split::CalculateTuplePairs() {
std::size_t df_search_space_num = 0;
for (model::ColumnIndex index = 0; index < num_columns_; index++) {
df_search_space_num += index_search_spaces_[index].size();
}
std::unordered_set<boost::dynamic_bitset<>> tuple_pair_set;
for (std::size_t first_index = 0; first_index < num_rows_; first_index++) {
for (std::size_t second_index = first_index + 1; second_index < num_rows_; second_index++) {
boost::dynamic_bitset<> pair_bitset(df_search_space_num);
std::size_t df_index = 0;
for (model::ColumnIndex column_index = 0; column_index < num_columns_; column_index++) {
for (auto const& df_constraint : index_search_spaces_[column_index]) {
if (CheckDFConstraint(df_constraint, column_index,
{first_index, second_index})) {
pair_bitset.set(df_index);
}
df_index++;
}
}
auto const [it, is_new] = tuple_pair_set.insert(std::move(pair_bitset));
if (is_new) {
tuple_pairs_.emplace_back(first_index, second_index);
}
}
}
}

double Split::CalculateDistance(model::ColumnIndex column_index,
std::pair<std::size_t, std::size_t> tuple_pair) {
model::TypedColumnData const& column = typed_relation_->GetColumnData(column_index);
Expand All @@ -309,33 +343,38 @@ double Split::CalculateDistance(model::ColumnIndex column_index,
return dif;
}

inline bool Split::CheckDFConstraint(DFConstraint const& dif_constraint,
model::ColumnIndex column_index,
std::pair<std::size_t, std::size_t> tuple_pair) {
ClusterIndex const first_cluster = plis_[column_index].GetInvertedIndex()[tuple_pair.first];
ClusterIndex const second_cluster = plis_[column_index].GetInvertedIndex()[tuple_pair.second];
double const dif = distances_[column_index][first_cluster][second_cluster];

if (type_ids_[column_index] == +model::TypeId::kDouble) {
if (!dif_constraint.Contains(dif)) {
return false;
}
} else {
if (dif < dif_constraint.lower_bound || dif > dif_constraint.upper_bound) {
return false;
}
}
return true;
}

// must be inline for optimization (gcc 11.4.0)
inline bool Split::CheckDF(DF const& dif_func, std::pair<std::size_t, std::size_t> tuple_pair) {
for (model::ColumnIndex column_index = 0; column_index < num_columns_; column_index++) {
ClusterIndex const first_cluster = plis_[column_index].GetInvertedIndex()[tuple_pair.first];
ClusterIndex const second_cluster =
plis_[column_index].GetInvertedIndex()[tuple_pair.second];
double const dif = distances_[column_index][first_cluster][second_cluster];

if (type_ids_[column_index] == +model::TypeId::kDouble) {
if (!dif_func[column_index].Contains(dif)) {
return false;
}
} else {
if (dif < dif_func[column_index].lower_bound ||
dif > dif_func[column_index].upper_bound) {
return false;
}
if (!CheckDFConstraint(dif_func[column_index], column_index, tuple_pair)) {
return false;
}
}
return true;
}

bool Split::VerifyDD(DF const& lhs, DF const& rhs) {
for (std::size_t i = 0; i < num_rows_; i++) {
for (std::size_t j = i + 1; j < num_rows_; j++) {
if (CheckDF(lhs, {i, j}) && !CheckDF(rhs, {i, j})) return false;
}
for (auto const& pair : tuple_pairs_) {
if (CheckDF(lhs, pair) && !CheckDF(rhs, pair)) return false;
}
return true;
}
Expand Down Expand Up @@ -372,10 +411,8 @@ void Split::CalculateAllDistances() {
}

bool Split::IsFeasible(DF const& d) {
for (std::size_t i = 0; i < num_rows_; i++) {
for (std::size_t j = i + 1; j < num_rows_; j++) {
if (CheckDF(d, {i, j})) return true;
}
for (auto const& pair : tuple_pairs_) {
if (CheckDF(d, pair)) return true;
}
return false;
}
Expand Down Expand Up @@ -669,14 +706,6 @@ std::list<DD> Split::InstanceExclusionReduce(
return dds;
}

void Split::CalculateTuplePairs() {
for (std::size_t i = 0; i < num_rows_; i++) {
for (std::size_t j = i + 1; j < num_rows_; j++) {
tuple_pairs_.push_back({i, j});
}
}
}

void Split::PrintResults() {
std::list<model::DDString> const result_strings = GetDDStringList();
LOG(INFO) << "Minimal cover size: " << result_strings.size();
Expand Down
4 changes: 3 additions & 1 deletion src/core/algorithms/dd/split/split.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,12 @@ class Split : public Algorithm {
std::pair<std::size_t, std::size_t> tuple_pair);
void InsertDistance(model::ColumnIndex column_index, std::size_t first_index,
std::size_t second_index, double& min_dif, double& max_dif);
bool CheckDFConstraint(DFConstraint const& dif_constraint, model::ColumnIndex column_index,
std::pair<std::size_t, std::size_t> tuple_pair);
bool CheckDF(DF const& dep, std::pair<std::size_t, std::size_t> tuple_pair);
bool VerifyDD(DF const& lhs, DF const& rhs);
void CalculateIndexSearchSpaces();
void CalculateTuplePairs();
void CalculateAllDistances();
bool IsFeasible(DF const& d);
std::vector<DF> SearchSpace(std::vector<model::ColumnIndex>& indices);
Expand All @@ -86,7 +89,6 @@ class Split : public Algorithm {
std::list<DD> InstanceExclusionReduce(
std::vector<std::pair<std::size_t, std::size_t>> const& tuple_pairs,
std::vector<DF> const& search, DF const& rhs, unsigned& cnt);
void CalculateTuplePairs();
unsigned ReduceDDs(auto const& start_time);
unsigned RemoveRedundantDDs();
unsigned RemoveTransitiveDDs();
Expand Down

0 comments on commit 38b7c52

Please sign in to comment.