Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix: add support output for ARs #461

Merged
merged 4 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 31 additions & 20 deletions examples/basic/mining_ar.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ def print_ars(ars):
print('Total count of ARs:', len(ars))
print('The first 10 ARs:')
for ar in ars[:10]:
print('conf: ', end='')

if ar.confidence > 0.9:
print(COLOR_CODES['bold_green'], end='')
elif ar.confidence > 0.3:
Expand All @@ -36,6 +38,17 @@ def print_ars(ars):

print('{:1.2f}'.format(ar.confidence),
COLOR_CODES['default'], end='\t')
print('sup: ', end='')

if ar.support > 0.9:
print(COLOR_CODES['bold_green'], end='')
elif ar.support > 0.3:
print(COLOR_CODES['bold_yellow'], end='')
else:
print(COLOR_CODES['bold_red'], end='')

print('{:1.2f}'.format(ar.support),
COLOR_CODES['default'], end='\t')
print(ar.left, '->', ar.right, )


Expand All @@ -47,7 +60,7 @@ def print_itemnames(itemnames):

def scenario_tabular():
algo = desbordante.ar.algorithms.Default()
algo.load_data(table=(TABLE_TABULAR, ',', True), input_format='tabular')
algo.load_data(table=(TABLE_TABULAR, ',', False), input_format='tabular')
algo.execute(minconf=1)
table = pandas.read_csv(TABLE_TABULAR, header=None)

Expand All @@ -60,20 +73,21 @@ def scenario_tabular():
print("\nLet's see the first 10 association rules (ARs) that are present "
'in the dataset with minconf=1. As no minsup is specified, '
'the default value of minsup=0 is used.\n')

print_ars(algo.get_ars())
print("\n['Eggs'] -> ['Milk'] with confidence 1 means that whenever eggs "
'are found in the receipt, milk will '
print("\n['Butter'] -> ['Bread'] with confidence 1 means that whenever butter "
'is found in the receipt, bread will '
f'{COLOR_CODES["green"]}always{COLOR_CODES["default"]} '
'be present as well. The same holds true for all other rules with '
f'{COLOR_CODES["bold_green"]}confidence 1{COLOR_CODES["default"]}.')

print("\n\nNow, let's examine the same dataset with "
f'{COLOR_CODES["yellow"]}minconf=0.7{COLOR_CODES["default"]}.')
algo.execute(minconf=0.7)
f'{COLOR_CODES["yellow"]}minconf=0.6{COLOR_CODES["default"]}.')
algo.execute(minconf=0.6)
print_ars(algo.get_ars())
print("\n['Milk'] -> ['Eggs'] with confidence 0.75 means that when milk "
print("\n['Yogurt'] -> ['Eggs'] with confidence 0.67 means that when milk "
'is found in the receipt, the chance of eggs being '
'present amounts to 75%. So, customers are '
'present amounts to 67%. So, customers are '
f'{COLOR_CODES["bold_yellow"]}likely{COLOR_CODES["default"]} '
'to buy eggs with milk.')

Expand All @@ -89,24 +103,21 @@ def scenario_tabular():
'Since the default support value is 0, the system discovers '
'all association rules, even those that only occur once '
"in the dataset. Now, let's see the results with "
f'{COLOR_CODES["yellow"]}minsup=0.5{COLOR_CODES["default"]} and '
f'{COLOR_CODES["yellow"]}minconf=0.5{COLOR_CODES["default"]}.\n')
algo.execute(minsup=0.5, minconf=0.5)
f'{COLOR_CODES["yellow"]}minsup=0.4{COLOR_CODES["default"]} and '
f'{COLOR_CODES["yellow"]}minconf=0.6{COLOR_CODES["default"]}.\n')
algo.execute(minsup=0.4, minconf=0.6)
print_ars(algo.get_ars())
print('\nNow you can see that the number of association rules have decreased '
'significantly. This happened due to minsup being set to 0.5. '
'Unfortunately, if you want to know what the support value is for a '
'particular association rule in a dataset, '
"you can't get it with Desbordante.\n"
'significantly. This happened due to minsup being set to 0.5. \n'
'\nA typical approach to controlling the algorithm is to employ '
f'{COLOR_CODES["bold_yellow"]}"usefulness"{COLOR_CODES["default"]}, '
'which is defined as confidence * support. '
'In the last example, we set up min '
'"usefulness" = 0.5 * 0.5 = 0.25. \n\nNow, let\'s try with '
f'{COLOR_CODES["green"]}minsup=0.7{COLOR_CODES["default"]}, '
f'{COLOR_CODES["green"]}minconf=0.7{COLOR_CODES["default"]} and '
f'{COLOR_CODES["bold_yellow"]}"usefulness"=0.49{COLOR_CODES["default"]}.\n')
algo.execute(minsup=0.7, minconf=0.7)
'"usefulness" = 0.6 * 0.4 = 0.24. \n\nNow, let\'s try with '
f'{COLOR_CODES["green"]}minsup=0.6{COLOR_CODES["default"]}, '
f'{COLOR_CODES["green"]}minconf=0.6{COLOR_CODES["default"]} and '
f'{COLOR_CODES["bold_yellow"]}"usefulness"=0.36{COLOR_CODES["default"]}.\n')
algo.execute(minsup=0.6, minconf=0.6)
print_ars(algo.get_ars())
print('\nSo, now the total number of returned association rules '
'is only four. We reduced the amount of "noisy" information '
Expand All @@ -117,7 +128,7 @@ def scenario_tabular():

def scenario_singular():
algo = desbordante.ar.algorithms.Default()
algo.load_data(table=(TABLE_SINGULAR, ',', True), input_format='singular')
algo.load_data(table=(TABLE_SINGULAR, ',', False), input_format='singular')
algo.execute()
table = pandas.read_csv(TABLE_SINGULAR, header=None, index_col=0)

Expand Down
23 changes: 18 additions & 5 deletions src/core/algorithms/association_rules/ar.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,16 @@ struct ArIDs {
std::vector<unsigned> left; // antecedent
std::vector<unsigned> right; // consequent
double confidence = -1;
double support = -1;

ArIDs() = default;

ArIDs(std::vector<unsigned> left, std::vector<unsigned> right, double confidence)
: left(std::move(left)), right(std::move(right)), confidence(confidence) {}
ArIDs(std::vector<unsigned> left, std::vector<unsigned> right, double confidence,
double support)
: left(std::move(left)),
right(std::move(right)),
confidence(confidence),
support(support) {}

ArIDs(ArIDs const& other) = default;
ArIDs& operator=(ArIDs const& other) = default;
Expand All @@ -27,14 +32,19 @@ struct ARStrings {
std::list<std::string> left; // antecedent
std::list<std::string> right; // consequent
double confidence = -1;
double support = -1;

ARStrings() = default;

ARStrings(std::list<std::string> left, std::list<std::string> right, double confidence)
: left(std::move(left)), right(std::move(right)), confidence(confidence) {}
ARStrings(std::list<std::string> left, std::list<std::string> right, double confidence,
double support)
: left(std::move(left)),
right(std::move(right)),
confidence(confidence),
support(support) {}

ARStrings(ArIDs const& id_format_rule, TransactionalData const* transactional_data)
: confidence(id_format_rule.confidence) {
: confidence(id_format_rule.confidence), support(id_format_rule.support) {
std::vector<std::string> const& item_names_map = transactional_data->GetItemUniverse();

for (auto item_id : id_format_rule.left) {
Expand All @@ -52,7 +62,10 @@ struct ARStrings {

std::string ToString() const {
std::string result;
result.append("conf: ");
result.append(std::to_string(confidence));
result.append("\tsup: ");
result.append(std::to_string(support));
result.append("\t{");
for (auto const& item_name : left) {
result.append(item_name);
Expand Down
8 changes: 4 additions & 4 deletions src/core/algorithms/association_rules/ar_algorithm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,8 @@ void ARAlgorithm::GenerateRulesFrom(std::vector<unsigned> const& frequent_itemse
auto const lhs_support = GetSupport(lhs);
auto const confidence = support / lhs_support;
if (confidence >= minconf_) {
auto const& new_ar =
ar_collection_.emplace_back(std::move(lhs), std::move(rhs), confidence);
auto const& new_ar = ar_collection_.emplace_back(std::move(lhs), std::move(rhs),
confidence, support);
root_.children.emplace_back(new_ar);
}
}
Expand Down Expand Up @@ -146,8 +146,8 @@ bool ARAlgorithm::MergeRules(std::vector<unsigned> const& frequent_itemset, doub
auto const lhs_support = GetSupport(lhs);
auto const confidence = support / lhs_support;
if (confidence >= minconf_) {
auto const& new_ar =
ar_collection_.emplace_back(std::move(lhs), std::move(rhs), confidence);
auto const& new_ar = ar_collection_.emplace_back(std::move(lhs), std::move(rhs),
confidence, support);
child_iter->children.emplace_back(new_ar);
is_rule_produced = true;
}
Expand Down
6 changes: 4 additions & 2 deletions src/python_bindings/ar/bind_ar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ void BindAr(py::module_& main_module) {
.def("__str__", &ARStrings::ToString)
.def_readonly("left", &ARStrings::left)
.def_readonly("right", &ARStrings::right)
.def_readonly("confidence", &ARStrings::confidence);
.def_readonly("confidence", &ARStrings::confidence)
.def_readonly("support", &ARStrings::support);

py::class_<ArIDs>(ar_module, "ArIDs")
.def_readonly("left", &ArIDs::left)
.def_readonly("right", &ArIDs::right)
.def_readonly("confidence", &ArIDs::confidence);
.def_readonly("confidence", &ArIDs::confidence)
.def_readonly("support", &ArIDs::support);

py::class_<ARAlgorithm, Algorithm>(ar_module, "ArAlgorithm")
.def("get_ars", &ARAlgorithm::GetArStringsList, py::return_value_policy::move)
Expand Down
2 changes: 2 additions & 0 deletions src/tests/all_csv_configs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ CSVConfig const kTestZeros = CreateCsvConfig("TestZeros.csv", ',', true);
CSVConfig const kNullEmpty = CreateCsvConfig("NullEmpty.csv", ',', true);
CSVConfig const kSimpleTypes = CreateCsvConfig("SimpleTypes.csv", ',', true);
CSVConfig const kRulesBook = CreateCsvConfig("transactional_data/rules-book.csv", ',', false);
CSVConfig const kRulesBookRows =
CreateCsvConfig("transactional_data/rules-book-rows.csv", ',', false);
CSVConfig const kRulesPresentationExtended =
CreateCsvConfig("transactional_data/rules-presentation-extended.csv", ',', false);
CSVConfig const kRulesPresentation =
Expand Down
1 change: 1 addition & 0 deletions src/tests/all_csv_configs.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ extern CSVConfig const kTestZeros;
extern CSVConfig const kNullEmpty;
extern CSVConfig const kSimpleTypes;
extern CSVConfig const kRulesBook;
extern CSVConfig const kRulesBookRows;
extern CSVConfig const kRulesPresentationExtended;
extern CSVConfig const kRulesPresentation;
extern CSVConfig const kRulesSynthetic2;
Expand Down
44 changes: 44 additions & 0 deletions src/tests/test_apriori.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,26 @@ void CheckAssociationRulesListsEquality(
SUCCEED();
}

void CheckSupportAndConfidence(std::list<model::ARStrings> const& actual,
std::set<std::string> const& lhs, std::set<std::string> const& rhs,
double expected_support, double expected_confidence) {
for (auto const& rule : actual) {
std::set<std::string> actual_lhs(rule.left.begin(), rule.left.end());
std::set<std::string> actual_rhs(rule.right.begin(), rule.right.end());

if (lhs == actual_lhs && rhs == actual_rhs) {
EXPECT_DOUBLE_EQ(rule.support, expected_support)
<< "supports don't match: expected " << expected_support
<< ", got: " << rule.support;
EXPECT_DOUBLE_EQ(rule.confidence, expected_confidence)
<< "confidences don't match: expected " << expected_confidence
<< ", got: " << rule.confidence;
return;
}
}
ADD_FAILURE() << "expected rule not found in generated rules";
}

static std::set<std::pair<std::set<std::string>, std::set<std::string>>> ToSet(
std::list<model::ARStrings> const& rules) {
std::set<std::pair<std::set<std::string>, std::set<std::string>>> set;
Expand Down Expand Up @@ -318,4 +338,28 @@ TEST_F(ARAlgorithmTest, RepeatedExecutionConsistentResult) {
}
}

TEST_F(ARAlgorithmTest, SupportAndConfidenceSingular) {
auto algorithm = CreateAlgorithmInstance(kRulesBook, 0.2, 0.5, 0, 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess last parameter should be true

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it shouldn't. The last parameter is itemColumnIndex, which is unsigned int.

algorithm->Execute();
auto result = algorithm->GetArStringsList();
CheckSupportAndConfidence(result, {"Eggs"}, {"Milk"}, 0.6, 1);
CheckSupportAndConfidence(result, {"Bread"}, {"Eggs"}, 0.2, 0.5);
CheckSupportAndConfidence(result, {"Yogurt"}, {"Milk"}, 0.6, 1);
CheckSupportAndConfidence(result, {"Bread"}, {"Milk"}, 0.4, 1);
CheckSupportAndConfidence(result, {"Cheese"}, {"Milk"}, 0.4, 1);
CheckSupportAndConfidence(result, {"Milk", "Bread"}, {"Eggs"}, 0.2, 0.5);
}

TEST_F(ARAlgorithmTest, SupportAndConfidenceTabular) {
auto algorithm = CreateAlgorithmInstance(kRulesBookRows, 0.2, 0.5, false);
algorithm->Execute();
auto result = algorithm->GetArStringsList();
CheckSupportAndConfidence(result, {"Eggs"}, {"Milk"}, 0.6, 1);
CheckSupportAndConfidence(result, {"Bread"}, {"Eggs"}, 0.2, 0.5);
CheckSupportAndConfidence(result, {"Yogurt"}, {"Milk"}, 0.6, 1);
CheckSupportAndConfidence(result, {"Bread"}, {"Milk"}, 0.4, 1);
CheckSupportAndConfidence(result, {"Cheese"}, {"Milk"}, 0.4, 1);
CheckSupportAndConfidence(result, {"Milk", "Bread"}, {"Eggs"}, 0.2, 0.5);
}

} // namespace tests