diff --git a/MANIFEST.in b/MANIFEST.in index 6ef89360..057a5817 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ recursive-include doc *.bat *.css *.md *.png *.py *.rst Makefile -recursive-include orangecontrib *.svg icons/* *.html *.js *.css *.txt +recursive-include orangecontrib *.svg icons/* *.html *.js *.css *.txt *.tab include *.pypi *.md *.ini .coveragerc diff --git a/orangecontrib/prototypes/widgets/icons/Split.svg b/orangecontrib/prototypes/widgets/icons/Split.svg new file mode 100644 index 00000000..5594fb76 --- /dev/null +++ b/orangecontrib/prototypes/widgets/icons/Split.svg @@ -0,0 +1,184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/orangecontrib/prototypes/widgets/owsplit.py b/orangecontrib/prototypes/widgets/owsplit.py new file mode 100644 index 00000000..ae8220b4 --- /dev/null +++ b/orangecontrib/prototypes/widgets/owsplit.py @@ -0,0 +1,149 @@ +import numpy as np + +from AnyQt.QtCore import Qt + +from Orange.widgets import gui +from Orange.widgets.settings import ContextSetting, DomainContextHandler +from Orange.widgets.widget import OWWidget, Msg, Output, Input +from Orange.widgets.utils.itemmodels import DomainModel +from Orange.widgets.utils.widgetpreview import WidgetPreview +from Orange.data import Table, Domain, DiscreteVariable, StringVariable +from Orange.data.util import SharedComputeValue, get_unique_names + +from orangewidget.settings import Setting + + +class SplitColumn: + def __init__(self, data, attr, delimiter): + self.attr = attr + self.delimiter = delimiter + + column = self.get_string_values(data, self.attr) + values = [s.split(self.delimiter) for s in column] + self.new_values = tuple(sorted({val if val else "?" for vals in + values for val in vals})) + + def __eq__(self, other): + return self.attr == other.attr and self.delimiter == \ + other.delimiter and self.new_values == other.new_values + + def __hash__(self): + return hash((self.attr, self.delimiter, self.new_values)) + + def __call__(self, data): + column = self.get_string_values(data, self.attr) + values = [set(s.split(self.delimiter)) for s in column] + shared_data = {v: [i for i, xs in enumerate(values) if v in xs] for v + in self.new_values} + return shared_data + + @staticmethod + def get_string_values(data, var): + # turn discrete to string variable + column = data.get_column_view(var)[0] + if var.is_discrete: + return [var.str_val(x) for x in column] + return column + + +class OneHotStrings(SharedComputeValue): + + def __init__(self, fn, new_feature): + super().__init__(fn) + self.new_feature = new_feature + + def __eq__(self, other): + return self.compute_shared == other.compute_shared \ + and self.new_feature == other.new_feature + + def __hash__(self): + return hash((self.compute_shared, self.new_feature)) + + def compute(self, data, shared_data): + indices = shared_data[self.new_feature] + col = np.zeros(len(data)) + col[indices] = 1 + return col + + +class OWSplit(OWWidget): + name = "Split" + description = "Split string variables to create discrete." + icon = "icons/Split.svg" + priority = 700 + + class Inputs: + data = Input("Data", Table) + + class Outputs: + data = Output("Data", Table) + + class Warning(OWWidget.Warning): + no_disc = Msg("Data contains only numeric variables.") + + want_main_area = False + resizing_enabled = False + + settingsHandler = DomainContextHandler() + attribute = ContextSetting(None) + delimiter = ContextSetting(";") + auto_apply = Setting(True) + + def __init__(self): + super().__init__() + self.data = None + + variable_select_box = gui.vBox(self.controlArea, "Variable") + + gui.comboBox(variable_select_box, self, "attribute", + orientation=Qt.Horizontal, searchable=True, + callback=self.apply.deferred, + model=DomainModel(valid_types=(StringVariable, + DiscreteVariable))) + gui.lineEdit( + variable_select_box, self, "delimiter", + orientation=Qt.Horizontal, callback=self.apply.deferred) + + gui.auto_apply(self.buttonsArea, self, commit=self.apply) + + @Inputs.data + def set_data(self, data): + self.closeContext() + self.data = data + + model = self.controls.attribute.model() + model.set_domain(data.domain if data is not None else None) + self.Warning.no_disc(shown=data is not None and not model) + if not model: + self.attribute = None + self.data = None + return + self.attribute = model[0] + self.openContext(data) + self.apply.now() + + @gui.deferred + def apply(self): + if self.attribute is None: + self.Outputs.data.send(None) + return + var = self.data.domain[self.attribute] + + sc = SplitColumn(self.data, var, self.delimiter) + + new_columns = tuple(DiscreteVariable( + get_unique_names(self.data.domain, v), values=("0", "1"), + compute_value=OneHotStrings(sc, v) + ) for v in sc.new_values) + + new_domain = Domain( + self.data.domain.attributes + new_columns, + self.data.domain.class_vars, self.data.domain.metas + ) + extended_data = self.data.transform(new_domain) + self.Outputs.data.send(extended_data) + + +if __name__ == "__main__": # pragma: no cover + WidgetPreview(OWSplit).run(Table.from_file( + "tests/orange-in-education.tab")) diff --git a/orangecontrib/prototypes/widgets/tests/orange-in-education.tab b/orangecontrib/prototypes/widgets/tests/orange-in-education.tab new file mode 100644 index 00000000..3f67d1c0 --- /dev/null +++ b/orangecontrib/prototypes/widgets/tests/orange-in-education.tab @@ -0,0 +1,103 @@ +Role Orange use Familiar with Timestamp Country Classes with Orange +professor student teaching\ assistant in-class,\ in\ hands-on\ workshops in-class,\ in\ hands-on\ workshops;outside\ the\ classroom in-class,\ in\ lectures in-class,\ in\ lectures;in-class,\ in\ hands-on\ workshops in-class,\ in\ lectures;in-class,\ in\ hands-on\ workshops;outside\ the\ classroom in-class,\ in\ lectures;outside\ the\ classroom outside\ the\ classroom YouTube\ videos YouTube\ videos;lectures\ notes\ published\ on\ the\ Orange\ blog YouTube\ videos;lectures\ notes\ published\ on\ the\ Orange\ blog;published\ literature YouTube\ videos;published\ literature lectures\ notes\ published\ on\ the\ Orange\ blog lectures\ notes\ published\ on\ the\ Orange\ blog;published\ literature published\ literature time string string + meta meta meta +professor outside the classroom YouTube videos;lectures notes published on the Orange blog 2020-12-12 09:06:34 Pakistan Machine Learning +professor in-class, in lectures YouTube videos 2021-03-19 21:36:49 Portugal Data mining +student in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog 2020-12-10 03:35:34 Canada - Ontario prediction +student outside the classroom 2021-04-12 11:15:13 Italy computer science +professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog;published literature 2021-03-30 01:18:39 Ecuador computer science;text mining +student in-class, in hands-on workshops YouTube videos 2021-03-31 01:54:17 France business analytics +professor in-class, in lectures YouTube videos 2020-12-10 16:51:59 Germany Material Science +professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog;published literature 2021-03-29 04:39:05 Canada computer science +student in-class, in lectures;outside the classroom YouTube videos;lectures notes published on the Orange blog 2020-12-10 23:36:42 Sweden digital humanities +professor outside the classroom YouTube videos 2021-04-13 15:18:12 Brazil computer science;text mining +student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog;published literature 2021-03-27 19:43:11 Czech Republic big data analysis in management +teaching assistant in-class, in lectures YouTube videos 2020-12-11 13:39:51 Indonesia computer science;text mining +professor in-class, in lectures;in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog 2021-04-17 23:57:00 Switzerland digital humanities +professor in-class, in lectures YouTube videos;lectures notes published on the Orange blog;published literature 2020-12-11 07:26:54 Bulgaria computer science +professor in-class, in lectures;in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog 2020-12-16 14:49:04 Spain computer science;text mining +student in-class, in lectures;in-class, in hands-on workshops YouTube videos 2021-03-24 08:09:51 India data science +student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos 2020-12-18 11:11:20 United Kingdom computer science +student in-class, in lectures YouTube videos 2020-12-20 12:07:00 Turkey digital humanities +student outside the classroom YouTube videos 2021-04-22 04:04:37 Argentina data science +student outside the classroom YouTube videos;lectures notes published on the Orange blog 2021-04-05 07:34:26 Indonesia biology +professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom 2020-12-10 13:11:04 Latvia computer science;text mining +teaching assistant in-class, in hands-on workshops;outside the classroom lectures notes published on the Orange blog;published literature 2020-12-16 16:48:41 Portugal text mining +teaching assistant in-class, in lectures;in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog 2021-04-16 07:33:47 Egypt computer science +professor outside the classroom published literature 2021-04-08 03:01:17 Brazil digital humanities +professor in-class, in lectures YouTube videos 2020-12-15 04:49:08 India Management +student in-class, in lectures published literature 2020-12-12 18:52:57 Colombia text mining +professor in-class, in lectures YouTube videos 2020-12-17 16:54:40 Turkey computer science +student outside the classroom YouTube videos 2020-12-10 18:43:37 Ireland computer science +professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos 2020-12-11 08:23:55 India Business Administration +student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog 2021-03-22 11:44:04 Turkey Data Mining +student in-class, in lectures;outside the classroom YouTube videos 2021-03-20 16:03:47 Netherlands digital humanities +student in-class, in lectures;in-class, in hands-on workshops YouTube videos 2020-12-12 07:59:12 Indonesia computer science;text mining +student in-class, in lectures lectures notes published on the Orange blog 2021-03-21 17:38:31 Saudi Arabia Statistics +student outside the classroom YouTube videos 2021-03-26 21:24:08 United States of America - Massachusetts computer science;text mining +student in-class, in lectures;outside the classroom 2020-12-11 00:42:56 Malaysia computer science +teaching assistant outside the classroom YouTube videos 2021-03-21 21:23:57 United States of America - California Astronomy +professor in-class, in lectures;in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog 2020-12-10 22:16:14 Brazil computer science;data science +professor in-class, in hands-on workshops 2021-03-29 19:06:32 France Final studies project +student outside the classroom YouTube videos 2020-12-13 02:38:30 Australia Chemistry +professor outside the classroom 2021-04-06 19:26:58 China biology +student in-class, in hands-on workshops published literature 2020-12-12 22:27:35 China - Hong Kong SAR computer science +professor outside the classroom YouTube videos 2021-04-06 12:09:38 New Zealand text mining +professor in-class, in lectures;in-class, in hands-on workshops YouTube videos 2021-04-09 15:37:47 France computer science;text mining;data mining +teaching assistant outside the classroom YouTube videos 2020-12-19 12:45:24 Saudi Arabia computer science +professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog 2020-12-16 17:42:32 Brazil computer science +professor in-class, in hands-on workshops YouTube videos 2020-12-10 09:03:23 Russian Federation sport sciences +professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos 2021-03-19 23:54:38 Portugal Data Mining +professor outside the classroom YouTube videos;lectures notes published on the Orange blog;published literature 2021-03-29 17:27:09 Philippines text mining;Research Methods in Medicine +student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;published literature 2020-12-16 17:46:16 Ukraine computer science;artificial intelligence +professor in-class, in hands-on workshops YouTube videos 2021-03-25 14:22:49 Thailand computer science +professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos 2020-12-11 19:45:35 United States of America - California text mining;Consumer Insights +student in-class, in lectures;in-class, in hands-on workshops 2020-12-14 08:52:29 Netherlands computer science;digital humanities;design +professor in-class, in lectures published literature 2021-04-07 05:02:58 Korea (Republic of) Smart Factory +student in-class, in lectures YouTube videos 2020-12-25 21:04:48 Croatia computer science +student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog 2021-04-19 15:31:54 United Kingdom computer science +student in-class, in lectures;in-class, in hands-on workshops;outside the classroom YouTube videos;lectures notes published on the Orange blog 2021-04-08 15:25:51 India HR Analytics +professor in-class, in hands-on workshops;outside the classroom YouTube videos;published literature 2020-12-10 16:33:04 United States of America - Pennsylvania Electrodynamics +professor in-class, in lectures;in-class, in hands-on workshops YouTube videos 2020-12-16 17:19:53 Canada - Quebec / Québec agronomy +student in-class, in hands-on workshops published literature 2021-04-13 13:53:44 Romania text mining +student in-class, in hands-on workshops 2021-03-25 23:57:48 Brazil computer science;text mining +professor outside the classroom YouTube videos 2020-12-09 17:48:41 Thailand computer science;biology +professor outside the classroom YouTube videos 2021-03-21 14:42:52 Brazil text mining +student outside the classroom YouTube videos 2021-03-20 21:45:37 India text mining +teaching assistant outside the classroom YouTube videos 2021-04-15 19:08:14 China Transportation data analysis +student in-class, in hands-on workshops lectures notes published on the Orange blog 2020-12-15 05:49:47 India computer science;text mining +professor in-class, in hands-on workshops YouTube videos 2021-03-30 20:43:35 France computer science +student outside the classroom YouTube videos 2021-03-23 11:28:40 Argentina computer science;text mining +teaching assistant in-class, in lectures YouTube videos;lectures notes published on the Orange blog 2020-12-22 16:22:35 Germany text mining +student outside the classroom lectures notes published on the Orange blog 2021-04-08 13:22:44 India text mining +professor in-class, in lectures;outside the classroom lectures notes published on the Orange blog 2021-04-15 07:56:12 Korea (Republic of) computer science +professor in-class, in hands-on workshops lectures notes published on the Orange blog 2021-03-24 14:45:22 India computer science +student in-class, in lectures;in-class, in hands-on workshops;outside the classroom published literature 2021-04-15 04:44:38 India computer science +student in-class, in hands-on workshops YouTube videos 2021-03-29 14:36:24 United States of America - Ohio data science +student in-class, in hands-on workshops YouTube videos 2021-03-23 10:27:41 Singapore text mining +professor outside the classroom YouTube videos 2020-12-11 14:16:14 Indonesia computer science +teaching assistant in-class, in lectures;outside the classroom YouTube videos 2020-12-22 22:29:56 Japan text mining +student in-class, in lectures YouTube videos 2020-12-15 14:56:45 Indonesia computer science +student outside the classroom YouTube videos 2021-04-10 19:19:58 Italy biology +student in-class, in lectures 2021-03-22 12:51:18 United Kingdom computer science +student in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog;published literature 2021-04-12 18:44:38 Brazil text mining +student outside the classroom YouTube videos 2021-04-08 20:43:15 Brazil My personal work +professor in-class, in lectures YouTube videos;lectures notes published on the Orange blog 2021-04-10 15:18:30 China - Taiwan Big data analysis +teaching assistant in-class, in lectures YouTube videos;lectures notes published on the Orange blog 2020-12-10 01:18:39 Indonesia text mining +professor outside the classroom published literature 2020-12-23 21:06:49 Turkey biology +professor in-class, in lectures YouTube videos 2021-04-12 07:42:57 Korea (Republic of) Business Administration +professor in-class, in hands-on workshops YouTube videos;lectures notes published on the Orange blog 2021-04-13 12:01:24 Oman computer science +teaching assistant outside the classroom YouTube videos;lectures notes published on the Orange blog 2021-03-22 21:14:46 Canada - Ontario Geological Engineering +student outside the classroom YouTube videos 2021-04-19 18:31:40 Argentina computer science +professor in-class, in hands-on workshops published literature 2021-04-10 11:06:15 Russian Federation computer science +professor in-class, in hands-on workshops published literature 2020-12-10 13:19:53 Mexico computer science +professor in-class, in lectures YouTube videos;lectures notes published on the Orange blog 2020-12-13 21:39:59 United States of America - Florida text mining;sport analytics +professor in-class, in lectures;in-class, in hands-on workshops;outside the classroom 2021-03-19 21:39:43 Germany ethics in digital transformation +teaching assistant outside the classroom YouTube videos;published literature 2021-03-19 17:56:23 Hungary computer science;text mining;health management +student in-class, in lectures;in-class, in hands-on workshops;outside the classroom 2021-03-22 13:44:29 India data science +professor in-class, in lectures lectures notes published on the Orange blog 2020-12-10 21:41:44 Brazil industrial automation +student outside the classroom YouTube videos 2020-12-09 16:39:59 Spain text mining +student outside the classroom published literature 2020-12-18 18:13:38 Brazil biology +student outside the classroom lectures notes published on the Orange blog;published literature 2021-03-30 17:45:22 Brazil computer science;text mining +professor in-class, in hands-on workshops;outside the classroom YouTube videos 2021-03-25 14:34:24 Brazil computer science +professor in-class, in lectures YouTube videos;lectures notes published on the Orange blog 2021-04-22 00:44:51 Portugal computer science diff --git a/orangecontrib/prototypes/widgets/tests/test_owsplit.py b/orangecontrib/prototypes/widgets/tests/test_owsplit.py new file mode 100644 index 00000000..0dc8fe3d --- /dev/null +++ b/orangecontrib/prototypes/widgets/tests/test_owsplit.py @@ -0,0 +1,108 @@ +# pylint: disable=missing-docstring,unsubscriptable-object +import os +import unittest + +import numpy as np + +from Orange.data import Table, StringVariable, Domain +from Orange.widgets.tests.base import WidgetTest + +from orangecontrib.prototypes.widgets.owsplit import OWSplit + + +class TestOWSplit(WidgetTest): + def setUp(self): + self.widget = self.create_widget(OWSplit) + test_path = os.path.dirname(os.path.abspath(__file__)) + self.data = Table.from_file( + os.path.join(test_path, "orange-in-education.tab")) + self._create_simple_corpus() + + def _set_attr(self, attr, widget=None): + if widget is None: + widget = self.widget + attr_combo = widget.controls.attribute + idx = attr_combo.model().indexOf(attr) + attr_combo.setCurrentIndex(idx) + attr_combo.activated.emit(idx) + + def _create_simple_corpus(self) -> None: + """ + Creat a simple dataset with 4 documents. + """ + metas = np.array( + [ + ["foo,"], + ["bar,baz "], + ["foo,bar"], + [""], + ] + ) + text_var = StringVariable("foo") + domain = Domain([], metas=[text_var]) + self.small_table = Table.from_numpy( + domain, + X=np.empty((len(metas), 0)), + metas=metas, + ) + + def test_data(self): + """Basic functionality""" + self.send_signal(self.widget.Inputs.data, self.data) + self._set_attr(self.data.domain.attributes[1]) + output = self.get_output(self.widget.Outputs.data) + self.assertEqual(len(output.domain.attributes), + len(self.data.domain.attributes) + 3) + self.assertTrue("in-class, in hands-on workshops" in output.domain + and "in-class, in lectures" in output.domain and + "outside the classroom" in output.domain) + np.testing.assert_array_equal(output[:10, "in-class, in hands-on " + "workshops"], + np.array([0, 0, 1, 0, 1, 1, 0, 1, 0, 0] + ).reshape(-1, 1)) + np.testing.assert_array_equal(output[:10, "in-class, in lectures"], + np.array([0, 1, 0, 0, 1, 0, 1, 1, 1, 0] + ).reshape(-1, 1)) + np.testing.assert_array_equal(output[:10, "outside the classroom"], + np.array([1, 0, 1, 1, 1, 0, 0, 1, 1, 1] + ).reshape(-1, 1)) + + def test_empty_data(self): + """Do not crash on empty data""" + self.send_signal(self.widget.Inputs.data, None) + + def test_discrete(self): + """No crash on data attributes of different types""" + self.send_signal(self.widget.Inputs.data, self.data) + self.assertEqual(self.widget.attribute, self.data.domain.metas[1]) + self._set_attr(self.data.domain.attributes[1]) + self.assertEqual(self.widget.attribute, self.data.domain.attributes[1]) + + def test_numeric_only(self): + """Error raised when only numeric variables given""" + housing = Table.from_file("housing") + self.send_signal(self.widget.Inputs.data, housing) + self.assertTrue(self.widget.Warning.no_disc.is_shown()) + + def test_split_nonexisting(self): + """Test splitting when delimiter doesn't exist""" + self.widget.delimiter = "|" + self.send_signal(self.widget.Inputs.data, self.data) + new_cols = set(self.data.get_column_view("Country")[0]) + self.assertFalse(any(self.widget.delimiter in v for v in new_cols)) + self.assertEqual(len(self.get_output( + self.widget.Outputs.data).domain.attributes), + len(self.data.domain.attributes) + len(new_cols)) + + def test_empty_split(self): + """Test a case of nan column. At the same time, test duplicate + variable name.""" + self.widget.delimiter = "," + self.send_signal(self.widget.Inputs.data, self.small_table) + # new columns will be ["?", "bar", "baz ", "foo (1)"] + self.assertEqual(len(self.get_output(self.widget.Outputs.data).domain), + 5) + + +if __name__ == "__main__": + unittest.main() diff --git a/setup.py b/setup.py index cb57fd6d..f289eb8b 100644 --- a/setup.py +++ b/setup.py @@ -60,7 +60,7 @@ def include_documentation(local_dir, install_dir): long_description_content_type='text/markdown', packages=find_packages(), package_data={ - "orangecontrib.prototypes.widgets": ["icons/*.svg"] + "orangecontrib.prototypes.widgets": ["icons/*.svg", "tests/*.tab"] }, install_requires=[ 'Orange3>=3.28',