word-trees allow selection of column

digitalmethodsinitiative · Oct 8, 2024 · e4c0099 · e4c0099
1 parent d769be4
commit e4c0099
Showing 1 changed file with 86 additions and 62 deletions.
diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py
@@ -38,71 +38,94 @@ class MakeWordtree(BasicProcessor):
 		"Wattenberg, M., & Viégas, F. B. (2008). The Word Tree, an Interactive Visual Concordance. IEEE Transactions on Visualization and Computer Graphics, 14(6), 1221–1228. <https://doi.org/10.1109/TVCG.2008.172>"
 	]
 
-	options = {
-		"query": {
-			"type": UserInput.OPTION_TEXT,
-			"default": "",
-			"help": "Word tree root query",
-			"tooltip": "Enter a word here to serve as the root of the word tree. The context of this query will be mapped in the tree visualisation. Cannot be empty or contain whitespace."
-		},
-		"limit": {
-			"type": UserInput.OPTION_TEXT,
-			"default": 3,
-			"min": 1,
-			"max": 25,
-			"help": "Max branches/level",
-			"tooltip": "Limit the amount of branches per level, sorted by most-occuring phrases. Range 1-25."
-		},
-		"window": {
-			"type": UserInput.OPTION_TEXT,
-			"min": 1,
-			"max": 10,
-			"default": 5,
-			"help": "Window size",
-			"tooltip": "Up to this many words before and/or after the queried phrase will be visualised"
-		},
-		"sides": {
-			"type": UserInput.OPTION_CHOICE,
-			"default": "right",
-			"options": {
-				"left": "Before query",
-				"right": "After query",
-				"both": "Before and after query"
+	@classmethod
+	def get_options(cls, parent_dataset=None, user=None):
+		"""
+		Get processor options
+		"""
+		options = {
+			"column": {
+				"type": UserInput.OPTION_TEXT,
+				"help": "Text column",
+				"default": "url",
+				"inline": True,
+				"tooltip": "Select the column containing the text from which to generate the word tree.",
+			},
+			"query": {
+				"type": UserInput.OPTION_TEXT,
+				"default": "",
+				"help": "Word tree root query",
+				"tooltip": "Enter a word here to serve as the root of the word tree. The context of this query will be mapped in the tree visualisation. Cannot be empty or contain whitespace."
+			},
+			"limit": {
+				"type": UserInput.OPTION_TEXT,
+				"default": 3,
+				"min": 1,
+				"max": 25,
+				"help": "Max branches/level",
+				"tooltip": "Limit the amount of branches per level, sorted by most-occuring phrases. Range 1-25."
 			},
-			"help": "Query context to visualise"
-		},
-		"align": {
-			"type": UserInput.OPTION_CHOICE,
-			"default": "middle",
-			"options": {
-				"middle": "Vertically centered",
-				"top": "Top",
+			"window": {
+				"type": UserInput.OPTION_TEXT,
+				"min": 1,
+				"max": 10,
+				"default": 5,
+				"help": "Window size",
+				"tooltip": "Up to this many words before and/or after the queried phrase will be visualised"
 			},
-			"help": "Visual alignment"
-		},
-		"tokeniser_type": {
-			"type": UserInput.OPTION_CHOICE,
-			"default": "regular",
-			"options": {
-				"regular": "nltk word_tokenize",
-				"jieba-cut": "jieba (for Chinese text; accurate mode, recommended)",
-				"jieba-cut-all": "jieba (for Chinese text; full mode)",
-				"jieba-search": "jieba (for Chinese text; search engine suggestion style)",
+			"sides": {
+				"type": UserInput.OPTION_CHOICE,
+				"default": "right",
+				"options": {
+					"left": "Before query",
+					"right": "After query",
+					"both": "Before and after query"
+				},
+				"help": "Query context to visualise"
 			},
-			"help": "Tokeniser",
-			"tooltip": "What heuristic to use to split up the text into separate words."
-		},
-		"strip-urls": {
-			"type": UserInput.OPTION_TOGGLE,
-			"default": True,
-			"help": "Remove URLs"
-		},
-		"strip-symbols": {
-			"type": UserInput.OPTION_TOGGLE,
-			"default": True,
-			"help": "Remove punctuation"
+			"align": {
+				"type": UserInput.OPTION_CHOICE,
+				"default": "middle",
+				"options": {
+					"middle": "Vertically centered",
+					"top": "Top",
+				},
+				"help": "Visual alignment"
+			},
+			"tokeniser_type": {
+				"type": UserInput.OPTION_CHOICE,
+				"default": "regular",
+				"options": {
+					"regular": "nltk word_tokenize",
+					"jieba-cut": "jieba (for Chinese text; accurate mode, recommended)",
+					"jieba-cut-all": "jieba (for Chinese text; full mode)",
+					"jieba-search": "jieba (for Chinese text; search engine suggestion style)",
+				},
+				"help": "Tokeniser",
+				"tooltip": "What heuristic to use to split up the text into separate words."
+			},
+			"strip-urls": {
+				"type": UserInput.OPTION_TOGGLE,
+				"default": True,
+				"help": "Remove URLs"
+			},
+			"strip-symbols": {
+				"type": UserInput.OPTION_TOGGLE,
+				"default": True,
+				"help": "Remove punctuation"
+			}
 		}
-	}
+
+		# Get the columns for the select columns option
+		if parent_dataset and parent_dataset.get_columns():
+			columns = parent_dataset.get_columns()
+			options["column"]["type"] = UserInput.OPTION_CHOICE
+			options["column"]["options"] = {v: v for v in columns}
+			options["column"]["default"] = "body" if "body" in columns else sorted(
+				columns,
+				key=lambda k: any([name in k for name in ["text", "subject", "description"]]), reverse=True).pop(0)
+
+		return options
 
 	@classmethod
 	def is_compatible_with(cls, module=None, user=None):
@@ -146,6 +169,7 @@ def process(self):
 		delete_regex = re.compile(r"[^a-zA-Z)(.,\n -]")
 
 		# settings
+		column = self.parameters.get("column")
 		strip_urls = self.parameters.get("strip-urls")
 		strip_symbols = self.parameters.get("strip-symbols")
 		sides = self.parameters.get("sides")
@@ -187,7 +211,7 @@ def process(self):
 			processed += 1
 			if processed % 500 == 0:
 				self.dataset.update_status("Processing and tokenising post %i" % processed)
-			body = post["body"]
+			body = post.get(column)
 			if not body:
 				continue