Skip to content

Commit

Permalink
word-trees allow selection of column
Browse files Browse the repository at this point in the history
  • Loading branch information
dale-wahl committed Oct 8, 2024
1 parent d769be4 commit e4c0099
Showing 1 changed file with 86 additions and 62 deletions.
148 changes: 86 additions & 62 deletions processors/visualisation/word-trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,71 +38,94 @@ class MakeWordtree(BasicProcessor):
"Wattenberg, M., & Viégas, F. B. (2008). The Word Tree, an Interactive Visual Concordance. IEEE Transactions on Visualization and Computer Graphics, 14(6), 1221–1228. <https://doi.org/10.1109/TVCG.2008.172>"
]

options = {
"query": {
"type": UserInput.OPTION_TEXT,
"default": "",
"help": "Word tree root query",
"tooltip": "Enter a word here to serve as the root of the word tree. The context of this query will be mapped in the tree visualisation. Cannot be empty or contain whitespace."
},
"limit": {
"type": UserInput.OPTION_TEXT,
"default": 3,
"min": 1,
"max": 25,
"help": "Max branches/level",
"tooltip": "Limit the amount of branches per level, sorted by most-occuring phrases. Range 1-25."
},
"window": {
"type": UserInput.OPTION_TEXT,
"min": 1,
"max": 10,
"default": 5,
"help": "Window size",
"tooltip": "Up to this many words before and/or after the queried phrase will be visualised"
},
"sides": {
"type": UserInput.OPTION_CHOICE,
"default": "right",
"options": {
"left": "Before query",
"right": "After query",
"both": "Before and after query"
@classmethod
def get_options(cls, parent_dataset=None, user=None):
"""
Get processor options
"""
options = {
"column": {
"type": UserInput.OPTION_TEXT,
"help": "Text column",
"default": "url",
"inline": True,
"tooltip": "Select the column containing the text from which to generate the word tree.",
},
"query": {
"type": UserInput.OPTION_TEXT,
"default": "",
"help": "Word tree root query",
"tooltip": "Enter a word here to serve as the root of the word tree. The context of this query will be mapped in the tree visualisation. Cannot be empty or contain whitespace."
},
"limit": {
"type": UserInput.OPTION_TEXT,
"default": 3,
"min": 1,
"max": 25,
"help": "Max branches/level",
"tooltip": "Limit the amount of branches per level, sorted by most-occuring phrases. Range 1-25."
},
"help": "Query context to visualise"
},
"align": {
"type": UserInput.OPTION_CHOICE,
"default": "middle",
"options": {
"middle": "Vertically centered",
"top": "Top",
"window": {
"type": UserInput.OPTION_TEXT,
"min": 1,
"max": 10,
"default": 5,
"help": "Window size",
"tooltip": "Up to this many words before and/or after the queried phrase will be visualised"
},
"help": "Visual alignment"
},
"tokeniser_type": {
"type": UserInput.OPTION_CHOICE,
"default": "regular",
"options": {
"regular": "nltk word_tokenize",
"jieba-cut": "jieba (for Chinese text; accurate mode, recommended)",
"jieba-cut-all": "jieba (for Chinese text; full mode)",
"jieba-search": "jieba (for Chinese text; search engine suggestion style)",
"sides": {
"type": UserInput.OPTION_CHOICE,
"default": "right",
"options": {
"left": "Before query",
"right": "After query",
"both": "Before and after query"
},
"help": "Query context to visualise"
},
"help": "Tokeniser",
"tooltip": "What heuristic to use to split up the text into separate words."
},
"strip-urls": {
"type": UserInput.OPTION_TOGGLE,
"default": True,
"help": "Remove URLs"
},
"strip-symbols": {
"type": UserInput.OPTION_TOGGLE,
"default": True,
"help": "Remove punctuation"
"align": {
"type": UserInput.OPTION_CHOICE,
"default": "middle",
"options": {
"middle": "Vertically centered",
"top": "Top",
},
"help": "Visual alignment"
},
"tokeniser_type": {
"type": UserInput.OPTION_CHOICE,
"default": "regular",
"options": {
"regular": "nltk word_tokenize",
"jieba-cut": "jieba (for Chinese text; accurate mode, recommended)",
"jieba-cut-all": "jieba (for Chinese text; full mode)",
"jieba-search": "jieba (for Chinese text; search engine suggestion style)",
},
"help": "Tokeniser",
"tooltip": "What heuristic to use to split up the text into separate words."
},
"strip-urls": {
"type": UserInput.OPTION_TOGGLE,
"default": True,
"help": "Remove URLs"
},
"strip-symbols": {
"type": UserInput.OPTION_TOGGLE,
"default": True,
"help": "Remove punctuation"
}
}
}

# Get the columns for the select columns option
if parent_dataset and parent_dataset.get_columns():
columns = parent_dataset.get_columns()
options["column"]["type"] = UserInput.OPTION_CHOICE
options["column"]["options"] = {v: v for v in columns}
options["column"]["default"] = "body" if "body" in columns else sorted(
columns,
key=lambda k: any([name in k for name in ["text", "subject", "description"]]), reverse=True).pop(0)

return options

@classmethod
def is_compatible_with(cls, module=None, user=None):
Expand Down Expand Up @@ -146,6 +169,7 @@ def process(self):
delete_regex = re.compile(r"[^a-zA-Z)(.,\n -]")

# settings
column = self.parameters.get("column")
strip_urls = self.parameters.get("strip-urls")
strip_symbols = self.parameters.get("strip-symbols")
sides = self.parameters.get("sides")
Expand Down Expand Up @@ -187,7 +211,7 @@ def process(self):
processed += 1
if processed % 500 == 0:
self.dataset.update_status("Processing and tokenising post %i" % processed)
body = post["body"]
body = post.get(column)
if not body:
continue

Expand Down

0 comments on commit e4c0099

Please sign in to comment.