From 81db52c690905aceb8fd78d56dd234b8ba51bfae Mon Sep 17 00:00:00 2001 From: CI Date: Wed, 17 Feb 2021 16:01:01 -0400 Subject: [PATCH] major bugfixing commit + documentation --- README.md | 111 ++++++++++++++++++ algorithmia_benchmark.py | 39 ++++-- .../src/classification_albert.py | 14 ++- algorithms/hello_world/requirements.txt | 2 + algorithms/hello_world/src/__init__.py | 0 algorithms/hello_world/src/hello_world.py | 7 ++ .../hello_world/src/hello_world_test.py | 4 + src/algorithm_creation.py | 10 +- workflows/classification_albert.json | 2 +- workflows/hello_world.json | 23 ++++ workflows/image_parallel_pipelining.json | 6 +- 11 files changed, 193 insertions(+), 25 deletions(-) create mode 100644 algorithms/hello_world/requirements.txt create mode 100644 algorithms/hello_world/src/__init__.py create mode 100644 algorithms/hello_world/src/hello_world.py create mode 100644 algorithms/hello_world/src/hello_world_test.py create mode 100644 workflows/hello_world.json diff --git a/README.md b/README.md index 2cbb9b1..d5c822b 100644 --- a/README.md +++ b/README.md @@ -91,3 +91,114 @@ A workflow is a json file consisting of 3 separate objects, `source_info`, `test * If the algorithm fails to process your test payload, the process will return an exception, and the error message that was returned +## How to create a workflow +Creating a new workflow has 2 steps: +* Create a workflow.json file in `/workflows` +* Creating an Algorithm template (or templates) in the `/algorithms` directory +Lets explore each of step in detail. + +### Workflow Creation +First create a workflow with a useful filename, you'll use this to refer to your workflow operations. + +`touch workflows/hello_world.json` + +**some important things to note:** +* Algorithm order matters! + * Make sure that you define downstream algorithms first, and walk back towards your "orchestrator" if you have one. + * For example, define `smartImageDownloader` first, before defining your Image Classifier that uses it. +* All algorithm data is to be stored in the Data API + * It can be any collection, but the reason for this is to ensure that we can export data into closed off networks. + +Lets look at a basic template and walk through the different components. +```json +{ + "source_info": { + "cluster_address": "https://api.algorithmia.com" + }, + "test_info": { + "entrypoint": "hello_world", + "tests": [ + { + "name": "basic test", + "payload": "Algorithmia", + "timeout": 10 + } + ] + }, + "algorithms": [ + { + "name": "hello_world", + "data_files": [], + "language": "python3", + "test_payload": "Algorithmia" + } + ] +} +``` + +#### source_info +For the moment, this contains only the `cluster_address` algorithmia cluster api addresswhere data files are located, in the future this may be optional. +Unless you're storing data files on a different cluster than production, please leave this as is. +#### test_info +This is where you define the benchmark tests for your algorithm to pass +* `"entrypoint"` - defines which algorithm should be "called" during all tests, only one algorithm can be an entrypoint per workflow. Name must match up exactly with the name you defined in `algorithms`. +* `"tests` - a list of tests which get run in order, each test consists of: + * `"name"` - the name of the test (for reporting purposes) + * `"payload"` - the json encoded payload to provide to your entrypoint algorithm. + * If you're interacting with data files, it's recommended to define them in your algorithm's `data_files` object, and to refer to them with the following schema: + * `"data://.my//..."`, replacing ... with the name of your datafile. + * If you're algorithm writes to a particular location, to ensure that the collection exists it's recommended to use the following output schema: + * `"data://.algo/temp/..."`, replacing ... with the name of your datafile. + * `"timeout"` - the amount of seconds we should wait for your algorithm before determining that the test failed, maximum value is `3000`. +#### algorithms + +This is where you define the algorithms that your workflow will need to get executed, this includes any dependent algorithms (like smart image downloader). +Please ensure that you define your algorithms in order of dependency requirements. If one of your algorithms depends on another, list the downstream one first. +* `"algorithms"` - a list of algorithm objects that this workflow will use + * `"name"` - the name of your algorithm, must match the name of the directory defined in `/algorithms` as well as the name of the algorithm files associated. + * for example, if your algorithm is "hello_world", the directory path containing your algorithm code must be `/algorithms/hello_world` which in the src directory contains `hello_world.py` and `hello_world_test.py` + * `"data_files"` - this list object contains all model files and other objects required at runtime for your algorithm, as a data API URI prefixed with '.my' + * for the moment, these files should be stored in a data collection owned by user `quality` on production + * data file collection paths are not used, so they can be anything + * If your algorithm uses an image or data file as input for testing, those objects should be stored using this system as well + * `"language"` - the environments `language` enum that should be used to create this algorithm. + * the concept of "language" is not quite right, as we're potentially using the same language but with different dependencies + * check to make sure that your required dependencies exist as a language already defined in `/src/images.py` + * if running the benchmark on an AEMS cluster that does not access to the PROD or TEST AEMS masters, you'll need to interact with the `environments/current` webapi endpoint to populate your environments list + * if you hit any kind of system 500 error during the build stage, make sure that your language is configured and that the language `environment id` is valid. + * `"test_payload"` - the json encodable (string, list, dict, etc) algorithm payload you wish to send to your algorithm to verify runtime functionality + * not used during the benchmark process, you may use different payloads during validation and benchmarking + * If you're interacting with data files, it's recommended to define them in your algorithm's `data_files` object, and to refer to them with the following schema: + * `"data://.my//..."`, replacing ... with the name of your datafile. +If you have any questions as to the schema or best practices in regards to creating a workflow file, please ping zeryx or the algo-team on slack and we should be able to help :) + +### Algorithm Template Creation +Now that we have the workflow configured, lets take a look at the `/algorithms` directory, and what it takes to setup a new algorithm template. + +Currently our templating service supports the following languages: +* [x] Python +* [ ] Scala +* [ ] Java +* [ ] R +* [ ] Javascript +* [ ] Ruby +* [ ] C# +* [ ] Rust +#### For Python +* the name of each directory **is equal to** the name of the algorithm, this is used for lookups and is important. + * This is also case sensitve, as algorithm names are also case sensitive. + * eg: "/algorithms/BoundingBoxOnImage" contains the `BoundingBoxOnImage` algorithm +* inside an algorithm directory, we have a `/src` directory and a `requirements.txt` file + * The `/src` directory should contain all algorithm files present in the original algorithms `src` directory. + * However, for any references to an algorithm or data file, should be replaced with the following: + * for data files: + * original - `"data://myuser/mycollection/somedata_file"` + * template friendly version - `"data://.my//somedata_file"` + * for algorithms: + * original - `"algo://someuser/somealgo/0.2.4"` + * template friendly version - `"algo://.my/somealgo/latestPrivate"` +* typically, no changes are required for `requirements.txt` files, just copy them from the original algorithm. +* if you end up using a data location on disk that contains the algorithm name, consider renaming it as there may be a conflict with our algorithm creation service. + + + diff --git a/algorithmia_benchmark.py b/algorithmia_benchmark.py index 851191e..1754368 100644 --- a/algorithmia_benchmark.py +++ b/algorithmia_benchmark.py @@ -5,7 +5,7 @@ import shutil import requests from src.utilities import algorithm_exists, call_algo -from time import sleep +from uuid import uuid4 import sys from os import environ, path, listdir from src.algorithm_creation import initialize_algorithm, migrate_datafiles, update_algorithm @@ -33,6 +33,16 @@ def find_algo(algo_name, artifact_path): raise Exception(f"algorithm {algo_name} not found in local cache (algorithms)") +def template_payload(payload, template_name): + if isinstance(payload, str): + payload = payload.replace("", template_name) + elif isinstance(payload, dict): + for key in payload.keys(): + if isinstance(payload[key], str): + payload[key] = payload[key].replace("", template_name) + return payload + + def delete_workflows(workflows, destination_client: Client): for workflow in workflows: for algorithm in workflow.get("algorithms", []): @@ -55,13 +65,18 @@ def delete_workflows(workflows, destination_client: Client): def create_workflows(workflows, source_client, destination_aems_master, destination_client): entrypoints = [] for workflow in workflows: - print("----- Creating workflow {} -----".format(workflow["name"])) + print(f"----- Creating workflow {workflow['name']} -----") + workflow_suffix = str(uuid4()).split('-')[-1] + print(f"----- Workflow Suffix is: {workflow_suffix} -----") entrypoint_path = workflow['test_info'].get("entrypoint", None) + algorithm_pairs = [] for algorithm in workflow.get("algorithms", []): if path.exists(WORKING_DIR): shutil.rmtree(WORKING_DIR) print("\n") - algorithm_name = algorithm['name'] + template_algorithm_name = algorithm['name'] + new_algorithm_name = f"{template_algorithm_name}_{workflow_suffix}" + algorithm_pairs.append(( template_algorithm_name, new_algorithm_name)) remote_code_path = algorithm.get("code", None) language = algorithm['language'] data_file_paths = algorithm['data_files'] @@ -75,19 +90,20 @@ def create_workflows(workflows, source_client, destination_aems_master, destinat f.extractall(path=artifact_path) else: print("checking for local code...") - find_algo(algorithm_name, artifact_path) + find_algo(template_algorithm_name, artifact_path) print("initializing algorithm...") - algo_object = initialize_algorithm(algorithm_name, language, destination_aems_master, destination_client) + algo_object = initialize_algorithm(new_algorithm_name, language, destination_aems_master, destination_client) print("migrating datafiles...") migrate_datafiles(algo_object, data_file_paths, source_client, destination_client, WORKING_DIR) print("updating algorithm source...") - update_algorithm(algo_object, destination_client, WORKING_DIR, artifact_path) + update_algorithm(algo_object, template_algorithm_name, algorithm_pairs, destination_client, WORKING_DIR, artifact_path) print("testing algorithm...") - algorithm_test(algo_object, test_payload) + payload = template_payload(test_payload, new_algorithm_name) + algorithm_test(algo_object, payload) print("publishing algorithm...") published_algorithm = algorithm_publish(algo_object, test_payload) - if entrypoint_path and entrypoint_path == algorithm_name: + if entrypoint_path and entrypoint_path == template_algorithm_name: entrypoints.append(published_algorithm) return entrypoints @@ -99,6 +115,7 @@ def workflow_test(algorithms, workflows): for test in test_info['tests']: name = test['name'] payload = test['payload'] + payload = template_payload(payload, algorithm.algoname) timeout = test['timeout'] message = f"test {name} for {algorithm.username}/{algorithm.algoname} with timeout {timeout}" print("starting " + message) @@ -115,7 +132,7 @@ def workflow_test(algorithms, workflows): destination_ca_cert = environ.get("DESTINATION_CA_CERT", None) destination_aems_master = environ.get("DESTINATION_AEMS_MASTER", "prod") if len(sys.argv) > 1: - workflow_names = str(sys.argv[1]) + workflow_names = [str(sys.argv[1])] else: workflow_names = [] for file in listdir("workflows"): @@ -127,10 +144,6 @@ def workflow_test(algorithms, workflows): ca_cert=source_ca_cert) destination_client = Algorithmia.client(api_key=destination_api_key, api_address=destination_api_address, ca_cert=destination_ca_cert) - # print("----deleting algorithms-----") - # delete_workflows(workflows, destination_client) - print("------- waiting for algorithm caches to clear ---------") - sleep(15) print("------- Starting Algorithm Export/Import Procedure -------") entrypoint_algos = create_workflows(workflows, source_client, destination_aems_master, destination_client) print("------- Workflow Created, initiating QA Test Procedure -------") diff --git a/algorithms/classification_albert/src/classification_albert.py b/algorithms/classification_albert/src/classification_albert.py index 490f52f..d2c8eef 100644 --- a/algorithms/classification_albert/src/classification_albert.py +++ b/algorithms/classification_albert/src/classification_albert.py @@ -2,7 +2,7 @@ import time import zipfile - +from os import walk import tensorflow as tf from transformers import AlbertTokenizer, AlbertConfig from transformers.modeling_tf_albert import TFAlbertForSequenceClassification @@ -14,8 +14,8 @@ DEFAULT_MAX_LEN = 128 DEFAULT_BATCH_SIZE = 16 -MODEL_ZIP_PATH = "data://.my/classification_albert/classification_albert_model_params.zip" -UNZIPPED_MODEL_PATH = "classification_albert_model_params" +MODEL_ZIP_PATH = "data://.my/classification_albert/model_params.zip" +UNZIPPED_MODEL_PATH = "model_params" client = Algorithmia.client() @@ -24,16 +24,20 @@ def get_unzipped_dir_path(zip_path_in_collection, dir_name): start = time.time() zip_in_collection = client.file(zip_path_in_collection).getFile().name - output_dir = "/tmp" + output_dir = "/tmp/somedir" try: zipped_file = zipfile.ZipFile(zip_in_collection, "r") zipped_file.extractall(output_dir) zipped_file.close() duration = time.time() - start + output_directory_name = None + for dirpath, dirnames, filenames in walk(output_dir): + for dirname in dirnames: + output_directory_name = dirname print(f"Getting model data took {duration}") except Exception as e: print("Exception occurred while creating dir: {}".format(e)) - return "{}/{}".format(output_dir, dir_name) + return "{}/{}".format(output_dir, output_directory_name) def load_model_and_tokenizer(): diff --git a/algorithms/hello_world/requirements.txt b/algorithms/hello_world/requirements.txt new file mode 100644 index 0000000..1ef65c6 --- /dev/null +++ b/algorithms/hello_world/requirements.txt @@ -0,0 +1,2 @@ +algorithmia>=1.0.0,<2.0 +six \ No newline at end of file diff --git a/algorithms/hello_world/src/__init__.py b/algorithms/hello_world/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/algorithms/hello_world/src/hello_world.py b/algorithms/hello_world/src/hello_world.py new file mode 100644 index 0000000..3a47892 --- /dev/null +++ b/algorithms/hello_world/src/hello_world.py @@ -0,0 +1,7 @@ +import Algorithmia + +# API calls will begin at the apply() method, with the request body passed as 'input' +# For more details, see algorithmia.com/developers/algorithm-development/languages +def apply(input): + return "hello {}".format(input) + diff --git a/algorithms/hello_world/src/hello_world_test.py b/algorithms/hello_world/src/hello_world_test.py new file mode 100644 index 0000000..ea6f68c --- /dev/null +++ b/algorithms/hello_world/src/hello_world_test.py @@ -0,0 +1,4 @@ +from . import hello + +def test_hello(): + assert hello.apply("Jane") == "hello Jane" diff --git a/src/algorithm_creation.py b/src/algorithm_creation.py index dd52939..6aa097d 100644 --- a/src/algorithm_creation.py +++ b/src/algorithm_creation.py @@ -26,7 +26,7 @@ def create_algorithm(algo, algoname, mode, aems_master): algo.create( details={ - "label": f"QA - {algoname} - {str(uuid4())}", + "label": f"QA - {algoname}", }, settings={ "source_visibility": "closed", @@ -55,7 +55,7 @@ def migrate_datafiles(algo, data_file_paths, source_client, destination_client, print(f"{collection_path} already exists, assuming datafiles are correct; skipping migration...") -def update_algorithm(algo, remote_client, workspace_path, artifact_path): +def update_algorithm(algo, original_name, algorithm_pairs, remote_client, workspace_path, artifact_path): api_key = remote_client.apiKey api_address = remote_client.apiAddress destination_algorithm_name = algo.algoname @@ -73,7 +73,11 @@ def update_algorithm(algo, remote_client, workspace_path, artifact_path): sh.rm("-r", f"{repo_path}/src") sh.cp("-R", f"{artifact_path}/src", f"{repo_path}/src") sh.cp("-R", f"{artifact_path}/requirements.txt", f"{repo_path}/requirements.txt") - sh.xargs.sed(sh.find(repo_path, "-type", "f"), i=f"s/{templatable_username}/{destination_username}/g") + sh.xargs.sed(sh.find(repo_path, "-not", "-path", "*/\.*", "-type", "f"), i=f"s/{templatable_username}/{destination_username}/g") + for template_name, new_name in algorithm_pairs: + sh.xargs.sed(sh.find(f"{repo_path}/src", "-not", "-path", "*/\.*", "-type", "f"), i=f"s/{template_name}/{new_name}/g") + sh.mv(f"{repo_path}/src/{original_name}.py", f"{repo_path}/src/{destination_algorithm_name}.py") + sh.mv(f"{repo_path}/src/{original_name}_test.py", f"{repo_path}/src/{destination_algorithm_name}_test.py") try: publish_bake.add(".") publish_bake.commit(m="automatic initialization commit") diff --git a/workflows/classification_albert.json b/workflows/classification_albert.json index d910b09..f6110e9 100644 --- a/workflows/classification_albert.json +++ b/workflows/classification_albert.json @@ -21,7 +21,7 @@ { "name": "classification_albert", "data_files": [ - "data://.my/artifacts/classification_albert_model_params.zip" + "data://.my/classification_albert/model_params.zip" ], "language": "tensorflow-2.3", "test_payload": {"texts": ["Hello", "Nice to meet you", "I can not login", "It does not work"]} diff --git a/workflows/hello_world.json b/workflows/hello_world.json new file mode 100644 index 0000000..90c9d1c --- /dev/null +++ b/workflows/hello_world.json @@ -0,0 +1,23 @@ +{ + "source_info": { + "cluster_address": "https://api.algorithmia.com" + }, + "test_info": { + "entrypoint": "hello_world", + "tests": [ + { + "name": "basic test", + "payload": "Algorithmia", + "timeout": 5 + } + ] + }, + "algorithms": [ + { + "name": "hello_world", + "data_files": [], + "language": "python3", + "test_payload": "Algorithmia" + } + ] +} \ No newline at end of file diff --git a/workflows/image_parallel_pipelining.json b/workflows/image_parallel_pipelining.json index 2159e7b..82b05e0 100644 --- a/workflows/image_parallel_pipelining.json +++ b/workflows/image_parallel_pipelining.json @@ -25,7 +25,7 @@ "data://.my/artifacts/willow_example.jpeg" ], "test_payload": { - "image": "data://.my/smartimagedownloader/willow_example.jpeg", + "image": "data://.my//willow_example.jpeg", "resize": 600 } }, @@ -36,7 +36,7 @@ "data://.my/artifacts/willow_example.jpeg" ], "test_payload": { - "imageUrl": "data://.my/boundingboxonimage/willow_example.jpeg", + "imageUrl": "data://.my//willow_example.jpeg", "imageSaveUrl": "data://.algo/temp/willow.jpeg", "boundingBoxes": [ { @@ -65,7 +65,7 @@ "data://.my/artifacts/willow_example.jpeg" ], "language": "tensorflow-1.14", - "test_payload": "data://.my/boundingboxonimage/willow_example.jpeg" + "test_payload": "data://.my//willow_example.jpeg" }, { "name": "ParallelPipelining",