Merge pull request #668 from epfml/662-improve-tasks-julien

Improves and fixes the web-client UI
epfml · May 13, 2024 · d454f30 · d454f30
2 parents 39115a6 + 7b301c5
commit d454f30
Show file tree

Hide file tree

Showing 62 changed files with 729 additions and 1,517 deletions.
diff --git a/.github/workflows/lint-test-build.yml b/.github/workflows/lint-test-build.yml
@@ -332,7 +332,7 @@ jobs:
           cache: npm
       - run: npm ci
       - run: npm --workspace={./discojs/discojs-{core,node},./server} run build
-      - run: npm --workspace=./cli start -- -t cifar10 -u 1 -e 1
+      - run: npm --workspace=./cli start -- -t cifar10 -u 3 -e 1
 
   test-docs-examples:
     needs: [build-lib-core, build-lib-node, build-server, download-datasets]

diff --git a/cli/src/data.ts b/cli/src/data.ts
@@ -1,4 +1,4 @@
-import { Range } from 'immutable'
+import { Range, Repeat } from 'immutable'
 import fs from 'node:fs/promises'
 import path from 'node:path'
 
@@ -23,7 +23,7 @@ async function simplefaceData (task: Task): Promise<data.DataSplit> {
 async function cifar10Data (cifar10: Task): Promise<data.DataSplit> {
   const dir = '../datasets/CIFAR10/'
   const files = (await fs.readdir(dir)).map((file) => path.join(dir, file))
-  const labels = Range(0, 24).map((label) => (label % 10).toString()).toArray()
+  const labels = Repeat('airplane', 24).toArray() // TODO read labels in csv
   return await new NodeImageLoader(cifar10).loadAll(files, { labels })
 }
 

diff --git a/discojs/discojs-core/src/dataset/data/image_data.spec.ts b/discojs/discojs-core/src/dataset/data/image_data.spec.ts
@@ -2,21 +2,13 @@ import { assert, expect } from 'chai'
 import * as tf from '@tensorflow/tfjs'
 
 import { ImageData } from './image_data.js'
-import type { Task } from '../../index.js'
+import { defaultTasks } from '../../index.js'
 
 describe('image data checks', () => {
-  const simplefaceMock: Task = {
-    id: 'simpleface',
-    displayInformation: {},
-    trainingInformation: {
-      IMAGE_H: 200,
-      IMAGE_W: 200
-    }
-  } as unknown as Task
-
+  const simpleFaceTask = defaultTasks.simpleFace.getTask()
   it('throw an error on incorrectly formatted data', async () => {
     try {
-      await ImageData.init(tf.data.array([tf.zeros([150, 150, 3]), tf.zeros([150, 150, 3])]), simplefaceMock, 3)
+      await ImageData.init(tf.data.array([tf.zeros([150, 150, 3]), tf.zeros([150, 150, 3])]), simpleFaceTask, 3)
     } catch (e) {
       expect(e).to.be.an.instanceOf(Error)
       return
@@ -26,6 +18,6 @@ describe('image data checks', () => {
   })
 
   it('do nothing on correctly formatted data', async () => {
-    await ImageData.init(tf.data.array([tf.zeros([200, 200, 3]), tf.zeros([200, 200, 3])]), simplefaceMock, 3)
+    await ImageData.init(tf.data.array([tf.zeros([200, 200, 3]), tf.zeros([200, 200, 3])]), simpleFaceTask, 3)
   })
 })
diff --git a/discojs/discojs-core/src/dataset/data/preprocessing/text_preprocessing.spec.ts b/discojs/discojs-core/src/dataset/data/preprocessing/text_preprocessing.spec.ts
@@ -10,7 +10,10 @@ describe('text preprocessing', function () {
   function initMockTask(): Task {
     return {
       id: 'mock-task-id',
-      displayInformation: {},
+      displayInformation: {
+      taskTitle: 'mock title',
+      summary: { overview: '', preview: '' }
+    },
       trainingInformation: {
         modelID: 'model-id',
         epochs: 1,

diff --git a/discojs/discojs-core/src/dataset/data/tabular_data.spec.ts b/discojs/discojs-core/src/dataset/data/tabular_data.spec.ts
@@ -3,37 +3,16 @@ import { Map, Set } from 'immutable'
 import * as tf from '@tensorflow/tfjs'
 
 import { TabularData } from './tabular_data.js'
-import type { Task } from '../../index.js'
+import { defaultTasks } from '../../index.js'
+
 
 describe('tabular data checks', () => {
-  const titanicMock: Task = {
-    id: 'titanic',
-    displayInformation: {},
-    trainingInformation: {
-      modelID: 'titanic',
-      epochs: 1,
-      roundDuration: 1,
-      validationSplit: 0,
-      batchSize: 1,
-      dataType: 'tabular',
-      scheme: 'federated',
-      inputColumns: [
-        'PassengerId',
-        'Age',
-        'SibSp',
-        'Parch',
-        'Fare',
-        'Pclass'
-      ],
-      outputColumns: [
-        'Survived'
-      ]
-    }
-  }
+  const titanicTask = defaultTasks.titanic.getTask()
+
 
   const dataConfig = {
-    features: titanicMock.trainingInformation.inputColumns,
-    labels: titanicMock.trainingInformation.outputColumns
+    features: titanicTask.trainingInformation.inputColumns,
+    labels: titanicTask.trainingInformation.outputColumns
   }
 
   const columnConfigs = Map(
@@ -51,7 +30,7 @@ describe('tabular data checks', () => {
 
   it('throw an error on incorrectly formatted data', async () => {
     try {
-      await TabularData.init(tf.data.csv('file://../../datasets/cifar10-labels.csv', csvConfig), titanicMock, 3)
+      await TabularData.init(tf.data.csv('file://../../datasets/cifar10-labels.csv', csvConfig), titanicTask, 3)
     } catch (e) {
       expect(e).to.be.an.instanceOf(Error)
       return
@@ -61,6 +40,6 @@ describe('tabular data checks', () => {
   })
 
   it('do nothing on correctly formatted data', async () => {
-    await TabularData.init(tf.data.csv('file://../../datasets/titanic_train.csv', csvConfig), titanicMock, 3)
+    await TabularData.init(tf.data.csv('file://../../datasets/titanic_train.csv', csvConfig), titanicTask, 3)
   })
 })
diff --git a/discojs/discojs-core/src/dataset/dataset_builder.ts b/discojs/discojs-core/src/dataset/dataset_builder.ts
@@ -98,7 +98,7 @@ export class DatasetBuilder<Source> {
   async build (config?: DataConfig): Promise<DataSplit> {
     // Require that at least one source collection is non-empty, but not both
     if ((this._sources.length > 0) === (this.labelledSources.size > 0)) {
-      throw new Error('Please provide dataset input files')
+      throw new Error('Please provide dataset input files') // This error message is parsed in DatasetInput.vue
     }
 
     let dataTuple: DataSplit

diff --git a/discojs/discojs-core/src/default_tasks/cifar10.ts b/discojs/discojs-core/src/default_tasks/cifar10.ts
@@ -13,11 +13,10 @@ export const cifar10: TaskProvider = {
           preview: 'In this challenge, we ask you to classify images into categories based on the objects shown on the image.',
           overview: 'The CIFAR-10 dataset is a collection of images that are commonly used to train machine learning and computer vision algorithms. It is one of the most widely used datasets for machine learning research.'
         },
-        limitations: 'The training data is limited to small images of size 32x32.',
-        tradeoffs: 'Training success strongly depends on label distribution',
         dataFormatInformation: 'Images should be of .png format and of size 32x32. <br> The label file should be .csv, where each row contains a file_name, class.  <br> <br> e.g. if you have images: 0.png (of a frog) and 1.png (of a car) <br> labels.csv contains: (Note that no header is needed)<br> 0.png, frog <br> 1.png, car',
         dataExampleText: 'Below you can find 10 random examples from each of the 10 classes in the dataset.',
-        dataExampleImage: 'https://storage.googleapis.com/deai-313515.appspot.com/example_training_data/cifar10-example.png'
+        dataExampleImage: 'https://storage.googleapis.com/deai-313515.appspot.com/example_training_data/cifar10-example.png',
+        sampleDatasetLink: 'https://www.kaggle.com/competitions/cifar-10/data'
       },
       trainingInformation: {
         modelID: 'cifar10-model',
@@ -29,7 +28,7 @@ export const cifar10: TaskProvider = {
         preprocessingFunctions: [data.ImagePreprocessing.Resize],
         IMAGE_H: 224,
         IMAGE_W: 224,
-        LABEL_LIST: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'],
+        LABEL_LIST: ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'],
         scheme: 'decentralized',
         noiseScale: undefined,
         clippingRadius: 20,

diff --git a/discojs/discojs-core/src/default_tasks/geotags.ts b/discojs/discojs-core/src/default_tasks/geotags.ts
diff --git a/discojs/discojs-core/src/default_tasks/index.ts b/discojs/discojs-core/src/default_tasks/index.ts
@@ -1,8 +1,6 @@
 export { cifar10 } from './cifar10.js'
-export { geotags } from './geotags.js'
 export { lusCovid } from './lus_covid.js'
 export { mnist } from './mnist.js'
 export { simpleFace } from './simple_face.js'
-export { skinMnist } from './skin_mnist.js'
 export { titanic } from './titanic.js'
 export { wikitext } from './wikitext.js'
diff --git a/discojs/discojs-core/src/default_tasks/lus_covid.ts b/discojs/discojs-core/src/default_tasks/lus_covid.ts
@@ -14,16 +14,16 @@ export const lusCovid: TaskProvider = {
           overview: "Don’t have a dataset of your own? Download a sample of a few cases <a class='underline' href='https://drive.switch.ch/index.php/s/zM5ZrUWK3taaIly' target='_blank'>here</a>."
         },
         model: "We use a simplified* version of the <b>DeepChest model</b>: A deep learning model developed in our lab (<a class='underline' href='https://www.epfl.ch/labs/mlo/igh-intelligent-global-health/'>intelligent Global Health</a>.). On a cohort of 400 Swiss patients suspected of LRTI, the model obtained over 90% area under the ROC curve for this task. <br><br>*Simplified to ensure smooth running on your browser, the performance is minimally affected. Details of the adaptations are below <br>- <b>Removed</b>: positional embedding (i.e. we don’t take the anatomic position into consideration). Rather, the model now does mean pooling over the feature vector of the images for each patient <br>- <b>Replaced</b>: ResNet18 by Mobilenet",
-        tradeoffs: 'We are using a simpler version of DeepChest in order to be able to run it on the browser.',
         dataFormatInformation: 'This model takes as input an image dataset. It consists on a set of lung ultrasound images per patient with its corresponding label of covid positive or negative. Moreover, to identify the images per patient you have to follow the follwing naming pattern: "patientId_*.png"',
         dataExampleText: 'Below you can find an example of an expected lung image for patient 2 named: 2_QAID_1.masked.reshaped.squared.224.png',
-        dataExampleImage: 'https://storage.googleapis.com/deai-313515.appspot.com/example_training_data/2_QAID_1.masked.reshaped.squared.224.png'
+        dataExampleImage: 'https://storage.googleapis.com/deai-313515.appspot.com/example_training_data/2_QAID_1.masked.reshaped.squared.224.png',
+        sampleDatasetLink: 'https://drive.switch.ch/index.php/s/zM5ZrUWK3taaIly'
       },
       trainingInformation: {
         modelID: 'lus-covid-model',
         epochs: 50,
         roundDuration: 2,
-        validationSplit: 0,
+        validationSplit: 0.2,
         batchSize: 5,
         IMAGE_H: 100,
         IMAGE_W: 100,

diff --git a/discojs/discojs-core/src/default_tasks/mnist.ts b/discojs/discojs-core/src/default_tasks/mnist.ts
@@ -14,7 +14,6 @@ export const mnist: TaskProvider = {
           overview: 'The MNIST handwritten digit classification problem is a standard dataset used in computer vision and deep learning. Although the dataset is effectively solved, we use it to test our Decentralised Learning algorithms and platform.'
         },
         model: 'The current model is a very simple CNN and its main goal is to test the app and the Decentralizsed Learning functionality.',
-        tradeoffs: 'We are using a simple model, first a 2d convolutional layer > max pooling > 2d convolutional layer > max pooling > convolutional layer > 2 dense layers.',
         dataFormatInformation: 'This model is trained on images corresponding to digits 0 to 9. You can upload each digit image of your dataset in the box corresponding to its label. The model taskes images of size 28x28 as input.',
         dataExampleText: 'Below you can find an example of an expected image representing the digit 9.',
         dataExampleImage: 'http://storage.googleapis.com/deai-313515.appspot.com/example_training_data/9-mnist-example.png'

diff --git a/discojs/discojs-core/src/default_tasks/simple_face.ts b/discojs/discojs-core/src/default_tasks/simple_face.ts
@@ -13,8 +13,6 @@ export const simpleFace: TaskProvider = {
           preview: 'Can you detect if the person in a picture is a child or an adult?',
           overview: 'Simple face is a small subset of face_task from Kaggle'
         },
-        limitations: 'The training data is limited to small images of size 200x200.',
-        tradeoffs: 'Training success strongly depends on label distribution',
         dataFormatInformation: '',
         dataExampleText: 'Below you find an example',
         dataExampleImage: 'https://storage.googleapis.com/deai-313515.appspot.com/example_training_data/simple_face-example.png'

diff --git a/discojs/discojs-core/src/default_tasks/skin_mnist.ts b/discojs/discojs-core/src/default_tasks/skin_mnist.ts
diff --git a/discojs/discojs-core/src/default_tasks/titanic.ts b/discojs/discojs-core/src/default_tasks/titanic.ts
@@ -14,7 +14,6 @@ export const titanic: TaskProvider = {
           overview: 'We all know the unfortunate story of the Titanic: this flamboyant new transatlantic boat that sunk in 1912 in the North Atlantic Ocean. Today, we revist this tragedy by trying to predict the survival odds of the passenger given some basic features.'
         },
         model: 'The current model does not normalize the given data and applies only a very simple pre-processing of the data.',
-        tradeoffs: 'We are using a small model for this task: 4 fully connected layers with few neurons. This allows fast training but can yield to reduced accuracy.',
         dataFormatInformation: 'This model takes as input a CSV file with 12 columns. The features are general information about the passenger (sex, age, name, etc.) and specific related Titanic data such as the ticket class bought by the passenger, its cabin number, etc.<br><br>pclass: A proxy for socio-economic status (SES)<br>1st = Upper<br>2nd = Middle<br>3rd = Lower<br><br>age: Age is fractional if less than 1. If the age is estimated, it is in the form of xx.5<br><br>sibsp: The dataset defines family relations in this way:<br>Sibling = brother, sister, stepbrother, stepsister<br>Spouse = husband, wife (mistresses and fiancés were ignored)<br><br>parch: The dataset defines family relations in this way:<br>Parent = mother, father<br>Child = daughter, son, stepdaughter, stepson<br>Some children travelled only with a nanny, therefore parch=0 for them.<br><br>The first line of the CSV contains the header:<br> PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked<br><br>Each susequent row contains the corresponding data.',
         dataExampleText: 'Below one can find an example of a datapoint taken as input by our model. In this datapoint, the person is young man named Owen Harris that unfortunnalty perished with the Titanic. He boarded the boat in South Hamptons and was a 3rd class passenger. On the testing & validation page, the data should not contain the label column (Survived).',
         dataExample: [