diff --git a/README.md b/README.md index cff8854..ee9c94e 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ This helm chart is intended to be used in two ways: * Development: The ./run_demo.sh script allows the infrastructure to be ran locally with docker+kind * Production: TODO -![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.0.1a](https://img.shields.io/badge/AppVersion-0.0.1a-informational?style=flat-square) +![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.0.1a17](https://img.shields.io/badge/AppVersion-0.0.1a17-informational?style=flat-square) ![DiracX Chart tests](https://github.com/DIRACGrid/diracx-charts/actions/workflows/main.yml/badge.svg?branch=master) @@ -301,7 +301,7 @@ Note that this configuration is trivial and does not follow production recommand | elasticsearch.volumeClaimTemplate.resources.requests.storage | string | `"100M"` | | | elasticsearch.volumeClaimTemplate.storageClassName | string | `"standard"` | | | fullnameOverride | string | `""` | | -| global.activeDeadlineSeconds | int | `900` | timeout for job deadlines | +| global.activeDeadlineSeconds | int | `3600` | timeout for job deadlines | | global.batchJobTTL | int | `600` | How long should batch jobs be retained after completing? | | global.imagePullPolicy | string | `"Always"` | | | global.images.client | string | `"ghcr.io/diracgrid/diracx/client"` | | @@ -381,6 +381,12 @@ Note that this configuration is trivial and does not follow production recommand | mysql.auth.username | string | `"sqldiracx"` | | | mysql.enabled | bool | `true` | | | mysql.initdbScriptsConfigMap | string | `"mysql-init-diracx-dbs"` | | +| mysql.startupProbe.enabled | bool | `true` | | +| mysql.startupProbe.failureThreshold | int | `30` | | +| mysql.startupProbe.initialDelaySeconds | int | `15` | | +| mysql.startupProbe.periodSeconds | int | `10` | | +| mysql.startupProbe.successThreshold | int | `1` | | +| mysql.startupProbe.timeoutSeconds | int | `1` | | | nameOverride | string | `""` | type=kubernetes.io/dockerconfigjson imagePullSecrets: - name: regcred | | nodeSelector | object | `{}` | | | opensearch.config | object | `{}` | | diff --git a/diracx/Chart.yaml b/diracx/Chart.yaml index e7d108b..82d6fd9 100644 --- a/diracx/Chart.yaml +++ b/diracx/Chart.yaml @@ -17,7 +17,7 @@ version: 0.1.0 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "0.0.1a" +appVersion: "0.0.1a17" dependencies: @@ -30,7 +30,7 @@ dependencies: - name: elasticsearch version: 8.5.1 repository: https://helm.elastic.co - # condition: elasticsearch.enabled + condition: elasticsearch.enabled - name: dex version: 0.14.2 diff --git a/diracx/values.yaml b/diracx/values.yaml index 3f7cdff..4f7adbb 100644 --- a/diracx/values.yaml +++ b/diracx/values.yaml @@ -13,7 +13,7 @@ global: # What storage class should we use for DiracX volumes storageClassName: standard # -- timeout for job deadlines - activeDeadlineSeconds: 900 + activeDeadlineSeconds: 3600 images: tag: "dev" services: ghcr.io/diracgrid/diracx/services @@ -351,15 +351,16 @@ mysql: # if mysql pod is failing and restarting due to mysql update # it can be that the prob failure treshold is too low - # increasing this number can help: + # increasing this number can help. # - # startupProbe: - # enabled: true - # initialDelaySeconds: 15 - # periodSeconds: 10 - # timeoutSeconds: 1 - # failureThreshold: 30 - # successThreshold: 1 + # Also have a look at https://github.com/bitnami/charts/issues/7433#issuecomment-938748980 + startupProbe: + enabled: true + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 1 + failureThreshold: 30 + successThreshold: 1 ########################## diff --git a/k3s/README.md b/k3s/README.md index a380d40..d309795 100644 --- a/k3s/README.md +++ b/k3s/README.md @@ -1,9 +1,11 @@ -# diracx-k3s +# Deploy diracx on a k3s cluster remotely -Deploy diracx on a k3s cluster remotely +## Before you start -## Resources +Make sure you go at least through the requirements below + +### Resources (to study) kubectl: https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/ @@ -18,26 +20,28 @@ diracx: https://github.com/DIRACGrid/diracx diracx-charts: https://github.com/DIRACGrid/diracx-charts -## Requirements - -- Accessible cluster machines via ssh - -- kubectl (client for managing kubernetes cluster) - -- helm (tool for managing kubernetes deployments via charts) +### Requirements for the (Virtual) machines of your cluster: -- Clone this repo on your laptop +You should have either one machine (for a test setup) or 3 or more. Avoid 2. -- Ports +Minimal requirements: +- Accessible via ssh (root) +- Ports that need to be open - 6443 (kubernetes) - - 8001 (kubernetes dashboard) + - 8001 (kubectl dashboard) - 8080 (longhorn dashboard) - 9000 (traefik dashboard) +- Check that you follow the recommendations https://docs.k3s.io/installation/requirements especially paying attention to the point about firewall -Check that you follow the recommendations https://docs.k3s.io/installation/requirements +The diracx chart includes by default all services that will ever be needed. These include MySQL, OpenSearch, MinIO, etc. Refer to the `values.yml` file for info (look for the `enabled` entries). +If you think you will install everything, pay attention to provide enough disk space, so if your machines come from a private cloud create and assign volumes. Few minimal relevant info: +- **longhorn** normally uses `/var/lib/longhorn` for storing its volumes. You can modify this value, for example below we used `/mnt/longhorn` +- **minIO** suggestions can be found [here](https://min.io/docs/minio/linux/operations/checklists/hardware.html#use-xfs-formatted-drives-with-labels). The info you need to extract is basically to use `mkfs.xfs` for formatting the volume. -Install kubectl (on laptop) ---------------------------- + +## Client software to install + +### kubectl ```bash # kubectl @@ -52,63 +56,68 @@ echo "$(cat kubectl.sha256) kubectl" | sha256sum --check # install sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl +# Enable completion (optional but useful) +source <(kubectl completion bash) ``` -Install helm (on laptop) ---------------------------- + +### helm ```bash curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 chmod 700 get_helm.sh ./get_helm.sh + +# Enable completion (optional but useful) +source <(helm completion bash) ``` -Enable completion (optional but useful) ---------------------------------------- + +### longhornctl (optional) ```bash -# kubectl -source <(kubectl completion bash) +curl -L https://github.com/longhorn/cli/releases/download/${LonghornVersion}/longhornctl-${OS}-${ARCH} -o longhornctl # check https://github.com/longhorn/cli/releases +chmod +x longhornctl +mv ./longhornctl /usr/local/bin/longhornctl -# helm -source <(helm completion bash) +# Enable completion (optional but useful) +source <(longhornctl completion bash) ``` -## Deploy K3S remotely (using k3sup) - -Install k3sup (on laptop) -------------------------- +### k3sup ```bash curl -sLS https://get.k3sup.dev | sh sudo install k3sup /usr/local/bin/ ``` -Assuming your cluster is composed of 2 machines (main server and agent server) -```bash -# install k3s on main server - -export SERVER_IP=xxx.xxx.xxx.xxx -export USER=root -k3sup install --ip $SERVER_IP --user $USER --k3s-extra-args '--flannel-backend=wireguard-native' +## Deploy K3S remotely (using k3sup) +1. Move yourself in a directory when you would like to keep deployment files +```bash +git clone git@github.com:DIRACGrid/diracx-charts +``` -# join agent server +2. For the main server: -export AGENT_IP=xxx.xxx.xxx.xxx +```bash +export SERVER_IP=xxx.xxx.xxx.xxx +k3sup install --ip $SERVER_IP --user root --k3s-extra-args '--flannel-backend=wireguard-native' +``` -k3sup join --ip $AGENT_IP --server-ip $SERVER_IP --user $USER +3. For each agent server: +``` +k3sup join --ip $AGENT_IP --server-ip $SERVER_IP --user root ``` -Test your cluster ------------------ +### Test your cluster ```bash -export KUBECONFIG=`pwd`/kubeconfig +export KUBECONFIG=$PWD/kubeconfig kubectl config use-context default kubectl get node @@ -116,22 +125,18 @@ kubectl get node kubectl get pods -A ``` -## Deploy Kubernetes Dashboard (optional but useful) +### Deploy Kubernetes Dashboard (optional but useful) ```bash kubectl apply -f https://raw.githubusercontent.com/kubernetes/dashboard/v2.7.0/aio/deploy/recommended.yaml -kubectl apply -f ./manifest/dashboard/cluster-role.yaml -kubectl apply -f ./manifest/dashboard/secret.yaml -kubectl apply -f ./manifest/dashboard/service-account.yaml -``` +kubectl apply -f ./diracx-charts/k3s/manifest/dashboard/cluster-role.yaml +kubectl apply -f ./diracx-charts/k3s/manifest/dashboard/secret.yaml +kubectl apply -f ./diracx-charts/k3s/manifest/dashboard/service-account.yaml -```bash # generate token kubectl -n kubernetes-dashboard create token admin-user -``` -```bash # launch web server kubectl proxy & ``` @@ -141,7 +146,7 @@ Note: use token created just above for login Choose `Token` as login method, paste the token just generated -## Get Traefik Dashboard +### Get Traefik Dashboard Traefik comes out of the box with k3s. In order to access Traefik Dashboard from your laptop: @@ -151,35 +156,30 @@ kubectl --namespace kube-system port-forward deployments/traefik 9000:9000 & In a web browser, go to : http://localhost:9000/dashboard/ -Storage configuration (Longhorn) --------------------------------- +### Storage configuration (Longhorn) Deploy longhorn in your cluster: ```bash kubectl apply -f https://raw.githubusercontent.com/longhorn/longhorn/v1.5.3/deploy/prerequisite/longhorn-iscsi-installation.yaml - kubectl apply -f https://raw.githubusercontent.com/longhorn/longhorn/v1.5.3/deploy/prerequisite/longhorn-nfs-installation.yaml -``` - -**Single or two nodes cluster** (less than 3 nodes) - -```bash wget https://raw.githubusercontent.com/longhorn/longhorn/v1.5.3/deploy/longhorn.yaml ``` -edit `longhorn.yaml` and modify `numberOfReplicas: ` (i.e 1 or 2) +edit `longhorn.yaml` and +- modify `numberOfReplicas: ` (i.e 1 or 2) +- OPTIONAL: look for the `longhorn-default-setting` section. At this point, depending on the configuration you applied on your (Virtual) machine(s), modify its `data` part as following: +``` + data: + default-setting.yaml: |- + default-data-path: /mnt/longhorn # reflect what is the config you'd like to apply. Without, the default is /var/lib/longhorn +``` ```bash kubectl apply -f longhorn.yaml ``` -**Multi node cluster** (more than 2 nodes) - -```bash -kubectl apply -f https://raw.githubusercontent.com/longhorn/longhorn/v1.5.3/deploy/longhorn.yaml -``` Check environnment ------------------ @@ -197,11 +197,15 @@ sed -i -e "s/storageclass.kubernetes.io\/is-default-class: \"true\"/storageclass ``` +Now, on your client, start the longhorn UI with ```bash kubectl port-forward -n longhorn-system svc/longhorn-frontend 8080:80 & ``` -## What is your hostname ? +and then visualize it by visiting http://localhost:8080 + + + ## Deploy diracx -------------------------- ```bash -# Clone diracx repositories - -git clone https://github.com/DIRACGrid/diracx-charts.git - # Update the config with your hostname sed -i 's//thenameyouareacutally.using.com/g' ./diracx-charts/k3s/examples/* +``` -# Deploy via provided helm charts +Now, it is time to choose what to install, so go through `./diracx-charts/diracx/values.yaml` file and edit it accordingly. +``` +# Deploy time! helm install --timeout 3600s diracx ./diracx-charts/diracx/ -f ./diracx-charts/k3s/examples/my.values.yaml --debug ``` @@ -253,6 +255,12 @@ git add default.yml git commit -m 'Initial config' ``` +## Post-install tips + +In case you would like to make us of the services installed (e.g. MySQL or OpenSearch) from inside the kubernetes cluster, there are different solutions and configurations to make. LoadBalancer, NodePort, or Ingress are the options. One of these would need to be set out. + +Similar considerations apply for the use of certificates. See https://github.com/DIRACGrid/diracx-charts/issues/107 + ## Uninstall k3s on main server https://docs.k3s.io/installation/uninstall