ceph: configuration fixes

[infra/stack/kubernetes.git] / apps / ceph / kubespray / playbooks / roles / install / templates / cluster.yaml.j2
diff --git a/apps/ceph/kubespray/playbooks/roles/install/templates/cluster.yaml.j2 b/apps/ceph/kubespray/playbooks/roles/install/templates/cluster.yaml.j2

index 60c6665b88e11aa9c0d56c58c2e6d39c9d3e7703..c49d1c3c0588c921eba32e7ceaac02c67cf7d3d6 100644 (file)
--- a/apps/ceph/kubespray/playbooks/roles/install/templates/cluster.yaml.j2
+++ b/apps/ceph/kubespray/playbooks/roles/install/templates/cluster.yaml.j2
@@ -16,23 +16,35 @@
  # SPDX-License-Identifier: Apache-2.0
  # ============LICENSE_END=========================================================
  
+#################################################################################################################
+# Define the settings for the rook-ceph cluster with common settings for a production cluster.
+# All nodes with available raw devices will be used for the Ceph cluster. At least three nodes are required
+# in this example. See the documentation for more details on storage settings available.
+
+# For example, to create the cluster:
+#   kubectl create -f crds.yaml -f common.yaml -f operator.yaml
+#   kubectl create -f cluster.yaml
+#################################################################################################################
+
  apiVersion: ceph.rook.io/v1
  kind: CephCluster
  metadata:
    name: rook-ceph
-  namespace: "{{ rook_namespace }}"
+  namespace: "{{ rook_namespace }}" # namespace:cluster
  spec:
    cephVersion:
      # The container image used to launch the Ceph daemon pods (mon, mgr, osd, mds, rgw).
-    # v12 is luminous, v13 is mimic, and v14 is nautilus.
-    # RECOMMENDATION: In production, use a specific version tag instead of the general v13 flag, which pulls the latest release and could result in different
+    # v13 is mimic, v14 is nautilus, and v15 is octopus.
+    # RECOMMENDATION: In production, use a specific version tag instead of the general v14 flag, which pulls the latest release and could result in different
      # versions running within the cluster. See tags available at https://hub.docker.com/r/ceph/ceph/tags/.
+    # If you want to be more precise, you can always use a timestamp tag such ceph/ceph:v15.2.9-20201217
+    # This tag might not contain a new Ceph version, just security fixes from the underlying operating system, which will reduce vulnerabilities
      image: "{{ ceph_repository }}:{{ ceph_version }}"
-    # Whether to allow unsupported versions of Ceph. Currently only luminous and mimic are supported.
-    # After nautilus is released, Rook will be updated to support nautilus.
+    # Whether to allow unsupported versions of Ceph. Currently `nautilus` and `octopus` are supported.
+    # Future versions such as `pacific` would require this to be set to `true`.
      # Do not set to true in production.
      allowUnsupported: false
-  # The path on the host where configuration files will be persisted. If not specified, a kubernetes emptyDir will be created (not recommended).
+  # The path on the host where configuration files will be persisted. Must be specified.
    # Important: if you reinstall the cluster, make sure you delete this directory from each host or else the mons will fail to start on the new cluster.
    # In Minikube, the '/data' directory is configured to persist across reboots. Use "/data/rook" in Minikube environment.
    dataDirHostPath: "{{ rook_data_dir_path }}"
@@ -41,14 +53,24 @@ spec:
    # Use at your OWN risk
    # To understand Rook's upgrade process of Ceph, read https://rook.io/docs/rook/master/ceph-upgrade.html#ceph-version-upgrades
    skipUpgradeChecks: false
-  # set the amount of mons to be started
+  # Whether or not continue if PGs are not clean during an upgrade
+  continueUpgradeAfterChecksEvenIfNotHealthy: false
+  # WaitTimeoutForHealthyOSDInMinutes defines the time (in minutes) the operator would wait before an OSD can be stopped for upgrade or restart.
+  # If the timeout exceeds and OSD is not ok to stop, then the operator would skip upgrade for the current OSD and proceed with the next one
+  # if `continueUpgradeAfterChecksEvenIfNotHealthy` is `false`. If `continueUpgradeAfterChecksEvenIfNotHealthy` is `true`, then opertor would
+  # continue with the upgrade of an OSD even if its not ok to stop after the timeout. This timeout won't be applied if `skipUpgradeChecks` is `true`.
+  # The default wait timeout is 10 minutes.
+  waitTimeoutForHealthyOSDInMinutes: 10
    mon:
-    count: 3
+    # Set the number of mons to be started. Must be an odd number, and is generally recommended to be 3.
+    count: {{ rook_ceph_mon_count }}
+    # The mons should be on unique nodes. For production, at least 3 nodes are recommended for this reason.
+    # Mons should only be allowed on the same node for test environments where data loss is acceptable.
      allowMultiplePerNode: true
    mgr:
-    modules:
+    modules: []
      # Several modules should not need to be included in this list. The "dashboard" and "monitoring" modules
-    # are already enabled by other settings in the cluster CR and the "rook" module is always enabled.
+    # are already enabled by other settings in the cluster CR.
      # - name: pg_autoscaler
      #   enabled: true
    # enable the ceph dashboard for viewing cluster status
@@ -60,6 +82,7 @@ spec:
      # port: 8443
      # serve the dashboard using SSL
      ssl: true
+  # enable prometheus alerting for cluster
    monitoring:
      # requires Prometheus to be pre-installed
      enabled: false
@@ -68,32 +91,55 @@ spec:
      # If you have a single rook-ceph cluster, set the rulesNamespace to the same namespace as the cluster or keep it empty.
      # If you have multiple rook-ceph clusters in the same k8s cluster, choose the same namespace (ideally, namespace with prometheus
      # deployed) to set rulesNamespace for all the clusters. Otherwise, you will get duplicate alerts with multiple alert definitions.
-    rulesNamespace: {{ rook_namespace }}
+    rulesNamespace: "{{ rook_namespace }}"
    network:
-    # toggle to use hostNetwork
      hostNetwork: {{ rook_use_host_network }}
-  rbdMirroring:
-    # The number of daemons that will perform the rbd mirroring.
-    # rbd mirroring must be configured with "rbd mirror" from the rook toolbox.
-    workers: 0
+  # enable the crash collector for ceph daemon crash collection
+  crashCollector:
+    disable: {{ rook_ceph_crashcollector_disable }}
+  # automate [data cleanup process](https://github.com/rook/rook/blob/master/Documentation/ceph-teardown.md#delete-the-data-on-hosts) in cluster destruction.
+  cleanupPolicy:
+    # Since cluster cleanup is destructive to data, confirmation is required.
+    # To destroy all Rook data on hosts during uninstall, confirmation must be set to "yes-really-destroy-data".
+    # This value should only be set when the cluster is about to be deleted. After the confirmation is set,
+    # Rook will immediately stop configuring the cluster and only wait for the delete command.
+    # If the empty string is set, Rook will not destroy any data on hosts during uninstall.
+    confirmation: ""
+    # sanitizeDisks represents settings for sanitizing OSD disks on cluster deletion
+    sanitizeDisks:
+      # method indicates if the entire disk should be sanitized or simply ceph's metadata
+      # in both case, re-install is possible
+      # possible choices are 'complete' or 'quick' (default)
+      method: quick
+      # dataSource indicate where to get random bytes from to write on the disk
+      # possible choices are 'zero' (default) or 'random'
+      # using random sources will consume entropy from the system and will take much more time then the zero source
+      dataSource: zero
+      # iteration overwrite N times instead of the default (1)
+      # takes an integer value
+      iteration: 1
+    # allowUninstallWithVolumes defines how the uninstall should be performed
+    # If set to true, cephCluster deletion does not wait for the PVs to be deleted.
+    allowUninstallWithVolumes: false
    # To control where various services will be scheduled by kubernetes, use the placement configuration sections below.
    # The example under 'all' would have all services scheduled on kubernetes nodes labeled with 'role=storage-node' and
    # tolerate taints with a key of 'storage-node'.
-#  placement:
-#    all:
-#      nodeAffinity:
-#        requiredDuringSchedulingIgnoredDuringExecution:
-#          nodeSelectorTerms:
-#          - matchExpressions:
-#            - key: role
-#              operator: In
-#              values:
-#              - storage-node
+  placement:
+    all:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+          - matchExpressions:
+            - key: "{{ rook_storage_label }}"
+              operator: In
+              values:
+              - "true"
  #      podAffinity:
  #      podAntiAffinity:
-#      tolerations:
-#      - key: storage-node
-#        operator: Exists
+#      topologySpreadConstraints:
+      tolerations:
+      - key: "{{ rook_storage_label }}"
+        operator: Exists
  # The above placement information can also be specified for mon, osd, and mgr components
  #    mon:
  # Monitor deployments may contain an anti-affinity rule for avoiding monitor
@@ -102,12 +148,22 @@ spec:
  # preferred rule with weight: 50.
  #    osd:
  #    mgr:
+#    cleanup:
    annotations:
  #    all:
  #    mon:
  #    osd:
+#    cleanup:
+#    prepareosd:
  # If no mgr annotations are set, prometheus scrape annotations will be set by default.
-#   mgr:
+#    mgr:
+  labels:
+#    all:
+#    mon:
+#    osd:
+#    cleanup:
+#    mgr:
+#    prepareosd:
    resources:
  # The requests and limits set here, allow the mgr pod to use half of one CPU core and 1 gigabyte of memory
  #    mgr:
@@ -120,54 +176,77 @@ spec:
  # The above example requests/limits can also be added to the mon and osd components
  #    mon:
  #    osd:
+#    prepareosd:
+#    crashcollector:
+#    logcollector:
+#    cleanup:
+  # The option to automatically remove OSDs that are out and are safe to destroy.
+  removeOSDsIfOutAndSafeToRemove: false
    storage: # cluster level storage configuration and selection
      useAllNodes: true
      useAllDevices: false
+    deviceFilter: "{{ rook_node_device_filter }}"
      location:
      config:
-      # The default and recommended storeType is dynamically set to bluestore for devices and filestore for directories.
-      # Set the storeType explicitly only if it is required not to use the default.
-      # storeType: bluestore
-      databaseSizeMB: "1024" # this value can be removed for environments with normal sized disks (100 GB or larger)
-      journalSizeMB: "1024"  # this value can be removed for environments with normal sized disks (20 GB or larger)
+      # crushRoot: "custom-root" # specify a non-default root label for the CRUSH map
+      # metadataDevice: "md0" # specify a non-rotational storage so ceph-volume will use it as block db device of bluestore.
+      databaseSizeMB: "1024" # uncomment if the disks are smaller than 100 GB
+      journalSizeMB: "1024"  # uncomment if the disks are 20 GB or smaller
        osdsPerDevice: "1" # this value can be overridden at the node or device level
-# Cluster level list of directories to use for storage. These values will be set for all nodes that have no `directories` set.
-    directories:
-    - path: "{{ rook_storage_dir_path }}"
+      # encryptedDevice: "true" # the default value for this option is "false"
+    #directories:
+    #- path: "{{ rook_storage_dir_path }}"
  # Individual nodes and their config can be specified as well, but 'useAllNodes' above must be set to false. Then, only the named
  # nodes below will be used as storage resources.  Each node's 'name' field should match their 'kubernetes.io/hostname' label.
  #    nodes:
-#    - name: "172.17.4.101"
-#      directories: # specific directories to use for storage can be specified for each node
-#      - path: "/rook/storage-dir"
-#      resources:
-#        limits:
-#          cpu: "500m"
-#          memory: "1024Mi"
-#        requests:
-#          cpu: "500m"
-#          memory: "1024Mi"
  #    - name: "172.17.4.201"
  #      devices: # specific devices to use for storage can be specified for each node
  #      - name: "sdb"
  #      - name: "nvme01" # multiple osds can be created on high performance devices
  #        config:
  #          osdsPerDevice: "5"
+#      - name: "/dev/disk/by-id/ata-ST4000DM004-XXXX" # devices can be specified using full udev paths
  #      config: # configuration can be specified at the node level which overrides the cluster level config
-#        storeType: filestore
+#        storeType: filestore # this option is osbsolete and only provided as an example
  #    - name: "172.17.4.301"
-#      deviceFilter: ^vdb
+#      deviceFilter: "^vdb"
    # The section for configuring management of daemon disruptions during upgrade or fencing.
    disruptionManagement:
      # If true, the operator will create and manage PodDisruptionBudgets for OSD, Mon, RGW, and MDS daemons. OSD PDBs are managed dynamically
-    # via the strategy outlined in the [design](https://github.com/rook/rook/blob/master/design/ceph-managed-disruptionbudgets.md). The operator will
+    # via the strategy outlined in the [design](https://github.com/rook/rook/blob/master/design/ceph/ceph-managed-disruptionbudgets.md). The operator will
      # block eviction of OSDs by default and unblock them safely when drains are detected.
      managePodBudgets: false
      # A duration in minutes that determines how long an entire failureDomain like `region/zone/host` will be held in `noout` (in addition to the
      # default DOWN/OUT interval) when it is draining. This is only relevant when  `managePodBudgets` is `true`. The default value is `30` minutes.
      osdMaintenanceTimeout: 30
+    # A duration in minutes that the operator will wait for the placement groups to become healthy (active+clean) after a drain was completed and OSDs came back up.
+    # Operator will continue with the next drain if the timeout exceeds. It only works if `managePodBudgets` is `true`.
+    # No values or 0 means that the operator will wait until the placement groups are healthy before unblocking the next drain.
+    pgHealthCheckTimeout: 0
      # If true, the operator will create and manage MachineDisruptionBudgets to ensure OSDs are only fenced when the cluster is healthy.
      # Only available on OpenShift.
      manageMachineDisruptionBudgets: false
      # Namespace in which to watch for the MachineDisruptionBudgets.
      machineDisruptionBudgetNamespace: openshift-machine-api
+
+  # healthChecks
+  # Valid values for daemons are 'mon', 'osd', 'status'
+  healthCheck:
+    daemonHealth:
+      mon:
+        disabled: false
+        interval: 45s
+      osd:
+        disabled: false
+        interval: 60s
+      status:
+        disabled: false
+        interval: 60s
+    # Change pod liveness probe, it works for all mon,mgr,osd daemons
+    livenessProbe:
+      mon:
+        disabled: false
+      mgr:
+        disabled: false
+      osd:
+        disabled: false