wip

csviri · csviri · commit c527bf113865 · 2026-04-17T11:36:16.000+02:00
Signed-off-by: Attila Mészáros &lt;a_meszaros@apple.com&gt;
diff --git a/docs/content/en/docs/documentation/operations/health-probes.md b/docs/content/en/docs/documentation/operations/health-probes.md
@@ -17,72 +17,69 @@ API.
 | `isStarted()` | `true` once the operator and all its controllers have fully started |
 | `allEventSourcesAreHealthy()` | `true` when every registered event source (informers, polling sources, etc.) reports a healthy status |
 | `unhealthyEventSources()` | returns a map of controller name → unhealthy event sources, useful for diagnostics |
+| `unhealthyInformerWrappingEventSourceHealthIndicator()` | returns a map of controller name → unhealthy informer-wrapping event sources, each exposing per-informer details via `InformerHealthIndicator` (`hasSynced()`, `isWatching()`, `isRunning()`, `getTargetNamespace()`) |
 
-These map naturally to Kubernetes probes:
+In most cases a single readiness probe backed by `allEventSourcesAreHealthy()` is sufficient: before the
+operator has fully started the informers will not have synced yet, so the check naturally covers the startup
+case as well. Once running, it detects runtime degradation such as a lost watch connection.
 
-- **Startup probe** → `isStarted()` — fails until all informers have synced and the operator is ready to
-  reconcile.
-- **Readiness probe** → `allEventSourcesAreHealthy()` — fails if an informer loses its watch connection
-  or any event source reports an unhealthy status.
+### Fine-Grained Informer Diagnostics
 
-## Setting Up Probe Endpoints
+For advanced use cases — such as exposing per-informer health in a diagnostic endpoint or logging which
+specific namespace lost its watch — `unhealthyInformerWrappingEventSourceHealthIndicator()` gives access to
+individual `InformerHealthIndicator` instances. Each indicator exposes `hasSynced()`, `isWatching()`,
+`isRunning()`, and `getTargetNamespace()`. This is typically not needed for a standard health probe but can
+be valuable for operational dashboards or troubleshooting.
 
-The example below uses [Jetty](https://eclipse.dev/jetty/) to expose health probe endpoints. Any HTTP
+## Setting Up a Probe Endpoint
+
+The example below uses [Jetty](https://eclipse.dev/jetty/) to expose a `/healthz` endpoint. Any HTTP
 server library works — the key is calling the `RuntimeInfo` methods to determine the response code.
 
 ```java
 import org.eclipse.jetty.server.Server;
 import org.eclipse.jetty.server.handler.ContextHandler;
-import org.eclipse.jetty.server.handler.ContextHandlerCollection;
 
 Operator operator = new Operator();
 operator.register(new MyReconciler());
-operator.start();
 
-var startup = new ContextHandler(new StartupHandler(operator), "/startup");
-var readiness = new ContextHandler(new ReadinessHandler(operator), "/ready");
+// start the health server before the operator so probes can be queried during startup
+var health = new ContextHandler(new HealthHandler(operator), "/healthz");
 Server server = new Server(8080);
-server.setHandler(new ContextHandlerCollection(startup, readiness));
+server.setHandler(health);
 server.start();
+
+operator.start();
 ```
 
-Where `StartupHandler` and `ReadinessHandler` extend `org.eclipse.jetty.server.Handler.Abstract` and
-check `operator.getRuntimeInfo().isStarted()` and
-`operator.getRuntimeInfo().allEventSourcesAreHealthy()` respectively.
+Where `HealthHandler` extends `org.eclipse.jetty.server.Handler.Abstract` and checks
+`operator.getRuntimeInfo().allEventSourcesAreHealthy()`.
 
 See the
 [`operations` sample operator](https://github.com/java-operator-sdk/java-operator-sdk/tree/main/sample-operators/operations)
 for a complete working example.
 
 ## Kubernetes Deployment Configuration
 
-Once your operator exposes probe endpoints, configure them in your Deployment manifest:
+Once your operator exposes the probe endpoint, configure a readiness probe in your Deployment manifest:
 
 ```yaml
 containers:
 - name: operator
   ports:
   - name: probes
     containerPort: 8080
-  startupProbe:
-    httpGet:
-      path: /startup
-      port: probes
-    initialDelaySeconds: 1
-    periodSeconds: 3
-    failureThreshold: 20
   readinessProbe:
     httpGet:
-      path: /ready
+      path: /healthz
       port: probes
     initialDelaySeconds: 5
     periodSeconds: 5
     failureThreshold: 3
 ```
 
-The startup probe gives the operator time to start (up to ~60 s with the settings above). Once the startup
-probe succeeds, the readiness probe takes over and will mark the pod as not-ready if any event source
-becomes unhealthy.
+The readiness probe will mark the pod as not-ready until all informers have synced. After that, it
+continues to monitor event source health at runtime.
 
 ## Helm Chart Support
 
@@ -92,12 +89,9 @@ Enable them in your `values.yaml`:
 ```yaml
 probes:
   port: 8080
-  startup:
-    enabled: true
-    path: /startup
   readiness:
     enabled: true
-    path: /ready
+    path: /healthz
 ```
 
 All probe timing parameters (`initialDelaySeconds`, `periodSeconds`, `failureThreshold`) have sensible
diff --git a/helm/generic-helm-chart/tests/deployment_test.yaml b/helm/generic-helm-chart/tests/deployment_test.yaml
@@ -305,7 +305,7 @@ tests:
     asserts:
       - equal:
           path: spec.template.spec.containers[0].startupProbe.httpGet.path
-          value: /startup
+          value: /healthz
       - equal:
           path: spec.template.spec.containers[0].startupProbe.httpGet.port
           value: probes
@@ -325,7 +325,7 @@ tests:
     asserts:
       - equal:
           path: spec.template.spec.containers[0].readinessProbe.httpGet.path
-          value: /ready
+          value: /healthz
       - equal:
           path: spec.template.spec.containers[0].readinessProbe.httpGet.port
           value: probes
diff --git a/helm/generic-helm-chart/values.yaml b/helm/generic-helm-chart/values.yaml
@@ -134,13 +134,13 @@ probes:
   port: 8080
   startup:
     enabled: false
-    path: /startup
+    path: /healthz
     initialDelaySeconds: 1
     periodSeconds: 3
     failureThreshold: 20
   readiness:
     enabled: false
-    path: /ready
+    path: /healthz
     initialDelaySeconds: 5
     periodSeconds: 5
     failureThreshold: 3
diff --git a/sample-operators/operations/pom.xml b/sample-operators/operations/pom.xml
@@ -85,7 +85,7 @@
     <dependency>
       <groupId>org.eclipse.jetty</groupId>
       <artifactId>jetty-server</artifactId>
-      <version>12.1.0</version>
+      <version>12.1.8</version>
     </dependency>
     <dependency>
       <groupId>io.javaoperatorsdk</groupId>
diff --git a/sample-operators/operations/src/main/java/io/javaoperatorsdk/operator/sample/metrics/HealthHandler.java b/sample-operators/operations/src/main/java/io/javaoperatorsdk/operator/sample/metrics/HealthHandler.java
@@ -25,20 +25,26 @@
 
 import io.javaoperatorsdk.operator.Operator;
 
-public class StartupHandler extends Handler.Abstract {
+/**
+ * Combined health endpoint that checks whether all event sources (informers, polling sources, etc.)
+ * are healthy. Before the operator has fully started the informers will not have synced yet, so
+ * this endpoint naturally covers the startup case as well.
+ */
+public class HealthHandler extends Handler.Abstract {
 
   private final Operator operator;
 
-  public StartupHandler(Operator operator) {
+  public HealthHandler(Operator operator) {
     this.operator = operator;
   }
 
   @Override
   public boolean handle(Request request, Response response, Callback callback) {
-    if (operator.getRuntimeInfo().isStarted()) {
-      sendMessage(response, 200, "started", callback);
+    var runtimeInfo = operator.getRuntimeInfo();
+    if (runtimeInfo.isStarted() && runtimeInfo.allEventSourcesAreHealthy()) {
+      sendMessage(response, 200, "healthy", callback);
     } else {
-      sendMessage(response, 400, "not started yet", callback);
+      sendMessage(response, 503, "not healthy", callback);
     }
     return true;
   }
diff --git a/sample-operators/operations/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingSampleOperator.java b/sample-operators/operations/src/main/java/io/javaoperatorsdk/operator/sample/metrics/MetricsHandlingSampleOperator.java
@@ -25,7 +25,6 @@
 
 import org.eclipse.jetty.server.Server;
 import org.eclipse.jetty.server.handler.ContextHandler;
-import org.eclipse.jetty.server.handler.ContextHandlerCollection;
 import org.jspecify.annotations.NonNull;
 import org.jspecify.annotations.Nullable;
 import org.slf4j.Logger;
@@ -79,10 +78,9 @@ public static void main(String[] args) throws Exception {
     operator.register(
         new MetricsHandlingReconciler2(),
         configLoader.applyControllerConfigs(MetricsHandlingReconciler2.NAME));
-    var startup = new ContextHandler(new StartupHandler(operator), "/startup");
-    var readiness = new ContextHandler(new ReadinessHandler(operator), "/ready");
+    var health = new ContextHandler(new HealthHandler(operator), "/healthz");
     Server server = new Server(8080);
-    server.setHandler(new ContextHandlerCollection(startup, readiness));
+    server.setHandler(health);
     server.start();
     log.info("Health probe server started on port 8080");
 
diff --git a/sample-operators/operations/src/main/java/io/javaoperatorsdk/operator/sample/metrics/ReadinessHandler.java b/sample-operators/operations/src/main/java/io/javaoperatorsdk/operator/sample/metrics/ReadinessHandler.java
diff --git a/sample-operators/operations/src/test/resources/helm-values.yaml b/sample-operators/operations/src/test/resources/helm-values.yaml
@@ -34,8 +34,7 @@ primaryResources:
     - metricshandlingcustomresource2s
 
 probes:
-  startup:
-    enabled: true
   readiness:
     enabled: true
+    path: /healthz