Skip to content

Commit f3f0944

Browse files
committed
(bug) handle helm chart errors in pullmode
Previously, when Sveltos processed a Helm chart in pull mode, it would generate ConfigurationBundle instances as it prepared the resources. If an error was encountered halfway through this process, Sveltos would still commit the resources it had successfully prepared up to that point. This led to a critical synchronization failure: 1. Partial State: The ConfigurationBundle contained only a subset of the intended resources. 2. Applier Misinterpretation: The applier would treat the missing resources as "deleted" or "out of scope," causing it to prune existing resources on the managed cluster or deploy an incomplete, broken stack. This PR introduces atomic preparation logic. If an error occurs at any point during the preparation of resources for a Helm chart: 1. Discard: All partially prepared resources are discarded. 2. No Commit: Nothing is committed to the ConfigurationBundle, ensuring the applier does not act on incomplete data. 3. Report: The error is captured and reported specifically within the ClusterSummary status for visibility.
1 parent 4db50cb commit f3f0944

File tree

2 files changed

+210
-4
lines changed

2 files changed

+210
-4
lines changed

controllers/handlers_helm.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -928,6 +928,13 @@ func handleCharts(ctx context.Context, clusterSummary *configv1beta1.ClusterSumm
928928
}
929929

930930
if isPullMode {
931+
if deployError != nil {
932+
_ = pullmode.DiscardStagedResourcesForDeployment(ctx, c, clusterSummary.Spec.ClusterNamespace,
933+
clusterSummary.Spec.ClusterName, configv1beta1.ClusterSummaryKind, clusterSummary.Name,
934+
string(libsveltosv1beta1.FeatureHelm), logger)
935+
return deployError
936+
}
937+
931938
err = commitStagedResourcesForDeployment(ctx, clusterSummary, configurationHash, mgmtResources, logger)
932939
if err != nil {
933940
return err
@@ -937,10 +944,6 @@ func handleCharts(ctx context.Context, clusterSummary *configv1beta1.ClusterSumm
937944
if err != nil {
938945
return err
939946
}
940-
941-
if deployError != nil {
942-
return deployError
943-
}
944947
} else {
945948
// First get the helm releases currently managed and uninstall all the ones
946949
// not referenced anymore. Only if this operation succeeds, removes all stale

test/fv/helm_error_test.go

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
/*
2+
Copyright 2026. projectsveltos.io. All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package fv_test
18+
19+
import (
20+
"context"
21+
22+
. "github.com/onsi/ginkgo/v2"
23+
. "github.com/onsi/gomega"
24+
25+
appsv1 "k8s.io/api/apps/v1"
26+
corev1 "k8s.io/api/core/v1"
27+
apierrors "k8s.io/apimachinery/pkg/api/errors"
28+
"k8s.io/apimachinery/pkg/types"
29+
"k8s.io/client-go/util/retry"
30+
31+
configv1beta1 "github.com/projectsveltos/addon-controller/api/v1beta1"
32+
"github.com/projectsveltos/addon-controller/lib/clusterops"
33+
libsveltosv1beta1 "github.com/projectsveltos/libsveltos/api/v1beta1"
34+
"github.com/projectsveltos/libsveltos/lib/k8s_utils"
35+
)
36+
37+
var (
38+
certManagerValues = `apiVersion: v1
39+
kind: ConfigMap
40+
metadata:
41+
name: cert-manager-values
42+
namespace: default
43+
data:
44+
values: |
45+
crds:
46+
enabled: true`
47+
48+
certManagerIncorrectValues = `apiVersion: v1
49+
kind: ConfigMap
50+
metadata:
51+
name: cert-manager-values
52+
namespace: default
53+
data:
54+
values: |
55+
crds:
56+
enabled: true
57+
replicas: 2`
58+
)
59+
60+
var _ = Describe("Feature", Serial, func() {
61+
const (
62+
namePrefix = "helm-error-"
63+
certManager = "cert-manager"
64+
)
65+
66+
It("An error in helm values does not remove helm chart", Label("FV", "PULLMODE"), func() {
67+
Byf("Create a configMap with valid helm values")
68+
configMap, err := k8s_utils.GetUnstructured([]byte(certManagerValues))
69+
Expect(err).To(BeNil())
70+
Expect(k8sClient.Create(context.TODO(), configMap)).To(Succeed())
71+
72+
Byf("Create a ClusterProfile matching Cluster %s/%s", kindWorkloadCluster.GetNamespace(), kindWorkloadCluster.GetName())
73+
clusterProfile := getClusterProfile(namePrefix, map[string]string{key: value})
74+
clusterProfile.Spec.SyncMode = configv1beta1.SyncModeContinuous
75+
Expect(k8sClient.Create(context.TODO(), clusterProfile)).To(Succeed())
76+
77+
verifyClusterProfileMatches(clusterProfile)
78+
79+
verifyClusterSummary(clusterops.ClusterProfileLabelName,
80+
clusterProfile.Name, &clusterProfile.Spec, kindWorkloadCluster.GetNamespace(),
81+
kindWorkloadCluster.GetName(), getClusterType())
82+
83+
Byf("Update ClusterProfile %s to deploy helm charts and referencing ConfigMap in ValuesFrom %s/%s",
84+
clusterProfile.Name, configMap.GetNamespace(), configMap.GetName())
85+
86+
currentClusterProfile := &configv1beta1.ClusterProfile{}
87+
88+
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
89+
Expect(k8sClient.Get(context.TODO(), types.NamespacedName{Name: clusterProfile.Name},
90+
currentClusterProfile)).To(Succeed())
91+
92+
currentClusterProfile.Spec.HelmCharts = []configv1beta1.HelmChart{
93+
{
94+
RepositoryURL: "https://charts.jetstack.io",
95+
RepositoryName: "jetstack",
96+
ChartName: "jetstack/cert-manager",
97+
ChartVersion: "v1.19.4",
98+
ReleaseName: "cert-manager",
99+
ReleaseNamespace: "cert-manager",
100+
HelmChartAction: configv1beta1.HelmChartActionInstall,
101+
ValuesFrom: []configv1beta1.ValueFrom{
102+
{
103+
Namespace: configMap.GetNamespace(),
104+
Name: configMap.GetName(),
105+
Kind: string(libsveltosv1beta1.ConfigMapReferencedResourceKind),
106+
},
107+
},
108+
},
109+
}
110+
return k8sClient.Update(context.TODO(), currentClusterProfile)
111+
})
112+
Expect(err).To(BeNil())
113+
114+
Expect(k8sClient.Get(context.TODO(),
115+
types.NamespacedName{Name: clusterProfile.Name}, currentClusterProfile)).To(Succeed())
116+
117+
clusterSummary := verifyClusterSummary(clusterops.ClusterProfileLabelName,
118+
currentClusterProfile.Name, &currentClusterProfile.Spec,
119+
kindWorkloadCluster.GetNamespace(), kindWorkloadCluster.GetName(), getClusterType())
120+
121+
Byf("Getting client to access the workload cluster")
122+
workloadClient, err := getKindWorkloadClusterKubeconfig()
123+
Expect(err).To(BeNil())
124+
Expect(workloadClient).ToNot(BeNil())
125+
126+
Byf("Verifying cert-manager deployment is created in the workload cluster")
127+
Eventually(func() error {
128+
depl := &appsv1.Deployment{}
129+
return workloadClient.Get(context.TODO(),
130+
types.NamespacedName{Namespace: certManager, Name: certManager}, depl)
131+
}, timeout, pollingInterval).Should(BeNil())
132+
133+
Byf("Verifying ClusterSummary %s status is set to Deployed for Helm feature", clusterSummary.Name)
134+
verifyFeatureStatusIsProvisioned(kindWorkloadCluster.GetNamespace(), clusterSummary.Name, libsveltosv1beta1.FeatureHelm)
135+
136+
By("Change ConfigMap with HelmValues to contain incorrect values")
137+
incorrectValuesConfigMap, err := k8s_utils.GetUnstructured([]byte(certManagerIncorrectValues))
138+
Expect(err).To(BeNil())
139+
140+
currentConfigMap := &corev1.ConfigMap{}
141+
Expect(k8sClient.Get(context.TODO(),
142+
types.NamespacedName{Namespace: configMap.GetNamespace(), Name: configMap.GetName()},
143+
currentConfigMap)).To(Succeed())
144+
incorrectValuesConfigMap.SetResourceVersion(currentConfigMap.GetResourceVersion())
145+
Expect(k8sClient.Update(context.TODO(), incorrectValuesConfigMap))
146+
147+
Byf("Verifying ClusterSummary reports the error")
148+
Eventually(func() bool {
149+
currentClusterSummary := &configv1beta1.ClusterSummary{}
150+
err = k8sClient.Get(context.TODO(),
151+
types.NamespacedName{Namespace: clusterSummary.Namespace, Name: clusterSummary.Name},
152+
currentClusterSummary)
153+
if err != nil {
154+
return false
155+
}
156+
for i := range currentClusterSummary.Status.FeatureSummaries {
157+
if currentClusterSummary.Status.FeatureSummaries[i].FeatureID == libsveltosv1beta1.FeatureHelm {
158+
return currentClusterSummary.Status.FeatureSummaries[i].FailureMessage != nil
159+
}
160+
}
161+
return false
162+
}, timeout, pollingInterval).Should(BeTrue())
163+
164+
Byf("Verifying cert-manager deployment is still present in the workload cluster")
165+
Eventually(func() error {
166+
depl := &appsv1.Deployment{}
167+
return workloadClient.Get(context.TODO(),
168+
types.NamespacedName{Namespace: certManager, Name: certManager}, depl)
169+
}, timeout, pollingInterval).Should(BeNil())
170+
171+
By("Change ConfigMap with HelmValues to contain correct values")
172+
Expect(k8sClient.Get(context.TODO(),
173+
types.NamespacedName{Namespace: configMap.GetNamespace(), Name: configMap.GetName()},
174+
currentConfigMap)).To(Succeed())
175+
configMap.SetResourceVersion(currentConfigMap.GetResourceVersion())
176+
Expect(k8sClient.Update(context.TODO(), configMap))
177+
178+
Byf("Verifying ClusterSummary %s status is set to Deployed for Helm feature", clusterSummary.Name)
179+
verifyFeatureStatusIsProvisioned(kindWorkloadCluster.GetNamespace(), clusterSummary.Name, libsveltosv1beta1.FeatureHelm)
180+
181+
Byf("Verifying cert-manager deployment is still present in the workload cluster")
182+
Eventually(func() error {
183+
depl := &appsv1.Deployment{}
184+
return workloadClient.Get(context.TODO(),
185+
types.NamespacedName{Namespace: certManager, Name: certManager}, depl)
186+
}, timeout, pollingInterval).Should(BeNil())
187+
188+
deleteClusterProfile(clusterProfile)
189+
190+
Byf("Verifying cert-manager deployment is removed from workload cluster")
191+
Eventually(func() bool {
192+
depl := &appsv1.Deployment{}
193+
err = workloadClient.Get(context.TODO(),
194+
types.NamespacedName{Namespace: certManager, Name: certManager}, depl)
195+
return apierrors.IsNotFound(err)
196+
}, timeout, pollingInterval).Should(BeTrue())
197+
198+
Expect(k8sClient.Get(context.TODO(),
199+
types.NamespacedName{Namespace: configMap.GetNamespace(), Name: configMap.GetName()},
200+
currentConfigMap)).To(Succeed())
201+
Expect(k8sClient.Delete(context.TODO(), currentConfigMap))
202+
})
203+
})

0 commit comments

Comments
 (0)