diff --git a/api/v1alpha1/bmc_types.go b/api/v1alpha1/bmc_types.go index 5c2d04d60..d6b7d5a5c 100644 --- a/api/v1alpha1/bmc_types.go +++ b/api/v1alpha1/bmc_types.go @@ -206,6 +206,10 @@ type BMCStatus struct { // +optional LastResetTime *metav1.Time `json:"lastResetTime,omitempty"` + // Tasks tracks ongoing and recent BMC operations. + // +optional + Tasks []BMCTask `json:"tasks,omitempty"` + // Conditions represents the latest available observations of the BMC's current state. // +patchStrategy=merge // +patchMergeKey=type @@ -227,6 +231,67 @@ const ( BMCStatePending BMCState = "Pending" ) +// BMCTask represents a single BMC operation task. +type BMCTask struct { + // TaskURI is the URI to monitor the task on the BMC. + // +required + TaskURI string `json:"taskURI"` + + // TaskType indicates the type of operation. + // +required + // +kubebuilder:validation:Enum=DiskErase;BIOSReset;BMCReset;NetworkClear;FirmwareUpdate;ConfigurationChange;AccountManagement;Other + TaskType BMCTaskType `json:"taskType"` + + // TargetID identifies what the task is operating on (e.g., "BIOS", "BMC", "Drive-1"). + // +optional + TargetID string `json:"targetID,omitempty"` + + // State is the current state of the task. + // +optional + State string `json:"state,omitempty"` + + // PercentComplete indicates completion percentage (0-100). + // +optional + PercentComplete int32 `json:"percentComplete,omitempty"` + + // Message provides additional information about the task. + // +optional + Message string `json:"message,omitempty"` + + // LastUpdateTime is when this task status was last updated. + // +optional + LastUpdateTime metav1.Time `json:"lastUpdateTime,omitempty"` +} + +// BMCTaskType defines the type of BMC task. +type BMCTaskType string + +const ( + // BMCTaskTypeDiskErase indicates a disk erasing task. + BMCTaskTypeDiskErase BMCTaskType = "DiskErase" + + // BMCTaskTypeBIOSReset indicates a BIOS reset task. + BMCTaskTypeBIOSReset BMCTaskType = "BIOSReset" + + // BMCTaskTypeBMCReset indicates a BMC reset task. + BMCTaskTypeBMCReset BMCTaskType = "BMCReset" + + // BMCTaskTypeNetworkClear indicates a network configuration clear task. + BMCTaskTypeNetworkClear BMCTaskType = "NetworkClear" + + // BMCTaskTypeFirmwareUpdate indicates a firmware update task (BIOS or BMC). + BMCTaskTypeFirmwareUpdate BMCTaskType = "FirmwareUpdate" + + // BMCTaskTypeConfigurationChange indicates a configuration change task. + BMCTaskTypeConfigurationChange BMCTaskType = "ConfigurationChange" + + // BMCTaskTypeAccountManagement indicates an account management task. + BMCTaskTypeAccountManagement BMCTaskType = "AccountManagement" + + // BMCTaskTypeOther indicates a task type not covered by the specific types. + BMCTaskTypeOther BMCTaskType = "Other" +) + // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Cluster diff --git a/api/v1alpha1/server_types.go b/api/v1alpha1/server_types.go index e280f9570..0cc0204a3 100644 --- a/api/v1alpha1/server_types.go +++ b/api/v1alpha1/server_types.go @@ -128,6 +128,10 @@ type ServerSpec struct { // the BIOS configuration for this server. // +optional BIOSSettingsRef *v1.LocalObjectReference `json:"biosSettingsRef,omitempty"` + + // Taints is a list of taints that affect this server. + // +optional + Taints []v1.Taint `json:"taints,omitempty"` } // ServerState defines the possible states of a server. @@ -146,6 +150,10 @@ const ( // ServerStateReserved indicates that the server is reserved for a specific use or user. ServerStateReserved ServerState = "Reserved" + // ServerStateTainted indicates that the server is tainted and requires cleaning + // before transitioning back to Available. + ServerStateTainted ServerState = "Tainted" + // ServerStateError indicates that there is an error with the server. ServerStateError ServerState = "Error" diff --git a/api/v1alpha1/servercleaning_types.go b/api/v1alpha1/servercleaning_types.go new file mode 100644 index 000000000..f4fd147e5 --- /dev/null +++ b/api/v1alpha1/servercleaning_types.go @@ -0,0 +1,175 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// ServerCleaningSpec defines the desired cleaning operations +// +kubebuilder:validation:XValidation:rule="has(self.serverRef) || has(self.serverSelector)", message="either serverRef or serverSelector must be specified" +type ServerCleaningSpec struct { + // ServerRef references a specific Server to be cleaned. + // Mutually exclusive with ServerSelector. + // +optional + ServerRef *corev1.LocalObjectReference `json:"serverRef,omitempty"` + + // ServerSelector specifies a label selector to identify servers to be cleaned. + // Mutually exclusive with ServerRef. + // +optional + ServerSelector *metav1.LabelSelector `json:"serverSelector,omitempty"` + + // DiskWipe specifies disk erasing configuration + // +optional + DiskWipe *DiskWipeConfig `json:"diskWipe,omitempty"` + + // BMCReset specifies if BMC should be reset to defaults + // +optional + BMCReset bool `json:"bmcReset,omitempty"` + + // BIOSReset specifies if BIOS should be reset to defaults + // +optional + BIOSReset bool `json:"biosReset,omitempty"` + + // NetworkCleanup specifies if network configurations should be cleared + // +optional + NetworkCleanup bool `json:"networkCleanup,omitempty"` + + // ServerBootConfigurationTemplate defines the boot configuration for cleaning agent + // If not specified, cleaning operations are performed via BMC APIs + // +optional + ServerBootConfigurationTemplate *ServerBootConfigurationTemplate `json:"serverBootConfigurationTemplate,omitempty"` +} + +// DiskWipeConfig defines disk erasing behavior +type DiskWipeConfig struct { + // Method specifies the disk erasing method + // +kubebuilder:validation:Enum=quick;secure;dod + // +kubebuilder:default=quick + Method DiskWipeMethod `json:"method"` + + // IncludeBootDrives specifies whether to erase boot drives + // +optional + IncludeBootDrives bool `json:"includeBootDrives,omitempty"` +} + +// DiskWipeMethod defines the available disk erasing methods +type DiskWipeMethod string + +const ( + // DiskWipeMethodQuick performs a quick erase (single pass) + DiskWipeMethodQuick DiskWipeMethod = "quick" + + // DiskWipeMethodSecure performs a secure erase (3 passes) + DiskWipeMethodSecure DiskWipeMethod = "secure" + + // DiskWipeMethodDoD performs DoD 5220.22-M standard erase (7 passes) + DiskWipeMethodDoD DiskWipeMethod = "dod" +) + +// ServerCleaningState defines the state of the cleaning process +type ServerCleaningState string + +const ( + // ServerCleaningStatePending indicates cleaning is waiting to start + ServerCleaningStatePending ServerCleaningState = "Pending" + + // ServerCleaningStateInProgress indicates cleaning is in progress + ServerCleaningStateInProgress ServerCleaningState = "InProgress" + + // ServerCleaningStateCompleted indicates cleaning completed successfully + ServerCleaningStateCompleted ServerCleaningState = "Completed" + + // ServerCleaningStateFailed indicates cleaning failed + ServerCleaningStateFailed ServerCleaningState = "Failed" +) + +// ServerCleaningStatus defines the observed state of ServerCleaning +type ServerCleaningStatus struct { + // State represents the current state of the cleaning process + // +optional + State ServerCleaningState `json:"state,omitempty"` + + // SelectedServers is the total number of servers selected for cleaning + // +optional + SelectedServers int32 `json:"selectedServers,omitempty"` + + // PendingCleanings is the number of servers with pending cleaning + // +optional + PendingCleanings int32 `json:"pendingCleanings,omitempty"` + + // InProgressCleanings is the number of servers currently being cleaned + // +optional + InProgressCleanings int32 `json:"inProgressCleanings,omitempty"` + + // CompletedCleanings is the number of servers successfully cleaned + // +optional + CompletedCleanings int32 `json:"completedCleanings,omitempty"` + + // FailedCleanings is the number of servers where cleaning failed + // +optional + FailedCleanings int32 `json:"failedCleanings,omitempty"` + + // ServerCleaningStatuses contains per-server cleaning status + // +optional + ServerCleaningStatuses []ServerCleaningStatusEntry `json:"serverCleaningStatuses,omitempty"` + + // Conditions represents the latest available observations + // +patchStrategy=merge + // +patchMergeKey=type + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty" patchStrategy:"merge" patchMergeKey:"type"` +} + +// ServerCleaningStatusEntry represents the cleaning status for a single server +type ServerCleaningStatusEntry struct { + // ServerName is the name of the server + // +required + ServerName string `json:"serverName"` + + // State is the cleaning state for this server + // +required + State ServerCleaningState `json:"state"` + + // Message provides additional information about the cleaning state + // +optional + Message string `json:"message,omitempty"` + + // LastUpdateTime is the last time this status was updated + // +optional + LastUpdateTime metav1.Time `json:"lastUpdateTime,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:scope=Namespaced,shortName=scl +// +kubebuilder:printcolumn:name="Selected",type=integer,JSONPath=`.status.selectedServers` +// +kubebuilder:printcolumn:name="Completed",type=integer,JSONPath=`.status.completedCleanings` +// +kubebuilder:printcolumn:name="InProgress",type=integer,JSONPath=`.status.inProgressCleanings` +// +kubebuilder:printcolumn:name="Failed",type=integer,JSONPath=`.status.failedCleanings` +// +kubebuilder:printcolumn:name="State",type=string,JSONPath=`.status.state` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// ServerCleaning is the Schema for the servercleaning API +type ServerCleaning struct { + metav1.TypeMeta `json:",inline"` + metav1.ObjectMeta `json:"metadata,omitempty"` + + Spec ServerCleaningSpec `json:"spec,omitempty"` + Status ServerCleaningStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// ServerCleaningList contains a list of ServerCleaning +type ServerCleaningList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []ServerCleaning `json:"items"` +} + +func init() { + SchemeBuilder.Register(&ServerCleaning{}, &ServerCleaningList{}) +} diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 8b96f5e64..2ade1a876 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -927,6 +927,13 @@ func (in *BMCStatus) DeepCopyInto(out *BMCStatus) { in, out := &in.LastResetTime, &out.LastResetTime *out = (*in).DeepCopy() } + if in.Tasks != nil { + in, out := &in.Tasks, &out.Tasks + *out = make([]BMCTask, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions *out = make([]metav1.Condition, len(*in)) @@ -946,6 +953,22 @@ func (in *BMCStatus) DeepCopy() *BMCStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *BMCTask) DeepCopyInto(out *BMCTask) { + *out = *in + in.LastUpdateTime.DeepCopyInto(&out.LastUpdateTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BMCTask. +func (in *BMCTask) DeepCopy() *BMCTask { + if in == nil { + return nil + } + out := new(BMCTask) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *BMCUser) DeepCopyInto(out *BMCUser) { *out = *in @@ -1317,6 +1340,21 @@ func (in *ConsoleProtocol) DeepCopy() *ConsoleProtocol { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DiskWipeConfig) DeepCopyInto(out *DiskWipeConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DiskWipeConfig. +func (in *DiskWipeConfig) DeepCopy() *DiskWipeConfig { + if in == nil { + return nil + } + out := new(DiskWipeConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Endpoint) DeepCopyInto(out *Endpoint) { *out = *in @@ -1783,6 +1821,145 @@ func (in *ServerClaimStatus) DeepCopy() *ServerClaimStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServerCleaning) DeepCopyInto(out *ServerCleaning) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerCleaning. +func (in *ServerCleaning) DeepCopy() *ServerCleaning { + if in == nil { + return nil + } + out := new(ServerCleaning) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ServerCleaning) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServerCleaningList) DeepCopyInto(out *ServerCleaningList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]ServerCleaning, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerCleaningList. +func (in *ServerCleaningList) DeepCopy() *ServerCleaningList { + if in == nil { + return nil + } + out := new(ServerCleaningList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *ServerCleaningList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServerCleaningSpec) DeepCopyInto(out *ServerCleaningSpec) { + *out = *in + if in.ServerRef != nil { + in, out := &in.ServerRef, &out.ServerRef + *out = new(v1.LocalObjectReference) + **out = **in + } + if in.ServerSelector != nil { + in, out := &in.ServerSelector, &out.ServerSelector + *out = new(metav1.LabelSelector) + (*in).DeepCopyInto(*out) + } + if in.DiskWipe != nil { + in, out := &in.DiskWipe, &out.DiskWipe + *out = new(DiskWipeConfig) + **out = **in + } + if in.ServerBootConfigurationTemplate != nil { + in, out := &in.ServerBootConfigurationTemplate, &out.ServerBootConfigurationTemplate + *out = new(ServerBootConfigurationTemplate) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerCleaningSpec. +func (in *ServerCleaningSpec) DeepCopy() *ServerCleaningSpec { + if in == nil { + return nil + } + out := new(ServerCleaningSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServerCleaningStatus) DeepCopyInto(out *ServerCleaningStatus) { + *out = *in + if in.ServerCleaningStatuses != nil { + in, out := &in.ServerCleaningStatuses, &out.ServerCleaningStatuses + *out = make([]ServerCleaningStatusEntry, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]metav1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerCleaningStatus. +func (in *ServerCleaningStatus) DeepCopy() *ServerCleaningStatus { + if in == nil { + return nil + } + out := new(ServerCleaningStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ServerCleaningStatusEntry) DeepCopyInto(out *ServerCleaningStatusEntry) { + *out = *in + in.LastUpdateTime.DeepCopyInto(&out.LastUpdateTime) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerCleaningStatusEntry. +func (in *ServerCleaningStatusEntry) DeepCopy() *ServerCleaningStatusEntry { + if in == nil { + return nil + } + out := new(ServerCleaningStatusEntry) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ServerList) DeepCopyInto(out *ServerList) { *out = *in @@ -1977,6 +2154,13 @@ func (in *ServerSpec) DeepCopyInto(out *ServerSpec) { *out = new(v1.LocalObjectReference) **out = **in } + if in.Taints != nil { + in, out := &in.Taints, &out.Taints + *out = make([]v1.Taint, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ServerSpec. diff --git a/bmc/bmc.go b/bmc/bmc.go index d9f6634b7..ff90de3b3 100644 --- a/bmc/bmc.go +++ b/bmc/bmc.go @@ -20,6 +20,60 @@ const ( ManufacturerSupermicro Manufacturer = "Supermicro" ) +// DiskWipeMethod defines the disk wiping method +type DiskWipeMethod string + +const ( + // DiskWipeMethodQuick performs a quick wipe (single pass with zeros) + DiskWipeMethodQuick DiskWipeMethod = "quick" + + // DiskWipeMethodSecure performs a secure wipe (3 passes) + DiskWipeMethodSecure DiskWipeMethod = "secure" + + // DiskWipeMethodDoD performs DoD 5220.22-M standard wipe (7 passes) + DiskWipeMethodDoD DiskWipeMethod = "dod" +) + +// CleaningTaskInfo contains information about a cleaning task +type CleaningTaskInfo struct { + // TaskURI is the URI to monitor the task + TaskURI string + // TaskType indicates what type of cleaning task this is + TaskType CleaningTaskType + // TargetID identifies the target resource (e.g., drive ID for disk wipe) + TargetID string +} + +// CleaningTaskType defines the type of cleaning task +type CleaningTaskType string + +const ( + // CleaningTaskTypeDiskErase indicates a disk erasing task + CleaningTaskTypeDiskErase CleaningTaskType = "DiskErase" + // CleaningTaskTypeBIOSReset indicates a BIOS reset task + CleaningTaskTypeBIOSReset CleaningTaskType = "BIOSReset" + // CleaningTaskTypeBMCReset indicates a BMC reset task + CleaningTaskTypeBMCReset CleaningTaskType = "BMCReset" + // CleaningTaskTypeNetworkClear indicates a network config clear task + CleaningTaskTypeNetworkClear CleaningTaskType = "NetworkClear" +) + +// CleaningTaskStatus represents the status of a cleaning task +type CleaningTaskStatus struct { + // TaskURI is the URI to monitor the task + TaskURI string + // State is the current state of the task + State string + // PercentComplete indicates the completion percentage (0-100) + PercentComplete int + // Message provides additional information about the task + Message string + // TaskType indicates what type of cleaning task this is + TaskType CleaningTaskType + // TargetID identifies the target resource + TargetID string +} + // BMC defines an interface for interacting with a Baseboard Management Controller. type BMC interface { // PowerOn powers on the system. @@ -109,6 +163,9 @@ type BMC interface { // GetBMCUpgradeTask retrieves the task for the BMC upgrade. GetBMCUpgradeTask(ctx context.Context, manufacturer string, taskURI string) (*schemas.Task, error) + // GetTaskStatus retrieves the status of a task by its URI. + GetTaskStatus(ctx context.Context, taskURI string) (*schemas.Task, error) + // CreateOrUpdateAccount creates or updates a BMC user account. CreateOrUpdateAccount(ctx context.Context, userName, role, password string, enabled bool) error @@ -120,6 +177,18 @@ type BMC interface { // GetAccountService retrieves the account service. GetAccountService() (*schemas.AccountService, error) + + // EraseDisk initiates disk erasing operation via Redfish. Returns task URIs for long-running operations. + EraseDisk(ctx context.Context, systemURI string, method DiskWipeMethod) ([]CleaningTaskInfo, error) + + // ResetBIOSToDefaults resets BIOS configuration to factory defaults. Returns task URI if operation is async. + ResetBIOSToDefaults(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) + + // ResetBMCToDefaults resets BMC configuration to factory defaults. Returns task URI if operation is async. + ResetBMCToDefaults(ctx context.Context, managerUUID string) (*CleaningTaskInfo, error) + + // ClearNetworkConfiguration clears network configuration settings. Returns task URI if operation is async. + ClearNetworkConfiguration(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) } type Entity struct { diff --git a/bmc/cleaning.go b/bmc/cleaning.go new file mode 100644 index 000000000..b807815d1 --- /dev/null +++ b/bmc/cleaning.go @@ -0,0 +1,31 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package bmc + +import ( + "context" + "net/http" + + "github.com/stmcginnis/gofish/schemas" +) + +// CleaningInterface defines methods for OEM-specific cleaning operations +type CleaningInterface interface { + // EraseDisk erases disks using vendor-specific methods + EraseDisk(ctx context.Context, storages []*schemas.Storage, method DiskWipeMethod) error + + // ResetBIOS resets BIOS to factory defaults + ResetBIOS(ctx context.Context, biosURI string) error + + // ResetBMC resets BMC to factory defaults + ResetBMC(ctx context.Context, manager *schemas.Manager) error + + // ClearNetworkConfig clears network configuration + ClearNetworkConfig(ctx context.Context, systemURI string) error +} + +// HTTPClient interface for making HTTP requests +type HTTPClient interface { + Post(uri string, payload any) (*http.Response, error) +} diff --git a/bmc/cleaning_test.go b/bmc/cleaning_test.go new file mode 100644 index 000000000..41d68bf65 --- /dev/null +++ b/bmc/cleaning_test.go @@ -0,0 +1,96 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package bmc + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("Server Cleaning Operations", func() { + Describe("Vendor-Specific Disk Wipe Configuration", func() { + Describe("Dell Disk Wipe Passes", func() { + It("should return correct pass count for quick wipe", func() { + passes := getDellWipePasses(DiskWipeMethodQuick) + Expect(passes).To(Equal(1)) + }) + + It("should return correct pass count for secure wipe", func() { + passes := getDellWipePasses(DiskWipeMethodSecure) + Expect(passes).To(Equal(3)) + }) + + It("should return correct pass count for DoD wipe", func() { + passes := getDellWipePasses(DiskWipeMethodDoD) + Expect(passes).To(Equal(7)) + }) + + It("should default to 1 pass for unknown method", func() { + passes := getDellWipePasses("unknown") + Expect(passes).To(Equal(1)) + }) + }) + + Describe("HPE Wipe Type", func() { + It("should return correct type for quick wipe", func() { + wipeType := getHPEWipeType(DiskWipeMethodQuick) + Expect(wipeType).To(Equal("BlockErase")) + }) + + It("should return correct type for secure wipe", func() { + wipeType := getHPEWipeType(DiskWipeMethodSecure) + Expect(wipeType).To(Equal("Overwrite")) + }) + + It("should return correct type for DoD wipe", func() { + wipeType := getHPEWipeType(DiskWipeMethodDoD) + Expect(wipeType).To(Equal("CryptographicErase")) + }) + + It("should default to BlockErase for unknown method", func() { + wipeType := getHPEWipeType("unknown") + Expect(wipeType).To(Equal("BlockErase")) + }) + }) + + Describe("Lenovo Wipe Method", func() { + It("should return correct method for quick wipe", func() { + method := getLenovoWipeMethod(DiskWipeMethodQuick) + Expect(method).To(Equal("Simple")) + }) + + It("should return correct method for secure wipe", func() { + method := getLenovoWipeMethod(DiskWipeMethodSecure) + Expect(method).To(Equal("Cryptographic")) + }) + + It("should return correct method for DoD wipe", func() { + method := getLenovoWipeMethod(DiskWipeMethodDoD) + Expect(method).To(Equal("Sanitize")) + }) + + It("should default to Simple for unknown method", func() { + method := getLenovoWipeMethod("unknown") + Expect(method).To(Equal("Simple")) + }) + }) + }) + + Describe("DiskWipeMethod Constants", func() { + It("should have expected constant values", func() { + Expect(DiskWipeMethodQuick).To(Equal(DiskWipeMethod("quick"))) + Expect(DiskWipeMethodSecure).To(Equal(DiskWipeMethod("secure"))) + Expect(DiskWipeMethodDoD).To(Equal(DiskWipeMethod("dod"))) + }) + }) + + Describe("Manufacturer Constants", func() { + It("should have expected manufacturer values", func() { + Expect(ManufacturerDell).To(Equal(Manufacturer("Dell Inc."))) + Expect(ManufacturerHPE).To(Equal(Manufacturer("HPE"))) + Expect(ManufacturerLenovo).To(Equal(Manufacturer("Lenovo"))) + Expect(ManufacturerSupermicro).To(Equal(Manufacturer("Supermicro"))) + }) + }) +}) diff --git a/bmc/dell_cleaning.go b/bmc/dell_cleaning.go new file mode 100644 index 000000000..b0ba72197 --- /dev/null +++ b/bmc/dell_cleaning.go @@ -0,0 +1,143 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package bmc + +import ( + "context" + "fmt" + "io" + + "github.com/stmcginnis/gofish/schemas" + ctrl "sigs.k8s.io/controller-runtime" +) + +// DellCleaning implements cleaning operations for Dell servers +type DellCleaning struct { + client HTTPClient +} + +// NewDellCleaning creates a new DellCleaning instance +func NewDellCleaning(client HTTPClient) *DellCleaning { + return &DellCleaning{client: client} +} + +// EraseDisk performs disk erasing for Dell servers using iDRAC OEM extensions +func (d *DellCleaning) EraseDisk(ctx context.Context, storages []*schemas.Storage, method DiskWipeMethod) error { + log := ctrl.LoggerFrom(ctx) + + // Dell iDRAC supports secure erase via Storage Controller actions + for _, storage := range storages { + drives, err := storage.Drives() + if err != nil { + log.Error(err, "Failed to get drives for storage", "storage", storage.Name) + continue + } + + for _, drive := range drives { + // Construct OEM action URI for Dell + // Dell uses: /redfish/v1/Systems/{id}/Storage/{storageId}/Drives/{driveId}/Actions/Drive.SecureErase + actionURI := fmt.Sprintf("%s/Actions/Drive.SecureErase", drive.ODataID) + + payload := map[string]any{ + "OverwritePasses": getDellWipePasses(method), + } + + log.V(1).Info("Initiating Dell drive wipe", "drive", drive.Name, "uri", actionURI) + + resp, err := d.client.Post(actionURI, payload) + if err != nil { + log.Error(err, "Failed to initiate disk wipe for drive", "drive", drive.Name) + continue + } + _ = resp.Body.Close() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + log.Error(fmt.Errorf("wipe request failed"), "Failed to wipe drive", + "drive", drive.Name, "status", resp.StatusCode, "body", string(body)) + continue + } + } + } + + return nil +} + +// ResetBIOS resets BIOS configuration to factory defaults for Dell servers +func (d *DellCleaning) ResetBIOS(ctx context.Context, biosURI string) error { + log := ctrl.LoggerFrom(ctx) + + // Dell iDRAC: POST to /redfish/v1/Systems/{id}/Bios/Actions/Bios.ResetBios + actionURI := fmt.Sprintf("%s/Actions/Bios.ResetBios", biosURI) + + log.V(1).Info("Resetting Dell BIOS to defaults", "uri", actionURI) + + resp, err := d.client.Post(actionURI, map[string]any{}) + if err != nil { + return fmt.Errorf("failed to reset BIOS: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("BIOS reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + return nil +} + +// ResetBMC resets BMC configuration to factory defaults for Dell servers +func (d *DellCleaning) ResetBMC(ctx context.Context, manager *schemas.Manager) error { + log := ctrl.LoggerFrom(ctx) + + // Dell iDRAC: Use OEM action to reset to defaults + // /redfish/v1/Managers/{id}/Actions/Oem/DellManager.ResetToDefaults + actionURI := fmt.Sprintf("%s/Actions/Oem/DellManager.ResetToDefaults", manager.ODataID) + + payload := map[string]any{ + "ResetType": "ResetAllWithRootDefaults", + } + + log.V(1).Info("Resetting Dell iDRAC to defaults", "uri", actionURI) + + resp, err := d.client.Post(actionURI, payload) + if err != nil { + return fmt.Errorf("failed to reset BMC: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("BMC reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + return nil +} + +// ClearNetworkConfig clears network configuration for Dell servers +func (d *DellCleaning) ClearNetworkConfig(ctx context.Context, systemURI string) error { + log := ctrl.LoggerFrom(ctx) + + // Dell: Clear network adapters configuration via OEM extensions + // This typically involves resetting NIC settings to defaults + actionURI := fmt.Sprintf("%s/NetworkAdapters/Actions/Oem/DellNetworkAdapter.ClearConfiguration", systemURI) + + log.V(1).Info("Clearing Dell network configuration", "uri", actionURI) + + resp, err := d.client.Post(actionURI, map[string]any{}) + if err != nil { + // Network config clear might not be critical, log and continue + log.Error(err, "Failed to clear network configuration (non-critical)") + return nil + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + log.Error(fmt.Errorf("network config clear failed"), "Failed with status", + "status", resp.StatusCode, "body", string(body)) + } + + return nil +} diff --git a/bmc/hpe_cleaning.go b/bmc/hpe_cleaning.go new file mode 100644 index 000000000..9b9545026 --- /dev/null +++ b/bmc/hpe_cleaning.go @@ -0,0 +1,141 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package bmc + +import ( + "context" + "fmt" + "io" + + "github.com/stmcginnis/gofish/schemas" + ctrl "sigs.k8s.io/controller-runtime" +) + +// HPECleaning implements cleaning operations for HPE servers +type HPECleaning struct { + client HTTPClient +} + +// NewHPECleaning creates a new HPECleaning instance +func NewHPECleaning(client HTTPClient) *HPECleaning { + return &HPECleaning{client: client} +} + +// EraseDisk performs disk erasing for HPE servers using iLO OEM extensions +func (h *HPECleaning) EraseDisk(ctx context.Context, storages []*schemas.Storage, method DiskWipeMethod) error { + log := ctrl.LoggerFrom(ctx) + + // HPE iLO supports sanitize operations via OEM extensions + for _, storage := range storages { + drives, err := storage.Drives() + if err != nil { + log.Error(err, "Failed to get drives for storage", "storage", storage.Name) + continue + } + + for _, drive := range drives { + // HPE OEM action: /redfish/v1/Systems/{id}/Storage/{storageId}/Drives/{driveId}/Actions/Oem/Hpe/HpeDrive.SecureErase + actionURI := fmt.Sprintf("%s/Actions/Oem/Hpe/HpeDrive.SecureErase", drive.ODataID) + + payload := map[string]any{ + "SanitizeType": getHPEWipeType(method), + } + + log.V(1).Info("Initiating HPE drive wipe", "drive", drive.Name, "uri", actionURI) + + resp, err := h.client.Post(actionURI, payload) + if err != nil { + log.Error(err, "Failed to initiate disk wipe for drive", "drive", drive.Name) + continue + } + _ = resp.Body.Close() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + log.Error(fmt.Errorf("wipe request failed"), "Failed to wipe drive", + "drive", drive.Name, "status", resp.StatusCode, "body", string(body)) + continue + } + } + } + + return nil +} + +// ResetBIOS resets BIOS configuration to factory defaults for HPE servers +func (h *HPECleaning) ResetBIOS(ctx context.Context, biosURI string) error { + log := ctrl.LoggerFrom(ctx) + + // HPE iLO: Use ResetBios action + // /redfish/v1/Systems/{id}/Bios/Actions/Bios.ResetBios + actionURI := fmt.Sprintf("%s/Actions/Bios.ResetBios", biosURI) + + log.V(1).Info("Resetting HPE BIOS to defaults", "uri", actionURI) + + resp, err := h.client.Post(actionURI, map[string]any{}) + if err != nil { + return fmt.Errorf("failed to reset BIOS: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("BIOS reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + return nil +} + +// ResetBMC resets BMC configuration to factory defaults for HPE servers +func (h *HPECleaning) ResetBMC(ctx context.Context, manager *schemas.Manager) error { + log := ctrl.LoggerFrom(ctx) + + // HPE iLO: Use OEM action to reset to factory defaults + // /redfish/v1/Managers/{id}/Actions/Oem/Hpe/HpiLO.ResetToFactoryDefaults + actionURI := fmt.Sprintf("%s/Actions/Oem/Hpe/HpiLO.ResetToFactoryDefaults", manager.ODataID) + + payload := map[string]any{ + "ResetType": "Default", + } + + log.V(1).Info("Resetting HPE iLO to defaults", "uri", actionURI) + + resp, err := h.client.Post(actionURI, payload) + if err != nil { + return fmt.Errorf("failed to reset BMC: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("BMC reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + return nil +} + +// ClearNetworkConfig clears network configuration for HPE servers +func (h *HPECleaning) ClearNetworkConfig(ctx context.Context, systemURI string) error { + log := ctrl.LoggerFrom(ctx) + + // HPE: Clear network adapters configuration + actionURI := fmt.Sprintf("%s/NetworkAdapters/Actions/Oem/Hpe/HpeNetworkAdapter.ClearConfiguration", systemURI) + + log.V(1).Info("Clearing HPE network configuration", "uri", actionURI) + + resp, err := h.client.Post(actionURI, map[string]any{}) + if err != nil { + log.Error(err, "Failed to clear network configuration (non-critical)") + return nil + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + log.Error(fmt.Errorf("network config clear failed"), "Failed with status", + "status", resp.StatusCode, "body", string(body)) + } + + return nil +} diff --git a/bmc/lenovo_cleaning.go b/bmc/lenovo_cleaning.go new file mode 100644 index 000000000..be3bcc959 --- /dev/null +++ b/bmc/lenovo_cleaning.go @@ -0,0 +1,140 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package bmc + +import ( + "context" + "fmt" + "io" + + "github.com/stmcginnis/gofish/schemas" + ctrl "sigs.k8s.io/controller-runtime" +) + +// LenovoCleaning implements cleaning operations for Lenovo servers +type LenovoCleaning struct { + client HTTPClient +} + +// NewLenovoCleaning creates a new LenovoCleaning instance +func NewLenovoCleaning(client HTTPClient) *LenovoCleaning { + return &LenovoCleaning{client: client} +} + +// EraseDisk performs disk erasing for Lenovo servers using XClarity OEM extensions +func (l *LenovoCleaning) EraseDisk(ctx context.Context, storages []*schemas.Storage, method DiskWipeMethod) error { + log := ctrl.LoggerFrom(ctx) + + // Lenovo XClarity supports secure erase via OEM extensions + for _, storage := range storages { + drives, err := storage.Drives() + if err != nil { + log.Error(err, "Failed to get drives for storage", "storage", storage.Name) + continue + } + + for _, drive := range drives { + // Lenovo OEM action path + actionURI := fmt.Sprintf("%s/Actions/Drive.SecureErase", drive.ODataID) + + payload := map[string]any{ + "EraseMethod": getLenovoWipeMethod(method), + } + + log.V(1).Info("Initiating Lenovo drive wipe", "drive", drive.Name, "uri", actionURI) + + resp, err := l.client.Post(actionURI, payload) + if err != nil { + log.Error(err, "Failed to initiate disk wipe for drive", "drive", drive.Name) + continue + } + _ = resp.Body.Close() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + log.Error(fmt.Errorf("wipe request failed"), "Failed to wipe drive", + "drive", drive.Name, "status", resp.StatusCode, "body", string(body)) + continue + } + } + } + + return nil +} + +// ResetBIOS resets BIOS configuration to factory defaults for Lenovo servers +func (l *LenovoCleaning) ResetBIOS(ctx context.Context, biosURI string) error { + log := ctrl.LoggerFrom(ctx) + + // Lenovo XClarity: POST to reset action + actionURI := fmt.Sprintf("%s/Actions/Bios.ResetBios", biosURI) + + log.V(1).Info("Resetting Lenovo BIOS to defaults", "uri", actionURI) + + resp, err := l.client.Post(actionURI, map[string]any{}) + if err != nil { + return fmt.Errorf("failed to reset BIOS: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("BIOS reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + return nil +} + +// ResetBMC resets BMC configuration to factory defaults for Lenovo servers +func (l *LenovoCleaning) ResetBMC(ctx context.Context, manager *schemas.Manager) error { + log := ctrl.LoggerFrom(ctx) + + // Lenovo XClarity: Use OEM action to reset to factory defaults + // /redfish/v1/Managers/{id}/Actions/Manager.ResetToDefaults + actionURI := fmt.Sprintf("%s/Actions/Manager.ResetToDefaults", manager.ODataID) + + payload := map[string]any{ + "ResetToDefaultsType": "ResetAll", + } + + log.V(1).Info("Resetting Lenovo XCC to defaults", "uri", actionURI) + + resp, err := l.client.Post(actionURI, payload) + if err != nil { + return fmt.Errorf("failed to reset BMC: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("BMC reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + return nil +} + +// ClearNetworkConfig clears network configuration for Lenovo servers +func (l *LenovoCleaning) ClearNetworkConfig(ctx context.Context, systemURI string) error { + log := ctrl.LoggerFrom(ctx) + + // Lenovo: Clear network adapters configuration + actionURI := fmt.Sprintf("%s/NetworkAdapters/Actions/NetworkAdapter.ClearConfiguration", systemURI) + + log.V(1).Info("Clearing Lenovo network configuration", "uri", actionURI) + + resp, err := l.client.Post(actionURI, map[string]any{}) + if err != nil { + log.Error(err, "Failed to clear network configuration (non-critical)") + return nil + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + log.Error(fmt.Errorf("network config clear failed"), "Failed with status", + "status", resp.StatusCode, "body", string(body)) + } + + return nil +} diff --git a/bmc/redfish.go b/bmc/redfish.go index 08d3d195e..c754a1bd8 100644 --- a/bmc/redfish.go +++ b/bmc/redfish.go @@ -12,6 +12,7 @@ import ( "io" "maps" "math/big" + "net/http" "slices" "strings" "time" @@ -873,6 +874,33 @@ func (r *RedfishBaseBMC) GetBMCUpgradeTask(_ context.Context, _ string, _ string return nil, fmt.Errorf("firmware upgrade task not supported for manufacturer %q", r.manufacturer) } +// GetTaskStatus retrieves the status of a task by its URI. +func (r *RedfishBaseBMC) GetTaskStatus(ctx context.Context, taskURI string) (*schemas.Task, error) { + log := ctrl.LoggerFrom(ctx) + client := r.client.GetService().GetClient() + + resp, err := client.Get(taskURI) + if err != nil { + return nil, fmt.Errorf("failed to get task status: %w", err) + } + defer func() { + if closeErr := resp.Body.Close(); closeErr != nil { + log.Error(closeErr, "Failed to close response body") + } + }() + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unexpected status code %d when getting task status", resp.StatusCode) + } + + var task schemas.Task + if err := json.NewDecoder(resp.Body).Decode(&task); err != nil { + return nil, fmt.Errorf("failed to decode task response: %w", err) + } + + return &task, nil +} + const ( charLower = "abcdefghijklmnopqrstuvwxyz" charUpper = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" @@ -975,3 +1003,832 @@ func shuffleRunes(a []rune) error { } return nil } + +// extractTaskURIFromResponse extracts the task URI from HTTP response headers or body +func (r *RedfishBaseBMC) extractTaskURIFromResponse(resp *http.Response) string { + // Check Location header (standard Redfish async response) + if location := resp.Header.Get("Location"); location != "" { + return location + } + + // Check for task monitor in response body + if resp.Body != nil { + body, err := io.ReadAll(resp.Body) + if err == nil { + var taskResponse struct { + TaskMonitor string `json:"@odata.id"` + } + if err := json.Unmarshal(body, &taskResponse); err == nil && taskResponse.TaskMonitor != "" { + return taskResponse.TaskMonitor + } + } + } + + return "" +} + +// EraseDisk initiates disk erasing operation via Redfish. +// This implementation uses vendor-specific OEM extensions when available. +func (r *RedfishBaseBMC) EraseDisk(ctx context.Context, systemURI string, method DiskWipeMethod) ([]CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Erasing disks", "systemURI", systemURI, "method", method) + + system, err := r.getSystemFromUri(ctx, systemURI) + if err != nil { + return nil, fmt.Errorf("failed to get computer system: %w", err) + } + + manufacturer := system.Manufacturer + log.V(1).Info("Detected manufacturer", "manufacturer", manufacturer) + + // Get system storage + systemStorage, err := system.Storage() + if err != nil { + return nil, fmt.Errorf("failed to get storage: %w", err) + } + + if len(systemStorage) == 0 { + log.V(1).Info("No storage devices found") + return nil, nil + } + + // Use OEM-specific wipe if available + switch Manufacturer(manufacturer) { + case ManufacturerDell: + return r.wipeDiskDell(ctx, systemStorage, method) + case ManufacturerHPE: + return r.wipeDiskHPE(ctx, systemStorage, method) + case ManufacturerLenovo: + return r.wipeDiskLenovo(ctx, systemStorage, method) + default: + // Generic Redfish SecureErase + return r.wipeDiskGeneric(ctx, systemStorage, method) + } +} + +// wipeDiskDell performs disk wiping for Dell servers using iDRAC OEM extensions +func (r *RedfishBaseBMC) wipeDiskDell(ctx context.Context, storages []*schemas.Storage, method DiskWipeMethod) ([]CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + var tasks []CleaningTaskInfo + + // Dell iDRAC supports secure erase via Storage Controller actions + for _, storage := range storages { + drives, err := storage.Drives() + if err != nil { + log.Error(err, "Failed to get drives for storage", "storage", storage.Name) + continue + } + + for _, drive := range drives { + // Construct OEM action URI for Dell + // Dell uses: /redfish/v1/Systems/{id}/Storage/{storageId}/Drives/{driveId}/Actions/Drive.SecureErase + actionURI := fmt.Sprintf("%s/Actions/Drive.SecureErase", drive.ODataID) + + payload := map[string]any{ + "OverwritePasses": getDellWipePasses(method), + } + + log.V(1).Info("Initiating Dell drive wipe", "drive", drive.Name, "uri", actionURI) + + resp, err := r.client.Post(actionURI, payload) + if err != nil { + log.Error(err, "Failed to initiate disk wipe for drive", "drive", drive.Name) + continue + } + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + log.Error(fmt.Errorf("wipe request failed"), "Failed to wipe drive", + "drive", drive.Name, "status", resp.StatusCode, "body", string(body)) + continue + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + _ = resp.Body.Close() + + if taskURI != "" { + tasks = append(tasks, CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeDiskErase, + TargetID: drive.ID, + }) + log.V(1).Info("Dell disk wipe task created", "drive", drive.Name, "taskURI", taskURI) + } else { + log.V(1).Info("Dell disk wipe completed synchronously", "drive", drive.Name) + } + } + } + + return tasks, nil +} + +func getDellWipePasses(method DiskWipeMethod) int { + switch method { + case DiskWipeMethodQuick: + return 1 + case DiskWipeMethodSecure: + return 3 + case DiskWipeMethodDoD: + return 7 + default: + return 1 + } +} + +// wipeDiskHPE performs disk wiping for HPE servers using iLO OEM extensions +func (r *RedfishBaseBMC) wipeDiskHPE(ctx context.Context, storages []*schemas.Storage, method DiskWipeMethod) ([]CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + var tasks []CleaningTaskInfo + + // HPE iLO supports sanitize operations via OEM extensions + for _, storage := range storages { + drives, err := storage.Drives() + if err != nil { + log.Error(err, "Failed to get drives for storage", "storage", storage.Name) + continue + } + + for _, drive := range drives { + // HPE OEM action: /redfish/v1/Systems/{id}/Storage/{storageId}/Drives/{driveId}/Actions/Oem/Hpe/HpeDrive.SecureErase + actionURI := fmt.Sprintf("%s/Actions/Oem/Hpe/HpeDrive.SecureErase", drive.ODataID) + + payload := map[string]any{ + "SanitizeType": getHPEWipeType(method), + } + + log.V(1).Info("Initiating HPE drive wipe", "drive", drive.Name, "uri", actionURI) + + resp, err := r.client.Post(actionURI, payload) + if err != nil { + log.Error(err, "Failed to initiate disk wipe for drive", "drive", drive.Name) + continue + } + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + log.Error(fmt.Errorf("wipe request failed"), "Failed to wipe drive", + "drive", drive.Name, "status", resp.StatusCode, "body", string(body)) + continue + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + _ = resp.Body.Close() + + if taskURI != "" { + tasks = append(tasks, CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeDiskErase, + TargetID: drive.ID, + }) + log.V(1).Info("HPE disk wipe task created", "drive", drive.Name, "taskURI", taskURI) + } else { + log.V(1).Info("HPE disk wipe completed synchronously", "drive", drive.Name) + } + } + } + + return tasks, nil +} + +func getHPEWipeType(method DiskWipeMethod) string { + switch method { + case DiskWipeMethodQuick: + return "BlockErase" + case DiskWipeMethodSecure: + return "Overwrite" + case DiskWipeMethodDoD: + return "CryptographicErase" + default: + return "BlockErase" + } +} + +// wipeDiskLenovo performs disk wiping for Lenovo servers using XClarity OEM extensions +func (r *RedfishBaseBMC) wipeDiskLenovo(ctx context.Context, storages []*schemas.Storage, method DiskWipeMethod) ([]CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + var tasks []CleaningTaskInfo + + // Lenovo XClarity supports secure erase via OEM extensions + for _, storage := range storages { + drives, err := storage.Drives() + if err != nil { + log.Error(err, "Failed to get drives for storage", "storage", storage.Name) + continue + } + + for _, drive := range drives { + // Lenovo OEM action path + actionURI := fmt.Sprintf("%s/Actions/Drive.SecureErase", drive.ODataID) + + payload := map[string]any{ + "EraseMethod": getLenovoWipeMethod(method), + } + + log.V(1).Info("Initiating Lenovo drive wipe", "drive", drive.Name, "uri", actionURI) + + resp, err := r.client.Post(actionURI, payload) + if err != nil { + log.Error(err, "Failed to initiate disk wipe for drive", "drive", drive.Name) + continue + } + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + log.Error(fmt.Errorf("wipe request failed"), "Failed to wipe drive", + "drive", drive.Name, "status", resp.StatusCode, "body", string(body)) + continue + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + _ = resp.Body.Close() + + if taskURI != "" { + tasks = append(tasks, CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeDiskErase, + TargetID: drive.ID, + }) + log.V(1).Info("Lenovo disk wipe task created", "drive", drive.Name, "taskURI", taskURI) + } else { + log.V(1).Info("Lenovo disk wipe completed synchronously", "drive", drive.Name) + } + } + } + + return tasks, nil +} + +func getLenovoWipeMethod(method DiskWipeMethod) string { + switch method { + case DiskWipeMethodQuick: + return "Simple" + case DiskWipeMethodSecure: + return "Cryptographic" + case DiskWipeMethodDoD: + return "Sanitize" + default: + return "Simple" + } +} + +// wipeDiskGeneric performs generic Redfish disk wiping for unsupported vendors +func (r *RedfishBaseBMC) wipeDiskGeneric(ctx context.Context, storages []*schemas.Storage, _ DiskWipeMethod) ([]CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Using generic Redfish disk wipe") + var tasks []CleaningTaskInfo + + // Standard Redfish SecureErase action + for _, storage := range storages { + drives, err := storage.Drives() + if err != nil { + log.Error(err, "Failed to get drives for storage", "storage", storage.Name) + continue + } + + for _, drive := range drives { + actionURI := fmt.Sprintf("%s/Actions/Drive.SecureErase", drive.ODataID) + + payload := map[string]any{} + + log.V(1).Info("Initiating generic drive wipe", "drive", drive.Name, "uri", actionURI) + + resp, err := r.client.Post(actionURI, payload) + if err != nil { + log.Error(err, "Failed to initiate disk wipe for drive", "drive", drive.Name) + continue + } + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + log.Error(fmt.Errorf("wipe request failed"), "Failed to wipe drive", + "drive", drive.Name, "status", resp.StatusCode, "body", string(body)) + continue + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + _ = resp.Body.Close() + + if taskURI != "" { + tasks = append(tasks, CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeDiskErase, + TargetID: drive.ID, + }) + log.V(1).Info("Generic disk wipe task created", "drive", drive.Name, "taskURI", taskURI) + } else { + log.V(1).Info("Generic disk wipe completed synchronously", "drive", drive.Name) + } + } + } + + return tasks, nil +} + +// ResetBIOSToDefaults resets BIOS configuration to factory defaults +func (r *RedfishBaseBMC) ResetBIOSToDefaults(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Resetting BIOS to defaults", "systemURI", systemURI) + + system, err := r.getSystemFromUri(ctx, systemURI) + if err != nil { + return nil, fmt.Errorf("failed to get computer system: %w", err) + } + + manufacturer := system.Manufacturer + log.V(1).Info("Detected manufacturer", "manufacturer", manufacturer) + + // Get BIOS + bios, err := system.Bios() + if err != nil { + return nil, fmt.Errorf("failed to get BIOS for system %s: %w", systemURI, err) + } + + biosURI := bios.ODataID + if biosURI == "" { + return nil, fmt.Errorf("BIOS URI not found for system %s", systemURI) + } + + // Use vendor-specific reset methods + switch Manufacturer(manufacturer) { + case ManufacturerDell: + return r.resetBIOSDell(ctx, biosURI) + case ManufacturerHPE: + return r.resetBIOSHPE(ctx, biosURI) + case ManufacturerLenovo: + return r.resetBIOSLenovo(ctx, biosURI) + default: + return r.resetBIOSGeneric(ctx, biosURI) + } +} + +func (r *RedfishBaseBMC) resetBIOSDell(ctx context.Context, biosURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // Dell iDRAC: POST to /redfish/v1/Systems/{id}/Bios/Actions/Bios.ResetBios + actionURI := fmt.Sprintf("%s/Actions/Bios.ResetBios", biosURI) + + log.V(1).Info("Resetting Dell BIOS to defaults", "uri", actionURI) + + resp, err := r.client.Post(actionURI, map[string]any{}) + if err != nil { + return nil, fmt.Errorf("failed to reset BIOS: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("BIOS reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("Dell BIOS reset task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeBIOSReset, + TargetID: biosURI, + }, nil + } + + log.V(1).Info("Dell BIOS reset completed synchronously") + return nil, nil +} + +func (r *RedfishBaseBMC) resetBIOSHPE(ctx context.Context, biosURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // HPE iLO: Use ChangePassword action with default parameters + // /redfish/v1/Systems/{id}/Bios/Actions/Bios.ResetBios + actionURI := fmt.Sprintf("%s/Actions/Bios.ResetBios", biosURI) + + log.V(1).Info("Resetting HPE BIOS to defaults", "uri", actionURI) + + resp, err := r.client.Post(actionURI, map[string]any{}) + if err != nil { + return nil, fmt.Errorf("failed to reset BIOS: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("BIOS reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("HPE BIOS reset task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeBIOSReset, + TargetID: biosURI, + }, nil + } + + log.V(1).Info("HPE BIOS reset completed synchronously") + return nil, nil +} + +func (r *RedfishBaseBMC) resetBIOSLenovo(ctx context.Context, biosURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // Lenovo XClarity: POST to reset action + actionURI := fmt.Sprintf("%s/Actions/Bios.ResetBios", biosURI) + + log.V(1).Info("Resetting Lenovo BIOS to defaults", "uri", actionURI) + + resp, err := r.client.Post(actionURI, map[string]any{}) + if err != nil { + return nil, fmt.Errorf("failed to reset BIOS: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("BIOS reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("Lenovo BIOS reset task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeBIOSReset, + TargetID: biosURI, + }, nil + } + + log.V(1).Info("Lenovo BIOS reset completed synchronously") + return nil, nil +} + +func (r *RedfishBaseBMC) resetBIOSGeneric(ctx context.Context, biosURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // Generic Redfish: Try standard ResetBios action + actionURI := fmt.Sprintf("%s/Actions/Bios.ResetBios", biosURI) + + log.V(1).Info("Resetting BIOS to defaults (generic)", "uri", actionURI) + + resp, err := r.client.Post(actionURI, map[string]any{}) + if err != nil { + return nil, fmt.Errorf("failed to reset BIOS: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("BIOS reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("Generic BIOS reset task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeBIOSReset, + TargetID: biosURI, + }, nil + } + + log.V(1).Info("Generic BIOS reset completed synchronously") + return nil, nil +} + +// ResetBMCToDefaults resets BMC configuration to factory defaults +func (r *RedfishBaseBMC) ResetBMCToDefaults(ctx context.Context, managerUUID string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Resetting BMC to defaults", "managerUUID", managerUUID) + + manager, err := r.GetManager(managerUUID) + if err != nil { + return nil, fmt.Errorf("failed to get manager: %w", err) + } + + manufacturer := manager.Manufacturer + log.V(1).Info("Detected manufacturer", "manufacturer", manufacturer) + + // Use vendor-specific reset methods + switch Manufacturer(manufacturer) { + case ManufacturerDell: + return r.resetBMCDell(ctx, manager) + case ManufacturerHPE: + return r.resetBMCHPE(ctx, manager) + case ManufacturerLenovo: + return r.resetBMCLenovo(ctx, manager) + default: + return r.resetBMCGeneric(ctx, manager) + } +} + +func (r *RedfishBaseBMC) resetBMCDell(ctx context.Context, manager *schemas.Manager) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // Dell iDRAC: Use OEM action to reset to defaults + // /redfish/v1/Managers/{id}/Actions/Oem/DellManager.ResetToDefaults + actionURI := fmt.Sprintf("%s/Actions/Oem/DellManager.ResetToDefaults", manager.ODataID) + + payload := map[string]any{ + "ResetType": "ResetAllWithRootDefaults", + } + + log.V(1).Info("Resetting Dell iDRAC to defaults", "uri", actionURI) + + resp, err := r.client.Post(actionURI, payload) + if err != nil { + return nil, fmt.Errorf("failed to reset BMC: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("BMC reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("Dell BMC reset task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeBMCReset, + TargetID: manager.ID, + }, nil + } + + log.V(1).Info("Dell BMC reset completed synchronously") + return nil, nil +} + +func (r *RedfishBaseBMC) resetBMCHPE(ctx context.Context, manager *schemas.Manager) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // HPE iLO: Use OEM action to reset to factory defaults + // /redfish/v1/Managers/{id}/Actions/Oem/Hpe/HpiLO.ResetToFactoryDefaults + actionURI := fmt.Sprintf("%s/Actions/Oem/Hpe/HpiLO.ResetToFactoryDefaults", manager.ODataID) + + payload := map[string]any{ + "ResetType": "Default", + } + + log.V(1).Info("Resetting HPE iLO to defaults", "uri", actionURI) + + resp, err := r.client.Post(actionURI, payload) + if err != nil { + return nil, fmt.Errorf("failed to reset BMC: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("BMC reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("HPE BMC reset task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeBMCReset, + TargetID: manager.ID, + }, nil + } + + log.V(1).Info("HPE BMC reset completed synchronously") + return nil, nil +} + +func (r *RedfishBaseBMC) resetBMCLenovo(ctx context.Context, manager *schemas.Manager) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // Lenovo XClarity: Use OEM action to reset to factory defaults + // /redfish/v1/Managers/{id}/Actions/Manager.ResetToDefaults + actionURI := fmt.Sprintf("%s/Actions/Manager.ResetToDefaults", manager.ODataID) + + payload := map[string]any{ + "ResetToDefaultsType": "ResetAll", + } + + log.V(1).Info("Resetting Lenovo XCC to defaults", "uri", actionURI) + + resp, err := r.client.Post(actionURI, payload) + if err != nil { + return nil, fmt.Errorf("failed to reset BMC: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("BMC reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("Lenovo BMC reset task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeBMCReset, + TargetID: manager.ID, + }, nil + } + + log.V(1).Info("Lenovo BMC reset completed synchronously") + return nil, nil +} + +func (r *RedfishBaseBMC) resetBMCGeneric(ctx context.Context, manager *schemas.Manager) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // Generic Redfish: Try standard ResetToDefaults action + actionURI := fmt.Sprintf("%s/Actions/Manager.ResetToDefaults", manager.ODataID) + + payload := map[string]any{ + "ResetToDefaultsType": "ResetAll", + } + + log.V(1).Info("Resetting BMC to defaults (generic)", "uri", actionURI) + + resp, err := r.client.Post(actionURI, payload) + if err != nil { + return nil, fmt.Errorf("failed to reset BMC: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("BMC reset failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("Generic BMC reset task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeBMCReset, + TargetID: manager.ID, + }, nil + } + + log.V(1).Info("Generic BMC reset completed synchronously") + return nil, nil +} + +// ClearNetworkConfiguration clears network configuration settings +func (r *RedfishBaseBMC) ClearNetworkConfiguration(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Clearing network configuration", "systemURI", systemURI) + + system, err := r.getSystemFromUri(ctx, systemURI) + if err != nil { + return nil, fmt.Errorf("failed to get computer system: %w", err) + } + + manufacturer := system.Manufacturer + log.V(1).Info("Detected manufacturer", "manufacturer", manufacturer) + + // Use vendor-specific methods when available + switch Manufacturer(manufacturer) { + case ManufacturerDell: + return r.clearNetworkConfigDell(ctx, systemURI) + case ManufacturerHPE: + return r.clearNetworkConfigHPE(ctx, systemURI) + case ManufacturerLenovo: + return r.clearNetworkConfigLenovo(ctx, systemURI) + default: + return r.clearNetworkConfigGeneric(ctx, systemURI) + } +} + +func (r *RedfishBaseBMC) clearNetworkConfigDell(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // Dell: Clear network adapters configuration via OEM extensions + // This typically involves resetting NIC settings to defaults + actionURI := fmt.Sprintf("%s/NetworkAdapters/Actions/Oem/DellNetworkAdapter.ClearConfiguration", systemURI) + + log.V(1).Info("Clearing Dell network configuration", "uri", actionURI) + + resp, err := r.client.Post(actionURI, map[string]any{}) + if err != nil { + // Network config clear might not be critical, log and continue + log.Error(err, "Failed to clear network configuration (non-critical)") + return nil, nil + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + log.Error(fmt.Errorf("network config clear failed"), "Failed with status", + "status", resp.StatusCode, "body", string(body)) + return nil, nil + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("Dell network config clear task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeNetworkClear, + TargetID: systemURI, + }, nil + } + + log.V(1).Info("Dell network config clear completed synchronously") + return nil, nil +} + +func (r *RedfishBaseBMC) clearNetworkConfigHPE(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // HPE: Clear network adapters configuration + actionURI := fmt.Sprintf("%s/NetworkAdapters/Actions/Oem/Hpe/HpeNetworkAdapter.ClearConfiguration", systemURI) + + log.V(1).Info("Clearing HPE network configuration", "uri", actionURI) + + resp, err := r.client.Post(actionURI, map[string]any{}) + if err != nil { + log.Error(err, "Failed to clear network configuration (non-critical)") + return nil, nil + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + log.Error(fmt.Errorf("network config clear failed"), "Failed with status", + "status", resp.StatusCode, "body", string(body)) + return nil, nil + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("HPE network config clear task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeNetworkClear, + TargetID: systemURI, + }, nil + } + + log.V(1).Info("HPE network config clear completed synchronously") + return nil, nil +} + +func (r *RedfishBaseBMC) clearNetworkConfigLenovo(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + + // Lenovo: Clear network adapters configuration + actionURI := fmt.Sprintf("%s/NetworkAdapters/Actions/NetworkAdapter.ClearConfiguration", systemURI) + + log.V(1).Info("Clearing Lenovo network configuration", "uri", actionURI) + + resp, err := r.client.Post(actionURI, map[string]any{}) + if err != nil { + log.Error(err, "Failed to clear network configuration (non-critical)") + return nil, nil + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode >= 300 { + body, _ := io.ReadAll(resp.Body) + log.Error(fmt.Errorf("network config clear failed"), "Failed with status", + "status", resp.StatusCode, "body", string(body)) + return nil, nil + } + + // Extract task URI from response + taskURI := r.extractTaskURIFromResponse(resp) + if taskURI != "" { + log.V(1).Info("Lenovo network config clear task created", "taskURI", taskURI) + return &CleaningTaskInfo{ + TaskURI: taskURI, + TaskType: CleaningTaskTypeNetworkClear, + TargetID: systemURI, + }, nil + } + + log.V(1).Info("Lenovo network config clear completed synchronously") + return nil, nil +} + +func (r *RedfishBaseBMC) clearNetworkConfigGeneric(ctx context.Context, _ string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Network configuration clearing not supported for this vendor (generic)") + // For generic vendors, this operation is optional and non-critical + return nil, nil +} diff --git a/bmc/redfish_kube.go b/bmc/redfish_kube.go index 7e31cbda0..2acaa92a9 100644 --- a/bmc/redfish_kube.go +++ b/bmc/redfish_kube.go @@ -367,6 +367,12 @@ func (r *RedfishKubeBMC) GetBMCUpgradeTask(ctx context.Context, manufacturer, ta return task, nil } +// GetTaskStatus retrieves the status of a task by its URI. +func (r *RedfishKubeBMC) GetTaskStatus(ctx context.Context, taskURI string) (*schemas.Task, error) { + // Delegate to the underlying RedfishBaseBMC implementation + return r.RedfishBaseBMC.GetTaskStatus(ctx, taskURI) +} + // SetPXEBootOnce sets the boot device for the next system boot using Redfish. func (r *RedfishKubeBMC) SetPXEBootOnce(ctx context.Context, systemURI string) error { system, err := r.getSystemFromUri(ctx, systemURI) @@ -455,3 +461,23 @@ func (r *RedfishKubeBMC) createJob( } return nil } + +// EraseDisk delegates to the underlying RedfishBaseBMC +func (r *RedfishKubeBMC) EraseDisk(ctx context.Context, systemURI string, method DiskWipeMethod) ([]CleaningTaskInfo, error) { + return r.RedfishBaseBMC.EraseDisk(ctx, systemURI, method) +} + +// ResetBIOSToDefaults delegates to the underlying RedfishBaseBMC +func (r *RedfishKubeBMC) ResetBIOSToDefaults(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) { + return r.RedfishBaseBMC.ResetBIOSToDefaults(ctx, systemURI) +} + +// ResetBMCToDefaults delegates to the underlying RedfishBaseBMC +func (r *RedfishKubeBMC) ResetBMCToDefaults(ctx context.Context, managerUUID string) (*CleaningTaskInfo, error) { + return r.RedfishBaseBMC.ResetBMCToDefaults(ctx, managerUUID) +} + +// ClearNetworkConfiguration delegates to the underlying RedfishBaseBMC +func (r *RedfishKubeBMC) ClearNetworkConfiguration(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) { + return r.RedfishBaseBMC.ClearNetworkConfiguration(ctx, systemURI) +} diff --git a/bmc/redfish_local.go b/bmc/redfish_local.go index 8839f5a41..ee026c39d 100644 --- a/bmc/redfish_local.go +++ b/bmc/redfish_local.go @@ -267,3 +267,50 @@ func (r *RedfishLocalBMC) GetBMCUpgradeTask(ctx context.Context, manufacturer, t } return task, nil } + +// EraseDisk simulates disk erasing for testing +func (r *RedfishLocalBMC) EraseDisk(ctx context.Context, systemURI string, method DiskWipeMethod) ([]CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Simulating disk erase", "systemURI", systemURI, "method", method) + // Mock implementation - does nothing but succeeds + return nil, nil +} + +// ResetBIOSToDefaults simulates BIOS reset for testing +func (r *RedfishLocalBMC) ResetBIOSToDefaults(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Simulating BIOS reset", "systemURI", systemURI) + // Mock implementation - does nothing but succeeds + return nil, nil +} + +// ResetBMCToDefaults simulates BMC reset for testing +func (r *RedfishLocalBMC) ResetBMCToDefaults(ctx context.Context, managerUUID string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Simulating BMC reset", "managerUUID", managerUUID) + // Mock implementation - does nothing but succeeds + return nil, nil +} + +// ClearNetworkConfiguration simulates network config clearing for testing +func (r *RedfishLocalBMC) ClearNetworkConfiguration(ctx context.Context, systemURI string) (*CleaningTaskInfo, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Simulating network config clear", "systemURI", systemURI) + // Mock implementation - does nothing but succeeds + return nil, nil +} + +// GetTaskStatus simulates task status retrieval for testing +func (r *RedfishLocalBMC) GetTaskStatus(ctx context.Context, taskURI string) (*schemas.Task, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Simulating task status check", "taskURI", taskURI) + // Mock implementation - returns completed status + percentComplete := uint(100) + return &schemas.Task{ + TaskState: schemas.CompletedTaskState, + PercentComplete: &percentComplete, + Messages: []schemas.Message{ + {Message: "Mock task completed"}, + }, + }, nil +} diff --git a/cmd/main.go b/cmd/main.go index 5610fd63f..4ee2abac2 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -93,6 +93,7 @@ func main() { // nolint: gocyclo serverMaxConcurrentReconciles int serverClaimMaxConcurrentReconciles int dnsRecordTemplatePath string + taskPollInterval time.Duration ) flag.IntVar(&serverMaxConcurrentReconciles, "server-max-concurrent-reconciles", 5, @@ -153,6 +154,8 @@ func main() { // nolint: gocyclo "Timeout for BIOS Settings Controller") flag.StringVar(&dnsRecordTemplatePath, "dns-record-template-path", "", "Path to the DNS record template file used for creating DNS records for Servers.") + flag.DurationVar(&taskPollInterval, "task-poll-interval", 30*time.Second, + "Interval for polling BMC task status.") opts := zap.Options{ Development: true, @@ -407,6 +410,13 @@ func main() { // nolint: gocyclo setupLog.Error(err, "Failed to create controller", "controller", "ServerMaintenance") os.Exit(1) } + if err = (&controller.ServerCleaningReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "Failed to create controller", "controller", "ServerCleaning") + os.Exit(1) + } if err = (&controller.BIOSSettingsReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), @@ -527,6 +537,18 @@ func main() { // nolint: gocyclo setupLog.Error(err, "Failed to create controller", "controller", "BMCUser") os.Exit(1) } + if err = (&controller.BMCTaskReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Insecure: insecure, + PollInterval: taskPollInterval, + BMCOptions: bmc.Options{ + BasicAuth: true, + }, + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "Failed to create controller", "controller", "BMCTask") + os.Exit(1) + } // nolint:goconst if os.Getenv("ENABLE_WEBHOOKS") != "false" { diff --git a/config/crd/bases/metal.ironcore.dev_bmcs.yaml b/config/crd/bases/metal.ironcore.dev_bmcs.yaml index efdefb8fa..2404d3fcb 100644 --- a/config/crd/bases/metal.ironcore.dev_bmcs.yaml +++ b/config/crd/bases/metal.ironcore.dev_bmcs.yaml @@ -284,6 +284,52 @@ spec: State represents the current state of the BMC. kubebuilder:validation:Enum=Enabled;Error;Pending type: string + tasks: + description: Tasks tracks ongoing and recent BMC operations. + items: + description: BMCTask represents a single BMC operation task. + properties: + lastUpdateTime: + description: LastUpdateTime is when this task status was last + updated. + format: date-time + type: string + message: + description: Message provides additional information about the + task. + type: string + percentComplete: + description: PercentComplete indicates completion percentage + (0-100). + format: int32 + type: integer + state: + description: State is the current state of the task. + type: string + targetID: + description: TargetID identifies what the task is operating + on (e.g., "BIOS", "BMC", "Drive-1"). + type: string + taskType: + description: TaskType indicates the type of operation. + enum: + - DiskErase + - BIOSReset + - BMCReset + - NetworkClear + - FirmwareUpdate + - ConfigurationChange + - AccountManagement + - Other + type: string + taskURI: + description: TaskURI is the URI to monitor the task on the BMC. + type: string + required: + - taskType + - taskURI + type: object + type: array type: object type: object served: true diff --git a/config/crd/bases/metal.ironcore.dev_servercleanings.yaml b/config/crd/bases/metal.ironcore.dev_servercleanings.yaml new file mode 100644 index 000000000..180235be2 --- /dev/null +++ b/config/crd/bases/metal.ironcore.dev_servercleanings.yaml @@ -0,0 +1,331 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.20.1 + name: servercleanings.metal.ironcore.dev +spec: + group: metal.ironcore.dev + names: + kind: ServerCleaning + listKind: ServerCleaningList + plural: servercleanings + shortNames: + - scl + singular: servercleaning + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.selectedServers + name: Selected + type: integer + - jsonPath: .status.completedCleanings + name: Completed + type: integer + - jsonPath: .status.inProgressCleanings + name: InProgress + type: integer + - jsonPath: .status.failedCleanings + name: Failed + type: integer + - jsonPath: .status.state + name: State + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: ServerCleaning is the Schema for the servercleaning API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ServerCleaningSpec defines the desired cleaning operations + properties: + biosReset: + description: BIOSReset specifies if BIOS should be reset to defaults + type: boolean + bmcReset: + description: BMCReset specifies if BMC should be reset to defaults + type: boolean + diskWipe: + description: DiskWipe specifies disk erasing configuration + properties: + includeBootDrives: + description: IncludeBootDrives specifies whether to erase boot + drives + type: boolean + method: + default: quick + description: Method specifies the disk erasing method + enum: + - quick + - secure + - dod + type: string + required: + - method + type: object + networkCleanup: + description: NetworkCleanup specifies if network configurations should + be cleared + type: boolean + serverBootConfigurationTemplate: + description: |- + ServerBootConfigurationTemplate defines the boot configuration for cleaning agent + If not specified, cleaning operations are performed via BMC APIs + properties: + name: + description: Name specifies the name of the boot configuration. + type: string + spec: + description: Spec specifies the boot configuration to be rendered. + properties: + ignitionSecretRef: + description: |- + IgnitionSecretRef is a reference to the Secret object that contains + the ignition configuration for the server. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + image: + description: Image specifies the boot image to be used for + the server. + type: string + serverRef: + description: ServerRef is a reference to the server for which + this boot configuration is intended. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + required: + - serverRef + type: object + required: + - name + - spec + type: object + serverRef: + description: |- + ServerRef references a specific Server to be cleaned. + Mutually exclusive with ServerSelector. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + serverSelector: + description: |- + ServerSelector specifies a label selector to identify servers to be cleaned. + Mutually exclusive with ServerRef. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + type: object + x-kubernetes-validations: + - message: either serverRef or serverSelector must be specified + rule: has(self.serverRef) || has(self.serverSelector) + status: + description: ServerCleaningStatus defines the observed state of ServerCleaning + properties: + completedCleanings: + description: CompletedCleanings is the number of servers successfully + cleaned + format: int32 + type: integer + conditions: + description: Conditions represents the latest available observations + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + failedCleanings: + description: FailedCleanings is the number of servers where cleaning + failed + format: int32 + type: integer + inProgressCleanings: + description: InProgressCleanings is the number of servers currently + being cleaned + format: int32 + type: integer + pendingCleanings: + description: PendingCleanings is the number of servers with pending + cleaning + format: int32 + type: integer + selectedServers: + description: SelectedServers is the total number of servers selected + for cleaning + format: int32 + type: integer + serverCleaningStatuses: + description: ServerCleaningStatuses contains per-server cleaning status + items: + description: ServerCleaningStatusEntry represents the cleaning status + for a single server + properties: + lastUpdateTime: + description: LastUpdateTime is the last time this status was + updated + format: date-time + type: string + message: + description: Message provides additional information about the + cleaning state + type: string + serverName: + description: ServerName is the name of the server + type: string + state: + description: State is the cleaning state for this server + type: string + required: + - serverName + - state + type: object + type: array + state: + description: State represents the current state of the cleaning process + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/metal.ironcore.dev_servers.yaml b/config/crd/bases/metal.ironcore.dev_servers.yaml index f53eb3987..eb3c4f88e 100644 --- a/config/crd/bases/metal.ironcore.dev_servers.yaml +++ b/config/crd/bases/metal.ironcore.dev_servers.yaml @@ -294,6 +294,35 @@ spec: systemUUID: description: SystemUUID is the unique identifier for the server. type: string + taints: + description: Taints is a list of taints that affect this server. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to a node. + type: string + timeAdded: + description: TimeAdded represents the time at which the taint + was added. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + required: + - effect + - key + type: object + type: array required: - systemUUID type: object diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 7b33068c8..c83ef73b0 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -54,6 +54,7 @@ rules: - endpoints - serverbootconfigurations - serverclaims + - servercleanings - serverconfigurations - servermaintenances - servers @@ -82,6 +83,7 @@ rules: - endpoints/finalizers - serverbootconfigurations/finalizers - serverclaims/finalizers + - servercleanings/finalizers - servermaintenances/finalizers - servers/finalizers verbs: @@ -103,6 +105,7 @@ rules: - endpoints/status - serverbootconfigurations/status - serverclaims/status + - servercleanings/status - servermaintenances/status - servers/status verbs: diff --git a/config/samples/metal_v1alpha1_servercleaning.yaml b/config/samples/metal_v1alpha1_servercleaning.yaml new file mode 100644 index 000000000..d8a0cda20 --- /dev/null +++ b/config/samples/metal_v1alpha1_servercleaning.yaml @@ -0,0 +1,58 @@ +# Example 1: Cleaning a single server using serverRef +apiVersion: metal.ironcore.dev/v1alpha1 +kind: ServerCleaning +metadata: + name: servercleaning-single-server + namespace: default +spec: + serverRef: + name: server-sample + diskWipe: + method: secure + includeBootDrives: true + bmcReset: true + biosReset: true + networkCleanup: true + serverBootConfigurationTemplate: + name: cleaning-boot-config + spec: + serverRef: + name: server-sample + image: "ghcr.io/ironcore-dev/metal-operator/cleaning-agent:latest" + ignitionSecretRef: + name: cleaning-ignition-secret +--- +# Example 2: Cleaning multiple servers using serverSelector +apiVersion: metal.ironcore.dev/v1alpha1 +kind: ServerCleaning +metadata: + name: servercleaning-multi-server + namespace: default +spec: + serverSelector: + matchLabels: + environment: staging + cleanup-required: "true" + diskWipe: + method: quick + includeBootDrives: false + bmcReset: false + biosReset: false + networkCleanup: true +--- +# Example 3: DoD-compliant cleaning for multiple decommissioned servers +apiVersion: metal.ironcore.dev/v1alpha1 +kind: ServerCleaning +metadata: + name: servercleaning-decommission + namespace: default +spec: + serverSelector: + matchLabels: + status: decommissioned + diskWipe: + method: dod + includeBootDrives: true + bmcReset: true + biosReset: true + networkCleanup: true diff --git a/dist/chart/templates/crd/metal.ironcore.dev_servercleanings.yaml b/dist/chart/templates/crd/metal.ironcore.dev_servercleanings.yaml new file mode 100644 index 000000000..87ac6d041 --- /dev/null +++ b/dist/chart/templates/crd/metal.ironcore.dev_servercleanings.yaml @@ -0,0 +1,376 @@ +{{- if .Values.crd.enable }} +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + labels: + {{- include "chart.labels" . | nindent 4 }} + annotations: + {{- if .Values.crd.keep }} + "helm.sh/resource-policy": keep + {{- end }} + controller-gen.kubebuilder.io/version: v0.20.1 + name: servercleanings.metal.ironcore.dev +spec: + group: metal.ironcore.dev + names: + kind: ServerCleaning + listKind: ServerCleaningList + plural: servercleanings + shortNames: + - scl + singular: servercleaning + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.selectedServers + name: Selected + type: integer + - jsonPath: .status.completedCleanings + name: Completed + type: integer + - jsonPath: .status.inProgressCleanings + name: InProgress + type: integer + - jsonPath: .status.failedCleanings + name: Failed + type: integer + - jsonPath: .status.state + name: State + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: ServerCleaning is the Schema for the servercleaning API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: ServerCleaningSpec defines the desired cleaning operations + properties: + biosReset: + description: BIOSReset specifies if BIOS should be reset to defaults + type: boolean + bmcReset: + description: BMCReset specifies if BMC should be reset to defaults + type: boolean + diskWipe: + description: DiskWipe specifies disk erasing configuration + properties: + includeBootDrives: + description: IncludeBootDrives specifies whether to erase boot + drives + type: boolean + method: + default: quick + description: Method specifies the disk erasing method + enum: + - quick + - secure + - dod + type: string + required: + - method + type: object + networkCleanup: + description: NetworkCleanup specifies if network configurations should + be cleared + type: boolean + serverBootConfigurationTemplate: + description: |- + ServerBootConfigurationTemplate defines the boot configuration for cleaning agent + If not specified, cleaning operations are performed via BMC APIs + properties: + name: + description: Name specifies the name of the boot configuration. + type: string + spec: + description: Spec specifies the boot configuration to be rendered. + properties: + ignitionSecretRef: + description: |- + IgnitionSecretRef is a reference to the Secret object that contains + the ignition configuration for the server. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + image: + description: Image specifies the boot image to be used for + the server. + type: string + serverRef: + description: ServerRef is a reference to the server for which + this boot configuration is intended. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + required: + - serverRef + type: object + required: + - name + - spec + type: object + serverRef: + description: |- + ServerRef references a specific Server to be cleaned. + Mutually exclusive with ServerSelector. + properties: + name: + default: "" + description: |- + Name of the referent. + This field is effectively required, but due to backwards compatibility is + allowed to be empty. Instances of this type with an empty value here are + almost certainly wrong. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + type: object + x-kubernetes-map-type: atomic + serverSelector: + description: |- + ServerSelector specifies a label selector to identify servers to be cleaned. + Mutually exclusive with ServerRef. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: |- + A label selector requirement is a selector that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: |- + operator represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: |- + values is an array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator + type: object + type: array + x-kubernetes-list-type: atomic + matchLabels: + additionalProperties: + type: string + description: |- + matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, whose key field is "key", the + operator is "In", and the values array contains only "value". The requirements are ANDed. + type: object + type: object + x-kubernetes-map-type: atomic + type: object + x-kubernetes-validations: + - message: either serverRef or serverSelector must be specified + rule: has(self.serverRef) || has(self.serverSelector) + status: + description: ServerCleaningStatus defines the observed state of ServerCleaning + properties: + completedCleanings: + description: CompletedCleanings is the number of servers successfully + cleaned + format: int32 + type: integer + conditions: + description: Conditions represents the latest available observations + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + failedCleanings: + description: FailedCleanings is the number of servers where cleaning + failed + format: int32 + type: integer + inProgressCleanings: + description: InProgressCleanings is the number of servers currently + being cleaned + format: int32 + type: integer + pendingCleanings: + description: PendingCleanings is the number of servers with pending + cleaning + format: int32 + type: integer + selectedServers: + description: SelectedServers is the total number of servers selected + for cleaning + format: int32 + type: integer + serverCleaningStatuses: + description: ServerCleaningStatuses contains per-server cleaning status + items: + description: ServerCleaningStatusEntry represents the cleaning status + for a single server + properties: + cleaningTasks: + description: CleaningTasks contains information about the cleaning + tasks for this server + items: + description: CleaningTaskStatus represents the status of a + cleaning task + properties: + lastUpdateTime: + description: LastUpdateTime is the last time this task + status was updated + format: date-time + type: string + message: + description: Message provides additional information about + the task + type: string + percentComplete: + description: PercentComplete indicates the completion + percentage (0-100) + type: integer + state: + description: State is the current state of the task + type: string + targetID: + description: TargetID identifies the target resource (e.g., + drive ID for disk erase) + type: string + taskType: + description: TaskType indicates what type of cleaning + task this is + type: string + taskURI: + description: TaskURI is the URI to monitor the task + type: string + required: + - taskType + type: object + type: array + lastUpdateTime: + description: LastUpdateTime is the last time this status was + updated + format: date-time + type: string + message: + description: Message provides additional information about the + cleaning state + type: string + serverName: + description: ServerName is the name of the server + type: string + state: + description: State is the cleaning state for this server + type: string + required: + - serverName + - state + type: object + type: array + state: + description: State represents the current state of the cleaning process + type: string + type: object + type: object + served: true + storage: true + subresources: + status: {} +{{- end -}} diff --git a/dist/chart/templates/crd/metal.ironcore.dev_servers.yaml b/dist/chart/templates/crd/metal.ironcore.dev_servers.yaml index d48b6e80c..7b2cf18f5 100755 --- a/dist/chart/templates/crd/metal.ironcore.dev_servers.yaml +++ b/dist/chart/templates/crd/metal.ironcore.dev_servers.yaml @@ -300,6 +300,40 @@ spec: systemUUID: description: SystemUUID is the unique identifier for the server. type: string + taints: + description: Taints is a list of taints that affect this server. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to a node. + type: string + timeAdded: + description: TimeAdded represents the time at which the taint + was added. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + required: + - effect + - key + type: object + type: array + uuid: + description: |- + UUID is the unique identifier for the server. + Deprecated in favor of systemUUID. + type: string required: - systemUUID type: object diff --git a/dist/chart/templates/rbac/role.yaml b/dist/chart/templates/rbac/role.yaml index 96c75c9ae..899d532db 100755 --- a/dist/chart/templates/rbac/role.yaml +++ b/dist/chart/templates/rbac/role.yaml @@ -57,6 +57,7 @@ rules: - endpoints - serverbootconfigurations - serverclaims + - servercleanings - serverconfigurations - servermaintenances - servers @@ -85,6 +86,7 @@ rules: - endpoints/finalizers - serverbootconfigurations/finalizers - serverclaims/finalizers + - servercleanings/finalizers - servermaintenances/finalizers - servers/finalizers verbs: @@ -106,6 +108,7 @@ rules: - endpoints/status - serverbootconfigurations/status - serverclaims/status + - servercleanings/status - servermaintenances/status - servers/status verbs: diff --git a/docs/api-reference/api.md b/docs/api-reference/api.md index 0a09802c8..b50485c26 100644 --- a/docs/api-reference/api.md +++ b/docs/api-reference/api.md @@ -26,6 +26,7 @@ Package v1alpha1 contains API Schema definitions for the metal v1alpha1 API grou - [Server](#server) - [ServerBootConfiguration](#serverbootconfiguration) - [ServerClaim](#serverclaim) +- [ServerCleaning](#servercleaning) - [ServerMaintenance](#servermaintenance) @@ -904,6 +905,28 @@ _Appears in:_ | `device` _string_ | Device is the device to boot from. | | | +#### CleaningTaskStatus + + + +CleaningTaskStatus represents the status of a cleaning task + + + +_Appears in:_ +- [ServerCleaningStatusEntry](#servercleaningstatusentry) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `taskURI` _string_ | TaskURI is the URI to monitor the task | | | +| `taskType` _string_ | TaskType indicates what type of cleaning task this is | | | +| `targetID` _string_ | TargetID identifies the target resource (e.g., drive ID for disk erase) | | | +| `state` _string_ | State is the current state of the task | | | +| `percentComplete` _integer_ | PercentComplete indicates the completion percentage (0-100) | | | +| `message` _string_ | Message provides additional information about the task | | | +| `lastUpdateTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#time-v1-meta)_ | LastUpdateTime is the last time this task status was updated | | | + + #### ConsoleProtocol @@ -939,6 +962,41 @@ _Appears in:_ | `SSHLenovo` | ConsoleProtocolNameSSHLenovo represents the SSH console protocol specific to Lenovo hardware.
| +#### DiskWipeConfig + + + +DiskWipeConfig defines disk erasing behavior + + + +_Appears in:_ +- [ServerCleaningSpec](#servercleaningspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `method` _[DiskWipeMethod](#diskwipemethod)_ | Method specifies the disk erasing method | quick | Enum: [quick secure dod]
| +| `includeBootDrives` _boolean_ | IncludeBootDrives specifies whether to erase boot drives | | | + + +#### DiskWipeMethod + +_Underlying type:_ _string_ + +DiskWipeMethod defines the available disk erasing methods + + + +_Appears in:_ +- [DiskWipeConfig](#diskwipeconfig) + +| Field | Description | +| --- | --- | +| `quick` | DiskWipeMethodQuick performs a quick erase (single pass)
| +| `secure` | DiskWipeMethodSecure performs a secure erase (3 passes)
| +| `dod` | DiskWipeMethodDoD performs DoD 5220.22-M standard erase (7 passes)
| + + #### Endpoint @@ -1346,6 +1404,7 @@ ServerBootConfigurationTemplate defines the parameters to be used for rendering _Appears in:_ +- [ServerCleaningSpec](#servercleaningspec) - [ServerMaintenanceSpec](#servermaintenancespec) | Field | Description | Default | Validation | @@ -1409,6 +1468,110 @@ _Appears in:_ | `phase` _[Phase](#phase)_ | Phase represents the current phase of the server claim. | | | +#### ServerCleaning + + + +ServerCleaning is the Schema for the servercleaning API + + + + + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `apiVersion` _string_ | `metal.ironcore.dev/v1alpha1` | | | +| `kind` _string_ | `ServerCleaning` | | | +| `metadata` _[ObjectMeta](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#objectmeta-v1-meta)_ | Refer to Kubernetes API documentation for fields of `metadata`. | | | +| `spec` _[ServerCleaningSpec](#servercleaningspec)_ | | | | +| `status` _[ServerCleaningStatus](#servercleaningstatus)_ | | | | + + +#### ServerCleaningSpec + + + +ServerCleaningSpec defines the desired cleaning operations + + + +_Appears in:_ +- [ServerCleaning](#servercleaning) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `serverRef` _[LocalObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#localobjectreference-v1-core)_ | ServerRef references a specific Server to be cleaned.
Mutually exclusive with ServerSelector. | | | +| `serverSelector` _[LabelSelector](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#labelselector-v1-meta)_ | ServerSelector specifies a label selector to identify servers to be cleaned.
Mutually exclusive with ServerRef. | | | +| `diskWipe` _[DiskWipeConfig](#diskwipeconfig)_ | DiskWipe specifies disk erasing configuration | | | +| `bmcReset` _boolean_ | BMCReset specifies if BMC should be reset to defaults | | | +| `biosReset` _boolean_ | BIOSReset specifies if BIOS should be reset to defaults | | | +| `networkCleanup` _boolean_ | NetworkCleanup specifies if network configurations should be cleared | | | +| `serverBootConfigurationTemplate` _[ServerBootConfigurationTemplate](#serverbootconfigurationtemplate)_ | ServerBootConfigurationTemplate defines the boot configuration for cleaning agent
If not specified, cleaning operations are performed via BMC APIs | | | + + +#### ServerCleaningState + +_Underlying type:_ _string_ + +ServerCleaningState defines the state of the cleaning process + + + +_Appears in:_ +- [ServerCleaningStatus](#servercleaningstatus) +- [ServerCleaningStatusEntry](#servercleaningstatusentry) + +| Field | Description | +| --- | --- | +| `Pending` | ServerCleaningStatePending indicates cleaning is waiting to start
| +| `InProgress` | ServerCleaningStateInProgress indicates cleaning is in progress
| +| `Completed` | ServerCleaningStateCompleted indicates cleaning completed successfully
| +| `Failed` | ServerCleaningStateFailed indicates cleaning failed
| + + +#### ServerCleaningStatus + + + +ServerCleaningStatus defines the observed state of ServerCleaning + + + +_Appears in:_ +- [ServerCleaning](#servercleaning) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `state` _[ServerCleaningState](#servercleaningstate)_ | State represents the current state of the cleaning process | | | +| `selectedServers` _integer_ | SelectedServers is the total number of servers selected for cleaning | | | +| `pendingCleanings` _integer_ | PendingCleanings is the number of servers with pending cleaning | | | +| `inProgressCleanings` _integer_ | InProgressCleanings is the number of servers currently being cleaned | | | +| `completedCleanings` _integer_ | CompletedCleanings is the number of servers successfully cleaned | | | +| `failedCleanings` _integer_ | FailedCleanings is the number of servers where cleaning failed | | | +| `serverCleaningStatuses` _[ServerCleaningStatusEntry](#servercleaningstatusentry) array_ | ServerCleaningStatuses contains per-server cleaning status | | | +| `conditions` _[Condition](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#condition-v1-meta) array_ | Conditions represents the latest available observations | | | + + +#### ServerCleaningStatusEntry + + + +ServerCleaningStatusEntry represents the cleaning status for a single server + + + +_Appears in:_ +- [ServerCleaningStatus](#servercleaningstatus) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `serverName` _string_ | ServerName is the name of the server | | | +| `state` _[ServerCleaningState](#servercleaningstate)_ | State is the cleaning state for this server | | | +| `message` _string_ | Message provides additional information about the cleaning state | | | +| `lastUpdateTime` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#time-v1-meta)_ | LastUpdateTime is the last time this status was updated | | | +| `cleaningTasks` _[CleaningTaskStatus](#cleaningtaskstatus) array_ | CleaningTasks contains information about the cleaning tasks for this server | | | + + #### ServerMaintenance @@ -1568,6 +1731,7 @@ _Appears in:_ | `maintenanceBootConfigurationRef` _[ObjectReference](#objectreference)_ | MaintenanceBootConfigurationRef is a reference to a BootConfiguration object that specifies
the boot configuration for this server during maintenance. | | | | `bootOrder` _[BootOrder](#bootorder) array_ | BootOrder specifies the boot order of the server. | | | | `biosSettingsRef` _[LocalObjectReference](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#localobjectreference-v1-core)_ | BIOSSettingsRef is a reference to a biossettings object that specifies
the BIOS configuration for this server. | | | +| `taints` _[Taint](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#taint-v1-core) array_ | Taints is a list of taints that affect this server. | | | #### ServerState @@ -1587,6 +1751,7 @@ _Appears in:_ | `Discovery` | ServerStateDiscovery indicates that the server is in its discovery state.
| | `Available` | ServerStateAvailable indicates that the server is available for use.
| | `Reserved` | ServerStateReserved indicates that the server is reserved for a specific use or user.
| +| `Tainted` | ServerStateTainted indicates that the server is tainted and requires cleaning
before transitioning back to Available.
| | `Error` | ServerStateError indicates that there is an error with the server.
| | `Maintenance` | ServerStateMaintenance indicates that the server is in maintenance.
| diff --git a/docs/bmc-task-tracking.md b/docs/bmc-task-tracking.md new file mode 100644 index 000000000..7f0930c79 --- /dev/null +++ b/docs/bmc-task-tracking.md @@ -0,0 +1,553 @@ +# BMC Task Tracking + +## Overview + +All BMC operations are tracked centrally in `BMC.Status.Tasks[]`. This provides a single source of truth for all BMC operations across multiple controllers. + +## Architecture + +### Dedicated Task Controller (New in v0.x.x) - Initial Rollout for ServerCleaning + +The **BMCTask controller** is a dedicated controller responsible for monitoring BMC task progress. This separation of concerns provides: + +- ✅ **Consistent polling** - All tasks polled at configurable intervals (default 30s) +- ✅ **Automatic monitoring** - Tasks update even when parent resources don't change +- ✅ **Better performance** - No task polling overhead on cleaning operations +- ✅ **Simplified controllers** - Controllers only create tasks, don't poll + +**Current Implementation Status:** +- ✅ **ServerCleaning Controller** - Uses BMCTask controller for task monitoring +- 🔄 **Other Controllers** - Still use their own polling mechanisms (future enhancement) + +``` +┌─────────────────────────────────────────────────────────────┐ +│ BMC Resource │ +│ ┌────────────────────────────────────────────────────────┐ │ +│ │ Status: │ │ +│ │ Tasks: []BMCTask ← Single source of truth │ │ +│ │ - TaskURI, Type, State, Progress, Message │ │ +│ └────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────┘ + ▲ ▲ + │ Creates tasks │ Polls & updates + │ │ + ┌────┴─────┐ ┌──────┴────────┐ + │SrvClean │ ◄─────watches───────│ BMCTask │ + │ │ task updates │ Controller │ + │ │ │ │ + └──────────┘ │ • Watches BMC │ + │ • Polls tasks │ + │ • Updates │ + │ progress │ + │ • Requeues │ + └───────────────┘ +``` + +### Controller Responsibilities + +**BMCTask Controller (Dedicated Task Monitor):** +- Watches BMC resources that have tasks +- Polls BMC API for task status every 30s (configurable via `--task-poll-interval`) +- Updates `BMC.Status.Tasks` with latest State, PercentComplete, Message +- Automatically requeues when active tasks exist +- Stops polling when all tasks reach terminal states +- **Currently used by**: ServerCleaning controller + +**Controllers Using BMCTask Controller:** +- **ServerCleaning Controller**: Creates tasks for cleaning operations, watches BMC for updates + +**Controllers Using Own Polling (Future Migration):** +- **BMC Controller**: Still polls tasks during reconciliation (uses `updateBMCTaskStatus()`) +- **BMCVersion Controller**: Still has 2-minute polling via `ResyncInterval` +- **BMCSettings Controller**: Synchronous operations (no polling needed) + +**Interaction Pattern (ServerCleaning):** +1. **Task Creation**: ServerCleaning adds task entry to `BMC.Status.Tasks` with initial state +2. **Automatic Monitoring**: BMCTask controller automatically detects new task and begins polling +3. **Progress Updates**: BMCTask controller updates task status every 30s +4. **Completion Detection**: BMCTask controller stops polling when task reaches terminal state +5. **Watch for Updates**: ServerCleaning controller watches BMC resources and reacts to task status changes + +### Task Structure + +Each `BMCTask` contains: + +```go +type BMCTask struct { + TaskURI string // Unique identifier for the task + TaskType BMCTaskType // Type of operation + TargetID string // What the task operates on (e.g., "BMC", "BIOS", "Drive-1") + State string // Current state (e.g., "New", "Running", "Completed", "Failed") + PercentComplete int32 // Progress (0-100) + Message string // Additional information + LastUpdateTime metav1.Time // When task was last updated +} +``` + +### Task Types + +- **FirmwareUpdate**: BMC/BIOS firmware upgrades +- **ConfigurationChange**: BMC/BIOS attribute changes +- **DiskErase**: Disk wiping operations +- **BMCReset**: BMC reset operations +- **BIOSReset**: BIOS reset to defaults +- **NetworkClear**: Network configuration cleanup +- **AccountManagement**: User account operations +- **Other**: Other operations + +## Task Lifecycle + +### Automatic Task Monitoring (BMCTask Controller) + +The **BMCTask controller** is a dedicated controller that automatically monitors all in-progress tasks: + +**How it works:** +1. **Watches BMC resources** that have non-empty `Status.Tasks` arrays +2. **Runs every 30 seconds** (configurable via `--task-poll-interval` flag) +3. **Iterates through tasks** in `BMC.Status.Tasks` +4. **Skips terminal states**: `Completed`, `Failed`, `Killed`, `Exception`, `Cancelled` +5. **Polls the BMC** via `bmcClient.GetTaskStatus(taskURI)` for active tasks +6. **Updates task status** with latest `State`, `PercentComplete`, `Message`, and `LastUpdateTime` +7. **Persists changes** via `Status().Update()` if any tasks were updated +8. **Automatic requeue**: Continues polling as long as active tasks exist + +**Key Benefits:** +- ✅ **Automatic monitoring** - Tasks update even if BMC resource doesn't change +- ✅ **Consistent frequency** - All tasks polled at same interval regardless of source +- ✅ **No event dependency** - Doesn't rely on BMC reconciliation to trigger updates +- ✅ **Works across restarts** - Tasks persisted in BMC status survive controller restarts +- ✅ **Simplified controllers** - BMCVersion/BMCSettings/ServerCleaning don't need polling logic + +**Terminal States** (tasks that are no longer polled): +- `Completed` - Task finished successfully +- `Failed` - Task encountered an error +- `Killed` - Task was terminated +- `Exception` - Task threw an exception +- `Cancelled` - Task was cancelled + +**Configuration:** +```bash +# Default 30 second polling interval +./manager + +# Custom interval (e.g., 15 seconds) +./manager --task-poll-interval=15s + +# Longer interval for less frequent updates (e.g., 1 minute) +./manager --task-poll-interval=1m +``` + +### 1. Synchronous Operations + +For operations that complete immediately (e.g., BMC settings changes): + +```go +task := metalv1alpha1.BMCTask{ + TaskURI: fmt.Sprintf("config-change-%s-%s", name, time.Now().Format("20060102-150405")), + TaskType: metalv1alpha1.BMCTaskTypeConfigurationChange, + TargetID: "BMC", + State: "Completed", + PercentComplete: 100, + Message: fmt.Sprintf("Applied %d BMC attributes", len(attributes)), + LastUpdateTime: metav1.Now(), +} +``` + +### 2. Asynchronous Operations + +For long-running operations (e.g., firmware updates): + +**Initial Creation:** +```go +task := metalv1alpha1.BMCTask{ + TaskURI: taskMonitorURI, // From BMC client + TaskType: metalv1alpha1.BMCTaskTypeFirmwareUpdate, + TargetID: "BMC", + State: "New", + PercentComplete: 0, + Message: fmt.Sprintf("Upgrading BMC firmware to %s", version), + LastUpdateTime: metav1.Now(), +} +``` + +**Progress Updates:** +```go +// Poll task status from BMC client +taskStatus, err := bmcClient.GetBMCUpgradeTask(ctx, manufacturer, taskURI) + +// Update task in BMC status +updateBMCTask(ctx, bmcName, namespace, taskURI, func(bmcTask *metalv1alpha1.BMCTask) { + bmcTask.State = string(taskStatus.TaskState) + bmcTask.PercentComplete = int32(*taskStatus.PercentComplete) + bmcTask.Message = fmt.Sprintf("Status: %s", taskStatus.TaskStatus) +}) +``` + +## Controller-Specific Implementations + +### BMCTask Controller (Dedicated Task Monitor) + +**Responsibility:** +- Automatic monitoring of all BMC tasks across all controllers + +**Operations:** +- Polls task status from BMC API +- Updates `BMC.Status.Tasks` with progress +- Manages requeue for active tasks + +**Implementation Details:** +```go +// Only reconciles BMCs with tasks (via event filter) +func hasTasksPredicate() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + bmc := e.Object.(*metalv1alpha1.BMC) + return len(bmc.Status.Tasks) > 0 + }, + UpdateFunc: func(e event.UpdateEvent) bool { + bmc := e.ObjectNew.(*metalv1alpha1.BMC) + return len(bmc.Status.Tasks) > 0 + }, + } +} + +// Polls tasks and updates status +func (r *BMCTaskReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + // Fetch BMC, skip if no tasks + // Get BMC client + // Iterate through tasks, poll non-terminal ones + // Update BMC.Status.Tasks if changed + // Requeue if active tasks exist + return ctrl.Result{RequeueAfter: r.PollInterval}, nil +} +``` + +**Configuration:** +- `--task-poll-interval` flag controls polling frequency (default 30s) + +### BMC Controller + +**Operations Tracked:** +- BMC reset operations + +**Helper Functions:** +- `addBMCTask(bmcObj, task)` - Add new task to BMC status +- `updateBMCTask(bmcObj, taskURI, updateFn)` - Update existing task +- `getBMCTask(bmcObj, taskURI)` - Retrieve task by URI + +**Important:** The BMC controller **no longer polls tasks**. It only creates tasks for its operations. The BMCTask controller handles all polling automatically. + +**Example Usage:** +```go +func (r *BMCReconciler) resetBMC(ctx context.Context, bmcObj *metalv1alpha1.BMC) error { + // ... perform reset ... + + task := metalv1alpha1.BMCTask{ + TaskURI: fmt.Sprintf("bmc-reset-%s", time.Now().Format("20060102-150405")), + TaskType: metalv1alpha1.BMCTaskTypeBMCReset, + TargetID: "BMC", + State: "Completed", + PercentComplete: 100, + Message: "BMC reset initiated", + LastUpdateTime: metav1.Now(), + } + r.addBMCTask(bmcObj, task) + + return r.updateBMCState(ctx, bmcObj, metalv1alpha1.BMCStatePending) +} +``` + +### BMCVersion Controller + +**Operations Tracked:** +- Firmware upgrade operations + +**Helper Functions:** +- `addTaskToBMC(ctx, bmcName, namespace, task)` - Add task to referenced BMC + +**Important:** The BMCVersion controller **no longer polls** for task progress. The BMCTask controller automatically monitors all in-progress tasks. The BMCVersion controller only needs to: +1. Create the task when starting a firmware upgrade +2. Watch the BMC resource for task status updates +3. React to task completion/failure + +**Example Usage:** +```go +// When issuing upgrade +taskMonitor, _, err := bmcClient.UpgradeBMCVersion(ctx, manufacturer, params) +if taskMonitor != "" { + r.addTaskToBMC(ctx, bmcVersion.Spec.BMCRef.Name, bmcVersion.Namespace, metalv1alpha1.BMCTask{ + TaskURI: taskMonitor, + TaskType: metalv1alpha1.BMCTaskTypeFirmwareUpdate, + TargetID: "BMC", + State: "New", + PercentComplete: 0, + Message: fmt.Sprintf("Upgrading BMC firmware to %s", bmcVersion.Spec.Version), + LastUpdateTime: metav1.Now(), + }) +} + +// To check progress - read from BMC.Status.Tasks (BMCTask controller updates it automatically) +bmc := &metalv1alpha1.BMC{} +if err := r.Get(ctx, types.NamespacedName{Name: bmcName, Namespace: namespace}, bmc); err != nil { + return err +} +for _, task := range bmc.Status.Tasks { + if task.TaskURI == taskMonitor { + // Task is automatically updated by BMCTask controller + if task.State == "Completed" { + // Firmware upgrade complete + } else if task.State == "Failed" { + // Firmware upgrade failed + } + break + } +} +``` + +### BMCSettings Controller + +**Operations Tracked:** +- BMC attribute configuration changes + +**Helper Functions:** +- `addTaskToBMC(ctx, bmcName, namespace, task)` - Add task to referenced BMC + +**Important:** For synchronous operations (immediate configuration changes), tasks are created with `State: "Completed"`. The BMCTask controller will not poll these since they're already in a terminal state. + +**Example Usage:** +```go +err = bmcClient.SetBMCAttributesImmediately(ctx, BMC.Spec.BMCUUID, attributes) +if err != nil { + return fmt.Errorf("failed to set BMC settings: %w", err) +} + +// Record configuration change (synchronous operation - already completed) +taskURI := fmt.Sprintf("config-change-%s-%s", bmcSetting.Name, time.Now().Format("20060102-150405")) +r.addTaskToBMC(ctx, bmcSetting.Spec.BMCRef.Name, bmcSetting.Namespace, metalv1alpha1.BMCTask{ + TaskURI: taskURI, + TaskType: metalv1alpha1.BMCTaskTypeConfigurationChange, + TargetID: "BMC", + State: "Completed", + PercentComplete: 100, + Message: fmt.Sprintf("Applied %d BMC attributes", len(attributes)), + LastUpdateTime: metav1.Now(), +}) +``` + +### ServerCleaning Controller + +**Operations Tracked:** +- Disk erase operations +- BIOS reset operations +- Network configuration cleanup +- Account management operations + +**Helper Functions:** +- `addTaskToBMC(ctx, bmcName, namespace, task)` - Add task to referenced BMC + +**Important:** The ServerCleaning controller **no longer polls** for task progress. The BMCTask controller automatically monitors all in-progress tasks. The ServerCleaning controller only needs to: +1. Create tasks when starting cleaning operations +2. Watch the BMC resource for task status updates +3. React to task completion/failure to proceed with next cleaning steps + +**Example Usage:** +```go +// Start disk erase operation +taskURI, err := bmcClient.ErasePhysicalDrive(ctx, driveURI) +if err != nil { + return err +} + +// Create task in BMC status +r.addTaskToBMC(ctx, bmcName, namespace, metalv1alpha1.BMCTask{ + TaskURI: taskURI, + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + TargetID: driveURI, + State: "New", + PercentComplete: 0, + Message: fmt.Sprintf("Erasing drive %s", driveURI), + LastUpdateTime: metav1.Now(), +}) + +// BMCTask controller will automatically poll and update this task +// ServerCleaning controller watches BMC and reacts to task completion +``` + +## Task Cleanup + +Tasks are automatically pruned to prevent unbounded growth: +- Only the **last 10 tasks** are retained per BMC +- Older tasks are automatically removed when new tasks are added +- This happens transparently in `addBMCTask()` helper functions + +## Querying Tasks + +### From CLI + +```bash +# List all tasks for a BMC +kubectl get bmc -o jsonpath='{.status.tasks[*]}' | jq + +# Get specific task type +kubectl get bmc -o jsonpath='{.status.tasks[?(@.taskType=="FirmwareUpdate")]}' | jq + +# Watch task progress +watch 'kubectl get bmc -o jsonpath="{.status.tasks[0]}" | jq' + +# Get tasks with specific state +kubectl get bmc -o jsonpath='{.status.tasks[?(@.state=="Running")]}' | jq +``` + +### From Code + +```go +// Get BMC object +bmc := &metalv1alpha1.BMC{} +err := client.Get(ctx, types.NamespacedName{Name: bmcName}, bmc) + +// List all tasks +for _, task := range bmc.Status.Tasks { + fmt.Printf("Task: %s, Type: %s, State: %s, Progress: %d%%\n", + task.TaskURI, task.TaskType, task.State, task.PercentComplete) +} + +// Find specific task +for _, task := range bmc.Status.Tasks { + if task.TaskURI == targetURI { + fmt.Printf("Found task: %s at %d%% complete\n", task.Message, task.PercentComplete) + break + } +} +``` + +## Benefits + +### Single Source of Truth +- All BMC operations tracked in one place +- Eliminates duplication across controller status fields +- Simplifies operational monitoring + +### Cross-Controller Awareness +- See all operations affecting a BMC regardless of source +- Better understanding of BMC state and activity +- Prevents conflicting operations + +### Operational Transparency +- Complete audit trail of BMC operations +- Task history preserved (last 10 tasks) +- Clear progress indicators for async operations + +### Better Failure Recovery +- Tasks persist in BMC status across controller restarts +- Can resume monitoring of long-running operations +- Clear indication of failed operations + +## Migration Notes + +### Backward Compatibility + +**BMCVersion Controller:** +- Still maintains `Status.UpgradeTask` field (deprecated but updated) +- This allows existing monitoring/tooling to continue working +- Plan to remove in future version once consumers migrate + +**BMCSettings Controller:** +- No previous task tracking existed +- Pure addition of functionality + +**BMC Controller:** +- Tasks field was previously unused +- Now actively populated + +### Architecture Changes (v0.x.x) + +**What Changed:** + +**Before (Old Architecture):** +- BMC controller polled tasks during every reconciliation (event-driven, inconsistent) +- BMCVersion controller had its own 2-minute polling loop +- ServerCleaning controller had its own 30-second polling loop +- Tasks only updated when reconciliation triggered +- Redundant BMC API calls from multiple controllers + +**After (New Architecture):** +- Dedicated BMCTask controller handles ALL task polling +- Consistent 30-second polling interval (configurable) +- Tasks update automatically even without reconciliation events +- Single BMC API call per task per interval +- Other controllers only create tasks and watch for updates + +**Migration Impact:** + +✅ **No API changes** - `BMC.Status.Tasks` structure unchanged +✅ **No configuration changes** - Works with existing BMC resources +✅ **New flag available** - `--task-poll-interval` (default 30s maintains similar behavior) +✅ **Better consistency** - Tasks now update predictably every 30s +✅ **Improved performance** - Eliminates redundant polling overhead + +**Deployment:** + +1. Deploy new controller version with BMCTask controller +2. Verify task polling works as expected +3. Monitor logs for any issues +4. Roll back if needed (old architecture code preserved in git history) + +**Testing:** + +```bash +# Verify BMCTask controller is running +kubectl get pods -n metal-operator-system +kubectl logs -n metal-operator-system deployment/controller-manager | grep BMCTaskReconciler + +# Test task polling +kubectl apply -f test-bmcversion.yaml + +# Watch task progress (should update every 30s) +watch 'kubectl get bmc -o jsonpath="{.status.tasks[0]}" | jq' + +# Verify consistent updates +kubectl get bmc -o jsonpath='{.status.tasks[0].lastUpdateTime}' +# Should update every ~30 seconds for active tasks +``` + +**Rollback Plan:** + +If issues are found: +1. Revert to previous version +2. BMC controller will resume event-driven polling +3. No data loss - tasks persisted in BMC.Status.Tasks +4. Report issue with logs and reproduction steps + +### Migrating Consumers + +If you're consuming BMC operation status: + +**Before:** +```go +// Old way - check specific controller status +bmcVersion := &metalv1alpha1.BMCVersion{} +client.Get(ctx, key, bmcVersion) +progress := bmcVersion.Status.UpgradeTask.PercentComplete +``` + +**After:** +```go +// New way - check BMC tasks +bmc := &metalv1alpha1.BMC{} +client.Get(ctx, key, bmc) +for _, task := range bmc.Status.Tasks { + if task.TaskType == metalv1alpha1.BMCTaskTypeFirmwareUpdate { + progress := task.PercentComplete + break + } +} +``` + +## Future Enhancements + +Potential improvements: +- Task filtering by date range +- Task persistence to external storage for long-term audit +- Webhooks/events when tasks complete +- Task cancellation support +- Task priority/scheduling diff --git a/internal/controller/bmctask_controller.go b/internal/controller/bmctask_controller.go new file mode 100644 index 000000000..adaef7970 --- /dev/null +++ b/internal/controller/bmctask_controller.go @@ -0,0 +1,191 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "time" + + metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" + "github.com/ironcore-dev/metal-operator/bmc" + "github.com/ironcore-dev/metal-operator/internal/bmcutils" + "github.com/stmcginnis/gofish/schemas" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/predicate" +) + +// BMCTaskReconciler reconciles BMC tasks by polling task status from the BMC. +// This controller is responsible for monitoring all in-progress BMC operations +// and updating task status in BMC.Status.Tasks. +type BMCTaskReconciler struct { + client.Client + Scheme *runtime.Scheme + // Insecure allows insecure connections to the BMC. + Insecure bool + // BMCOptions contains additional options for BMC clients. + BMCOptions bmc.Options + // PollInterval defines how often to poll task status from the BMC. + PollInterval time.Duration +} + +// +kubebuilder:rbac:groups=metal.ironcore.dev,resources=bmcs,verbs=get;list;watch +// +kubebuilder:rbac:groups=metal.ironcore.dev,resources=bmcs/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=metal.ironcore.dev,resources=bmcsecrets,verbs=get;list;watch +// +kubebuilder:rbac:groups=metal.ironcore.dev,resources=endpoints,verbs=get;list;watch + +// Reconcile monitors BMC tasks and updates their status by polling the BMC. +func (r *BMCTaskReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Reconciling BMC tasks") + + // Fetch the BMC object + bmcObj := &metalv1alpha1.BMC{} + if err := r.Get(ctx, req.NamespacedName, bmcObj); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + // Skip reconciliation if the BMC is being deleted + if !bmcObj.DeletionTimestamp.IsZero() { + return ctrl.Result{}, nil + } + + // Skip if there are no tasks to monitor + if len(bmcObj.Status.Tasks) == 0 { + log.V(1).Info("No tasks to monitor") + return ctrl.Result{}, nil + } + + // Check if there are any non-terminal tasks + hasActiveTasks := false + for i := range bmcObj.Status.Tasks { + task := &bmcObj.Status.Tasks[i] + if !isTerminalState(task.State) { + hasActiveTasks = true + break + } + } + + if !hasActiveTasks { + log.V(1).Info("All tasks are in terminal state") + return ctrl.Result{}, nil + } + + // Get BMC client + bmcClient, err := bmcutils.GetBMCClientFromBMC(ctx, r.Client, bmcObj, r.Insecure, r.BMCOptions) + if err != nil { + log.V(1).Info("Failed to get BMC client, will retry", "error", err) + // Don't fail the reconciliation, just requeue + return ctrl.Result{RequeueAfter: r.PollInterval}, nil + } + defer bmcClient.Logout() + + // Poll and update task statuses + needsUpdate := false + for i := range bmcObj.Status.Tasks { + task := &bmcObj.Status.Tasks[i] + + // Skip tasks in terminal states + if isTerminalState(task.State) { + continue + } + + // Poll task status from BMC + taskStatus, err := bmcClient.GetTaskStatus(ctx, task.TaskURI) + if err != nil { + log.V(1).Info("Failed to get task status", "taskURI", task.TaskURI, "error", err) + continue + } + + // Update task if status changed + if taskStatus != nil { + oldState := task.State + oldPercent := task.PercentComplete + + task.State = string(taskStatus.TaskState) + if taskStatus.PercentComplete != nil { + task.PercentComplete = int32(*taskStatus.PercentComplete) + } + if taskStatus.TaskStatus != "" { + task.Message = string(taskStatus.TaskStatus) + } + task.LastUpdateTime = metav1.Now() + + // Log if status changed + if oldState != task.State || oldPercent != task.PercentComplete { + log.V(1).Info("Updated task status", + "taskURI", task.TaskURI, + "taskType", task.TaskType, + "state", task.State, + "percentComplete", task.PercentComplete) + needsUpdate = true + } + } + } + + // Persist changes if any tasks were updated + if needsUpdate { + bmcBase := bmcObj.DeepCopy() + if err := r.Status().Patch(ctx, bmcObj, client.MergeFrom(bmcBase)); err != nil { + log.Error(err, "Failed to update BMC task status") + return ctrl.Result{}, err + } + log.V(1).Info("Successfully updated BMC task status") + } + + // Requeue to continue monitoring active tasks + return ctrl.Result{RequeueAfter: r.PollInterval}, nil +} + +// isTerminalState checks if a task state is terminal (no further updates expected). +func isTerminalState(state string) bool { + return state == "Completed" || + state == "Failed" || + state == string(schemas.CompletedTaskState) || + state == string(schemas.KilledTaskState) || + state == string(schemas.ExceptionTaskState) || + state == string(schemas.CancelledTaskState) +} + +// SetupWithManager sets up the controller with the Manager. +func (r *BMCTaskReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&metalv1alpha1.BMC{}). + WithEventFilter(hasTasksPredicate()). + Complete(r) +} + +// hasTasksPredicate filters BMC events to only reconcile BMCs that have tasks. +func hasTasksPredicate() predicate.Predicate { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + bmc, ok := e.Object.(*metalv1alpha1.BMC) + if !ok { + return false + } + return len(bmc.Status.Tasks) > 0 + }, + UpdateFunc: func(e event.UpdateEvent) bool { + bmcNew, ok := e.ObjectNew.(*metalv1alpha1.BMC) + if !ok { + return false + } + return len(bmcNew.Status.Tasks) > 0 + }, + DeleteFunc: func(e event.DeleteEvent) bool { + // Don't reconcile on delete + return false + }, + GenericFunc: func(e event.GenericEvent) bool { + bmc, ok := e.Object.(*metalv1alpha1.BMC) + if !ok { + return false + } + return len(bmc.Status.Tasks) > 0 + }, + } +} diff --git a/internal/controller/bmctask_controller_test.go b/internal/controller/bmctask_controller_test.go new file mode 100644 index 000000000..19f2af7ee --- /dev/null +++ b/internal/controller/bmctask_controller_test.go @@ -0,0 +1,908 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "time" + + metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + . "sigs.k8s.io/controller-runtime/pkg/envtest/komega" + "sigs.k8s.io/controller-runtime/pkg/event" +) + +var _ = Describe("BMCTask Controller", func() { + _ = SetupTest(nil) + + AfterEach(func(ctx SpecContext) { + EnsureCleanState() + }) + + It("Should update BMC.Status.Tasks when polling active tasks", func(ctx SpecContext) { + By("Creating a BMCSecret") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC resource with active tasks") + bmc := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "aa:bb:cc:dd:ee:ff", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + TargetID: "Drive-1", + State: "Running", + PercentComplete: 0, + Message: "Erasing disk", + LastUpdateTime: metav1.Now(), + }, + }, + }, + } + Expect(k8sClient.Create(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Status().Update(ctx, bmc)).To(Succeed()) + + By("Ensuring that the task status is updated by the controller") + // The mock BMC will return Completed status + Eventually(Object(bmc)).Should(SatisfyAll( + HaveField("Status.Tasks", HaveLen(1)), + HaveField("Status.Tasks[0].State", "Completed"), + HaveField("Status.Tasks[0].PercentComplete", BeNumerically(">=", 0)), + )) + + // cleanup + Expect(k8sClient.Delete(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + }) + + It("Should only reconcile BMCs with tasks due to event filter", func(ctx SpecContext) { + By("Creating a BMCSecret") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC resource without tasks") + bmcWithoutTasks := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-notasks-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "aa:bb:cc:dd:ee:11", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + } + Expect(k8sClient.Create(ctx, bmcWithoutTasks)).To(Succeed()) + + By("Ensuring BMC without tasks remains unchanged") + Consistently(Object(bmcWithoutTasks)).Should(HaveField("Status.Tasks", BeEmpty())) + + By("Creating a BMC resource with tasks") + bmcWithTasks := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-withtasks-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "aa:bb:cc:dd:ee:22", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Running", + PercentComplete: 0, + LastUpdateTime: metav1.Now(), + }, + }, + }, + } + Expect(k8sClient.Create(ctx, bmcWithTasks)).To(Succeed()) + Expect(k8sClient.Status().Update(ctx, bmcWithTasks)).To(Succeed()) + + By("Ensuring BMC with tasks is reconciled") + Eventually(Object(bmcWithTasks)).Should(SatisfyAll( + HaveField("Status.Tasks", HaveLen(1)), + HaveField("Status.Tasks[0].State", "Completed"), + )) + + // cleanup + Expect(k8sClient.Delete(ctx, bmcWithoutTasks)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcWithTasks)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + }) + + It("Should automatically requeue when active tasks exist", func(ctx SpecContext) { + By("Creating a BMCSecret") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC resource with an active task") + bmc := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "aa:bb:cc:dd:ee:33", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/active", + TaskType: metalv1alpha1.BMCTaskTypeFirmwareUpdate, + State: "Running", + PercentComplete: 25, + LastUpdateTime: metav1.Now(), + }, + }, + }, + } + Expect(k8sClient.Create(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Status().Update(ctx, bmc)).To(Succeed()) + + By("Ensuring the task is polled multiple times due to requeue") + initialUpdateTime := metav1.Now() + + // Since the mock returns completed, we verify the task was updated + Eventually(Object(bmc)).Should(SatisfyAll( + HaveField("Status.Tasks", HaveLen(1)), + HaveField("Status.Tasks[0].State", "Completed"), + HaveField("Status.Tasks[0].LastUpdateTime", Not(Equal(initialUpdateTime))), + )) + + // cleanup + Expect(k8sClient.Delete(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + }) + + It("Should not requeue when all tasks are in terminal state", func(ctx SpecContext) { + By("Creating a BMCSecret") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC resource with only terminal tasks") + bmc := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "aa:bb:cc:dd:ee:44", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/completed", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Completed", + PercentComplete: 100, + LastUpdateTime: metav1.Now(), + }, + { + TaskURI: "/redfish/v1/TaskService/Tasks/failed", + TaskType: metalv1alpha1.BMCTaskTypeBIOSReset, + State: "Failed", + PercentComplete: 50, + Message: "Operation failed", + LastUpdateTime: metav1.Now(), + }, + }, + }, + } + Expect(k8sClient.Create(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Status().Update(ctx, bmc)).To(Succeed()) + + By("Ensuring terminal tasks are not updated") + // Store the initial last update time + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bmc), bmc)).To(Succeed()) + initialUpdateTime1 := bmc.Status.Tasks[0].LastUpdateTime + initialUpdateTime2 := bmc.Status.Tasks[1].LastUpdateTime + + // Wait a bit and verify the tasks haven't changed + time.Sleep(200 * time.Millisecond) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bmc), bmc)).To(Succeed()) + + Expect(bmc.Status.Tasks).To(HaveLen(2)) + Expect(bmc.Status.Tasks[0].LastUpdateTime).To(Equal(initialUpdateTime1)) + Expect(bmc.Status.Tasks[1].LastUpdateTime).To(Equal(initialUpdateTime2)) + + // cleanup + Expect(k8sClient.Delete(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + }) + + It("Should handle BMC client errors gracefully", func(ctx SpecContext) { + By("Creating a BMCSecret with invalid credentials") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("invalid"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("invalid"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC resource with active tasks") + bmc := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP("192.0.2.1"), // TEST-NET-1 (unreachable) + MACAddress: "aa:bb:cc:dd:ee:55", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfish, + Port: 8000, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Running", + PercentComplete: 0, + LastUpdateTime: metav1.Now(), + }, + }, + }, + } + Expect(k8sClient.Create(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Status().Update(ctx, bmc)).To(Succeed()) + + By("Ensuring the controller handles the error gracefully") + // The controller should not crash and should keep retrying + Consistently(Object(bmc), "2s", "100ms").Should(SatisfyAll( + HaveField("Status.Tasks", HaveLen(1)), + HaveField("Status.Tasks[0].State", "Running"), + )) + + // cleanup + Expect(k8sClient.Delete(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + }) + + It("Should only update changed tasks", func(ctx SpecContext) { + By("Creating a BMCSecret") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC resource with mixed terminal and active tasks") + bmc := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "aa:bb:cc:dd:ee:66", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/completed", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Completed", + PercentComplete: 100, + Message: "Disk erased successfully", + LastUpdateTime: metav1.Now(), + }, + { + TaskURI: "/redfish/v1/TaskService/Tasks/active", + TaskType: metalv1alpha1.BMCTaskTypeBIOSReset, + State: "Running", + PercentComplete: 50, + LastUpdateTime: metav1.Now(), + }, + }, + }, + } + Expect(k8sClient.Create(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Status().Update(ctx, bmc)).To(Succeed()) + + By("Getting the initial state") + Eventually(Get(bmc)).Should(Succeed()) + initialTask1UpdateTime := bmc.Status.Tasks[0].LastUpdateTime + initialTask2UpdateTime := bmc.Status.Tasks[1].LastUpdateTime + + By("Ensuring only active task is updated") + Eventually(Object(bmc)).Should(SatisfyAll( + HaveField("Status.Tasks", HaveLen(2)), + // First task (completed) should remain unchanged + HaveField("Status.Tasks[0].State", "Completed"), + HaveField("Status.Tasks[0].PercentComplete", BeNumerically("==", 100)), + // Second task (active) should be updated by the mock BMC + HaveField("Status.Tasks[1].State", "Completed"), + HaveField("Status.Tasks[1].LastUpdateTime", Not(Equal(initialTask2UpdateTime))), + )) + + // Verify first task was not updated + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bmc), bmc)).To(Succeed()) + Expect(bmc.Status.Tasks[0].LastUpdateTime).To(Equal(initialTask1UpdateTime)) + + // cleanup + Expect(k8sClient.Delete(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + }) + + It("Should handle multiple tasks with mixed states correctly", func(ctx SpecContext) { + By("Creating a BMCSecret") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC resource with multiple tasks in various states") + bmc := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "aa:bb:cc:dd:ee:77", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/task1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + TargetID: "Drive-1", + State: "Running", + PercentComplete: 10, + Message: "Erasing drive 1", + LastUpdateTime: metav1.Now(), + }, + { + TaskURI: "/redfish/v1/TaskService/Tasks/task2", + TaskType: metalv1alpha1.BMCTaskTypeBMCReset, + State: "Completed", + PercentComplete: 100, + Message: "BMC reset completed", + LastUpdateTime: metav1.Now(), + }, + { + TaskURI: "/redfish/v1/TaskService/Tasks/task3", + TaskType: metalv1alpha1.BMCTaskTypeFirmwareUpdate, + State: "Running", + PercentComplete: 75, + Message: "Updating firmware", + LastUpdateTime: metav1.Now(), + }, + { + TaskURI: "/redfish/v1/TaskService/Tasks/task4", + TaskType: metalv1alpha1.BMCTaskTypeNetworkClear, + State: "Failed", + PercentComplete: 0, + Message: "Network clear failed", + LastUpdateTime: metav1.Now(), + }, + }, + }, + } + Expect(k8sClient.Create(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Status().Update(ctx, bmc)).To(Succeed()) + + By("Ensuring only non-terminal tasks are updated") + Eventually(Object(bmc)).Should(SatisfyAll( + HaveField("Status.Tasks", HaveLen(4)), + // Task 1: was Running, should be updated to Completed by mock + HaveField("Status.Tasks[0].State", "Completed"), + // Task 2: was Completed, should remain Completed + HaveField("Status.Tasks[1].State", "Completed"), + HaveField("Status.Tasks[1].PercentComplete", BeNumerically("==", 100)), + // Task 3: was Running, should be updated to Completed by mock + HaveField("Status.Tasks[2].State", "Completed"), + // Task 4: was Failed, should remain Failed + HaveField("Status.Tasks[3].State", "Failed"), + HaveField("Status.Tasks[3].Message", "Network clear failed"), + )) + + // cleanup + Expect(k8sClient.Delete(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + }) + + It("Should skip reconciliation if BMC is being deleted", func(ctx SpecContext) { + By("Creating a BMCSecret") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC resource with tasks and a finalizer") + bmc := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-", + Finalizers: []string{"test.finalizer"}, + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "aa:bb:cc:dd:ee:88", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Running", + PercentComplete: 0, + LastUpdateTime: metav1.Now(), + }, + }, + }, + } + Expect(k8sClient.Create(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Status().Update(ctx, bmc)).To(Succeed()) + + By("Deleting the BMC") + Expect(k8sClient.Delete(ctx, bmc)).To(Succeed()) + + By("Ensuring tasks are not updated during deletion") + Eventually(Get(bmc)).Should(Succeed()) + Expect(bmc.DeletionTimestamp).NotTo(BeNil()) + + // Store the task state when deletion started + initialTaskState := bmc.Status.Tasks[0].State + initialUpdateTime := bmc.Status.Tasks[0].LastUpdateTime + + // Wait a bit and verify the task hasn't been updated + time.Sleep(200 * time.Millisecond) + Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bmc), bmc)).To(Succeed()) + Expect(bmc.Status.Tasks[0].State).To(Equal(initialTaskState)) + Expect(bmc.Status.Tasks[0].LastUpdateTime).To(Equal(initialUpdateTime)) + + By("Removing finalizer to allow deletion") + Eventually(Update(bmc, func() { + bmc.Finalizers = []string{} + })).Should(Succeed()) + + // cleanup + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + }) + + It("Should handle BMCs with empty task list", func(ctx SpecContext) { + By("Creating a BMCSecret") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC resource") + bmc := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "aa:bb:cc:dd:ee:99", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{}, + }, + } + Expect(k8sClient.Create(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Status().Update(ctx, bmc)).To(Succeed()) + + By("Ensuring the controller doesn't fail with empty task list") + Consistently(Object(bmc), "1s", "100ms").Should(HaveField("Status.Tasks", BeEmpty())) + + // cleanup + Expect(k8sClient.Delete(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + }) + + It("Should register BMCTask controller in the test setup", func(ctx SpecContext) { + By("Verifying the BMCTask controller is registered") + // This test verifies that the controller is properly set up in suite_test.go + // The fact that other tests pass indicates the controller is working + // This is a placeholder to ensure we remember to register it in suite_test.go + + By("Creating a BMCSecret") + bmcSecret := &metalv1alpha1.BMCSecret{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-", + }, + Data: map[string][]byte{ + metalv1alpha1.BMCSecretUsernameKeyName: []byte("foo"), + metalv1alpha1.BMCSecretPasswordKeyName: []byte("bar"), + }, + } + Expect(k8sClient.Create(ctx, bmcSecret)).To(Succeed()) + + By("Creating a BMC with tasks to trigger reconciliation") + bmc := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-bmc-controller-", + }, + Spec: metalv1alpha1.BMCSpec{ + Endpoint: &metalv1alpha1.InlineEndpoint{ + IP: metalv1alpha1.MustParseIP(MockServerIP), + MACAddress: "aa:bb:cc:dd:ee:00", + }, + Protocol: metalv1alpha1.Protocol{ + Name: metalv1alpha1.ProtocolRedfishLocal, + Port: MockServerPort, + }, + BMCSecretRef: v1.LocalObjectReference{ + Name: bmcSecret.Name, + }, + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Running", + PercentComplete: 0, + LastUpdateTime: metav1.Now(), + }, + }, + }, + } + Expect(k8sClient.Create(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Status().Update(ctx, bmc)).To(Succeed()) + + By("Ensuring controller processes the BMC task") + Eventually(Object(bmc), "5s", "100ms").Should(SatisfyAll( + HaveField("Status.Tasks", HaveLen(1)), + HaveField("Status.Tasks[0].State", "Completed"), + )) + + // cleanup + Expect(k8sClient.Delete(ctx, bmc)).To(Succeed()) + Expect(k8sClient.Delete(ctx, bmcSecret)).To(Succeed()) + }) +}) + +var _ = Describe("BMCTask Event Filter", func() { + It("Should filter BMCs without tasks on create event", func() { + predicate := hasTasksPredicate() + + By("Testing with BMC without tasks") + bmcWithoutTasks := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-bmc-no-tasks", + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{}, + }, + } + + // Create event should be filtered (return false) + Expect(predicate.Create(MockCreateEvent(bmcWithoutTasks))).To(BeFalse()) + + By("Testing with BMC with tasks") + bmcWithTasks := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-bmc-with-tasks", + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Running", + }, + }, + }, + } + + // Create event should pass (return true) + Expect(predicate.Create(MockCreateEvent(bmcWithTasks))).To(BeTrue()) + }) + + It("Should filter BMCs without tasks on update event", func() { + predicate := hasTasksPredicate() + + By("Testing update with old BMC having tasks, new BMC without tasks") + oldBMC := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-bmc", + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Running", + }, + }, + }, + } + newBMC := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-bmc", + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{}, + }, + } + + // Update event should be filtered when new BMC has no tasks + Expect(predicate.Update(MockUpdateEvent(oldBMC, newBMC))).To(BeFalse()) + + By("Testing update with both BMCs having tasks") + newBMCWithTasks := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-bmc", + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Completed", + }, + }, + }, + } + + // Update event should pass when new BMC has tasks + Expect(predicate.Update(MockUpdateEvent(oldBMC, newBMCWithTasks))).To(BeTrue()) + }) + + It("Should always filter delete events", func() { + predicate := hasTasksPredicate() + + bmc := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-bmc", + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Running", + }, + }, + }, + } + + // Delete events should always be filtered regardless of tasks + Expect(predicate.Delete(MockDeleteEvent(bmc))).To(BeFalse()) + }) + + It("Should filter generic events based on task presence", func() { + predicate := hasTasksPredicate() + + By("Testing generic event without tasks") + bmcWithoutTasks := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-bmc", + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{}, + }, + } + + Expect(predicate.Generic(MockGenericEvent(bmcWithoutTasks))).To(BeFalse()) + + By("Testing generic event with tasks") + bmcWithTasks := &metalv1alpha1.BMC{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-bmc", + }, + Status: metalv1alpha1.BMCStatus{ + Tasks: []metalv1alpha1.BMCTask{ + { + TaskURI: "/redfish/v1/TaskService/Tasks/1", + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + State: "Running", + }, + }, + }, + } + + Expect(predicate.Generic(MockGenericEvent(bmcWithTasks))).To(BeTrue()) + }) +}) + +var _ = Describe("isTerminalState", func() { + It("Should identify terminal states correctly", func() { + By("Testing completed state") + Expect(isTerminalState("Completed")).To(BeTrue()) + + By("Testing failed state") + Expect(isTerminalState("Failed")).To(BeTrue()) + + By("Testing Redfish terminal states") + Expect(isTerminalState("Killed")).To(BeTrue()) + Expect(isTerminalState("Exception")).To(BeTrue()) + Expect(isTerminalState("Cancelled")).To(BeTrue()) + + By("Testing non-terminal states") + Expect(isTerminalState("Running")).To(BeFalse()) + Expect(isTerminalState("Pending")).To(BeFalse()) + Expect(isTerminalState("Starting")).To(BeFalse()) + Expect(isTerminalState("")).To(BeFalse()) + }) +}) + +// Helper functions for creating mock events for predicate testing + +// MockCreateEvent creates a mock CreateEvent for testing predicates. +func MockCreateEvent(obj client.Object) event.CreateEvent { + return event.CreateEvent{ + Object: obj, + } +} + +// MockUpdateEvent creates a mock UpdateEvent for testing predicates. +func MockUpdateEvent(oldObj, newObj client.Object) event.UpdateEvent { + return event.UpdateEvent{ + ObjectOld: oldObj, + ObjectNew: newObj, + } +} + +// MockDeleteEvent creates a mock DeleteEvent for testing predicates. +func MockDeleteEvent(obj client.Object) event.DeleteEvent { + return event.DeleteEvent{ + Object: obj, + } +} + +// MockGenericEvent creates a mock GenericEvent for testing predicates. +func MockGenericEvent(obj client.Object) event.GenericEvent { + return event.GenericEvent{ + Object: obj, + } +} diff --git a/internal/controller/server_controller.go b/internal/controller/server_controller.go index b8218cb6d..707054aa6 100644 --- a/internal/controller/server_controller.go +++ b/internal/controller/server_controller.go @@ -104,6 +104,7 @@ type ServerReconciler struct { // +kubebuilder:rbac:groups=metal.ironcore.dev,resources=servers/status,verbs=get;update;patch // +kubebuilder:rbac:groups=metal.ironcore.dev,resources=servers/finalizers,verbs=update // +kubebuilder:rbac:groups=metal.ironcore.dev,resources=serverconfigurations,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=metal.ironcore.dev,resources=servercleanings,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="batch",resources=jobs,verbs=get;list;watch;create;update;patch;delete @@ -285,6 +286,8 @@ func (r *ServerReconciler) ensureServerStateTransition(ctx context.Context, bmcC return r.handleAvailableState(ctx, bmcClient, server) case metalv1alpha1.ServerStateReserved: return r.handleReservedState(ctx, bmcClient, server) + case metalv1alpha1.ServerStateTainted: + return r.handleTaintedState(ctx, bmcClient, server) case metalv1alpha1.ServerStateMaintenance: return r.handleMaintenanceState(ctx, bmcClient, server) default: @@ -417,8 +420,17 @@ func (r *ServerReconciler) handleReservedState(ctx context.Context, bmcClient bm // TODO: This needs be reworked later as the Server cleanup has to happen here. For now we just transition the server // back to available state. if server.Spec.ServerClaimRef == nil { - if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateAvailable); err != nil || modified { - return true, err + // Check if server has taints + if len(server.Spec.Taints) > 0 { + log.V(1).Info("Server has taints, transitioning to Tainted state for cleaning") + if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateTainted); err != nil || modified { + return true, err + } + } else { + // No taints, transition directly to Available + if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateAvailable); err != nil || modified { + return true, err + } } } @@ -467,6 +479,54 @@ func (r *ServerReconciler) handleReservedState(ctx context.Context, bmcClient bm return true, nil } +func (r *ServerReconciler) handleTaintedState(ctx context.Context, _ bmc.BMC, server *metalv1alpha1.Server) (bool, error) { + log := ctrl.LoggerFrom(ctx) + + // Check if ServerCleaning exists for this server + cleaningList := &metalv1alpha1.ServerCleaningList{} + if err := r.List(ctx, cleaningList); err != nil { + return false, fmt.Errorf("failed to list ServerCleaning resources: %w", err) + } + + var activeCleaning *metalv1alpha1.ServerCleaning + for i := range cleaningList.Items { + cleaning := &cleaningList.Items[i] + if cleaning.Spec.ServerRef.Name != server.Name { + continue + } + if cleaning.Status.State == metalv1alpha1.ServerCleaningStateCompleted { + // Cleaning completed, remove taints and transition to Available + log.V(1).Info("Cleaning completed, removing taints") + serverBase := server.DeepCopy() + server.Spec.Taints = nil + if err := r.Patch(ctx, server, client.MergeFrom(serverBase)); err != nil { + return false, fmt.Errorf("failed to remove taints: %w", err) + } + + // Transition to Available + if modified, err := r.patchServerState(ctx, server, metalv1alpha1.ServerStateAvailable); err != nil || modified { + return modified, err + } + return false, nil + } + if cleaning.Status.State == metalv1alpha1.ServerCleaningStatePending || + cleaning.Status.State == metalv1alpha1.ServerCleaningStateInProgress { + activeCleaning = cleaning + break + } + } + + if activeCleaning == nil { + log.V(1).Info("No active ServerCleaning found, waiting for cleaning to be created") + // A separate controller or operator should create ServerCleaning + // Requeue to check again + return true, nil + } + + log.V(1).Info("Server cleaning in progress", "cleaningState", activeCleaning.Status.State) + return true, nil +} + func (r *ServerReconciler) handleMaintenanceState(ctx context.Context, bmcClient bmc.BMC, server *metalv1alpha1.Server) (bool, error) { log := ctrl.LoggerFrom(ctx) if server.Spec.ServerMaintenanceRef == nil { diff --git a/internal/controller/servercleaning_controller.go b/internal/controller/servercleaning_controller.go new file mode 100644 index 000000000..3c256ad41 --- /dev/null +++ b/internal/controller/servercleaning_controller.go @@ -0,0 +1,766 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "fmt" + "slices" + "time" + + "github.com/ironcore-dev/controller-utils/clientutils" + metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" + "github.com/ironcore-dev/metal-operator/bmc" + "github.com/ironcore-dev/metal-operator/internal/bmcutils" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +const ( + // ServerCleaningFinalizer is the finalizer for the ServerCleaning resource. + ServerCleaningFinalizer = "metal.ironcore.dev/servercleaning" + + // ServerCleaningConditionTypeCleaning indicates that cleaning is in progress + ServerCleaningConditionTypeCleaning = "Cleaning" + + // ServerCleaningConditionReasonInProgress indicates cleaning is in progress + ServerCleaningConditionReasonInProgress = "CleaningInProgress" + + // ServerCleaningConditionReasonCompleted indicates cleaning is completed + ServerCleaningConditionReasonCompleted = "CleaningCompleted" + + // ServerCleaningConditionReasonFailed indicates cleaning failed + ServerCleaningConditionReasonFailed = "CleaningFailed" + + // Task state constants + taskStateCompleted = "Completed" + taskStateException = "Exception" + taskStateCancelled = "Cancelled" + taskStateKilled = "Killed" + taskStateFailed = "Failed" + taskStateNew = "New" +) + +// ServerCleaningReconciler reconciles a ServerCleaning object +type ServerCleaningReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=metal.ironcore.dev,resources=servercleanings,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=metal.ironcore.dev,resources=servercleanings/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=metal.ironcore.dev,resources=servercleanings/finalizers,verbs=update +// +kubebuilder:rbac:groups=metal.ironcore.dev,resources=servermaintenances,verbs=get;list;watch;create;update;patch;delete + +// Reconcile is part of the main kubernetes reconciliation loop which aims to +// move the current state of the cluster closer to the desired state. +func (r *ServerCleaningReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + cleaning := &metalv1alpha1.ServerCleaning{} + if err := r.Get(ctx, req.NamespacedName, cleaning); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + return r.reconcileExists(ctx, cleaning) +} + +func (r *ServerCleaningReconciler) reconcileExists(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning) (ctrl.Result, error) { + if !cleaning.DeletionTimestamp.IsZero() { + return r.delete(ctx, cleaning) + } + return r.reconcile(ctx, cleaning) +} + +func (r *ServerCleaningReconciler) reconcile(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Reconciling ServerCleaning") + + // Ensure finalizer + if modified, err := clientutils.PatchEnsureFinalizer(ctx, r.Client, cleaning, ServerCleaningFinalizer); err != nil || modified { + return ctrl.Result{}, err + } + + // Set initial state if not set + if cleaning.Status.State == "" { + if modified, err := r.patchCleaningState(ctx, cleaning, metalv1alpha1.ServerCleaningStatePending); err != nil || modified { + return ctrl.Result{}, err + } + } + + return r.ensureServerCleaningStateTransition(ctx, cleaning) +} + +func (r *ServerCleaningReconciler) ensureServerCleaningStateTransition(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + switch cleaning.Status.State { + case metalv1alpha1.ServerCleaningStatePending: + return r.handlePendingState(ctx, cleaning) + case metalv1alpha1.ServerCleaningStateInProgress: + return r.handleInProgressState(ctx, cleaning) + case metalv1alpha1.ServerCleaningStateCompleted: + return r.handleCompletedState(ctx, cleaning) + case metalv1alpha1.ServerCleaningStateFailed: + return r.handleFailedState(ctx, cleaning) + default: + log.V(1).Info("Unknown ServerCleaning state, skipping reconciliation", "State", cleaning.Status.State) + return ctrl.Result{}, nil + } +} + +func (r *ServerCleaningReconciler) handlePendingState(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + + // Get list of servers to clean + servers, err := r.getServersForCleaning(ctx, cleaning) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get servers for cleaning: %w", err) + } + + if len(servers) == 0 { + log.V(1).Info("No servers found for cleaning") + return ctrl.Result{}, nil + } + + // Update selected servers count + if err := r.updateSelectedServersCount(ctx, cleaning, int32(len(servers))); err != nil { + return ctrl.Result{}, err + } + + // Initialize server status entries + if err := r.initializeServerStatuses(ctx, cleaning, servers); err != nil { + return ctrl.Result{}, err + } + + // Initiate BMC cleaning operations for each server + pendingCount := int32(0) + inProgressCount := int32(0) + failedCount := int32(0) + + for _, server := range servers { + if server.Status.State != metalv1alpha1.ServerStateTainted { + log.V(1).Info("Server is not in Tainted state, skipping", "Server", server.Name, "State", server.Status.State) + continue + } + + // Initiate cleaning operations via BMC + if err := r.initiateBMCCleaning(ctx, cleaning, &server); err != nil { + log.Error(err, "Failed to initiate BMC cleaning for server", "Server", server.Name) + if err := r.updateServerStatus(ctx, cleaning, server.Name, metalv1alpha1.ServerCleaningStateFailed, fmt.Sprintf("Failed to initiate cleaning: %v", err)); err != nil { + return ctrl.Result{}, err + } + failedCount++ + continue + } + + inProgressCount++ + if err := r.updateServerStatus(ctx, cleaning, server.Name, metalv1alpha1.ServerCleaningStateInProgress, "Cleaning initiated"); err != nil { + return ctrl.Result{}, err + } + } + + // Update status counts + if err := r.updateCleaningCounts(ctx, cleaning, pendingCount, inProgressCount, 0, failedCount); err != nil { + return ctrl.Result{}, err + } + + // Update status condition + if err := r.setCondition(ctx, cleaning, metav1.Condition{ + Type: ServerCleaningConditionTypeCleaning, + Status: metav1.ConditionTrue, + Reason: ServerCleaningConditionReasonInProgress, + Message: fmt.Sprintf("Cleaning operations initiated for %d servers", inProgressCount), + ObservedGeneration: cleaning.Generation, + }); err != nil { + return ctrl.Result{}, err + } + + // Transition to InProgress + if modified, err := r.patchCleaningState(ctx, cleaning, metalv1alpha1.ServerCleaningStateInProgress); err != nil || modified { + return ctrl.Result{}, err + } + + // Requeue to monitor task progress + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil +} + +func (r *ServerCleaningReconciler) handleInProgressState(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + + // Get servers for cleaning + servers, err := r.getServersForCleaning(ctx, cleaning) + if err != nil { + return ctrl.Result{}, fmt.Errorf("failed to get servers for cleaning: %w", err) + } + + if len(servers) == 0 { + log.V(1).Info("No servers found for monitoring") + return ctrl.Result{}, nil + } + + // Track counts + var inProgressCount, completedCount, failedCount int32 + allComplete := true + + // Monitor each server's cleaning tasks + for _, server := range servers { + // Find the server status entry + var serverStatus *metalv1alpha1.ServerCleaningStatusEntry + for i := range cleaning.Status.ServerCleaningStatuses { + if cleaning.Status.ServerCleaningStatuses[i].ServerName == server.Name { + serverStatus = &cleaning.Status.ServerCleaningStatuses[i] + break + } + } + + if serverStatus == nil { + log.V(1).Info("No status entry found for server", "server", server.Name) + continue + } + + // Skip servers that are already in terminal states + if serverStatus.State == metalv1alpha1.ServerCleaningStateCompleted { + completedCount++ + continue + } + if serverStatus.State == metalv1alpha1.ServerCleaningStateFailed { + failedCount++ + continue + } + + // Check BMC tasks for this server + // Tasks are now in BMC.Status.Tasks and monitored by BMCTask controller + tasks, err := r.getTasksForServer(ctx, &server, cleaning.Name) + if err != nil { + log.Error(err, "Failed to get BMC tasks for server", "server", server.Name) + allComplete = false + inProgressCount++ + continue + } + + // Check if all tasks are complete + tasksComplete, tasksFailed := r.checkTasksComplete(tasks) + + if tasksComplete { + // All tasks finished - update server status + if tasksFailed { + log.Info("Cleaning completed with failures", "server", server.Name) + if err := r.updateServerStatus(ctx, cleaning, server.Name, metalv1alpha1.ServerCleaningStateFailed, "One or more cleaning tasks failed"); err != nil { + return ctrl.Result{}, err + } + failedCount++ + } else { + log.Info("Cleaning completed successfully", "server", server.Name) + if err := r.updateServerStatus(ctx, cleaning, server.Name, metalv1alpha1.ServerCleaningStateCompleted, "All cleaning tasks completed successfully"); err != nil { + return ctrl.Result{}, err + } + completedCount++ + } + } else { + // Tasks still in progress + inProgressCount++ + allComplete = false + + // Calculate progress + completedTaskCount := 0 + totalPercent := int32(0) + for _, task := range tasks { + if task.State == taskStateCompleted { + completedTaskCount++ + } + totalPercent += task.PercentComplete + } + avgPercent := int32(0) + if len(tasks) > 0 { + avgPercent = totalPercent / int32(len(tasks)) + } + progressMsg := fmt.Sprintf("Cleaning in progress: %d%% (%d/%d tasks completed)", avgPercent, completedTaskCount, len(tasks)) + + if err := r.updateServerStatus(ctx, cleaning, server.Name, metalv1alpha1.ServerCleaningStateInProgress, progressMsg); err != nil { + return ctrl.Result{}, err + } + } + } + + // Update counts + if err := r.updateCleaningCounts(ctx, cleaning, 0, inProgressCount, completedCount, failedCount); err != nil { + return ctrl.Result{}, err + } + + // Check if all cleanings are complete + totalServers := cleaning.Status.SelectedServers + processedServers := completedCount + failedCount + + if allComplete && processedServers >= totalServers { + // All servers processed + if failedCount > 0 { + log.V(1).Info("Cleaning completed with failures", "completed", completedCount, "failed", failedCount) + if err := r.setCondition(ctx, cleaning, metav1.Condition{ + Type: ServerCleaningConditionTypeCleaning, + Status: metav1.ConditionFalse, + Reason: ServerCleaningConditionReasonFailed, + Message: fmt.Sprintf("Cleaning completed: %d succeeded, %d failed", completedCount, failedCount), + ObservedGeneration: cleaning.Generation, + }); err != nil { + return ctrl.Result{}, err + } + if modified, err := r.patchCleaningState(ctx, cleaning, metalv1alpha1.ServerCleaningStateFailed); err != nil || modified { + return ctrl.Result{}, err + } + } else { + log.V(1).Info("Cleaning completed successfully", "completed", completedCount) + if err := r.setCondition(ctx, cleaning, metav1.Condition{ + Type: ServerCleaningConditionTypeCleaning, + Status: metav1.ConditionTrue, + Reason: ServerCleaningConditionReasonCompleted, + Message: fmt.Sprintf("Cleaning completed successfully for %d servers", completedCount), + ObservedGeneration: cleaning.Generation, + }); err != nil { + return ctrl.Result{}, err + } + if modified, err := r.patchCleaningState(ctx, cleaning, metalv1alpha1.ServerCleaningStateCompleted); err != nil || modified { + return ctrl.Result{}, err + } + } + return ctrl.Result{}, nil + } + + // Still in progress, requeue to check again + log.V(1).Info("Cleaning still in progress", "inProgress", inProgressCount, "completed", completedCount, "failed", failedCount) + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil +} + +func (r *ServerCleaningReconciler) handleCompletedState(ctx context.Context, _ *metalv1alpha1.ServerCleaning) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("ServerCleaning completed, nothing to do") + return ctrl.Result{}, nil +} + +func (r *ServerCleaningReconciler) handleFailedState(ctx context.Context, _ *metalv1alpha1.ServerCleaning) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("ServerCleaning failed, manual intervention required") + return ctrl.Result{}, nil +} + +func (r *ServerCleaningReconciler) delete(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + log.V(1).Info("Deleting ServerCleaning") + + // Remove finalizer + if err := clientutils.PatchRemoveFinalizer(ctx, r.Client, cleaning, ServerCleaningFinalizer); err != nil { + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +func (r *ServerCleaningReconciler) patchCleaningState(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning, state metalv1alpha1.ServerCleaningState) (bool, error) { + if cleaning.Status.State == state { + return false, nil + } + + cleaningBase := cleaning.DeepCopy() + cleaning.Status.State = state + if err := r.Status().Patch(ctx, cleaning, client.MergeFrom(cleaningBase)); err != nil { + return false, fmt.Errorf("failed to patch ServerCleaning state: %w", err) + } + + return true, nil +} + +func (r *ServerCleaningReconciler) setCondition(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning, condition metav1.Condition) error { + cleaningBase := cleaning.DeepCopy() + condition.LastTransitionTime = metav1.Now() + meta.SetStatusCondition(&cleaning.Status.Conditions, condition) + if err := r.Status().Patch(ctx, cleaning, client.MergeFrom(cleaningBase)); err != nil { + return fmt.Errorf("failed to update conditions: %w", err) + } + return nil +} + +// SetupWithManager sets up the controller with the Manager. +func (r *ServerCleaningReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&metalv1alpha1.ServerCleaning{}). + Owns(&metalv1alpha1.ServerMaintenance{}). + Watches( + &metalv1alpha1.Server{}, + handler.EnqueueRequestsFromMapFunc(r.mapServerToServerCleaning), + ). + Watches( + &metalv1alpha1.BMC{}, + handler.EnqueueRequestsFromMapFunc(r.mapBMCToServerCleaning), + ). + Complete(r) +} + +func (r *ServerCleaningReconciler) mapServerToServerCleaning(ctx context.Context, obj client.Object) []reconcile.Request { + server := obj.(*metalv1alpha1.Server) + + cleaningList := &metalv1alpha1.ServerCleaningList{} + if err := r.List(ctx, cleaningList); err != nil { + return nil + } + + var requests []reconcile.Request + for _, cleaning := range cleaningList.Items { + if cleaning.Spec.ServerRef.Name == server.Name { + requests = append(requests, reconcile.Request{ + NamespacedName: client.ObjectKeyFromObject(&cleaning), + }) + } + } + + return requests +} + +// mapBMCToServerCleaning maps BMC updates (specifically task status changes) to ServerCleaning reconcile requests +func (r *ServerCleaningReconciler) mapBMCToServerCleaning(ctx context.Context, obj client.Object) []reconcile.Request { + bmcObj := obj.(*metalv1alpha1.BMC) + + // Find all servers that reference this BMC + serverList := &metalv1alpha1.ServerList{} + if err := r.List(ctx, serverList); err != nil { + return nil + } + + var affectedServers []string + for _, server := range serverList.Items { + if server.Spec.BMCRef != nil && server.Spec.BMCRef.Name == bmcObj.Name { + affectedServers = append(affectedServers, server.Name) + } + } + + // Find ServerCleaning objects that are working on these servers + cleaningList := &metalv1alpha1.ServerCleaningList{} + if err := r.List(ctx, cleaningList); err != nil { + return nil + } + + var requests []reconcile.Request + for _, cleaning := range cleaningList.Items { + // Only reconcile if cleaning is in progress + if cleaning.Status.State != metalv1alpha1.ServerCleaningStateInProgress { + continue + } + + // Check if this cleaning is working on any of the affected servers + if cleaning.Spec.ServerRef != nil { + if slices.Contains(affectedServers, cleaning.Spec.ServerRef.Name) { + requests = append(requests, reconcile.Request{ + NamespacedName: client.ObjectKeyFromObject(&cleaning), + }) + } + } + } + + return requests +} + +func (r *ServerCleaningReconciler) getServersForCleaning(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning) ([]metalv1alpha1.Server, error) { + // If ServerRef is specified, return that single server + if cleaning.Spec.ServerRef != nil { + server, err := GetServerByName(ctx, r.Client, cleaning.Spec.ServerRef.Name) + if err != nil { + return nil, fmt.Errorf("failed to get server %s: %w", cleaning.Spec.ServerRef.Name, err) + } + return []metalv1alpha1.Server{*server}, nil + } + + // If ServerSelector is specified, list matching servers + if cleaning.Spec.ServerSelector != nil { + serverList := &metalv1alpha1.ServerList{} + selector, err := metav1.LabelSelectorAsSelector(cleaning.Spec.ServerSelector) + if err != nil { + return nil, fmt.Errorf("failed to convert label selector: %w", err) + } + + if err := r.List(ctx, serverList, client.MatchingLabelsSelector{Selector: selector}); err != nil { + return nil, fmt.Errorf("failed to list servers: %w", err) + } + + return serverList.Items, nil + } + + return nil, fmt.Errorf("neither serverRef nor serverSelector is specified") +} + +func (r *ServerCleaningReconciler) updateSelectedServersCount(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning, count int32) error { + cleaningBase := cleaning.DeepCopy() + cleaning.Status.SelectedServers = count + if err := r.Status().Patch(ctx, cleaning, client.MergeFrom(cleaningBase)); err != nil { + return fmt.Errorf("failed to update selected servers count: %w", err) + } + return nil +} + +func (r *ServerCleaningReconciler) initializeServerStatuses(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning, servers []metalv1alpha1.Server) error { + cleaningBase := cleaning.DeepCopy() + cleaning.Status.ServerCleaningStatuses = make([]metalv1alpha1.ServerCleaningStatusEntry, 0, len(servers)) + + for _, server := range servers { + cleaning.Status.ServerCleaningStatuses = append(cleaning.Status.ServerCleaningStatuses, metalv1alpha1.ServerCleaningStatusEntry{ + ServerName: server.Name, + State: metalv1alpha1.ServerCleaningStatePending, + Message: "Waiting to start cleaning", + LastUpdateTime: metav1.Now(), + }) + } + + if err := r.Status().Patch(ctx, cleaning, client.MergeFrom(cleaningBase)); err != nil { + return fmt.Errorf("failed to initialize server statuses: %w", err) + } + return nil +} + +func (r *ServerCleaningReconciler) updateServerStatus(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning, serverName string, state metalv1alpha1.ServerCleaningState, message string) error { + cleaningBase := cleaning.DeepCopy() + + // Find and update the server status entry + found := false + for i := range cleaning.Status.ServerCleaningStatuses { + if cleaning.Status.ServerCleaningStatuses[i].ServerName == serverName { + cleaning.Status.ServerCleaningStatuses[i].State = state + cleaning.Status.ServerCleaningStatuses[i].Message = message + cleaning.Status.ServerCleaningStatuses[i].LastUpdateTime = metav1.Now() + found = true + break + } + } + + // If not found, add new entry + if !found { + cleaning.Status.ServerCleaningStatuses = append(cleaning.Status.ServerCleaningStatuses, metalv1alpha1.ServerCleaningStatusEntry{ + ServerName: serverName, + State: state, + Message: message, + LastUpdateTime: metav1.Now(), + }) + } + + if err := r.Status().Patch(ctx, cleaning, client.MergeFrom(cleaningBase)); err != nil { + return fmt.Errorf("failed to update server status: %w", err) + } + return nil +} + +func (r *ServerCleaningReconciler) updateCleaningCounts(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning, pending, inProgress, completed, failed int32) error { + cleaningBase := cleaning.DeepCopy() + cleaning.Status.PendingCleanings = pending + cleaning.Status.InProgressCleanings = inProgress + cleaning.Status.CompletedCleanings = completed + cleaning.Status.FailedCleanings = failed + + if err := r.Status().Patch(ctx, cleaning, client.MergeFrom(cleaningBase)); err != nil { + return fmt.Errorf("failed to update cleaning counts: %w", err) + } + return nil +} + +// addTaskToBMC adds a BMCTask to the specified BMC's status +func (r *ServerCleaningReconciler) addTaskToBMC(ctx context.Context, bmcName string, task metalv1alpha1.BMCTask) error { + log := ctrl.LoggerFrom(ctx) + + // Get the BMC resource + bmcObj := &metalv1alpha1.BMC{} + if err := r.Get(ctx, types.NamespacedName{Name: bmcName}, bmcObj); err != nil { + return fmt.Errorf("failed to get BMC %s: %w", bmcName, err) + } + + // Add the task to BMC.Status.Tasks + bmcObj.Status.Tasks = append(bmcObj.Status.Tasks, task) + + // Keep only the last 10 tasks to prevent unbounded growth + if len(bmcObj.Status.Tasks) > 10 { + bmcObj.Status.Tasks = bmcObj.Status.Tasks[len(bmcObj.Status.Tasks)-10:] + } + + // Update BMC status + if err := r.Status().Update(ctx, bmcObj); err != nil { + return fmt.Errorf("failed to update BMC tasks: %w", err) + } + + log.V(1).Info("Added task to BMC", "bmc", bmcName, "taskType", task.TaskType, "taskURI", task.TaskURI) + return nil +} + +// getTasksForServer retrieves tasks from BMC.Status.Tasks for a specific server's cleaning operation +func (r *ServerCleaningReconciler) getTasksForServer(ctx context.Context, server *metalv1alpha1.Server, _ string) ([]metalv1alpha1.BMCTask, error) { + // Get the BMC for this server + if server.Spec.BMCRef == nil { + return nil, fmt.Errorf("server %s has no BMCRef", server.Name) + } + + bmcObj := &metalv1alpha1.BMC{} + if err := r.Get(ctx, types.NamespacedName{Name: server.Spec.BMCRef.Name}, bmcObj); err != nil { + return nil, fmt.Errorf("failed to get BMC %s: %w", server.Spec.BMCRef.Name, err) + } + + // Filter tasks that belong to this cleaning operation + // We identify our tasks by checking if they were created recently and match expected types + var relevantTasks []metalv1alpha1.BMCTask + for _, task := range bmcObj.Status.Tasks { + // Check if this is a cleaning-related task type + if task.TaskType == metalv1alpha1.BMCTaskTypeDiskErase || + task.TaskType == metalv1alpha1.BMCTaskTypeBIOSReset || + task.TaskType == metalv1alpha1.BMCTaskTypeBMCReset || + task.TaskType == metalv1alpha1.BMCTaskTypeNetworkClear { + relevantTasks = append(relevantTasks, task) + } + } + + return relevantTasks, nil +} + +// checkTasksComplete checks if all tasks are in terminal states and returns completion status +func (r *ServerCleaningReconciler) checkTasksComplete(tasks []metalv1alpha1.BMCTask) (allComplete bool, anyFailed bool) { + if len(tasks) == 0 { + return true, false + } + + allComplete = true + anyFailed = false + + for _, task := range tasks { + taskState := task.State + + // Check if task is still running + if taskState != taskStateCompleted && + taskState != taskStateException && + taskState != taskStateCancelled && + taskState != taskStateKilled && + taskState != taskStateFailed { + allComplete = false + } + + // Check if task failed + if taskState == taskStateException || + taskState == taskStateCancelled || + taskState == taskStateKilled || + taskState == taskStateFailed { + anyFailed = true + } + } + + return allComplete, anyFailed +} + +// initiateBMCCleaning initiates cleaning operations directly via BMC and stores task information +func (r *ServerCleaningReconciler) initiateBMCCleaning(ctx context.Context, cleaning *metalv1alpha1.ServerCleaning, server *metalv1alpha1.Server) error { + log := ctrl.LoggerFrom(ctx) + + // Get BMC client for this server + bmcClient, err := bmcutils.GetBMCClientForServer(ctx, r.Client, server, false, bmc.Options{}) + if err != nil { + return fmt.Errorf("failed to get BMC client: %w", err) + } + defer bmcClient.Logout() + + systemURI := server.Spec.SystemURI + if systemURI == "" { + return fmt.Errorf("server %s has no system URI", server.Name) + } + + // Get BMC reference for adding tasks + if server.Spec.BMCRef == nil { + return fmt.Errorf("server %s has no BMCRef", server.Name) + } + bmcName := server.Spec.BMCRef.Name + taskCount := 0 + + // Initiate disk wipe if requested + if cleaning.Spec.DiskWipe != nil { + log.V(1).Info("Initiating disk erase", "server", server.Name, "method", cleaning.Spec.DiskWipe.Method) + tasks, err := bmcClient.EraseDisk(ctx, systemURI, bmc.DiskWipeMethod(cleaning.Spec.DiskWipe.Method)) + if err != nil { + return fmt.Errorf("failed to initiate disk wipe: %w", err) + } + // Add each disk erase task to BMC.Status.Tasks + for _, task := range tasks { + bmcTask := metalv1alpha1.BMCTask{ + TaskURI: task.TaskURI, + TaskType: metalv1alpha1.BMCTaskTypeDiskErase, + TargetID: task.TargetID, + State: taskStateNew, + PercentComplete: 0, + LastUpdateTime: metav1.Now(), + } + if err := r.addTaskToBMC(ctx, bmcName, bmcTask); err != nil { + return fmt.Errorf("failed to add disk erase task to BMC: %w", err) + } + taskCount++ + } + log.V(1).Info("Disk wipe tasks created", "server", server.Name, "count", len(tasks)) + } + + // Initiate BIOS reset if requested + if cleaning.Spec.BIOSReset { + log.V(1).Info("Initiating BIOS reset", "server", server.Name) + task, err := bmcClient.ResetBIOSToDefaults(ctx, systemURI) + if err != nil { + return fmt.Errorf("failed to initiate BIOS reset: %w", err) + } + if task != nil { + bmcTask := metalv1alpha1.BMCTask{ + TaskURI: task.TaskURI, + TaskType: metalv1alpha1.BMCTaskTypeBIOSReset, + TargetID: task.TargetID, + State: taskStateNew, + PercentComplete: 0, + LastUpdateTime: metav1.Now(), + } + if err := r.addTaskToBMC(ctx, bmcName, bmcTask); err != nil { + return fmt.Errorf("failed to add BIOS reset task to BMC: %w", err) + } + taskCount++ + log.V(1).Info("BIOS reset task created", "server", server.Name, "taskURI", task.TaskURI) + } + } + + // Initiate BMC reset if requested + // TODO: BMC reset requires manager UUID which is not readily available from server spec. + // For now, BMC reset will be handled via ServerMaintenance or manual intervention. + if cleaning.Spec.BMCReset { + log.V(1).Info("BMC reset requested but not yet implemented via direct BMC access", "server", server.Name) + // Note: BMC reset is a critical operation that may disconnect the BMC client, + // so it should be done last or via ServerMaintenance with proper handling. + } + + // Initiate network config clear if requested + if cleaning.Spec.NetworkCleanup { + log.V(1).Info("Initiating network configuration clear", "server", server.Name) + task, err := bmcClient.ClearNetworkConfiguration(ctx, systemURI) + if err != nil { + // Network cleanup is non-critical, log and continue + log.Error(err, "Failed to initiate network config clear (non-critical)", "server", server.Name) + } else if task != nil { + bmcTask := metalv1alpha1.BMCTask{ + TaskURI: task.TaskURI, + TaskType: metalv1alpha1.BMCTaskTypeNetworkClear, + TargetID: task.TargetID, + State: taskStateNew, + PercentComplete: 0, + LastUpdateTime: metav1.Now(), + } + if err := r.addTaskToBMC(ctx, bmcName, bmcTask); err != nil { + log.Error(err, "Failed to add network clear task to BMC (non-critical)", "server", server.Name) + } else { + taskCount++ + log.V(1).Info("Network config clear task created", "server", server.Name, "taskURI", task.TaskURI) + } + } + } + + // Tasks are now in BMC.Status.Tasks and will be monitored by BMCTask controller + if taskCount > 0 { + log.Info("Cleaning tasks initiated and added to BMC", "server", server.Name, "bmc", bmcName, "taskCount", taskCount) + } else { + log.Info("No cleaning tasks created (all operations completed synchronously)", "server", server.Name) + } + + return nil +} diff --git a/internal/controller/servercleaning_controller_test.go b/internal/controller/servercleaning_controller_test.go new file mode 100644 index 000000000..f251c9b84 --- /dev/null +++ b/internal/controller/servercleaning_controller_test.go @@ -0,0 +1,670 @@ +// SPDX-FileCopyrightText: 2025 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "time" + + metalv1alpha1 "github.com/ironcore-dev/metal-operator/api/v1alpha1" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + . "sigs.k8s.io/controller-runtime/pkg/envtest/komega" +) + +var _ = Describe("ServerCleaning Controller", func() { + ns := SetupTest(nil) + + AfterEach(func(ctx SpecContext) { + EnsureCleanState() + }) + + It("Should successfully create and reconcile a ServerCleaning resource with serverRef", func(ctx SpecContext) { + By("Creating a Server resource in Tainted state") + server := &metalv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-server-", + Namespace: ns.Name, + Labels: map[string]string{ + "test": "cleaning", + }, + }, + Spec: metalv1alpha1.ServerSpec{ + SystemUUID: "test-system-uuid-1", + SystemURI: "/redfish/v1/Systems/1", + BMCRef: &corev1.LocalObjectReference{ + Name: "test-bmc", + }, + Taints: []corev1.Taint{ + { + Key: "metal.ironcore.dev/tainted", + Effect: corev1.TaintEffectNoSchedule, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, server)).To(Succeed()) + + By("Setting Server state to Tainted") + Eventually(UpdateStatus(server, func() { + server.Status.State = metalv1alpha1.ServerStateTainted + })).Should(Succeed()) + + By("Creating a ServerCleaning resource") + cleaning := &metalv1alpha1.ServerCleaning{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-cleaning-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerCleaningSpec{ + ServerRef: &corev1.LocalObjectReference{ + Name: server.Name, + }, + DiskWipe: &metalv1alpha1.DiskWipeConfig{ + Method: metalv1alpha1.DiskWipeMethodQuick, + IncludeBootDrives: true, + }, + BIOSReset: true, + NetworkCleanup: true, + }, + } + Expect(k8sClient.Create(ctx, cleaning)).To(Succeed()) + + By("Ensuring ServerCleaning transitions to Pending state") + Eventually(Object(cleaning)).Should(SatisfyAll( + HaveField("Status.State", metalv1alpha1.ServerCleaningStatePending), + )) + + By("Ensuring ServerCleaning has finalizer") + Eventually(Object(cleaning)).Should(SatisfyAll( + HaveField("Finalizers", ContainElement(ServerCleaningFinalizer)), + )) + + By("Ensuring ServerCleaning transitions to InProgress state") + Eventually(Object(cleaning)).WithTimeout(2 * time.Minute).Should(SatisfyAll( + HaveField("Status.State", metalv1alpha1.ServerCleaningStateInProgress), + HaveField("Status.SelectedServers", BeNumerically(">", 0)), + )) + + By("Ensuring ServerCleaning status has server status entry") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cleaning), cleaning)).To(Succeed()) + g.Expect(cleaning.Status.ServerCleaningStatuses).NotTo(BeEmpty()) + g.Expect(cleaning.Status.ServerCleaningStatuses[0].ServerName).To(Equal(server.Name)) + }).Should(Succeed()) + + // Cleanup + Expect(k8sClient.Delete(ctx, cleaning)).To(Succeed()) + Eventually(Get(cleaning)).Should(Satisfy(apierrors.IsNotFound)) + Expect(k8sClient.Delete(ctx, server)).To(Succeed()) + }) + + It("Should successfully create and reconcile a ServerCleaning resource with serverSelector", func(ctx SpecContext) { + By("Creating multiple Server resources in Tainted state") + server1 := &metalv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-server-", + Namespace: ns.Name, + Labels: map[string]string{ + "cleanup-group": "staging", + "region": "us-west", + }, + }, + Spec: metalv1alpha1.ServerSpec{ + SystemUUID: "test-system-uuid-1", + SystemURI: "/redfish/v1/Systems/1", + BMCRef: &corev1.LocalObjectReference{ + Name: "test-bmc", + }, + Taints: []corev1.Taint{ + { + Key: "metal.ironcore.dev/tainted", + Effect: corev1.TaintEffectNoSchedule, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, server1)).To(Succeed()) + + server2 := &metalv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-server-", + Namespace: ns.Name, + Labels: map[string]string{ + "cleanup-group": "staging", + "region": "us-east", + }, + }, + Spec: metalv1alpha1.ServerSpec{ + SystemUUID: "test-system-uuid-2", + SystemURI: "/redfish/v1/Systems/2", + BMCRef: &corev1.LocalObjectReference{ + Name: "test-bmc", + }, + Taints: []corev1.Taint{ + { + Key: "metal.ironcore.dev/tainted", + Effect: corev1.TaintEffectNoSchedule, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, server2)).To(Succeed()) + + By("Setting Server states to Tainted") + Eventually(UpdateStatus(server1, func() { + server1.Status.State = metalv1alpha1.ServerStateTainted + })).Should(Succeed()) + Eventually(UpdateStatus(server2, func() { + server2.Status.State = metalv1alpha1.ServerStateTainted + })).Should(Succeed()) + + By("Creating a ServerCleaning resource with serverSelector") + cleaning := &metalv1alpha1.ServerCleaning{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-cleaning-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerCleaningSpec{ + ServerSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "cleanup-group": "staging", + }, + }, + DiskWipe: &metalv1alpha1.DiskWipeConfig{ + Method: metalv1alpha1.DiskWipeMethodSecure, + IncludeBootDrives: false, + }, + NetworkCleanup: true, + }, + } + Expect(k8sClient.Create(ctx, cleaning)).To(Succeed()) + + By("Ensuring ServerCleaning transitions to InProgress state") + Eventually(Object(cleaning)).WithTimeout(2 * time.Minute).Should(SatisfyAll( + HaveField("Status.State", metalv1alpha1.ServerCleaningStateInProgress), + HaveField("Status.SelectedServers", BeNumerically("==", 2)), + )) + + By("Ensuring ServerCleaning status has entries for both servers") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cleaning), cleaning)).To(Succeed()) + g.Expect(cleaning.Status.ServerCleaningStatuses).To(HaveLen(2)) + }).Should(Succeed()) + + // Cleanup + Expect(k8sClient.Delete(ctx, cleaning)).To(Succeed()) + Eventually(Get(cleaning)).Should(Satisfy(apierrors.IsNotFound)) + Expect(k8sClient.Delete(ctx, server1)).To(Succeed()) + Expect(k8sClient.Delete(ctx, server2)).To(Succeed()) + }) + + It("Should track cleaning tasks in status", func(ctx SpecContext) { + By("Creating a Server resource in Tainted state") + server := &metalv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-server-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerSpec{ + SystemUUID: "test-system-uuid-1", + SystemURI: "/redfish/v1/Systems/1", + BMCRef: &corev1.LocalObjectReference{ + Name: "test-bmc", + }, + Taints: []corev1.Taint{ + { + Key: "metal.ironcore.dev/tainted", + Effect: corev1.TaintEffectNoSchedule, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, server)).To(Succeed()) + + By("Setting Server state to Tainted") + Eventually(UpdateStatus(server, func() { + server.Status.State = metalv1alpha1.ServerStateTainted + })).Should(Succeed()) + + By("Creating a ServerCleaning resource with multiple cleaning operations") + cleaning := &metalv1alpha1.ServerCleaning{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-cleaning-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerCleaningSpec{ + ServerRef: &corev1.LocalObjectReference{ + Name: server.Name, + }, + DiskWipe: &metalv1alpha1.DiskWipeConfig{ + Method: metalv1alpha1.DiskWipeMethodDoD, + IncludeBootDrives: true, + }, + BIOSReset: true, + NetworkCleanup: true, + }, + } + Expect(k8sClient.Create(ctx, cleaning)).To(Succeed()) + + By("Ensuring cleaning tasks are tracked in BMC status") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cleaning), cleaning)).To(Succeed()) + // Check that cleaning is in progress + g.Expect(cleaning.Status.State).To(Equal(metalv1alpha1.ServerCleaningStateInProgress)) + + // Verify tasks are created in BMC.Status.Tasks (not in ServerCleaning status) + bmcObj := &metalv1alpha1.BMC{} + g.Expect(k8sClient.Get(ctx, client.ObjectKey{Name: server.Spec.BMCRef.Name}, bmcObj)).To(Succeed()) + // Should have at least one task from the cleaning operations + g.Expect(bmcObj.Status.Tasks).ToNot(BeEmpty()) + }).WithTimeout(2 * time.Minute).Should(Succeed()) + + // Cleanup + Expect(k8sClient.Delete(ctx, cleaning)).To(Succeed()) + Eventually(Get(cleaning)).Should(Satisfy(apierrors.IsNotFound)) + Expect(k8sClient.Delete(ctx, server)).To(Succeed()) + }) + + It("Should update cleaning counts correctly", func(ctx SpecContext) { + By("Creating multiple Server resources") + servers := make([]*metalv1alpha1.Server, 3) + for i := range 3 { + servers[i] = &metalv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-server-", + Namespace: ns.Name, + Labels: map[string]string{ + "batch": "test", + }, + }, + Spec: metalv1alpha1.ServerSpec{ + SystemUUID: "test-system-uuid-" + string(rune(i)), + SystemURI: "/redfish/v1/Systems/" + string(rune(i)), + BMCRef: &corev1.LocalObjectReference{ + Name: "test-bmc", + }, + }, + } + Expect(k8sClient.Create(ctx, servers[i])).To(Succeed()) + + Eventually(UpdateStatus(servers[i], func() { + servers[i].Status.State = metalv1alpha1.ServerStateTainted + })).Should(Succeed()) + } + + By("Creating a ServerCleaning resource for all servers") + cleaning := &metalv1alpha1.ServerCleaning{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-cleaning-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerCleaningSpec{ + ServerSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "batch": "test", + }, + }, + DiskWipe: &metalv1alpha1.DiskWipeConfig{ + Method: metalv1alpha1.DiskWipeMethodQuick, + }, + }, + } + Expect(k8sClient.Create(ctx, cleaning)).To(Succeed()) + + By("Ensuring cleaning counts are updated") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cleaning), cleaning)).To(Succeed()) + // Should have selected all 3 servers + g.Expect(cleaning.Status.SelectedServers).To(BeNumerically("==", 3)) + // Should have counts tracking progress + totalProcessed := cleaning.Status.InProgressCleanings + + cleaning.Status.CompletedCleanings + + cleaning.Status.FailedCleanings + g.Expect(totalProcessed).To(BeNumerically(">", 0)) + }).WithTimeout(2 * time.Minute).Should(Succeed()) + + // Cleanup + Expect(k8sClient.Delete(ctx, cleaning)).To(Succeed()) + Eventually(Get(cleaning)).Should(Satisfy(apierrors.IsNotFound)) + for _, server := range servers { + Expect(k8sClient.Delete(ctx, server)).To(Succeed()) + } + }) + + It("Should set proper conditions during cleaning lifecycle", func(ctx SpecContext) { + By("Creating a Server resource in Tainted state") + server := &metalv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-server-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerSpec{ + SystemUUID: "test-system-uuid-1", + SystemURI: "/redfish/v1/Systems/1", + BMCRef: &corev1.LocalObjectReference{ + Name: "test-bmc", + }, + Taints: []corev1.Taint{ + { + Key: "metal.ironcore.dev/tainted", + Effect: corev1.TaintEffectNoSchedule, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, server)).To(Succeed()) + + By("Setting Server state to Tainted") + Eventually(UpdateStatus(server, func() { + server.Status.State = metalv1alpha1.ServerStateTainted + })).Should(Succeed()) + + By("Creating a ServerCleaning resource") + cleaning := &metalv1alpha1.ServerCleaning{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-cleaning-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerCleaningSpec{ + ServerRef: &corev1.LocalObjectReference{ + Name: server.Name, + }, + DiskWipe: &metalv1alpha1.DiskWipeConfig{ + Method: metalv1alpha1.DiskWipeMethodQuick, + }, + }, + } + Expect(k8sClient.Create(ctx, cleaning)).To(Succeed()) + + By("Ensuring Cleaning condition is set when in progress") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cleaning), cleaning)).To(Succeed()) + if cleaning.Status.State == metalv1alpha1.ServerCleaningStateInProgress { + g.Expect(cleaning.Status.Conditions).NotTo(BeEmpty()) + condition := findCondition(cleaning.Status.Conditions, ServerCleaningConditionTypeCleaning) + g.Expect(condition).NotTo(BeNil()) + g.Expect(condition.Status).To(Equal(metav1.ConditionTrue)) + g.Expect(condition.Reason).To(Equal(ServerCleaningConditionReasonInProgress)) + } + }).WithTimeout(2 * time.Minute).Should(Succeed()) + + // Cleanup + Expect(k8sClient.Delete(ctx, cleaning)).To(Succeed()) + Eventually(Get(cleaning)).Should(Satisfy(apierrors.IsNotFound)) + Expect(k8sClient.Delete(ctx, server)).To(Succeed()) + }) + + It("Should skip servers not in Tainted state", func(ctx SpecContext) { + By("Creating servers in different states") + taintedServer := &metalv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "tainted-server-", + Namespace: ns.Name, + Labels: map[string]string{ + "group": "mixed", + }, + }, + Spec: metalv1alpha1.ServerSpec{ + SystemUUID: "test-system-uuid-1", + SystemURI: "/redfish/v1/Systems/1", + BMCRef: &corev1.LocalObjectReference{ + Name: "test-bmc", + }, + Taints: []corev1.Taint{ + { + Key: "metal.ironcore.dev/tainted", + Effect: corev1.TaintEffectNoSchedule, + }, + }, + }, + } + Expect(k8sClient.Create(ctx, taintedServer)).To(Succeed()) + Eventually(UpdateStatus(taintedServer, func() { + taintedServer.Status.State = metalv1alpha1.ServerStateTainted + })).Should(Succeed()) + + availableServer := &metalv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "available-server-", + Namespace: ns.Name, + Labels: map[string]string{ + "group": "mixed", + }, + }, + Spec: metalv1alpha1.ServerSpec{ + SystemUUID: "test-system-uuid-2", + SystemURI: "/redfish/v1/Systems/2", + BMCRef: &corev1.LocalObjectReference{ + Name: "test-bmc", + }, + }, + } + Expect(k8sClient.Create(ctx, availableServer)).To(Succeed()) + Eventually(UpdateStatus(availableServer, func() { + availableServer.Status.State = metalv1alpha1.ServerStateAvailable + })).Should(Succeed()) + + By("Creating a ServerCleaning resource targeting both servers") + cleaning := &metalv1alpha1.ServerCleaning{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-cleaning-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerCleaningSpec{ + ServerSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "group": "mixed", + }, + }, + DiskWipe: &metalv1alpha1.DiskWipeConfig{ + Method: metalv1alpha1.DiskWipeMethodQuick, + }, + }, + } + Expect(k8sClient.Create(ctx, cleaning)).To(Succeed()) + + By("Ensuring only tainted server gets cleaning status entry") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cleaning), cleaning)).To(Succeed()) + // Should select 2 servers but only process the tainted one + g.Expect(cleaning.Status.SelectedServers).To(BeNumerically("==", 2)) + // Only tainted server should have a status entry + if len(cleaning.Status.ServerCleaningStatuses) > 0 { + for _, status := range cleaning.Status.ServerCleaningStatuses { + g.Expect(status.ServerName).To(Equal(taintedServer.Name)) + } + } + }).WithTimeout(2 * time.Minute).Should(Succeed()) + + // Cleanup + Expect(k8sClient.Delete(ctx, cleaning)).To(Succeed()) + Eventually(Get(cleaning)).Should(Satisfy(apierrors.IsNotFound)) + Expect(k8sClient.Delete(ctx, taintedServer)).To(Succeed()) + Expect(k8sClient.Delete(ctx, availableServer)).To(Succeed()) + }) + + It("Should clean tainted server and transition from Reserved to Available", func(ctx SpecContext) { + By("Creating a ServerClaim resource") + claim := &metalv1alpha1.ServerClaim{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-claim-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerClaimSpec{ + Power: metalv1alpha1.PowerOn, + ServerSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "claim-test": "transition", + }, + }, + }, + } + Expect(k8sClient.Create(ctx, claim)).To(Succeed()) + + By("Creating a Server resource that will be claimed") + server := &metalv1alpha1.Server{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-server-", + Namespace: ns.Name, + Labels: map[string]string{ + "claim-test": "transition", + }, + }, + Spec: metalv1alpha1.ServerSpec{ + SystemUUID: "test-system-uuid-claim", + SystemURI: "/redfish/v1/Systems/claim", + BMCRef: &corev1.LocalObjectReference{ + Name: "test-bmc", + }, + }, + } + Expect(k8sClient.Create(ctx, server)).To(Succeed()) + + By("Setting Server state to Available initially") + Eventually(UpdateStatus(server, func() { + server.Status.State = metalv1alpha1.ServerStateAvailable + })).Should(Succeed()) + + By("Waiting for Server to be claimed") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(server), server)).To(Succeed()) + g.Expect(server.Spec.ServerClaimRef).NotTo(BeNil()) + g.Expect(server.Spec.ServerClaimRef.Name).To(Equal(claim.Name)) + }).Should(Succeed()) + + By("Setting Server state to Reserved") + Eventually(UpdateStatus(server, func() { + server.Status.State = metalv1alpha1.ServerStateReserved + })).Should(Succeed()) + + By("Adding taints to the Server before releasing") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(server), server)).To(Succeed()) + serverBase := server.DeepCopy() + server.Spec.Taints = []corev1.Taint{ + { + Key: "metal.ironcore.dev/tainted", + Effect: corev1.TaintEffectNoSchedule, + }, + } + g.Expect(k8sClient.Patch(ctx, server, client.MergeFrom(serverBase))).To(Succeed()) + }).Should(Succeed()) + + By("Deleting the ServerClaim to release the server") + Expect(k8sClient.Delete(ctx, claim)).To(Succeed()) + Eventually(Get(claim)).Should(Satisfy(apierrors.IsNotFound)) + + By("Ensuring ServerClaimRef is removed from Server") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(server), server)).To(Succeed()) + g.Expect(server.Spec.ServerClaimRef).To(BeNil()) + }).Should(Succeed()) + + By("Ensuring Server transitions to Tainted state") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(server), server)).To(Succeed()) + g.Expect(server.Status.State).To(Equal(metalv1alpha1.ServerStateTainted)) + }).WithTimeout(2 * time.Minute).Should(Succeed()) + + By("Creating a ServerCleaning resource for the tainted server") + cleaning := &metalv1alpha1.ServerCleaning{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-cleaning-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerCleaningSpec{ + ServerRef: &corev1.LocalObjectReference{ + Name: server.Name, + }, + DiskWipe: &metalv1alpha1.DiskWipeConfig{ + Method: metalv1alpha1.DiskWipeMethodQuick, + IncludeBootDrives: true, + }, + BIOSReset: true, + NetworkCleanup: true, + }, + } + Expect(k8sClient.Create(ctx, cleaning)).To(Succeed()) + + By("Ensuring ServerCleaning transitions through states") + Eventually(Object(cleaning)).Should(SatisfyAll( + HaveField("Status.State", metalv1alpha1.ServerCleaningStatePending), + )) + + Eventually(Object(cleaning)).WithTimeout(2 * time.Minute).Should(SatisfyAll( + HaveField("Status.State", metalv1alpha1.ServerCleaningStateInProgress), + )) + + By("Simulating cleaning completion by updating ServerCleaning status") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(cleaning), cleaning)).To(Succeed()) + cleaningBase := cleaning.DeepCopy() + cleaning.Status.State = metalv1alpha1.ServerCleaningStateCompleted + if len(cleaning.Status.ServerCleaningStatuses) > 0 { + cleaning.Status.ServerCleaningStatuses[0].State = metalv1alpha1.ServerCleaningStateCompleted + } + cleaning.Status.CompletedCleanings = 1 + cleaning.Status.InProgressCleanings = 0 + g.Expect(k8sClient.Status().Patch(ctx, cleaning, client.MergeFrom(cleaningBase))).To(Succeed()) + }).Should(Succeed()) + + By("Ensuring Server taints are removed after cleaning completion") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(server), server)).To(Succeed()) + g.Expect(server.Spec.Taints).To(BeEmpty()) + }).WithTimeout(2 * time.Minute).Should(Succeed()) + + By("Ensuring Server transitions to Available state") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(server), server)).To(Succeed()) + g.Expect(server.Status.State).To(Equal(metalv1alpha1.ServerStateAvailable)) + }).WithTimeout(2 * time.Minute).Should(Succeed()) + + // Cleanup + Expect(k8sClient.Delete(ctx, cleaning)).To(Succeed()) + Eventually(Get(cleaning)).Should(Satisfy(apierrors.IsNotFound)) + Expect(k8sClient.Delete(ctx, server)).To(Succeed()) + }) + + It("Should handle deletion with finalizer", func(ctx SpecContext) { + By("Creating a ServerCleaning resource") + cleaning := &metalv1alpha1.ServerCleaning{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "test-cleaning-", + Namespace: ns.Name, + }, + Spec: metalv1alpha1.ServerCleaningSpec{ + ServerRef: &corev1.LocalObjectReference{ + Name: "non-existent-server", + }, + }, + } + Expect(k8sClient.Create(ctx, cleaning)).To(Succeed()) + + By("Ensuring finalizer is added") + Eventually(Object(cleaning)).Should(SatisfyAll( + HaveField("Finalizers", ContainElement(ServerCleaningFinalizer)), + )) + + By("Deleting the ServerCleaning resource") + Expect(k8sClient.Delete(ctx, cleaning)).To(Succeed()) + + By("Ensuring the resource is eventually deleted") + Eventually(Get(cleaning)).Should(Satisfy(apierrors.IsNotFound)) + }) +}) + +// Helper function to find a condition by type +func findCondition(conditions []metav1.Condition, conditionType string) *metav1.Condition { + for i := range conditions { + if conditions[i].Type == conditionType { + return &conditions[i] + } + } + return nil +} diff --git a/internal/controller/suite_test.go b/internal/controller/suite_test.go index 9db3050f7..317e2ea95 100644 --- a/internal/controller/suite_test.go +++ b/internal/controller/suite_test.go @@ -219,6 +219,11 @@ func SetupTest(redfishMockServers []netip.AddrPort) *corev1.Namespace { Scheme: k8sManager.GetScheme(), }).SetupWithManager(k8sManager)).To(Succeed()) + Expect((&ServerCleaningReconciler{ + Client: k8sManager.GetClient(), + Scheme: k8sManager.GetScheme(), + }).SetupWithManager(k8sManager)).To(Succeed()) + Expect((&BIOSSettingsReconciler{ Client: k8sManager.GetClient(), ManagerNamespace: ns.Name, @@ -311,6 +316,16 @@ func SetupTest(redfishMockServers []netip.AddrPort) *corev1.Namespace { }, }).SetupWithManager(k8sManager)).To(Succeed()) + Expect((&BMCTaskReconciler{ + Client: k8sManager.GetClient(), + Scheme: k8sManager.GetScheme(), + Insecure: true, + PollInterval: 50 * time.Millisecond, + BMCOptions: bmc.Options{ + BasicAuth: true, + }, + }).SetupWithManager(k8sManager)).To(Succeed()) + By("Starting the registry server") Expect(k8sManager.Add(manager.RunnableFunc(func(ctx context.Context) error { registryServer := registry.NewServer(GinkgoLogr, ":30000", k8sManager.GetClient())