From 6767e7ee62aa268faac2d192a2cec8543785040d Mon Sep 17 00:00:00 2001 From: Drew Walters Date: Tue, 23 Feb 2021 16:46:23 +0000 Subject: [PATCH] Fix vBMH Secret extrapolation The scheduling algorithm subtracts from the number of hosts available for scheduling each time it encounters a problem retrieving BMC auth credentials or interface IP addresses, sometimes multiple times for the same host, causing SIP to display an error that not enough hosts are available for scheduling when there are. This change fixes the logic by halting processing of a host when SIP encounters a problem. This change also aggregates the errors so all problem hosts are identified in the logs. Fixes #2 Signed-off-by: Drew Walters Change-Id: Icd45dff7c420abfc91593edf145807cb05f3b472 --- pkg/controllers/sipcluster_controller.go | 25 ++-- pkg/vbmh/machines.go | 103 ++++++++++------ pkg/vbmh/vbmh_test.go | 151 ++++++++++++++++++++++- 3 files changed, 227 insertions(+), 52 deletions(-) diff --git a/pkg/controllers/sipcluster_controller.go b/pkg/controllers/sipcluster_controller.go index 0e91e94..b5b5667 100644 --- a/pkg/controllers/sipcluster_controller.go +++ b/pkg/controllers/sipcluster_controller.go @@ -278,20 +278,29 @@ func (r *SIPClusterReconciler) gatherVBMH(ctx context.Context, sip airshipv1.SIP // TODO : this is a loop until we succeed or cannot find a schedule for { logger.Info("gathering machines", "machines", machines.String()) + + // NOTE: Schedule executes the scheduling algorithm to find hosts that meet the topology and role + // constraints. err := machines.Schedule(sip, r.Client) if err != nil { return machines, err } - // we extract the information in a generic way - // So that LB , Jump and Ath POD all leverage the same - // If there are some issues finnding information the vBMH - // Are flagged Unschedulable - // Loop and Try to find new vBMH to complete tge schedule - if machines.Extrapolate(sip, r.Client) { - logger.Info("successfully extrapolated machines") - break + if err = machines.ExtrapolateServiceAddresses(sip, r.Client); err != nil { + logger.Error(err, "unable to retrieve infrastructure service IP addresses from selected BMHs."+ + "Selecting replacement hosts.") + + continue } + + if err = machines.ExtrapolateBMCAuth(sip, r.Client); err != nil { + logger.Error(err, "unable to retrieve BMC auth info from selected BMHs. Selecting replacement"+ + "hosts.") + + continue + } + + break } return machines, nil diff --git a/pkg/vbmh/machines.go b/pkg/vbmh/machines.go index 2d34fe8..53d3c47 100644 --- a/pkg/vbmh/machines.go +++ b/pkg/vbmh/machines.go @@ -30,10 +30,10 @@ import ( metal3 "github.com/metal3-io/baremetal-operator/apis/metal3.io/v1alpha1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" + kerror "k8s.io/apimachinery/pkg/util/errors" "sigs.k8s.io/controller-runtime/pkg/client" ) -// ScheduledState type ScheduledState string // Possible Node or VM Roles for a Tenant @@ -396,69 +396,92 @@ func (ml *MachineList) scheduleIt(nodeRole airshipv1.VMRole, nodeCfg airshipv1.N return nil } -// Extrapolate intention is to extract the IP information from the referenced networkData field for the BareMetalHost -func (ml *MachineList) Extrapolate(sip airshipv1.SIPCluster, c client.Client) bool { - // Lets get the data for all selected BMH's. - extrapolateSuccess := true - fmt.Printf("Schedule.Extrapolate ml.Vbmhs:%d\n", len(ml.Machines)) - for _, machine := range ml.Machines { - fmt.Printf("Schedule.Extrapolate machine.Data.IPOnInterface len:%d machine:%v\n", - len(machine.Data.IPOnInterface), machine) +// ExtrapolateServiceAddresses extracts the IP addresses of each network interface mapped to a service in the SIPCluster +// CR by inspecting each BMH's Network Data Secret. +func (ml *MachineList) ExtrapolateServiceAddresses(sip airshipv1.SIPCluster, c client.Client) error { + // NOTE: At this point in the scheduling algorithm, the list of Machines in the MachineList each have BMH + // objects that meet the SIPCluster CR topology and role constraints. - // Skip if I alread extrapolated tehh data for this machine + var extrapolateErrs error + for _, machine := range ml.Machines { + // Skip machines whose service addresses have been extracted if len(machine.Data.IPOnInterface) > 0 { continue } - bmh := machine.BMH - // Identify Network Data Secret name + // Retrieve BMH Network Data Secret networkDataSecret := &corev1.Secret{} - fmt.Printf("Schedule.Extrapolate Namespace:%s Name:%s\n", bmh.Spec.NetworkData.Namespace, - bmh.Spec.NetworkData.Name) - // c is a created client. err := c.Get(context.Background(), client.ObjectKey{ - Namespace: bmh.Spec.NetworkData.Namespace, - Name: bmh.Spec.NetworkData.Name, + Namespace: machine.BMH.Spec.NetworkData.Namespace, + Name: machine.BMH.Spec.NetworkData.Name, }, networkDataSecret) if err != nil { + ml.Log.Error(err, "unable to retrieve BMH Network Data Secret", "BMH", machine.BMH.Name, + "Secret", machine.BMH.Spec.NetworkData.Name, + "Secret Namespace", machine.BMH.Spec.NetworkData.Namespace) + machine.ScheduleStatus = UnableToSchedule ml.ReadyForScheduleCount[machine.VMRole]-- - extrapolateSuccess = false - } - //fmt.Printf("Schedule.Extrapolate networkDataSecret:%v\n", networkDataSecret) - // Assuming there might be other data - // Retrieve IP's for Service defined Network Interfaces - err = ml.getIP(machine, networkDataSecret, sip.Spec.Services) - if err != nil { - // Lets mark the machine as NotScheduleable. - // Update the count of what I have found so far, - machine.ScheduleStatus = UnableToSchedule - ml.ReadyForScheduleCount[machine.VMRole]-- - extrapolateSuccess = false + extrapolateErrs = kerror.NewAggregate([]error{extrapolateErrs, err}) + + continue } - // Retrieve BMC credentials - mgmtCredsSecret := &corev1.Secret{} - err = c.Get(context.Background(), client.ObjectKey{ - Namespace: bmh.Namespace, - Name: bmh.Spec.BMC.CredentialsName, - }, mgmtCredsSecret) + // Parse the interface IP addresses from the BMH's Network Data + err = ml.getIP(machine, networkDataSecret, sip.Spec.Services) if err != nil { + ml.Log.Error(err, "unable to parse BMH Network Data Secret", "BMH", machine.BMH.Name, + "Secret", machine.BMH.Spec.NetworkData.Name, + "Secret Namespace", machine.BMH.Spec.NetworkData.Namespace) + machine.ScheduleStatus = UnableToSchedule ml.ReadyForScheduleCount[machine.VMRole]-- - extrapolateSuccess = false + extrapolateErrs = kerror.NewAggregate([]error{extrapolateErrs, err}) + } + } + + return extrapolateErrs +} + +// ExtrapolateBMCAuth extracts the BMC authentication information in each BMH's BMC Credentials Secret. +func (ml *MachineList) ExtrapolateBMCAuth(sip airshipv1.SIPCluster, c client.Client) error { + // NOTE: At this point in the scheduling algorithm, the list of Machines in the MachineList each have BMH + // objects that meet the SIPCluster CR topology and role constraints. + + var extrapolateErrs error + for _, machine := range ml.Machines { + // Retrieve BMC credentials Secret + bmcCredsSecret := &corev1.Secret{} + err := c.Get(context.Background(), client.ObjectKey{ + Namespace: machine.BMH.Namespace, + Name: machine.BMH.Spec.BMC.CredentialsName, + }, bmcCredsSecret) + if err != nil { + ml.Log.Error(err, "unable to retrieve BMH BMC credentials Secret", "BMH", machine.BMH.Name, + "Secret", machine.BMH.Spec.BMC.CredentialsName, + "Secret Namespace", machine.BMH.Namespace) + + machine.ScheduleStatus = UnableToSchedule + ml.ReadyForScheduleCount[machine.VMRole]-- + extrapolateErrs = kerror.NewAggregate([]error{extrapolateErrs, err}) + + continue } // Parse BMC credentials from Secret - err = ml.getMangementCredentials(machine, mgmtCredsSecret) + err = ml.getMangementCredentials(machine, bmcCredsSecret) if err != nil { + ml.Log.Error(err, "unable to parse BMH BMC credentials Secret", "BMH", machine.BMH.Name, + "Secret", machine.BMH.Spec.BMC.CredentialsName, + "Secret Namespace", machine.BMH.Namespace) + machine.ScheduleStatus = UnableToSchedule ml.ReadyForScheduleCount[machine.VMRole]-- - extrapolateSuccess = false + extrapolateErrs = kerror.NewAggregate([]error{extrapolateErrs, err}) } } - fmt.Printf("Schedule.Extrapolate extrapolateSuccess:%t\n", extrapolateSuccess) - return extrapolateSuccess + + return extrapolateErrs } /*** diff --git a/pkg/vbmh/vbmh_test.go b/pkg/vbmh/vbmh_test.go index d42fa6e..a856d1f 100644 --- a/pkg/vbmh/vbmh_test.go +++ b/pkg/vbmh/vbmh_test.go @@ -138,7 +138,7 @@ var _ = Describe("MachineList", func() { }, } k8sClient := mockClient.NewFakeClient(objsToApply...) - Expect(ml.Extrapolate(*sipCluster, k8sClient)).To(BeTrue()) + Expect(ml.ExtrapolateServiceAddresses(*sipCluster, k8sClient)).To(BeNil()) Expect(ml.Machines[bmh.Name].Data.IPOnInterface).To(Equal(map[string]string{"oam-ipv4": "32.68.51.139"})) }) @@ -188,7 +188,7 @@ var _ = Describe("MachineList", func() { }, } k8sClient := mockClient.NewFakeClient(objsToApply...) - Expect(ml.Extrapolate(*sipCluster, k8sClient)).To(BeTrue()) + Expect(ml.ExtrapolateBMCAuth(*sipCluster, k8sClient)).To(BeNil()) Expect(ml.Machines[bmh.Name].Data.BMCUsername).To(Equal(username)) Expect(ml.Machines[bmh.Name].Data.BMCPassword).To(Equal(password)) @@ -236,7 +236,150 @@ var _ = Describe("MachineList", func() { }, } k8sClient := mockClient.NewFakeClient(objsToApply...) - Expect(ml.Extrapolate(*sipCluster, k8sClient)).To(BeFalse()) + Expect(ml.ExtrapolateBMCAuth(*sipCluster, k8sClient)).ToNot(BeNil()) + }) + + It("Should not process a BMH when its BMC secret is incorrectly formatted", func() { + var objsToApply []runtime.Object + + // Create BMH and NetworkData secret + bmh, networkData := testutil.CreateBMH(1, "default", "master", 6) + objsToApply = append(objsToApply, bmh) + objsToApply = append(objsToApply, networkData) + + // Create improperly formatted BMC credential secret + username := "root" + password := "test" + bmcSecret := testutil.CreateBMCAuthSecret(bmh.Name, bmh.Namespace, username, password) + bmcSecret.Data = map[string][]byte{"foo": []byte("bad data!")} + + bmh.Spec.BMC.CredentialsName = bmcSecret.Name + objsToApply = append(objsToApply, bmcSecret) + + m, err := NewMachine(*bmh, airshipv1.VMControlPlane, NotScheduled) + Expect(err).To(BeNil()) + + ml := &MachineList{ + NamespacedName: types.NamespacedName{ + Name: "vbmh", + Namespace: "default", + }, + Machines: map[string]*Machine{ + bmh.Name: m, + }, + ReadyForScheduleCount: map[airshipv1.VMRole]int{ + airshipv1.VMControlPlane: 1, + airshipv1.VMWorker: 0, + }, + Log: ctrl.Log.WithName("controllers").WithName("SIPCluster"), + } + + sipCluster := testutil.CreateSIPCluster("subcluster-1", "default", 1, 3) + sipCluster.Spec.Services = airshipv1.SIPClusterServices{ + LoadBalancer: []airshipv1.SIPClusterService{ + { + Image: "haproxy:latest", + NodeLabels: map[string]string{ + "test": "true", + }, + NodePort: 30000, + NodeInterface: "oam-ipv4", + }, + }, + } + k8sClient := mockClient.NewFakeClient(objsToApply...) + Expect(ml.ExtrapolateBMCAuth(*sipCluster, k8sClient)).ToNot(BeNil()) + }) + + It("Should not process a BMH when its Network Data secret is missing", func() { + var objsToApply []runtime.Object + + // Create BMH and NetworkData secret + bmh, networkData := testutil.CreateBMH(1, "default", "master", 6) + objsToApply = append(objsToApply, bmh) + objsToApply = append(objsToApply, networkData) + + bmh.Spec.NetworkData.Name = "foo-does-not-exist" + bmh.Spec.NetworkData.Namespace = "foo-does-not-exist" + + m, err := NewMachine(*bmh, airshipv1.VMControlPlane, NotScheduled) + Expect(err).To(BeNil()) + + ml := &MachineList{ + NamespacedName: types.NamespacedName{ + Name: "vbmh", + Namespace: "default", + }, + Machines: map[string]*Machine{ + bmh.Name: m, + }, + ReadyForScheduleCount: map[airshipv1.VMRole]int{ + airshipv1.VMControlPlane: 1, + airshipv1.VMWorker: 0, + }, + Log: ctrl.Log.WithName("controllers").WithName("SIPCluster"), + } + + sipCluster := testutil.CreateSIPCluster("subcluster-1", "default", 1, 3) + sipCluster.Spec.Services = airshipv1.SIPClusterServices{ + LoadBalancer: []airshipv1.SIPClusterService{ + { + Image: "haproxy:latest", + NodeLabels: map[string]string{ + "test": "true", + }, + NodePort: 30000, + NodeInterface: "oam-ipv4", + }, + }, + } + k8sClient := mockClient.NewFakeClient(objsToApply...) + Expect(ml.ExtrapolateServiceAddresses(*sipCluster, k8sClient)).ToNot(BeNil()) + }) + + It("Should not process a BMH when its Network Data secret is incorrectly formatted", func() { + var objsToApply []runtime.Object + + // Create BMH and NetworkData secret + bmh, networkData := testutil.CreateBMH(1, "default", "master", 6) + objsToApply = append(objsToApply, bmh) + objsToApply = append(objsToApply, networkData) + + networkData.Data = map[string][]byte{"foo": []byte("bad data!")} + + m, err := NewMachine(*bmh, airshipv1.VMControlPlane, NotScheduled) + Expect(err).To(BeNil()) + + ml := &MachineList{ + NamespacedName: types.NamespacedName{ + Name: "vbmh", + Namespace: "default", + }, + Machines: map[string]*Machine{ + bmh.Name: m, + }, + ReadyForScheduleCount: map[airshipv1.VMRole]int{ + airshipv1.VMControlPlane: 1, + airshipv1.VMWorker: 0, + }, + Log: ctrl.Log.WithName("controllers").WithName("SIPCluster"), + } + + sipCluster := testutil.CreateSIPCluster("subcluster-1", "default", 1, 3) + sipCluster.Spec.Services = airshipv1.SIPClusterServices{ + LoadBalancer: []airshipv1.SIPClusterService{ + { + Image: "haproxy:latest", + NodeLabels: map[string]string{ + "test": "true", + }, + NodePort: 30000, + NodeInterface: "oam-ipv4", + }, + }, + } + k8sClient := mockClient.NewFakeClient(objsToApply...) + Expect(ml.ExtrapolateServiceAddresses(*sipCluster, k8sClient)).ToNot(BeNil()) }) It("Should not retrieve the BMH IP if it has been previously extrapolated", func() { @@ -251,7 +394,7 @@ var _ = Describe("MachineList", func() { k8sClient := mockClient.NewFakeClient(objs...) sipCluster := testutil.CreateSIPCluster("subcluster-1", "default", 1, 3) - Expect(machineList.Extrapolate(*sipCluster, k8sClient)).To(BeTrue()) + Expect(machineList.ExtrapolateServiceAddresses(*sipCluster, k8sClient)).To(BeNil()) }) It("Should not schedule BMH if it is missing networkdata", func() {