Skip to content

Commit b79db97

Browse files
authored
Merge pull request #4095 from zac-nixon/main
[bug fix] handle ram shared VPCs for cross account tgb
2 parents bee5f8c + d85426e commit b79db97

File tree

3 files changed

+313
-24
lines changed

3 files changed

+313
-24
lines changed

helm/aws-load-balancer-controller/crds/crds.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@ spec:
5454
spec:
5555
description: IngressClassParamsSpec defines the desired state of IngressClassParams
5656
properties:
57+
PrefixListsIDs:
58+
description: PrefixListsIDs defines the security group prefix lists
59+
for all Ingresses that belong to IngressClass with this IngressClassParams.
60+
items:
61+
type: string
62+
type: array
5763
certificateArn:
5864
description: CertificateArn specifies the ARN of the certificates
5965
for all Ingresses that belong to IngressClass with this IngressClassParams.

pkg/targetgroupbinding/resource_manager.go

+86-24
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ package targetgroupbinding
33
import (
44
"context"
55
"fmt"
6+
"k8s.io/apimachinery/pkg/util/cache"
67
"net/netip"
78
lbcmetrics "sigs.k8s.io/aws-load-balancer-controller/pkg/metrics/lbc"
9+
"sync"
810
"time"
911

1012
elbv2types "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types"
@@ -28,7 +30,10 @@ import (
2830
"sigs.k8s.io/controller-runtime/pkg/client"
2931
)
3032

31-
const defaultRequeueDuration = 15 * time.Second
33+
const (
34+
defaultRequeueDuration = 15 * time.Second
35+
invalidVPCTTL = 60 * time.Minute
36+
)
3237

3338
// ResourceManager manages the TargetGroupBinding resource.
3439
type ResourceManager interface {
@@ -64,6 +69,9 @@ func NewDefaultResourceManager(k8sClient client.Client, elbv2Client services.ELB
6469
multiClusterManager: multiClusterManager,
6570
metricsCollector: metricsCollector,
6671

72+
invalidVpcCache: cache.NewExpiring(),
73+
invalidVpcCacheTTL: defaultTargetsCacheTTL,
74+
6775
requeueDuration: defaultRequeueDuration,
6876
}
6977
}
@@ -84,6 +92,10 @@ type defaultResourceManager struct {
8492
metricsCollector lbcmetrics.MetricCollector
8593
vpcID string
8694

95+
invalidVpcCache *cache.Expiring
96+
invalidVpcCacheTTL time.Duration
97+
invalidVpcCacheMutex sync.RWMutex
98+
8799
requeueDuration time.Duration
88100
}
89101

@@ -550,29 +562,10 @@ func (m *defaultResourceManager) registerPodEndpoints(ctx context.Context, tgb *
550562
"registering endpoints using the targetGroup's vpcID %s which is different from the cluster's vpcID %s", tgb.Spec.VpcID, m.vpcID))
551563
}
552564

553-
var overrideAzFn func(addr netip.Addr) bool
554-
if tgb.Spec.IamRoleArnToAssume != "" {
555-
// If we're interacting with another account, then we should always be setting "all" AZ to allow this
556-
// target to get registered by the ELB API.
557-
overrideAzFn = func(_ netip.Addr) bool {
558-
return true
559-
}
560-
} else {
561-
vpcInfo, err := m.vpcInfoProvider.FetchVPCInfo(ctx, vpcID)
562-
if err != nil {
563-
return err
564-
}
565-
var vpcRawCIDRs []string
566-
vpcRawCIDRs = append(vpcRawCIDRs, vpcInfo.AssociatedIPv4CIDRs()...)
567-
vpcRawCIDRs = append(vpcRawCIDRs, vpcInfo.AssociatedIPv6CIDRs()...)
568-
vpcCIDRs, err := networking.ParseCIDRs(vpcRawCIDRs)
569-
if err != nil {
570-
return err
571-
}
572-
// If the pod ip resides out of all the VPC CIDRs, then the only way to force the ELB API is to use "all" AZ.
573-
overrideAzFn = func(addr netip.Addr) bool {
574-
return !networking.IsIPWithinCIDRs(addr, vpcCIDRs)
575-
}
565+
overrideAzFn, err := m.generateOverrideAzFn(ctx, vpcID, tgb.Spec.IamRoleArnToAssume)
566+
567+
if err != nil {
568+
return err
576569
}
577570

578571
sdkTargets, err := m.prepareRegistrationCall(endpoints, overrideAzFn)
@@ -626,6 +619,66 @@ func (m *defaultResourceManager) updateTGBCheckPoint(ctx context.Context, tgb *e
626619
return nil
627620
}
628621

622+
func (m *defaultResourceManager) generateOverrideAzFn(ctx context.Context, vpcID string, assumeRole string) (func(addr netip.Addr) bool, error) {
623+
// Cross-Account is configured by assuming a role.
624+
usingCrossAccount := assumeRole != ""
625+
626+
// We need to cache the vpc response for the various assume roles.
627+
// There are two cases to consider when using assuming a role:
628+
// 1. Using a peered VPC connection to provide connectivity among accounts.
629+
// 2. Using RAM shared subnet(s) to provide connectivity among accounts.
630+
// We need to handle the case where the user is potentially using the same VPC in the peered context
631+
// as well as the RAM shared context.
632+
// Using peered VPC connection, we will always need to override the AZ.
633+
// Using a RAM shared subnet / VPC means that we follow the standard logic of checking the pod ip against the VPC CIDRs.
634+
635+
invalidVPCCacheKey := fmt.Sprintf("%s-%s", assumeRole, vpcID)
636+
637+
if usingCrossAccount {
638+
// Prevent spamming EC2 with requests.
639+
// We can use the cached result for this VPC ID given for the current assume role ARN
640+
m.invalidVpcCacheMutex.RLock()
641+
_, invalidVPC := m.invalidVpcCache.Get(invalidVPCCacheKey)
642+
m.invalidVpcCacheMutex.RUnlock()
643+
644+
// In this case, we already received that this VPC was invalid, we can shortcut the EC2 call and just override the AZ.
645+
if invalidVPC {
646+
return func(addr netip.Addr) bool {
647+
return true
648+
}, nil
649+
}
650+
}
651+
652+
vpcInfo, err := m.vpcInfoProvider.FetchVPCInfo(ctx, vpcID)
653+
if err != nil {
654+
// A VPC Not Found Error along with cross-account usage means that the VPC either, is not shared with the assume
655+
// role account OR this falls into case (1) from above where the VPC is just peered but not shared with RAM.
656+
// As we can't differentiate if RAM sharing wasn't set up correctly OR the VPC is set up via peering, we will
657+
// just default to assume that the VPC is peered but not shared.
658+
if isVPCNotFoundError(err) && usingCrossAccount {
659+
m.invalidVpcCacheMutex.Lock()
660+
m.invalidVpcCache.Set(invalidVPCCacheKey, true, m.invalidVpcCacheTTL)
661+
m.invalidVpcCacheMutex.Unlock()
662+
return func(addr netip.Addr) bool {
663+
return true
664+
}, nil
665+
}
666+
return nil, err
667+
}
668+
var vpcRawCIDRs []string
669+
vpcRawCIDRs = append(vpcRawCIDRs, vpcInfo.AssociatedIPv4CIDRs()...)
670+
vpcRawCIDRs = append(vpcRawCIDRs, vpcInfo.AssociatedIPv6CIDRs()...)
671+
vpcCIDRs, err := networking.ParseCIDRs(vpcRawCIDRs)
672+
if err != nil {
673+
return nil, err
674+
}
675+
// By getting here, we have a valid VPC for whatever credential was used. We return "true" in the function below
676+
// when the pod ip falls outside the VPCs configured CIDRs, other we return "false" to ensure that the "all" is NOT injected.
677+
return func(addr netip.Addr) bool {
678+
return !networking.IsIPWithinCIDRs(addr, vpcCIDRs)
679+
}, nil
680+
}
681+
629682
type podEndpointAndTargetPair struct {
630683
endpoint backend.PodEndpoint
631684
target TargetInfo
@@ -747,3 +800,12 @@ func isELBV2TargetGroupARNInvalidError(err error) bool {
747800
}
748801
return false
749802
}
803+
804+
func isVPCNotFoundError(err error) bool {
805+
var apiErr smithy.APIError
806+
if errors.As(err, &apiErr) {
807+
code := apiErr.ErrorCode()
808+
return code == "InvalidVpcID.NotFound"
809+
}
810+
return false
811+
}

0 commit comments

Comments
 (0)