Skip to content

Commit 24070d9

Browse files
authored
Resolve race condition between CARM ConfigMap and reconciler for annotated namespaces (#138)
Addresses aws-controllers-k8s/community#2011 In certain scenarios, where a user deploys a resource to a namespace annotated with a specific ownner accountID, a race condition was identified between the reconciler and the CARM (Cross Account Resource Management) `ConfigMap`. This race condition resulted in the controller setting an empty roleARN, preventing the aws-sdk-go client from pivoting (calling `STS::AssumeRole`) and managing resourecs in the correct account. Instead, resources were inadvertently managed in the default account instead of the namespace assigned account. This issue stemmed from the initial implementation of the CARM feature, where the method responsible for retrieving the accountID from the cache, didn't not properly verify the existance and content of the CARM configMap and instead returned an empty stringy when these conditions were not satisfied. This led to selection of the default account (when an empty `RoleARN` is returned )for resource management. Although these scenarios are rare, they can occur in clusters with a significantly high number of namespaces, causing a delay between naemsapce/configmap events and the informer's event handlers. This patch addresses the race issue by implementing two main things: - Proper error propagation: an error is no propagated when a `ConfigMap` is missing or when an accountID entry is missing in the `ConfigMap`. This helps the reconciler make the right decision on how to handle these cases. - Improved error handling: The reconciler now carefully handles these errors and requeues whenever a user has issued an owneraccountid-annotated namespace but the Configmap is not create or properly propagated. Signed-off-by: Amine Hilaly <[email protected]> By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
1 parent eb13154 commit 24070d9

File tree

4 files changed

+185
-45
lines changed

4 files changed

+185
-45
lines changed

pkg/runtime/adoption_reconciler.go

+31-9
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ package runtime
1515

1616
import (
1717
"context"
18+
"fmt"
1819

1920
"github.com/go-logr/logr"
2021
"github.com/pkg/errors"
@@ -107,10 +108,28 @@ func (r *adoptionReconciler) reconcile(ctx context.Context, req ctrlrt.Request)
107108
return ackerr.NotAdoptable
108109
}
109110

110-
targetDescriptor := rmf.ResourceDescriptor()
111-
acctID := r.getOwnerAccountID(res)
111+
// If a user has specified a namespace that is annotated with the
112+
// an owner account ID, we need an appropriate role ARN to assume
113+
// in order to perform the reconciliation. The roles ARN are typically
114+
// stored in a ConfigMap in the ACK system namespace.
115+
// If the ConfigMap is not created, or not populated with an
116+
// accountID to roleARN mapping, we need to properly requeue with a
117+
// helpful message to the user.
118+
var roleARN ackv1alpha1.AWSResourceName
119+
acctID, needCARMLookup := r.getOwnerAccountID(res)
120+
if needCARMLookup {
121+
// This means that the user is specifying a namespace that is
122+
// annotated with an owner account ID. We need to retrieve the
123+
// roleARN from the ConfigMap and properly requeue if the roleARN
124+
// is not available.
125+
roleARN, err = r.getRoleARN(acctID)
126+
if err != nil {
127+
// r.getRoleARN errors are not terminal, we should requeue.
128+
return requeue.NeededAfter(err, roleARNNotAvailableRequeueDelay)
129+
}
130+
}
112131
region := r.getRegion(res)
113-
roleARN := r.getRoleARN(acctID)
132+
targetDescriptor := rmf.ResourceDescriptor()
114133
endpointURL := r.getEndpointURL(res)
115134
gvk := targetDescriptor.GroupVersionKind()
116135

@@ -428,16 +447,16 @@ func (r *adoptionReconciler) handleReconcileError(err error) (ctrlrt.Result, err
428447
// that the service controller is in.
429448
func (r *adoptionReconciler) getOwnerAccountID(
430449
res *ackv1alpha1.AdoptedResource,
431-
) ackv1alpha1.AWSAccountID {
450+
) (ackv1alpha1.AWSAccountID, bool) {
432451
// look for owner account id in the namespace annotations
433452
namespace := res.GetNamespace()
434453
accID, ok := r.cache.Namespaces.GetOwnerAccountID(namespace)
435454
if ok {
436-
return ackv1alpha1.AWSAccountID(accID)
455+
return ackv1alpha1.AWSAccountID(accID), true
437456
}
438457

439458
// use controller configuration
440-
return ackv1alpha1.AWSAccountID(r.cfg.AccountID)
459+
return ackv1alpha1.AWSAccountID(r.cfg.AccountID), false
441460
}
442461

443462
// getEndpointURL returns the AWS account that owns the supplied resource.
@@ -462,9 +481,12 @@ func (r *adoptionReconciler) getEndpointURL(
462481
// the resources.
463482
func (r *adoptionReconciler) getRoleARN(
464483
acctID ackv1alpha1.AWSAccountID,
465-
) ackv1alpha1.AWSResourceName {
466-
roleARN, _ := r.cache.Accounts.GetAccountRoleARN(string(acctID))
467-
return ackv1alpha1.AWSResourceName(roleARN)
484+
) (ackv1alpha1.AWSResourceName, error) {
485+
roleARN, err := r.cache.Accounts.GetAccountRoleARN(string(acctID))
486+
if err != nil {
487+
return "", fmt.Errorf("unable to retrieve role ARN for account %s: %v", acctID, err)
488+
}
489+
return ackv1alpha1.AWSResourceName(roleARN), nil
468490
}
469491

470492
// getRegion returns the AWS region that the given resource is in or should be

pkg/runtime/cache/account.go

+48-11
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
package cache
1515

1616
import (
17+
"errors"
1718
"sync"
1819

1920
"github.com/go-logr/logr"
@@ -23,6 +24,18 @@ import (
2324
k8scache "k8s.io/client-go/tools/cache"
2425
)
2526

27+
var (
28+
// ErrCARMConfigMapNotFound is an error that is returned when the CARM
29+
// configmap is not found.
30+
ErrCARMConfigMapNotFound = errors.New("CARM configmap not found")
31+
// ErrAccountIDNotFound is an error that is returned when the account ID
32+
// is not found in the CARM configmap.
33+
ErrAccountIDNotFound = errors.New("account ID not found in CARM configmap")
34+
// ErrEmptyRoleARN is an error that is returned when the role ARN is empty
35+
// in the CARM configmap.
36+
ErrEmptyRoleARN = errors.New("role ARN is empty in CARM configmap")
37+
)
38+
2639
const (
2740
// ACKRoleAccountMap is the name of the configmap map object storing
2841
// all the AWS Account IDs associated with their AWS Role ARNs.
@@ -34,15 +47,17 @@ const (
3447
// make the changes accordingly.
3548
type AccountCache struct {
3649
sync.RWMutex
37-
log logr.Logger
38-
roleARNs map[string]string
50+
log logr.Logger
51+
roleARNs map[string]string
52+
configMapCreated bool
3953
}
4054

4155
// NewAccountCache instanciate a new AccountCache.
4256
func NewAccountCache(log logr.Logger) *AccountCache {
4357
return &AccountCache{
44-
log: log.WithName("cache.account"),
45-
roleARNs: make(map[string]string),
58+
log: log.WithName("cache.account"),
59+
roleARNs: make(map[string]string),
60+
configMapCreated: false,
4661
}
4762
}
4863

@@ -55,6 +70,7 @@ func resourceMatchACKRoleAccountsConfigMap(raw interface{}) bool {
5570

5671
// Run instantiate a new SharedInformer for ConfigMaps and runs it to begin processing items.
5772
func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{}) {
73+
c.log.V(1).Info("Starting shared informer for accounts cache", "targetConfigMap", ACKRoleAccountMap)
5874
informer := informersv1.NewConfigMapInformer(
5975
clientSet,
6076
ackSystemNamespace,
@@ -66,7 +82,10 @@ func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{
6682
if resourceMatchACKRoleAccountsConfigMap(obj) {
6783
cm := obj.(*corev1.ConfigMap)
6884
object := cm.DeepCopy()
69-
c.updateAccountRoleData(object.Data)
85+
// To avoid multiple mutex locks, we are updating the cache
86+
// and the configmap existence flag in the same function.
87+
configMapCreated := true
88+
c.updateAccountRoleData(configMapCreated, object.Data)
7089
c.log.V(1).Info("created account config map", "name", cm.ObjectMeta.Name)
7190
}
7291
},
@@ -75,15 +94,18 @@ func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{
7594
cm := desired.(*corev1.ConfigMap)
7695
object := cm.DeepCopy()
7796
//TODO(a-hilaly): compare data checksum before updating the cache
78-
c.updateAccountRoleData(object.Data)
97+
c.updateAccountRoleData(true, object.Data)
7998
c.log.V(1).Info("updated account config map", "name", cm.ObjectMeta.Name)
8099
}
81100
},
82101
DeleteFunc: func(obj interface{}) {
83102
if resourceMatchACKRoleAccountsConfigMap(obj) {
84103
cm := obj.(*corev1.ConfigMap)
85104
newMap := make(map[string]string)
86-
c.updateAccountRoleData(newMap)
105+
// To avoid multiple mutex locks, we are updating the cache
106+
// and the configmap existence flag in the same function.
107+
configMapCreated := false
108+
c.updateAccountRoleData(configMapCreated, newMap)
87109
c.log.V(1).Info("deleted account config map", "name", cm.ObjectMeta.Name)
88110
}
89111
},
@@ -92,17 +114,32 @@ func (c *AccountCache) Run(clientSet kubernetes.Interface, stopCh <-chan struct{
92114
}
93115

94116
// GetAccountRoleARN queries the AWS accountID associated Role ARN
95-
// from the cached CARM configmap. This function is thread safe.
96-
func (c *AccountCache) GetAccountRoleARN(accountID string) (string, bool) {
117+
// from the cached CARM configmap. It will return an error if the
118+
// configmap is not found, the accountID is not found or the role ARN
119+
// is empty.
120+
//
121+
// This function is thread safe.
122+
func (c *AccountCache) GetAccountRoleARN(accountID string) (string, error) {
97123
c.RLock()
98124
defer c.RUnlock()
125+
126+
if !c.configMapCreated {
127+
return "", ErrCARMConfigMapNotFound
128+
}
99129
roleARN, ok := c.roleARNs[accountID]
100-
return roleARN, ok && roleARN != ""
130+
if !ok {
131+
return "", ErrAccountIDNotFound
132+
}
133+
if roleARN == "" {
134+
return "", ErrEmptyRoleARN
135+
}
136+
return roleARN, nil
101137
}
102138

103139
// updateAccountRoleData updates the CARM map. This function is thread safe.
104-
func (c *AccountCache) updateAccountRoleData(data map[string]string) {
140+
func (c *AccountCache) updateAccountRoleData(exist bool, data map[string]string) {
105141
c.Lock()
106142
defer c.Unlock()
107143
c.roleARNs = data
144+
c.configMapCreated = exist
108145
}

pkg/runtime/cache/account_test.go

+58-16
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,14 @@ const (
3737
testAccountARN1 = "arn:aws:iam::012345678912:role/S3Access"
3838
testAccount2 = "219876543210"
3939
testAccountARN2 = "arn:aws:iam::012345678912:role/root"
40+
testAccount3 = "321987654321"
41+
testAccountARN3 = ""
4042
)
4143

4244
func TestAccountCache(t *testing.T) {
4345
accountsMap1 := map[string]string{
4446
testAccount1: testAccountARN1,
47+
testAccount3: testAccountARN3,
4548
}
4649

4750
accountsMap2 := map[string]string{
@@ -65,8 +68,14 @@ func TestAccountCache(t *testing.T) {
6568
stopCh := make(chan struct{})
6669
accountCache.Run(k8sClient, stopCh)
6770

71+
// Before creating the configmap, the accountCache should error for any
72+
// GetAccountRoleARN call.
73+
_, err := accountCache.GetAccountRoleARN(testAccount1)
74+
require.NotNil(t, err)
75+
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)
76+
6877
// Test create events
69-
_, err := k8sClient.CoreV1().ConfigMaps(testNamespace).Create(
78+
_, err = k8sClient.CoreV1().ConfigMaps(testNamespace).Create(
7079
context.Background(),
7180
&corev1.ConfigMap{
7281
ObjectMeta: metav1.ObjectMeta{
@@ -80,8 +89,15 @@ func TestAccountCache(t *testing.T) {
8089

8190
time.Sleep(time.Second)
8291

83-
_, ok := accountCache.GetAccountRoleARN("random-account")
84-
require.False(t, ok)
92+
// Test with non existing account
93+
_, err = accountCache.GetAccountRoleARN("random-account-not-exist")
94+
require.NotNil(t, err)
95+
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)
96+
97+
// Test with existing account
98+
_, err = accountCache.GetAccountRoleARN(testAccount1)
99+
require.NotNil(t, err)
100+
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)
85101

86102
k8sClient.CoreV1().ConfigMaps(testNamespace).Create(
87103
context.Background(),
@@ -97,12 +113,20 @@ func TestAccountCache(t *testing.T) {
97113

98114
time.Sleep(time.Second)
99115

100-
roleARN, ok := accountCache.GetAccountRoleARN(testAccount1)
101-
require.True(t, ok)
102-
require.Equal(t, roleARN, testAccountARN1)
116+
// Test with non existing account
117+
_, err = accountCache.GetAccountRoleARN("random-account-not-exist")
118+
require.NotNil(t, err)
119+
require.Equal(t, err, ackrtcache.ErrAccountIDNotFound)
103120

104-
_, ok = accountCache.GetAccountRoleARN(testAccount2)
105-
require.False(t, ok)
121+
// Test with existing account - but role ARN is empty
122+
_, err = accountCache.GetAccountRoleARN(testAccount3)
123+
require.NotNil(t, err)
124+
require.Equal(t, err, ackrtcache.ErrEmptyRoleARN)
125+
126+
// Test with existing account
127+
roleARN, err := accountCache.GetAccountRoleARN(testAccount1)
128+
require.Nil(t, err)
129+
require.Equal(t, roleARN, testAccountARN1)
106130

107131
// Test update events
108132
k8sClient.CoreV1().ConfigMaps("ack-system").Update(
@@ -119,12 +143,23 @@ func TestAccountCache(t *testing.T) {
119143

120144
time.Sleep(time.Second)
121145

122-
roleARN, ok = accountCache.GetAccountRoleARN(testAccount1)
123-
require.True(t, ok)
146+
// Test with non existing account
147+
_, err = accountCache.GetAccountRoleARN("random-account-not-exist")
148+
require.NotNil(t, err)
149+
require.Equal(t, err, ackrtcache.ErrAccountIDNotFound)
150+
151+
// Test that account was removed
152+
_, err = accountCache.GetAccountRoleARN(testAccount3)
153+
require.NotNil(t, err)
154+
require.Equal(t, err, ackrtcache.ErrAccountIDNotFound)
155+
156+
// Test with existing account
157+
roleARN, err = accountCache.GetAccountRoleARN(testAccount1)
158+
require.Nil(t, err)
124159
require.Equal(t, roleARN, testAccountARN1)
125160

126-
roleARN, ok = accountCache.GetAccountRoleARN(testAccount2)
127-
require.True(t, ok)
161+
roleARN, err = accountCache.GetAccountRoleARN(testAccount2)
162+
require.Nil(t, err)
128163
require.Equal(t, roleARN, testAccountARN2)
129164

130165
// Test delete events
@@ -136,9 +171,16 @@ func TestAccountCache(t *testing.T) {
136171

137172
time.Sleep(time.Second)
138173

139-
_, ok = accountCache.GetAccountRoleARN(testAccount1)
140-
require.False(t, ok)
141-
_, ok = accountCache.GetAccountRoleARN(testAccount2)
142-
require.False(t, ok)
174+
// Test that accounts ware removed
175+
_, err = accountCache.GetAccountRoleARN(testAccount1)
176+
require.NotNil(t, err)
177+
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)
178+
179+
_, err = accountCache.GetAccountRoleARN(testAccount2)
180+
require.NotNil(t, err)
181+
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)
143182

183+
_, err = accountCache.GetAccountRoleARN(testAccount3)
184+
require.NotNil(t, err)
185+
require.Equal(t, err, ackrtcache.ErrCARMConfigMapNotFound)
144186
}

0 commit comments

Comments
 (0)