Skip to content

Commit 9102251

Browse files
committed
fix TestRerun flakiness
1 parent 4a1667b commit 9102251

File tree

3 files changed

+145
-106
lines changed

3 files changed

+145
-106
lines changed

test/integration/test/deck_test.go

+95-93
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"io"
2323
"net/http"
2424
"sort"
25-
"strconv"
2625
"strings"
2726
"testing"
2827
"time"
@@ -358,7 +357,10 @@ func TestDeckTenantIDs(t *testing.T) {
358357
func TestRerun(t *testing.T) {
359358
t.Parallel()
360359
const rerunJobConfigFile = "rerun-test.yaml"
360+
361361
jobName := "rerun-test-job-" + RandomString(t)
362+
prowJobSelector := labels.SelectorFromSet(map[string]string{kube.ProwJobAnnotation: jobName})
363+
362364
var rerunJobConfigTemplate = `periodics:
363365
- interval: 1h
364366
name: %s
@@ -379,137 +381,137 @@ func TestRerun(t *testing.T) {
379381
t.Fatalf("Failed creating clients for cluster %q: %v", clusterContext, err)
380382
}
381383

382-
rerunJobConfig := fmt.Sprintf(rerunJobConfigTemplate, jobName, "foo")
383-
if err := updateJobConfig(context.Background(), kubeClient, rerunJobConfigFile, []byte(rerunJobConfig)); err != nil {
384-
t.Fatalf("Failed update job config: %v", err)
385-
}
384+
ctx := context.Background()
386385

387-
// Now we are waiting on Horologium to create the first prow job so that we
388-
// can rerun from.
389-
// Horologium itself is pretty good at handling the configmap update, but
390-
// not kubelet, according to
391-
// https://github.com/kubernetes/kubernetes/issues/30189 kubelet syncs
392-
// configmap updates on existing pods every minute, which is a long wait.
393-
// The proposed fix in the issue was updating the deployment, which imo
394-
// should be better handled by just refreshing pods.
395-
// So here comes forcing restart of horologium pods.
396-
if err := refreshProwPods(kubeClient, context.Background(), "horologium"); err != nil {
397-
t.Fatalf("Failed refreshing horologium pods: %v", err)
398-
}
399-
// Same with deck
400-
if err := refreshProwPods(kubeClient, context.Background(), "deck"); err != nil {
401-
t.Fatalf("Failed refreshing deck pods: %v", err)
386+
redeployJobConfig := func(jobConfig string) {
387+
if err := updateJobConfig(ctx, kubeClient, rerunJobConfigFile, []byte(jobConfig)); err != nil {
388+
t.Fatalf("Failed update job config: %v", err)
389+
}
390+
391+
// Now we are waiting on Horologium to create the first prow job so that we
392+
// can rerun from.
393+
// Horologium itself is pretty good at handling the configmap update, but
394+
// not kubelet, according to
395+
// https://github.com/kubernetes/kubernetes/issues/30189 kubelet syncs
396+
// configmap updates on existing pods every minute, which is a long wait.
397+
// It's quicker to rollout the affected Deployments.
398+
if err := rolloutDeployment(t, ctx, kubeClient, "horologium"); err != nil {
399+
t.Fatalf("Failed rolling out Horologium: %v", err)
400+
}
401+
// Same with deck
402+
if err := rolloutDeployment(t, ctx, kubeClient, "deck"); err != nil {
403+
t.Fatalf("Failed rolling out Deck: %v", err)
404+
}
402405
}
403406

407+
// Deploy the initial template with "foo" as the label.
408+
redeployJobConfig(fmt.Sprintf(rerunJobConfigTemplate, jobName, "foo"))
409+
404410
t.Cleanup(func() {
405-
if err := updateJobConfig(context.Background(), kubeClient, rerunJobConfigFile, []byte{}); err != nil {
411+
if err := updateJobConfig(ctx, kubeClient, rerunJobConfigFile, []byte{}); err != nil {
406412
t.Logf("ERROR CLEANUP: %v", err)
407413
}
408-
labels, err := labels.Parse("prow.k8s.io/job = " + jobName)
409-
if err != nil {
410-
t.Logf("Skip cleaning up jobs, as failed parsing label: %v", err)
411-
return
414+
// Prevent horologium from immediately creating the "missing" ProwJob after the
415+
// DeleteAll call further down, because horologium still runs with the last
416+
// non-empty configuration (foo=bar).
417+
if err := rolloutDeployment(t, ctx, kubeClient, "horologium"); err != nil {
418+
t.Logf("Failed rolling out Horologium: %v", err)
412419
}
413-
if err := kubeClient.DeleteAllOf(context.Background(), &prowjobv1.ProwJob{}, &ctrlruntimeclient.DeleteAllOfOptions{
414-
ListOptions: ctrlruntimeclient.ListOptions{LabelSelector: labels},
420+
if err := kubeClient.DeleteAllOf(ctx, &prowjobv1.ProwJob{}, &ctrlruntimeclient.DeleteAllOfOptions{
421+
ListOptions: ctrlruntimeclient.ListOptions{
422+
Namespace: defaultNamespace,
423+
LabelSelector: prowJobSelector,
424+
},
415425
}); err != nil {
416426
t.Logf("ERROR CLEANUP: %v", err)
417427
}
418428
})
419-
ctx := context.Background()
429+
420430
getLatestJob := func(t *testing.T, jobName string, lastRun *v1.Time) *prowjobv1.ProwJob {
421431
var res *prowjobv1.ProwJob
422432
if err := wait.PollUntilContextTimeout(ctx, time.Second, 90*time.Second, true, func(ctx context.Context) (bool, error) {
423433
pjs := &prowjobv1.ProwJobList{}
424-
err = kubeClient.List(ctx, pjs, &ctrlruntimeclient.ListOptions{
425-
LabelSelector: labels.SelectorFromSet(map[string]string{kube.ProwJobAnnotation: jobName}),
434+
err := kubeClient.List(ctx, pjs, &ctrlruntimeclient.ListOptions{
435+
LabelSelector: prowJobSelector,
426436
Namespace: defaultNamespace,
427437
})
428438
if err != nil {
429439
return false, fmt.Errorf("failed listing prow jobs: %w", err)
430440
}
441+
431442
sort.Slice(pjs.Items, func(i, j int) bool {
432-
revi, _ := strconv.Atoi(pjs.Items[i].ResourceVersion)
433-
revj, _ := strconv.Atoi(pjs.Items[j].ResourceVersion)
434-
return revi > revj
443+
createdi := pjs.Items[i].CreationTimestamp
444+
createdj := pjs.Items[j].CreationTimestamp
445+
return createdj.Before(&createdi)
435446
})
447+
436448
if len(pjs.Items) > 0 {
437449
if lastRun != nil && pjs.Items[0].CreationTimestamp.Before(lastRun) {
438450
return false, nil
439451
}
440452
res = &pjs.Items[0]
441453
}
454+
442455
return res != nil, nil
443456
}); err != nil {
444457
t.Fatalf("Failed waiting for job %q: %v", jobName, err)
445458
}
446459
return res
447460
}
448-
rerun := func(t *testing.T, jobName string, mode string) {
449-
req, err := http.NewRequest("POST", fmt.Sprintf("http://localhost/rerun?mode=%v&prowjob=%v", mode, jobName), nil)
450-
if err != nil {
451-
t.Fatalf("Could not generate a request %v", err)
452-
}
453461

454-
// Deck might not have been informed about the job config update, retry
455-
// for this case.
456-
waitDur := time.Second * 5
457-
var lastErr error
458-
for i := 0; i < 3; i++ {
459-
lastErr = nil
460-
res, err := http.DefaultClient.Do(req)
461-
if err != nil {
462-
lastErr = fmt.Errorf("could not make post request %v", err)
463-
res.Body.Close()
464-
break
465-
}
466-
// The only retry condition is status not ok
467-
if res.StatusCode != http.StatusOK {
468-
lastErr = fmt.Errorf("status not expected: %d", res.StatusCode)
469-
res.Body.Close()
470-
waitDur *= 2
471-
time.Sleep(waitDur)
472-
continue
473-
}
474-
body, err := io.ReadAll(res.Body)
475-
if err != nil {
476-
lastErr = fmt.Errorf("could not read body response %v", err)
477-
res.Body.Close()
478-
break
479-
}
480-
t.Logf("Response body: %s", string(body))
481-
break
482-
}
483-
if lastErr != nil {
484-
t.Fatalf("Failed trigger rerun: %v", lastErr)
485-
}
486-
}
487-
jobToRerun := getLatestJob(t, jobName, nil)
488-
rerunJobConfig = fmt.Sprintf(rerunJobConfigTemplate, jobName, "bar")
489-
if err := updateJobConfig(context.Background(), kubeClient, rerunJobConfigFile, []byte(rerunJobConfig)); err != nil {
490-
t.Fatalf("Failed update job config: %v", err)
491-
}
492-
var passed bool
493-
// It may take some time for the new ProwJob to show up, so we will
494-
// check every 30s interval three times for it to appear
495-
latestRun := jobToRerun
496-
for i := 0; i < 3; i++ {
497-
time.Sleep(30 * time.Second)
498-
rerun(t, jobToRerun.Name, "latest")
499-
if latestRun = getLatestJob(t, jobName, &latestRun.CreationTimestamp); latestRun.Labels["foo"] == "bar" {
500-
passed = true
501-
break
502-
}
503-
}
504-
if !passed {
505-
t.Fatal("Expected updated job.")
462+
// Wait for the first job to be created by horologium.
463+
initialJob := getLatestJob(t, jobName, nil)
464+
465+
// Update the job configuration with a new label.
466+
redeployJobConfig(fmt.Sprintf(rerunJobConfigTemplate, jobName, "bar"))
467+
468+
// Rerun the job using the latest config.
469+
rerunJob(t, ctx, initialJob.Name, "latest")
470+
471+
// Wait until the desired ProwJob shows up.
472+
latestJob := getLatestJob(t, jobName, &initialJob.CreationTimestamp)
473+
if latestJob.Labels["foo"] != "bar" {
474+
t.Fatalf("Failed waiting for ProwJob %q using latest config with foo=bar.", jobName)
506475
}
507476

477+
// Prevent Deck from being too fast and recreating the new job in the same second
478+
// as the previous one.
479+
time.Sleep(1 * time.Second)
480+
508481
// Deck scheduled job from latest configuration, rerun with "original"
509482
// should still go with original configuration.
510-
rerun(t, jobToRerun.Name, "original")
511-
if latestRun := getLatestJob(t, jobName, &latestRun.CreationTimestamp); latestRun.Labels["foo"] != "foo" {
512-
t.Fatalf("Job label mismatch. Want: 'foo', got: '%s'", latestRun.Labels["foo"])
483+
rerunJob(t, ctx, initialJob.Name, "original")
484+
485+
originalJob := getLatestJob(t, jobName, &latestJob.CreationTimestamp)
486+
if originalJob.Labels["foo"] != "foo" {
487+
t.Fatalf("Failed waiting for ProwJob %q using original config with foo=foo.", jobName)
488+
}
489+
}
490+
491+
func rerunJob(t *testing.T, ctx context.Context, jobName string, mode string) {
492+
req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost/rerun?mode=%v&prowjob=%v", mode, jobName), nil)
493+
if err != nil {
494+
t.Fatalf("Could not generate a request %v", err)
495+
}
496+
497+
// Deck might not be fully ready yet, so we must retry.
498+
if err := wait.PollUntilContextTimeout(ctx, time.Second, 10*time.Second, true, func(ctx context.Context) (bool, error) {
499+
res, err := http.DefaultClient.Do(req)
500+
if err != nil {
501+
return false, fmt.Errorf("could not make post request: %w", err)
502+
}
503+
defer res.Body.Close()
504+
505+
body, err := io.ReadAll(res.Body)
506+
if err != nil {
507+
t.Logf("Failed to read response body: %v", err)
508+
return false, nil
509+
}
510+
t.Logf("Response body: %s", string(body))
511+
512+
return res.StatusCode == http.StatusOK, nil
513+
}); err != nil {
514+
t.Fatalf("Failed to rerun job %q with %s config: %v", jobName, mode, err)
513515
}
514516
}
515517

test/integration/test/horologium_test.go

+2-4
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,8 @@ func TestLaunchProwJob(t *testing.T) {
6666
// not kubelet, according to
6767
// https://github.com/kubernetes/kubernetes/issues/30189 kubelet syncs
6868
// configmap updates on existing pods every minute, which is a long wait.
69-
// The proposed fix in the issue was updating the deployment, which imo
70-
// should be better handled by just refreshing pods.
71-
// So here comes forcing restart of horologium pods.
72-
if err := refreshProwPods(kubeClient, context.Background(), "horologium"); err != nil {
69+
// It's quicker to rollout the affected Deployments.
70+
if err := rolloutDeployment(t, context.Background(), kubeClient, "horologium"); err != nil {
7371
t.Fatalf("Failed refreshing horologium pods: %v", err)
7472
}
7573

test/integration/test/setup.go

+48-9
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,18 @@ import (
2121
"context"
2222
"crypto/rand"
2323
"crypto/sha256"
24+
"errors"
2425
"flag"
2526
"fmt"
2627
"io"
2728
"sync"
2829
"testing"
30+
"time"
2931

32+
appsv1 "k8s.io/api/apps/v1"
3033
coreapi "k8s.io/api/core/v1"
31-
"k8s.io/apimachinery/pkg/labels"
34+
"k8s.io/apimachinery/pkg/types"
35+
"k8s.io/apimachinery/pkg/util/wait"
3236
"k8s.io/client-go/kubernetes"
3337
"k8s.io/client-go/kubernetes/scheme"
3438
_ "k8s.io/client-go/plugin/pkg/client/auth/oidc"
@@ -106,20 +110,55 @@ func getPodLogs(clientset *kubernetes.Clientset, namespace, podName string, opts
106110
return str, nil
107111
}
108112

109-
func refreshProwPods(client ctrlruntimeclient.Client, ctx context.Context, name string) error {
113+
func rolloutDeployment(t *testing.T, ctx context.Context, client ctrlruntimeclient.Client, name string) error {
110114
prowComponentsMux.Lock()
111115
defer prowComponentsMux.Unlock()
112116

113-
var pods coreapi.PodList
114-
labels, _ := labels.Parse("app = " + name)
115-
if err := client.List(ctx, &pods, &ctrlruntimeclient.ListOptions{LabelSelector: labels}); err != nil {
116-
return err
117+
var depl appsv1.Deployment
118+
if err := client.Get(ctx, types.NamespacedName{Name: name, Namespace: defaultNamespace}, &depl); err != nil {
119+
return fmt.Errorf("failed to get Deployment: %w", err)
120+
}
121+
122+
if replicas := depl.Spec.Replicas; replicas == nil || *replicas < 1 {
123+
return errors.New("cannot restart a Deployment with zero replicas.")
124+
}
125+
126+
labels := depl.Spec.Template.Labels
127+
if labels == nil {
128+
// This should never happen.
129+
labels = map[string]string{}
130+
}
131+
labels["restart"] = RandomString(t)
132+
133+
t.Logf("Restarting %s...", name)
134+
if err := client.Update(ctx, &depl); err != nil {
135+
return fmt.Errorf("failed to update Deployment: %w", err)
117136
}
118-
for _, pod := range pods.Items {
119-
if err := client.Delete(ctx, &pod); err != nil {
120-
return err
137+
138+
timeout := 30 * time.Second
139+
if err := wait.PollUntilContextTimeout(ctx, time.Second, timeout, false, func(ctx context.Context) (bool, error) {
140+
var current appsv1.Deployment
141+
if err := client.Get(ctx, ctrlruntimeclient.ObjectKeyFromObject(&depl), &current); err != nil {
142+
return false, fmt.Errorf("failed to get current Deployment: %w", err)
143+
}
144+
145+
replicas := current.Spec.Replicas
146+
if replicas == nil || *replicas < 1 {
147+
// This should never happen.
148+
return false, errors.New("Deployment has no replicas defined")
121149
}
150+
151+
ready := true &&
152+
current.Status.AvailableReplicas == *replicas &&
153+
current.Status.ReadyReplicas == *replicas &&
154+
current.Status.UpdatedReplicas == *replicas &&
155+
current.Status.UnavailableReplicas == 0
156+
157+
return ready, nil
158+
}); err != nil {
159+
return fmt.Errorf("Deployment did not fully roll out after %v: %w", timeout, err)
122160
}
161+
123162
return nil
124163
}
125164

0 commit comments

Comments
 (0)