Skip to content

Commit b16f06f

Browse files
authored
fix troubleshooting script to skip DCE connected to DCRA (#1048)
[comment]: # (Note that your PR title should follow the conventional commit format: https://conventionalcommits.org/en/v1.0.0/#summary) # PR Description fix troubleshooting script to skip DCE connected to DCRA [comment]: # (The below checklist is for PRs adding new features. If a box is not checked, add a reason why it's not needed.) # New Feature Checklist - [ ] List telemetry added about the feature. - [ ] Link to the one-pager about the feature. - [ ] List any tasks necessary for release (3P docs, AKS RP chart changes, etc.) after merging the PR. - [ ] Attach results of scale and perf testing. [comment]: # (The below checklist is for code changes. Not all boxes necessarily need to be checked. Build, doc, and template changes do not need to fill out the checklist.) # Tests Checklist - [ ] Have end-to-end Ginkgo tests been run on your cluster and passed? To bootstrap your cluster to run the tests, follow [these instructions](/otelcollector/test/README.md#bootstrap-a-dev-cluster-to-run-ginkgo-tests). - Labels used when running the tests on your cluster: - [ ] `operator` - [ ] `windows` - [ ] `arm64` - [ ] `arc-extension` - [ ] `fips` - [ ] Have new tests been added? For features, have tests been added for this feature? For fixes, is there a test that could have caught this issue and could validate that the fix works? - [ ] Is a new scrape job needed? - [ ] The scrape job was added to the folder [test-cluster-yamls](/otelcollector/test/test-cluster-yamls/) in the correct configmap or as a CR. - [ ] Was a new test label added? - [ ] A string constant for the label was added to [constants.go](/otelcollector/test/utils/constants.go). - [ ] The label and description was added to the [test README](/otelcollector/test/README.md). - [ ] The label was added to this [PR checklist](/.github/pull_request_template). - [ ] The label was added as needed to [testkube-test-crs.yaml](/otelcollector/test/testkube/testkube-test-crs.yaml). - [ ] Are additional API server permissions needed for the new tests? - [ ] These permissions have been added to [api-server-permissions.yaml](/otelcollector/test/testkube/api-server-permissions.yaml). - [ ] Was a new test suite (a new folder under `/tests`) added? - [ ] The new test suite is included in [testkube-test-crs.yaml](/otelcollector/test/testkube/testkube-test-crs.yaml).
1 parent 8d52acf commit b16f06f

File tree

1 file changed

+121
-38
lines changed

1 file changed

+121
-38
lines changed

internal/scripts/troubleshoot/TroubleshootError.ps1

+121-38
Original file line numberDiff line numberDiff line change
@@ -346,19 +346,38 @@ catch {
346346

347347
# Get all DC* objects
348348
try {
349-
$dcraList = Get-AzDataCollectionRuleAssociation -TargetResourceId $ClusterResourceId -ErrorAction Stop -WarningAction silentlyContinue
349+
$dcraList = Get-AzDataCollectionRuleAssociation -TargetResourceId $ClusterResourceId -ErrorAction Stop -WarningAction SilentlyContinue
350350
$prometheusMetricsTuples = @()
351351

352352
foreach ($dcra in $dcraList) {
353+
354+
# Filter out "configurationAccessEndpoint" entries
355+
if ($dcra.Name -eq "configurationAccessEndpoint") {
356+
Write-Host "Skipping configurationAccessEndpoint DCRA: $($dcra.Name)" -ForegroundColor Yellow
357+
continue
358+
}
359+
353360
Write-Output "DCRA ID: $($dcra.Id)"
354361
Write-Output "DCRA Name: $($dcra.Name)"
355362
Write-Output "Data Collection Rule ID: $($dcra.DataCollectionRuleId)"
356363
Write-Output "Target Resource ID: $($dcra.TargetResourceId)"
357364
Write-Output "Provisioning State: $($dcra.ProvisioningState)"
358365
Write-Output "Additional Properties:"
359366
$dcra.Properties | Format-Table -AutoSize
367+
368+
# Check if DataCollectionRuleId is not null or empty
369+
if ([string]::IsNullOrWhiteSpace($dcra.DataCollectionRuleId)) {
370+
Write-Host "Skipping DCRA with no DataCollectionRuleId: $($dcra.Name)" -ForegroundColor Yellow
371+
continue
372+
}
373+
360374
# Get the Data Collection Rule details based on its ID
361-
$dataCollectionRule = Get-AzResource -ResourceId $dcra.DataCollectionRuleId -ErrorAction silentlyContinue
375+
$dataCollectionRule = Get-AzResource -ResourceId $dcra.DataCollectionRuleId -ErrorAction SilentlyContinue
376+
if ($null -eq $dataCollectionRule) {
377+
Write-Host "Unable to fetch Data Collection Rule details for ID: $($dcra.DataCollectionRuleId)" -ForegroundColor Yellow
378+
continue
379+
}
380+
362381
$dataflows = $dataCollectionRule.Properties.DataFlows
363382
foreach ($dataflow in $dataflows) {
364383
$dataflowstream = $dataflow.streams
@@ -377,7 +396,7 @@ try {
377396
# Check if the map is empty
378397
if ($prometheusMetricsTuples.Count -eq 0) {
379398
Write-Host "No entries with Microsoft-PrometheusMetrics found in the Data Collection Rule" -ForegroundColor Red
380-
Write-Host("");
399+
Write-Host("")
381400
Stop-Transcript
382401
exit 1
383402
}
@@ -390,8 +409,9 @@ catch {
390409
exit 1
391410
}
392411

412+
393413
#
394-
# Check Agent pods running as expected with HPA
414+
# Check Agent pods running as expected with or without HPA
395415
#
396416
try {
397417
Write-Host("Getting Kubeconfig of the cluster...")
@@ -402,53 +422,115 @@ try {
402422
kubectl config use-context $ClusterName
403423
Write-Host("Successfully switched current context of the k8s cluster to:", $ClusterName)
404424

405-
Write-Host("Fetching ama-metrics deployment status with HPA...")
406-
$hpa = kubectl get hpa ama-metrics-hpa -n kube-system -o json | ConvertFrom-Json
425+
Write-Host("Checking if HPA is configured for ama-metrics deployment...")
426+
$hpa = kubectl get hpa ama-metrics-hpa -n kube-system -o json 2>$null | ConvertFrom-Json
427+
407428
if ($null -eq $hpa) {
408-
Write-Host("HPA configuration for ama-metrics not found.") -ForegroundColor Red
409-
Write-Host("Please ensure HPA is enabled and properly configured.") -ForegroundColor Red
410-
exit 1
411-
}
429+
$rsPod = kubectl get deployments ama-metrics -n kube-system -o json | ConvertFrom-Json
430+
if ($null -eq $rsPod) {
431+
Write-Host("ama-metrics replicaset pod not scheduled or failed to schedule.") -ForegroundColor Red
432+
Write-Host("Please refer to the following documentation to onboard and validate:") -ForegroundColor Red
433+
Write-Host($AksOptInLink) -ForegroundColor Red
434+
Write-Host($contactUSMessage)
435+
Stop-Transcript
436+
exit 1
437+
}
412438

413-
$hpaStatus = $hpa.status
414-
$currentReplicas = $hpaStatus.currentReplicas
415-
$desiredReplicas = $hpaStatus.desiredReplicas
439+
$rsPodStatus = $rsPod.status
440+
if ((($rsPodStatus.availableReplicas -ge 2) -and
441+
($rsPodStatus.readyReplicas -ge 2) -and
442+
($rsPodStatus.replicas -ge 2)) -eq $false
443+
) {
444+
Write-Host("ama-metrics replicaset pods not scheduled or failed to schedule.") -ForegroundColor Red
445+
Write-Host("Available ama-metrics replicas:", $rsPodStatus.availableReplicas)
446+
Write-Host("Ready ama-metrics replicas:", $rsPodStatus.readyReplicas)
447+
Write-Host("Total ama-metrics replicas:", $rsPodStatus.replicas)
448+
Write-Host($rsPod) -ForegroundColor Red
449+
Write-Host("get ama-metrics rs pod details ...")
450+
$amaMetricsRsPods = kubectl get pods -n kube-system -l rsName=ama-metrics -o json | ConvertFrom-Json
451+
foreach ($pod in $amaMetricsRsPods.items) {
452+
Write-Host("status of the ama-metrics rs pod is:", $pod.status.conditions) -ForegroundColor Red
453+
}
454+
Write-Host("successfully got ama-metrics rs pod details ...")
455+
Write-Host("Please refer to the following documentation to onboard and validate:") -ForegroundColor Red
456+
Write-Host($AksOptInLink) -ForegroundColor Red
457+
Write-Host($contactUSMessage)
458+
Stop-Transcript
459+
exit 1
460+
}
461+
462+
# Fetch all ama-metrics pods
463+
$amaMetricsRsPods = kubectl get pods -n kube-system -l rsName=ama-metrics -o json | ConvertFrom-Json
464+
foreach ($pod in $amaMetricsRsPods.items) {
465+
$podName = $pod.metadata.name
466+
467+
# Copy MetricsExtensionConsoleDebugLog.log from container to debuglogs directory
468+
kubectl cp kube-system/$($podName):/MetricsExtensionConsoleDebugLog.log ./$debuglogsDir/MetricsExtensionConsoleDebugLog_$($podName).log
469+
Write-Host("MetricsExtensionConsoleDebugLog_$($podName).log copied to debuglogs directory.") -ForegroundColor Green
416470

417-
Write-Host("Current replicas:", $currentReplicas)
418-
Write-Host("Desired replicas:", $desiredReplicas)
471+
# Copy MDSD logs from container to debuglogs directory
472+
$logFiles = @("mdsd.qos", "mdsd.info", "mdsd.warn", "mdsd.err")
473+
foreach ($logFile in $logFiles) {
474+
kubectl cp kube-system/$($podName):/opt/microsoft/linuxmonagent/$logFile ./$debuglogsDir/$($logFile)_$($podName).log
475+
Write-Host("$($logFile)_$($podName).log copied to debuglogs directory.") -ForegroundColor Green
476+
}
419477

420-
# Check if current replicas do not match desired replicas
421-
if ($currentReplicas -ne $desiredReplicas) {
422-
Write-Error "Mismatch detected! Current replicas ($currentReplicas) do not match desired replicas ($desiredReplicas)."
478+
# Get logs from prometheus-collector container and store in a file
479+
$promCollectorLogPath = "$debuglogsDir/$($podName)_promcollector.log"
480+
kubectl logs $($podName) -n kube-system -c prometheus-collector > $promCollectorLogPath
481+
482+
# Get logs from addon-token-adapter container and store in a file
483+
$addonTokenAdapterLogPath = "$debuglogsDir/$($podName)_addontokenadapter.log"
484+
kubectl logs $($podName) -n kube-system -c addon-token-adapter > $addonTokenAdapterLogPath
485+
486+
Write-Host("Logs for pod $($podName) copied successfully.") -ForegroundColor Green
487+
}
488+
489+
Write-Host("All ama-metrics replicaset pods are running OK.") -ForegroundColor Green
423490
}
424491
else {
425-
Write-Host "Replica counts match. No issues detected."
426-
}
492+
Write-Host("Fetching HPA status for ama-metrics...")
493+
$hpaStatus = $hpa.status
494+
$currentReplicas = $hpaStatus.currentReplicas
495+
$desiredReplicas = $hpaStatus.desiredReplicas
427496

428-
if ($currentReplicas -lt $hpa.spec.minReplicas) {
429-
Write-Host("Current replicas are less than the minimum replicas configured.") -ForegroundColor Red
430-
exit 1
431-
}
497+
Write-Host("Current replicas:", $currentReplicas)
498+
Write-Host("Desired replicas:", $desiredReplicas)
432499

433-
Write-Host("Checking the status of pods for ama-metrics deployment...")
434-
$rsPods = kubectl get pods -n kube-system -l rsName=ama-metrics -o json | ConvertFrom-Json
435-
if ($null -eq $rsPods.Items -or $rsPods.Items.Count -lt $currentReplicas) {
436-
Write-Host("Not all ama-metrics pods are scheduled or running.") -ForegroundColor Red
437-
Write-Host("Expected replicas:", $currentReplicas)
438-
Write-Host("Scheduled pods:", $rsPods.Items.Count)
439-
exit 1
440-
}
500+
# Check if current replicas do not match desired replicas
501+
if ($currentReplicas -ne $desiredReplicas) {
502+
Write-Error "Mismatch detected! Current replicas ($currentReplicas) do not match desired replicas ($desiredReplicas)."
503+
}
504+
else {
505+
Write-Host "Replica counts match. No issues detected."
506+
}
441507

442-
foreach ($pod in $rsPods.Items) {
443-
$podStatus = $pod.status.conditions
444-
if (-not ($podStatus | Where-Object { $_.type -eq "Ready" -and $_.status -eq "True" })) {
445-
Write-Host("Pod $($pod.metadata.name) is not ready.") -ForegroundColor Red
508+
if ($currentReplicas -lt $hpa.spec.minReplicas) {
509+
Write-Host("Current replicas are less than the minimum replicas configured.") -ForegroundColor Red
446510
exit 1
447511
}
448-
}
449512

450-
Write-Host("All ama-metrics pods are running as expected.") -ForegroundColor Green
513+
Write-Host("Checking the status of pods for ama-metrics deployment...")
514+
$rsPods = kubectl get pods -n kube-system -l rsName=ama-metrics -o json | ConvertFrom-Json
515+
if ($null -eq $rsPods.Items -or $rsPods.Items.Count -lt $currentReplicas) {
516+
Write-Host("Not all ama-metrics pods are scheduled or running.") -ForegroundColor Red
517+
Write-Host("Expected replicas:", $currentReplicas)
518+
Write-Host("Scheduled pods:", $rsPods.Items.Count)
519+
exit 1
520+
}
521+
522+
foreach ($pod in $rsPods.Items) {
523+
$podStatus = $pod.status.conditions
524+
if (-not ($podStatus | Where-Object { $_.type -eq "Ready" -and $_.status -eq "True" })) {
525+
Write-Host("Pod $($pod.metadata.name) is not ready.") -ForegroundColor Red
526+
exit 1
527+
}
528+
}
529+
530+
Write-Host("All ama-metrics pods are running as expected.") -ForegroundColor Green
531+
}
451532

533+
Write-Host("Collecting logs for debugging...")
452534
foreach ($pod in $rsPods.Items) {
453535
$podName = $pod.metadata.name
454536

@@ -476,6 +558,7 @@ catch {
476558
}
477559

478560

561+
479562
Write-Host("Checking whether the ama-metrics-node linux daemonset pod running correctly ...")
480563
try {
481564
$ds = kubectl get ds -n kube-system -o json --field-selector metadata.name=ama-metrics-node | ConvertFrom-Json

0 commit comments

Comments
 (0)