Skip to content

Commit 23bc08a

Browse files
authored
Merge pull request #1165 from elezar/ignore-xid-109
Ignore XID error 109
2 parents 831c31e + f74a958 commit 23bc08a

File tree

1 file changed

+8
-7
lines changed

1 file changed

+8
-7
lines changed

internal/rm/health.go

+8-7
Original file line numberDiff line numberDiff line change
@@ -62,16 +62,17 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
6262
// FIXME: formalize the full list and document it.
6363
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
6464
// Application errors: the GPU should still be healthy
65-
applicationErrorXids := []uint64{
66-
13, // Graphics Engine Exception
67-
31, // GPU memory page fault
68-
43, // GPU stopped processing
69-
45, // Preemptive cleanup, due to previous errors
70-
68, // Video processor exception
65+
ignoredXids := []uint64{
66+
13, // Graphics Engine Exception
67+
31, // GPU memory page fault
68+
43, // GPU stopped processing
69+
45, // Preemptive cleanup, due to previous errors
70+
68, // Video processor exception
71+
109, // Context Switch Timeout Error
7172
}
7273

7374
skippedXids := make(map[uint64]bool)
74-
for _, id := range applicationErrorXids {
75+
for _, id := range ignoredXids {
7576
skippedXids[id] = true
7677
}
7778

0 commit comments

Comments
 (0)