@@ -62,16 +62,17 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
62
62
// FIXME: formalize the full list and document it.
63
63
// http://docs.nvidia.com/deploy/xid-errors/index.html#topic_4
64
64
// Application errors: the GPU should still be healthy
65
- applicationErrorXids := []uint64 {
66
- 13 , // Graphics Engine Exception
67
- 31 , // GPU memory page fault
68
- 43 , // GPU stopped processing
69
- 45 , // Preemptive cleanup, due to previous errors
70
- 68 , // Video processor exception
65
+ ignoredXids := []uint64 {
66
+ 13 , // Graphics Engine Exception
67
+ 31 , // GPU memory page fault
68
+ 43 , // GPU stopped processing
69
+ 45 , // Preemptive cleanup, due to previous errors
70
+ 68 , // Video processor exception
71
+ 109 , // Context Switch Timeout Error
71
72
}
72
73
73
74
skippedXids := make (map [uint64 ]bool )
74
- for _ , id := range applicationErrorXids {
75
+ for _ , id := range ignoredXids {
75
76
skippedXids [id ] = true
76
77
}
77
78
0 commit comments