-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_outliers_mahalanobis.py
178 lines (147 loc) · 5.77 KB
/
find_outliers_mahalanobis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
"""
Script to find outliers for 1 contrast, 1 atlas, using mahalanobis distance.
Author: Virgile Fritsch, 2010
from original Benjamin Thyreau's scripts
"""
import numpy as np
from scipy import linalg
from scipy.stats import chi2
import glob
from nipy.io.imageformats import load
import matplotlib.pyplot as plt
from fast_mcd import fast_mcd
#---------------------------------------------------------------
#----- Parameters ----------------------------------------------
#---------------------------------------------------------------
#----- Database path
DB_PATH = "/volatile/imagen/data"
#----- Functional Task
FUNCTIONAL_TASK = "GCA"
#----- Contrast number
CONTRAST = 13
#----- Atlas type ("cortical", "subcortical", or "functional")
ATLAS_TYPE = "subcortical"
#---------------------------------------------------------------
#----- Routines ------------------------------------------------
#---------------------------------------------------------------
def precollect(labels, skipfirst=False):
"""
"""
labelsvec = np.ravel(labels)
argposlabels = np.argsort(labelsvec)
b = labelsvec[argposlabels]
bounds = [0] + list(np.where(b[1:]!=b[:-1])[0]+1) + [len(b)]
if skipfirst:
bounds = bounds[1:]
return np.unique(b)[skipfirst:-1], argposlabels, bounds[skipfirst:]
def collect(signal, argposlabels, bounds):
"""Extracts each ROIs values
Parameters
----------
- signal: data from the nifti image.
- argposlabels: a rearrangement of the indices so that the values
of a given ROIs are consecutive in the flat data array.
- bounds: location of the boundary between two ROIs values (as indices)
"""
sorted_signal = signal.ravel()[argposlabels]
return [sorted_signal[b:e] for b,e in zip(bounds[:-1], bounds[1:])]
"""
def enlarge_trick(img, targetimg):
Returns data from a niftiimage,
optionnaly broadcasting up to len(targetimg) on newaxis
if img.extent == target_img.extent:
enlarged_data = img.get_data()
else:
enlarged_data = as_strided(
img.get_data(), shape=(len(targetimg.data),)+img.get_data().shape,
strides=(0,)+img.get_data().strides)
return enlarged_data
"""
#---------------------------------------------------------------
#----- Script starts -------------------------------------------
#---------------------------------------------------------------
# Subjects directory
subjects_dir = "%s/%s" %(DB_PATH, FUNCTIONAL_TASK)
### Load atlas
# get (or compute) atlas data
if ATLAS_TYPE == "cortical":
ATLAS = "/volatile/imagen/templateEPI/cort_333_LR.nii.gz"
atlas = load(ATLAS)
elif ATLAS_TYPE == "subcortical":
ATLAS = "/volatile/imagen/templateEPI/subcort_333.nii.gz"
atlas = load(ATLAS)
elif ATLAS_TYPE == "functional":
# /!\ NOT IMPLEMENTED YET
pass
else:
raise Exception("Wrong atlas type.")
# load ROIs structures
labels, argposlabels, bounds = precollect(atlas.get_data(), skipfirst=True)
print "Computing the covariance matrix of ROIs means for the %s atlas" \
%(ATLAS_TYPE)
# get all subjects data (estimables contrasts only)
filenames = sorted(
glob.glob("%s/0000*/con_%04d.nii.gz" %(subjects_dir, CONTRAST)))
actual_files = [f for f in filenames if not "unestimable" in load(f).get_header()['descrip']]
images = [load(f) for f in actual_files]
# find unestimables contrasts
unestimable_files = sorted(set(filenames).difference(actual_files))
print "unestimable: ", unestimable_files
unestimable_subjects = [x[42:54] for x in unestimable_files]
# get ROIs mean values for each subject
allsummary = np.zeros((len(actual_files), len(labels)))
for i, img in enumerate(images):
values = collect(img.get_data(), argposlabels, bounds)
allsummary[i,:] = np.asarray([v[~np.isnan(v)].mean() for v in values])
del img
# deal with the NaNs, since they confuse argsort
nonan_mask = np.isnan(allsummary).sum(1) == 0
fallsummary = allsummary[nonan_mask,:]
# --------> /!\ fixme: find a better way to trim
# per ROI trimmed-list of subjects (10 each side)
M = np.argsort(fallsummary, 0)[10:-10]
trimmed_ind = reduce(np.intersect1d, M.T)
trimmed_allsummary = fallsummary[trimmed_ind]
del M
#trimmed_allsummary = fallsummary
# SVD decomposition of the covariance matrix
covariance = np.cov(trimmed_allsummary.T)
robust_location, robust_covariance = fast_mcd(fallsummary)
u, s, vh = linalg.svd(robust_covariance)
# --------> /!\ fixme: look at that criterion (75%)
# keep only 75% of the covariance
inv_s = (1. / s) * \
((np.cumsum(s) < np.sum(s) * .95) | ([True]+[False]*(len(s)-1)))
inv_sigma = np.dot(np.dot(vh.T, np.diag(inv_s)), u.T)
# --------> /!\ fixme: median ?
# compute Mahalanobis distances
Y = fallsummary - robust_location
#Y = fallsummary - np.mean(fallsummary, 0)
R = np.sqrt((np.dot(Y, inv_sigma) * Y).sum(1))
# find outliers threshold
sortedR = R[~np.isnan(R)].copy()
sortedR.sort()
qi, qe, qa = np.outer(len(sortedR), [0.25, 0.5, 0.75])[0]
bnd = (sortedR[qa] - sortedR[qi])*3 + sortedR[qe]
### Estimate the density with a gaussian kernel
nonnan_subjects_arg = np.where(~np.isnan(R))[0]
R = R[nonnan_subjects_arg]
x = np.arange(0., 1.2*np.amax(R), 0.0012*np.amax(R))
n = R.size
sigma = 1.05 * np.std(R) * n**(-0.2)
kernel_arg = (np.tile(x, (n,1)).T - R) / sigma
fh = ((1/np.sqrt(2*np.pi)) * np.exp(-0.5*kernel_arg**2)).sum(1) / (n*sigma)
# print it
plt.figure()
plt.plot(x, fh)
plt.vlines(sortedR[qe], 0, np.amax(fh))
plt.vlines(bnd, 0, np.amax(fh))
# Khi-2 distribution
p = labels.size
diff_scale = np.sqrt(R.var() / float(chi2.stats(p, moments='v')))
diff_loc = R.mean() - float(chi2.stats(p, scale=diff_scale, moments='m'))
template = chi2(p, loc=diff_loc, scale=diff_scale)
plt.plot(x, template.pdf(x), linestyle='--', color='green')
plt.show()
for i in np.where(R > bnd)[0]:
print actual_files[nonnan_subjects_arg[i]][26:38]