14
14
15
15
16
16
def reduce_dimension (vectors ):
17
+ """
18
+ Perform PCA transformation on given vectors, n_components=0.95
19
+
20
+ :param vectors: the matrix to be transformed
21
+ :return: the transformed matrix
22
+ """
17
23
pca = PCA (n_components = 0.95 )
18
24
return pca .fit_transform (vectors )
19
25
20
26
21
- def averaging_vectors (namelist , default_value = True , fill_na = False ):
27
+ def averaging_vectors (namelist , default_value = True ):
28
+ """
29
+ Merge a list of word vectors into a single vector by averaging
30
+
31
+ :param namelist: The list of word vectors
32
+ :param default_value: whether to replace OOV word with unknown
33
+ :return: averaged vectors
34
+ """
22
35
if default_value :
23
36
unk = w2v_model .get_vector ('unknown' )
24
37
vectors = np .array ([unk for _ in range (len (namelist ))])
25
38
else :
26
39
vectors = np .array ([[np .nan ] * vector_size for _ in range (len (namelist ))])
40
+
27
41
i = 0
28
42
for e in namelist :
43
+ # Remove stop words
29
44
name = '' .join (filter (whitelist .__contains__ , e .replace ('-' , ' ' )))
30
45
vec = np .zeros (vector_size )
31
46
count = 0
@@ -44,10 +59,18 @@ def averaging_vectors(namelist, default_value=True, fill_na=False):
44
59
45
60
46
61
def fixed_length_vectors (namelist , embedding_size = entity_embedding_size ):
62
+ """
63
+ Turn the input word vector groups into vector with fixed length: embedding_size
64
+
65
+ :param namelist: The list of word vectors
66
+ :param embedding_size: the length of the embedding
67
+ :return: embedded vectors
68
+ """
47
69
vectors = np .zeros ((len (namelist ), embedding_size , vector_size ))
48
- i = 0
70
+ i = 0 # Tracks the number of word vector group
49
71
for e in namelist :
50
72
name = '' .join (filter (whitelist .__contains__ , e .replace ('-' , ' ' )))
73
+ # If the input word is too long, averaging the neighboring word vectors into one vector first
51
74
if len (name .split ()) > embedding_size :
52
75
tmp = list ()
53
76
for w in name .split ():
@@ -56,10 +79,12 @@ def fixed_length_vectors(namelist, embedding_size=entity_embedding_size):
56
79
elif w .lower () in w2v_model .vocab :
57
80
tmp .append (w2v_model .get_vector (w .lower ()))
58
81
avg_factor = np .ceil (len (tmp ) / embedding_size ).astype (int )
82
+
83
+ # Concat the averaged vectors
59
84
for k in range (0 , len (tmp ), avg_factor ):
60
85
vectors [i ][k // avg_factor ] = np .mean (tmp [k :k + avg_factor ])
61
86
else :
62
- j = 0
87
+ j = 0 # Tracks the counting within a word vector group
63
88
for w in name .split ():
64
89
if w in w2v_model .vocab :
65
90
vectors [i ][j ] = w2v_model .get_vector (w )
@@ -76,9 +101,20 @@ def weighted_vectors():
76
101
77
102
78
103
def fixed_length_vectors_by_text (names ):
104
+ """
105
+ Turn the input word vector groups, with context entity word vector groups, into vector with fixed length.
106
+ Each entity (word group) takes entity_embedding_size rows,
107
+ and the total embedding result takes context_embedding_size rows.
108
+
109
+ :param names: the DataFrame with entities information
110
+ :return: DataFrame with embedded vectors with context entity information.
111
+ Columns: ['text_id', 'vec']
112
+ """
79
113
# print(names.groupby('text_id').count())
114
+
80
115
names_by_text = names .groupby ('text_id' ).aggregate (lambda x : set (x ))
81
116
names_by_text ['vec' ] = None
117
+
82
118
# for idx in names_by_text.index:
83
119
# tmp = {}
84
120
# for n in names_by_text.name[idx]:
@@ -96,6 +132,9 @@ def fixed_length_vectors_by_text(names):
96
132
97
133
98
134
def process_biotope_dict ():
135
+ """
136
+ Convert biotope dictionary words to vectors by averaging individual word in each dict term respectively
137
+ """
99
138
ref = parse_biotope_dict ()
100
139
101
140
vectors = averaging_vectors (ref .name )
@@ -106,23 +145,43 @@ def process_biotope_dict():
106
145
107
146
108
147
def process_entity_and_label_table (tablename ):
148
+ """
149
+ Generate embedded entity and label vectors with native methods.
150
+ entity vectors: fixed_length_vectors
151
+ label vectors: averaging vectors
152
+ Save the numpy array to local
153
+
154
+ :param tablename: entity_and_label table generated from generate_tables.py
155
+ """
156
+ # Get "train" or "test" or "dev"
109
157
prefix = tablename .split ('_' , - 1 )[- 1 ]
158
+
110
159
names_and_labels = parse_entity_and_label_table (tablename )
111
160
names_vec = fixed_length_vectors (names_and_labels .name )
112
161
labels_vec = averaging_vectors (names_and_labels .dict_name )
162
+
163
+ # Save to local
113
164
names_and_labels .to_csv (os .path .join (path , '%s_names_and_labels.tsv' % prefix ), sep = '\t ' )
114
165
np .save (os .path .join (path , '%s_names_vectors.npy' % prefix ), names_vec )
115
166
np .save (os .path .join (path , '%s_labels_vectors.npy' % prefix ), labels_vec )
116
167
117
168
118
169
def generated_normalized_dict_and_labels ():
170
+ """
171
+ Run PCA on reference word space, n_component=0.95 (default), and turn the space to 139 dimensions.
172
+ Generate embedded label vectors by looking-up PCA-ed reference word vectors
173
+ Generate embedded entity and label vectors with fixed length methods.
174
+ Save the numpy array to local
175
+ """
176
+ # Reduce dimensions of random vectors
119
177
ref = parse_biotope_dict ()
120
178
vectors = averaging_vectors (ref .name )
121
179
vectors = reduce_dimension (vectors )
122
180
np .save (os .path .join (path , 'OBT_VSM_norm.npy' ), vectors )
123
181
ref ['vec' ] = list (vectors )
124
182
ref .to_csv (os .path .join (path , 'OBT_VSM_norm.tsv' ), sep = '\t ' )
125
183
184
+ # Parse entity_and_label tables
126
185
for tablename in ['entity_and_label_list_BioNLP-OST-2019_BB-norm_train.tsv' ,
127
186
'entity_and_label_list_BioNLP-OST-2019_BB-norm_dev.tsv' ]:
128
187
labels_id_and_labels = parse_entity_and_label_table (tablename )
@@ -132,28 +191,53 @@ def generated_normalized_dict_and_labels():
132
191
133
192
134
193
def generate_context_entity_list (tablename ):
194
+ """
195
+ Include entities appeared in the same article as input information.
196
+ Target entity: with size of embedding_vector_size*200
197
+ Other context entities: each with size of embedding_vector_size*200
198
+ Total input: padded to size of context_vector_size*200
199
+
200
+ :param tablename: entity_and_label table generated from generate_tables.py
201
+ """
202
+ # target entity vectors
135
203
names_and_labels = parse_entity_and_label_table (tablename )
136
204
names_vec = fixed_length_vectors (names_and_labels .name )
137
205
206
+ # context entity vectors
138
207
names_by_text = fixed_length_vectors_by_text (names_and_labels [['text_id' , 'name' ]])
139
208
concat_vec = np .stack (names_by_text .loc [names_and_labels .text_id , 'vec' ], axis = 0 )
140
209
names_vec = np .concatenate ((names_vec , concat_vec ), axis = 1 )
141
- # print(names_vec.shape)
142
- names_and_labels .to_csv (os .path .join (path , '%s_names_and_labels_with_context.tsv' % tablename .split ('_' , - 1 )[- 1 ]), sep = '\t ' )
143
- np .save (os .path .join (path , '%s_names_vectors_with_context.npy' % tablename .split ('_' , - 1 )[- 1 ]), names_vec )
210
+
211
+ # save
212
+ names_and_labels .to_csv (os .path .join (path , '%s_names_and_labels_with_context.tsv'
213
+ % tablename .split ('_' , - 1 )[- 1 ]), sep = '\t ' )
214
+ np .save (os .path .join (path , '%s_names_vectors_with_context.npy' % tablename .split ('_' , - 1 )[- 1 ]), names_vec )
144
215
145
216
146
217
def generate_five_fold_dataset (prediction = False ):
218
+ """
219
+ Randomly select a percentage of data as test dataset (0.17 for training, and use the real test dataset for testing)
220
+ Generate embedded label vectors by looking-up PCA-ed reference word vectors
221
+ Generate embedded entity and label vectors with fixed length methods.
222
+
223
+ :param prediction: if True: using pre-assigned text data set
224
+ if False: randomly select test data set, frac=0.17
225
+ :return:
226
+ """
227
+ # generate PCA-ed reference vectors
147
228
ref = parse_biotope_dict ()
148
229
vectors = averaging_vectors (ref .name )
149
230
vectors = reduce_dimension (vectors )
150
231
np .save (os .path .join (path , 'OBT_VSM_norm.npy' ), vectors )
151
232
ref ['vec' ] = list (vectors )
152
233
ref .to_csv (os .path .join (path , 'OBT_VSM_norm.tsv' ), sep = '\t ' )
153
234
235
+ # get total entity_and_label_list
154
236
names_and_labels = parse_entity_and_label_table ('entity_and_label_list_BioNLP-OST-2019_BB-norm_train.tsv' )
155
237
names_and_labels = pd .concat ([names_and_labels ,
156
238
parse_entity_and_label_table ('entity_and_label_list_BioNLP-OST-2019_BB-norm_dev.tsv' )])
239
+
240
+ # if prediction: no label for test data set
157
241
if prediction :
158
242
training_size = len (names_and_labels )
159
243
names_and_labels = pd .concat ([names_and_labels ,
@@ -183,14 +267,24 @@ def generate_five_fold_dataset(prediction=False):
183
267
if __name__ == "__main__" :
184
268
w2v_model = gensim .models .KeyedVectors .load_word2vec_format (
185
269
'../input_data/wikipedia-pubmed-and-PMC-w2v.bin' , binary = True )
186
- vector_size = w2v_model .vector_size
270
+ vector_size = w2v_model .vector_size # 200
271
+
272
+
187
273
whitelist = set ('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ' )
274
+
188
275
path = os .path .join (os .getcwd (), '../input_data/vsm/' )
189
276
277
+ """
278
+ Note: different methods use different methods to generate word vectors.
279
+ One only need to select desired subgroups of methods to generate dictionary, entity and label word vectors
280
+ """
190
281
# process_biotope_dict(default_value=False)
191
282
# process_entity_and_label_table('entity_and_label_list_BioNLP-OST-2019_BB-norm_train.tsv')
192
283
# process_entity_and_label_table('entity_and_label_list_BioNLP-OST-2019_BB-norm_dev.tsv')
284
+
193
285
# generated_normalized_dict_and_labels()
286
+
194
287
# generate_context_entity_list('entity_and_label_list_BioNLP-OST-2019_BB-norm_train.tsv')
195
288
# generate_context_entity_list('entity_and_label_list_BioNLP-OST-2019_BB-norm_dev.tsv')
196
- generate_five_fold_dataset (prediction = True )
289
+
290
+ generate_five_fold_dataset (prediction = False )
0 commit comments