Skip to content

Commit 5f3b3a5

Browse files
committed
Add docstrings and cleanup
1 parent 54670c4 commit 5f3b3a5

File tree

5 files changed

+223
-23
lines changed

5 files changed

+223
-23
lines changed

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,7 @@ src/preprocessing/.ipynb_checkpoints/*
55
.idea/*
66
src/__pycache__/*
77
*.npy
8+
Combine_result/*
9+
tmp_output/*
10+
ExampleCode.Python.zip
11+
exps

src/generate_vsm.py

+102-8
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,33 @@
1414

1515

1616
def reduce_dimension(vectors):
17+
"""
18+
Perform PCA transformation on given vectors, n_components=0.95
19+
20+
:param vectors: the matrix to be transformed
21+
:return: the transformed matrix
22+
"""
1723
pca = PCA(n_components=0.95)
1824
return pca.fit_transform(vectors)
1925

2026

21-
def averaging_vectors(namelist, default_value=True, fill_na=False):
27+
def averaging_vectors(namelist, default_value=True):
28+
"""
29+
Merge a list of word vectors into a single vector by averaging
30+
31+
:param namelist: The list of word vectors
32+
:param default_value: whether to replace OOV word with unknown
33+
:return: averaged vectors
34+
"""
2235
if default_value:
2336
unk = w2v_model.get_vector('unknown')
2437
vectors = np.array([unk for _ in range(len(namelist))])
2538
else:
2639
vectors = np.array([[np.nan] * vector_size for _ in range(len(namelist))])
40+
2741
i = 0
2842
for e in namelist:
43+
# Remove stop words
2944
name = ''.join(filter(whitelist.__contains__, e.replace('-', ' ')))
3045
vec = np.zeros(vector_size)
3146
count = 0
@@ -44,10 +59,18 @@ def averaging_vectors(namelist, default_value=True, fill_na=False):
4459

4560

4661
def fixed_length_vectors(namelist, embedding_size=entity_embedding_size):
62+
"""
63+
Turn the input word vector groups into vector with fixed length: embedding_size
64+
65+
:param namelist: The list of word vectors
66+
:param embedding_size: the length of the embedding
67+
:return: embedded vectors
68+
"""
4769
vectors = np.zeros((len(namelist), embedding_size, vector_size))
48-
i = 0
70+
i = 0 # Tracks the number of word vector group
4971
for e in namelist:
5072
name = ''.join(filter(whitelist.__contains__, e.replace('-', ' ')))
73+
# If the input word is too long, averaging the neighboring word vectors into one vector first
5174
if len(name.split()) > embedding_size:
5275
tmp = list()
5376
for w in name.split():
@@ -56,10 +79,12 @@ def fixed_length_vectors(namelist, embedding_size=entity_embedding_size):
5679
elif w.lower() in w2v_model.vocab:
5780
tmp.append(w2v_model.get_vector(w.lower()))
5881
avg_factor = np.ceil(len(tmp) / embedding_size).astype(int)
82+
83+
# Concat the averaged vectors
5984
for k in range(0, len(tmp), avg_factor):
6085
vectors[i][k//avg_factor] = np.mean(tmp[k:k+avg_factor])
6186
else:
62-
j = 0
87+
j = 0 # Tracks the counting within a word vector group
6388
for w in name.split():
6489
if w in w2v_model.vocab:
6590
vectors[i][j] = w2v_model.get_vector(w)
@@ -76,9 +101,20 @@ def weighted_vectors():
76101

77102

78103
def fixed_length_vectors_by_text(names):
104+
"""
105+
Turn the input word vector groups, with context entity word vector groups, into vector with fixed length.
106+
Each entity (word group) takes entity_embedding_size rows,
107+
and the total embedding result takes context_embedding_size rows.
108+
109+
:param names: the DataFrame with entities information
110+
:return: DataFrame with embedded vectors with context entity information.
111+
Columns: ['text_id', 'vec']
112+
"""
79113
# print(names.groupby('text_id').count())
114+
80115
names_by_text = names.groupby('text_id').aggregate(lambda x: set(x))
81116
names_by_text['vec'] = None
117+
82118
# for idx in names_by_text.index:
83119
# tmp = {}
84120
# for n in names_by_text.name[idx]:
@@ -96,6 +132,9 @@ def fixed_length_vectors_by_text(names):
96132

97133

98134
def process_biotope_dict():
135+
"""
136+
Convert biotope dictionary words to vectors by averaging individual word in each dict term respectively
137+
"""
99138
ref = parse_biotope_dict()
100139

101140
vectors = averaging_vectors(ref.name)
@@ -106,23 +145,43 @@ def process_biotope_dict():
106145

107146

108147
def process_entity_and_label_table(tablename):
148+
"""
149+
Generate embedded entity and label vectors with native methods.
150+
entity vectors: fixed_length_vectors
151+
label vectors: averaging vectors
152+
Save the numpy array to local
153+
154+
:param tablename: entity_and_label table generated from generate_tables.py
155+
"""
156+
# Get "train" or "test" or "dev"
109157
prefix = tablename.split('_', -1)[-1]
158+
110159
names_and_labels = parse_entity_and_label_table(tablename)
111160
names_vec = fixed_length_vectors(names_and_labels.name)
112161
labels_vec = averaging_vectors(names_and_labels.dict_name)
162+
163+
# Save to local
113164
names_and_labels.to_csv(os.path.join(path, '%s_names_and_labels.tsv' %prefix), sep='\t')
114165
np.save(os.path.join(path, '%s_names_vectors.npy' %prefix), names_vec)
115166
np.save(os.path.join(path, '%s_labels_vectors.npy' %prefix), labels_vec)
116167

117168

118169
def generated_normalized_dict_and_labels():
170+
"""
171+
Run PCA on reference word space, n_component=0.95 (default), and turn the space to 139 dimensions.
172+
Generate embedded label vectors by looking-up PCA-ed reference word vectors
173+
Generate embedded entity and label vectors with fixed length methods.
174+
Save the numpy array to local
175+
"""
176+
# Reduce dimensions of random vectors
119177
ref = parse_biotope_dict()
120178
vectors = averaging_vectors(ref.name)
121179
vectors = reduce_dimension(vectors)
122180
np.save(os.path.join(path, 'OBT_VSM_norm.npy'), vectors)
123181
ref['vec'] = list(vectors)
124182
ref.to_csv(os.path.join(path, 'OBT_VSM_norm.tsv'), sep='\t')
125183

184+
# Parse entity_and_label tables
126185
for tablename in ['entity_and_label_list_BioNLP-OST-2019_BB-norm_train.tsv',
127186
'entity_and_label_list_BioNLP-OST-2019_BB-norm_dev.tsv']:
128187
labels_id_and_labels = parse_entity_and_label_table(tablename)
@@ -132,28 +191,53 @@ def generated_normalized_dict_and_labels():
132191

133192

134193
def generate_context_entity_list(tablename):
194+
"""
195+
Include entities appeared in the same article as input information.
196+
Target entity: with size of embedding_vector_size*200
197+
Other context entities: each with size of embedding_vector_size*200
198+
Total input: padded to size of context_vector_size*200
199+
200+
:param tablename: entity_and_label table generated from generate_tables.py
201+
"""
202+
# target entity vectors
135203
names_and_labels = parse_entity_and_label_table(tablename)
136204
names_vec = fixed_length_vectors(names_and_labels.name)
137205

206+
# context entity vectors
138207
names_by_text = fixed_length_vectors_by_text(names_and_labels[['text_id', 'name']])
139208
concat_vec = np.stack(names_by_text.loc[names_and_labels.text_id, 'vec'], axis=0)
140209
names_vec = np.concatenate((names_vec, concat_vec), axis=1)
141-
# print(names_vec.shape)
142-
names_and_labels.to_csv(os.path.join(path, '%s_names_and_labels_with_context.tsv' %tablename.split('_', -1)[-1]), sep='\t')
143-
np.save(os.path.join(path, '%s_names_vectors_with_context.npy' %tablename.split('_', -1)[-1]), names_vec)
210+
211+
# save
212+
names_and_labels.to_csv(os.path.join(path, '%s_names_and_labels_with_context.tsv'
213+
% tablename.split('_', -1)[-1]), sep='\t')
214+
np.save(os.path.join(path, '%s_names_vectors_with_context.npy' % tablename.split('_', -1)[-1]), names_vec)
144215

145216

146217
def generate_five_fold_dataset(prediction=False):
218+
"""
219+
Randomly select a percentage of data as test dataset (0.17 for training, and use the real test dataset for testing)
220+
Generate embedded label vectors by looking-up PCA-ed reference word vectors
221+
Generate embedded entity and label vectors with fixed length methods.
222+
223+
:param prediction: if True: using pre-assigned text data set
224+
if False: randomly select test data set, frac=0.17
225+
:return:
226+
"""
227+
# generate PCA-ed reference vectors
147228
ref = parse_biotope_dict()
148229
vectors = averaging_vectors(ref.name)
149230
vectors = reduce_dimension(vectors)
150231
np.save(os.path.join(path, 'OBT_VSM_norm.npy'), vectors)
151232
ref['vec'] = list(vectors)
152233
ref.to_csv(os.path.join(path, 'OBT_VSM_norm.tsv'), sep='\t')
153234

235+
# get total entity_and_label_list
154236
names_and_labels = parse_entity_and_label_table('entity_and_label_list_BioNLP-OST-2019_BB-norm_train.tsv')
155237
names_and_labels = pd.concat([names_and_labels,
156238
parse_entity_and_label_table('entity_and_label_list_BioNLP-OST-2019_BB-norm_dev.tsv')])
239+
240+
# if prediction: no label for test data set
157241
if prediction:
158242
training_size = len(names_and_labels)
159243
names_and_labels = pd.concat([names_and_labels,
@@ -183,14 +267,24 @@ def generate_five_fold_dataset(prediction=False):
183267
if __name__=="__main__":
184268
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(
185269
'../input_data/wikipedia-pubmed-and-PMC-w2v.bin', binary=True)
186-
vector_size = w2v_model.vector_size
270+
vector_size = w2v_model.vector_size # 200
271+
272+
187273
whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')
274+
188275
path = os.path.join(os.getcwd(), '../input_data/vsm/')
189276

277+
"""
278+
Note: different methods use different methods to generate word vectors.
279+
One only need to select desired subgroups of methods to generate dictionary, entity and label word vectors
280+
"""
190281
# process_biotope_dict(default_value=False)
191282
# process_entity_and_label_table('entity_and_label_list_BioNLP-OST-2019_BB-norm_train.tsv')
192283
# process_entity_and_label_table('entity_and_label_list_BioNLP-OST-2019_BB-norm_dev.tsv')
284+
193285
# generated_normalized_dict_and_labels()
286+
194287
# generate_context_entity_list('entity_and_label_list_BioNLP-OST-2019_BB-norm_train.tsv')
195288
# generate_context_entity_list('entity_and_label_list_BioNLP-OST-2019_BB-norm_dev.tsv')
196-
generate_five_fold_dataset(prediction=True)
289+
290+
generate_five_fold_dataset(prediction=False)

0 commit comments

Comments
 (0)