7
7
import csv
8
8
import glob
9
9
import json
10
+ import pandas
10
11
import re
11
12
import os
12
13
import sys
@@ -48,7 +49,7 @@ def create_all_files(self):
48
49
self .create_concat_consensus ()
49
50
self .create_deliveryfile ()
50
51
self .create_fohm_csv ()
51
- self .load_vanilla_artic_results ()
52
+ self .load_lookup_dict ()
52
53
self .create_sarscov2_resultfile ()
53
54
self .create_sarscov2_variantfile ()
54
55
self .create_jsonfile ()
@@ -134,7 +135,6 @@ def create_sarscov2_resultfile(self):
134
135
if self .articdata == dict ():
135
136
print ("No artic results loaded. Quitting sarscov2_resultfile" )
136
137
sys .exit (- 1 )
137
- results = self .articdata
138
138
indir = self .indir
139
139
140
140
summaryfile = os .path .join (
@@ -153,14 +153,15 @@ def create_sarscov2_resultfile(self):
153
153
"Lineage" ,
154
154
"PangoLEARN_version" ,
155
155
"VOC" ,
156
- "Variants" ,
156
+ "Mutations" ,
157
+ "Region Code" ,
157
158
]
158
159
)
159
- for sample , data in results .items ():
160
+ for sample , data in self . articdata .items ():
160
161
selection = "-"
161
162
row = [
162
163
sample ,
163
- selection ,
164
+ data [ "selection_criteria" ] ,
164
165
ticket ,
165
166
data ["pct_n_bases" ],
166
167
data ["pct_10X_bases" ],
@@ -169,6 +170,7 @@ def create_sarscov2_resultfile(self):
169
170
data ["pangoLEARN_version" ],
170
171
data ["VOC" ],
171
172
data ["VOC_aa" ],
173
+ data ["region_code" ],
172
174
]
173
175
summary .writerow (row )
174
176
@@ -210,12 +212,45 @@ def create_jsonfile(self):
210
212
) as outfile :
211
213
json .dump (self .articdata , outfile )
212
214
213
- def load_vanilla_artic_results (self ):
214
- """Parse artic output directory for analysis results. Returns dictionary data object"""
215
+
216
+
217
+ def load_lookup_dict (self ):
218
+ """ Loads articdata with data from various sources. Atm, artic output and the case config input file """
219
+ self .load_artic_results ()
220
+ self .load_case_config ()
221
+
222
+ def load_case_config (self ):
223
+ """ Appends additional data to articdata dictionary """
224
+ casekeys = self .caseinfo [0 ].keys ()
225
+
226
+ packing = dict (zip (casekeys , "-" * len (casekeys )))
227
+
228
+ #Packs with keys. Time consuming but not really
229
+ for k , v in self .articdata .items ():
230
+ self .articdata [k ].update (packing )
231
+ #Writes caseconfig data where relevant
232
+ for entry in self .caseinfo :
233
+ k = entry ['Customer_ID_sample' ]
234
+ if k in self .articdata .keys ():
235
+ self .articdata [k ].update (entry )
236
+
237
+
238
+ def load_artic_results (self ):
239
+ """Parse artic output directory for analysis results. Returns dictionary data object """
215
240
indir = self .indir
216
241
voc_pos = range (475 , 486 )
217
- voc_pos_aa = get_json ("{0}/standalone/voc_strains.json" .format (WD ))['voc_pos_aa' ]
218
- voc_strains = get_json ("{0}/standalone/voc_strains.json" .format (WD ))['voc_strains' ]
242
+ muts = pandas .read_csv ("{0}/standalone/spike_mutations.csv" .format (WD ), sep = "," )
243
+ # Magical unpacking into single list
244
+ voc_pos_aa = sum (muts .values .tolist (), [])
245
+
246
+ classifications = pandas .read_csv ("{0}/standalone/classifications.csv" .format (WD ), sep = "," )
247
+ voc_strains = { 'lineage' :'' ,'spike' :'' ,'class' :'' }
248
+ voc_strains ['lineage' ] = classifications ['lineage' ].tolist ()
249
+ voc_strains ['spike' ] = classifications ['spike' ].tolist ()
250
+ voc_strains ['class' ] = classifications ['class' ].tolist ()
251
+
252
+ #voc_pos_aa = get_json("{0}/standalone/voc_strains.json".format(WD))['voc_pos_aa']
253
+ #voc_strains = get_json("{0}/standalone/voc_strains.json".format(WD))['voc_strains']
219
254
220
255
artic_data = dict ()
221
256
var_all = dict ()
@@ -251,16 +286,16 @@ def load_vanilla_artic_results(self):
251
286
for line in content :
252
287
sample = line [0 ].split ("_" )[- 1 ]
253
288
if float (line [2 ]) > 95 :
254
- passed = "TRUE"
289
+ qc_flag = "TRUE"
255
290
else :
256
- passed = "FALSE"
291
+ qc_flag = "FALSE"
257
292
artic_data [sample ] = {
258
293
"pct_n_bases" : line [1 ],
259
294
"pct_10X_bases" : line [2 ],
260
295
"longest_no_N_run" : line [3 ],
261
296
"num_aligned_reads" : line [4 ],
262
297
"artic_qc" : line [7 ],
263
- "qc" : passed ,
298
+ "qc" : qc_flag ,
264
299
}
265
300
# Parse Pangolin report data
266
301
with open (paths [2 ]) as f :
@@ -269,21 +304,16 @@ def load_vanilla_artic_results(self):
269
304
for line in content :
270
305
sample = line [0 ].split ("." )[0 ].split ("_" )[- 1 ]
271
306
lineage = line [1 ]
272
- if lineage in voc_strains :
273
- voc = "Yes"
274
- elif lineage == "None" :
275
- voc = "-"
276
- else :
277
- voc = "No"
307
+
278
308
artic_data [sample ].update (
279
309
{
280
310
"lineage" : lineage ,
281
311
"pangolin_probability" : line [2 ],
282
312
"pangoLEARN_version" : line [3 ],
283
313
"pangolin_qc" : line [4 ],
284
- "VOC" : voc ,
285
314
}
286
315
)
316
+
287
317
# Parse Variant report data
288
318
if os .stat (paths [1 ]).st_size != 0 :
289
319
with open (paths [1 ]) as f :
@@ -317,7 +347,23 @@ def load_vanilla_artic_results(self):
317
347
artic_data [sample ].update ({"variants" : var_all [sample ]})
318
348
else :
319
349
artic_data [sample ].update ({"variants" : "-" })
320
- self .articdata = artic_data
350
+
351
+
352
+ #Classification
353
+ for key , vals in artic_data .items ():
354
+ #Packing
355
+ artic_data [key ].update ( {"VOC" : "No" } )
356
+
357
+ #Check for lineage
358
+ if artic_data [key ]["lineage" ] in voc_strains ['lineage' ]:
359
+ index = voc_strains ['lineage' ].index (artic_data [key ]['lineage' ])
360
+ #Check for spike
361
+ if pandas .isna (voc_strains ['spike' ][index ]) or voc_strains ['spike' ][index ] in artic_data [key ]['VOC_aa' ]:
362
+ artic_data [key ].update ( {"VOC" :voc_strains ['class' ][index ]} )
363
+
364
+
365
+
366
+ self .articdata .update (artic_data )
321
367
322
368
def create_deliveryfile (self ):
323
369
0 commit comments