Skip to content

Commit dba3db8

Browse files
author
sylvinite
committed
Rewrote classifier, fixed a bug with input data
1 parent 08ca723 commit dba3db8

10 files changed

+124
-42
lines changed

mutant/modules/sarscov2_report.py

+66-20
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import csv
88
import glob
99
import json
10+
import pandas
1011
import re
1112
import os
1213
import sys
@@ -48,7 +49,7 @@ def create_all_files(self):
4849
self.create_concat_consensus()
4950
self.create_deliveryfile()
5051
self.create_fohm_csv()
51-
self.load_vanilla_artic_results()
52+
self.load_lookup_dict()
5253
self.create_sarscov2_resultfile()
5354
self.create_sarscov2_variantfile()
5455
self.create_jsonfile()
@@ -134,7 +135,6 @@ def create_sarscov2_resultfile(self):
134135
if self.articdata == dict():
135136
print("No artic results loaded. Quitting sarscov2_resultfile")
136137
sys.exit(-1)
137-
results = self.articdata
138138
indir = self.indir
139139

140140
summaryfile = os.path.join(
@@ -153,14 +153,15 @@ def create_sarscov2_resultfile(self):
153153
"Lineage",
154154
"PangoLEARN_version",
155155
"VOC",
156-
"Variants",
156+
"Mutations",
157+
"Region Code",
157158
]
158159
)
159-
for sample, data in results.items():
160+
for sample, data in self.articdata.items():
160161
selection = "-"
161162
row = [
162163
sample,
163-
selection,
164+
data["selection_criteria"],
164165
ticket,
165166
data["pct_n_bases"],
166167
data["pct_10X_bases"],
@@ -169,6 +170,7 @@ def create_sarscov2_resultfile(self):
169170
data["pangoLEARN_version"],
170171
data["VOC"],
171172
data["VOC_aa"],
173+
data["region_code"],
172174
]
173175
summary.writerow(row)
174176

@@ -210,12 +212,45 @@ def create_jsonfile(self):
210212
) as outfile:
211213
json.dump(self.articdata, outfile)
212214

213-
def load_vanilla_artic_results(self):
214-
"""Parse artic output directory for analysis results. Returns dictionary data object"""
215+
216+
217+
def load_lookup_dict(self):
218+
""" Loads articdata with data from various sources. Atm, artic output and the case config input file """
219+
self.load_artic_results()
220+
self.load_case_config()
221+
222+
def load_case_config(self):
223+
""" Appends additional data to articdata dictionary """
224+
casekeys = self.caseinfo[0].keys()
225+
226+
packing = dict(zip(casekeys, "-"*len(casekeys)))
227+
228+
#Packs with keys. Time consuming but not really
229+
for k, v in self.articdata.items():
230+
self.articdata[k].update(packing)
231+
#Writes caseconfig data where relevant
232+
for entry in self.caseinfo:
233+
k = entry['Customer_ID_sample']
234+
if k in self.articdata.keys():
235+
self.articdata[k].update(entry)
236+
237+
238+
def load_artic_results(self):
239+
"""Parse artic output directory for analysis results. Returns dictionary data object """
215240
indir = self.indir
216241
voc_pos = range(475, 486)
217-
voc_pos_aa = get_json("{0}/standalone/voc_strains.json".format(WD))['voc_pos_aa']
218-
voc_strains = get_json("{0}/standalone/voc_strains.json".format(WD))['voc_strains']
242+
muts = pandas.read_csv("{0}/standalone/spike_mutations.csv".format(WD), sep=",")
243+
# Magical unpacking into single list
244+
voc_pos_aa = sum(muts.values.tolist(), [])
245+
246+
classifications = pandas.read_csv("{0}/standalone/classifications.csv".format(WD), sep=",")
247+
voc_strains = { 'lineage':'','spike':'','class':''}
248+
voc_strains['lineage'] = classifications['lineage'].tolist()
249+
voc_strains['spike'] = classifications['spike'].tolist()
250+
voc_strains['class'] = classifications['class'].tolist()
251+
252+
#voc_pos_aa = get_json("{0}/standalone/voc_strains.json".format(WD))['voc_pos_aa']
253+
#voc_strains = get_json("{0}/standalone/voc_strains.json".format(WD))['voc_strains']
219254

220255
artic_data = dict()
221256
var_all = dict()
@@ -251,16 +286,16 @@ def load_vanilla_artic_results(self):
251286
for line in content:
252287
sample = line[0].split("_")[-1]
253288
if float(line[2]) > 95:
254-
passed = "TRUE"
289+
qc_flag = "TRUE"
255290
else:
256-
passed = "FALSE"
291+
qc_flag = "FALSE"
257292
artic_data[sample] = {
258293
"pct_n_bases": line[1],
259294
"pct_10X_bases": line[2],
260295
"longest_no_N_run": line[3],
261296
"num_aligned_reads": line[4],
262297
"artic_qc": line[7],
263-
"qc": passed,
298+
"qc": qc_flag,
264299
}
265300
# Parse Pangolin report data
266301
with open(paths[2]) as f:
@@ -269,21 +304,16 @@ def load_vanilla_artic_results(self):
269304
for line in content:
270305
sample = line[0].split(".")[0].split("_")[-1]
271306
lineage = line[1]
272-
if lineage in voc_strains:
273-
voc = "Yes"
274-
elif lineage == "None":
275-
voc = "-"
276-
else:
277-
voc = "No"
307+
278308
artic_data[sample].update(
279309
{
280310
"lineage": lineage,
281311
"pangolin_probability": line[2],
282312
"pangoLEARN_version": line[3],
283313
"pangolin_qc": line[4],
284-
"VOC": voc,
285314
}
286315
)
316+
287317
# Parse Variant report data
288318
if os.stat(paths[1]).st_size != 0:
289319
with open(paths[1]) as f:
@@ -317,7 +347,23 @@ def load_vanilla_artic_results(self):
317347
artic_data[sample].update({"variants": var_all[sample]})
318348
else:
319349
artic_data[sample].update({"variants": "-"})
320-
self.articdata = artic_data
350+
351+
352+
#Classification
353+
for key, vals in artic_data.items():
354+
#Packing
355+
artic_data[key].update( {"VOC": "No" } )
356+
357+
#Check for lineage
358+
if artic_data[key]["lineage"] in voc_strains['lineage']:
359+
index = voc_strains['lineage'].index(artic_data[key]['lineage'])
360+
#Check for spike
361+
if pandas.isna(voc_strains['spike'][index]) or voc_strains['spike'][index] in artic_data[key]['VOC_aa']:
362+
artic_data[key].update( {"VOC":voc_strains['class'][index]} )
363+
364+
365+
366+
self.articdata.update(artic_data)
321367

322368
def create_deliveryfile(self):
323369

mutant/standalone/classifications.csv

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
lineage,spike,class
2+
B.1.1.7,,VOC
3+
B.1.1.7,E484,VOC
4+
B.1.351,,VOC
5+
P.1,VOC
6+
B.1.525,,VOI
7+
B.1.427,,VOI
8+
B.1.429,,VOI
9+
P.3,,VOI
10+
B.1.616,,VOI
11+
B.1.617.1,,VOI
12+
B.1.617.2,,VOI
13+
B.1.617.3,,VOI
14+
B.1.620,,VOI
15+
B.1.621,,VOI
16+
B.1.214.2,,VUM
17+
A.23.1,E484K,VUM
18+
A.27,,VUM
19+
A.28,,VUM
20+
C.16,,VUM
21+
C.37,,VUM
22+
B.1.351,P384L,VUM
23+
B.1.351,E516Q,VUM
24+
B.1.1.7,L452R,VUM
25+
C.36,L452R,VUM
26+
AT.1,,VUM
27+
B.1.526,,VUM
28+
B.1.526.1,,VUM
29+
B.1.526.2,,VUM
30+
B.1.1.318,,VUM
31+
P.2,,VUM
32+

mutant/standalone/spike_mutations.csv

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
Spike_mutations_of_interest
2+
N501Y
3+
D614G
4+
E484K
5+
K417N
6+
K417T
7+
Q677H
8+
L452R
9+
V482A
10+
H655Y
11+
G669S
12+
E484Q
13+
T478K
14+
S477N
15+
R346K
16+
Q414K
17+
N450K
18+
ins214TDR
19+
Q613H
20+
N501T
21+
L452Q
22+
F490S
23+
P384L
24+
E516Q

mutant/standalone/voc_strains.json

-21
This file was deleted.

requirements-pip.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
black
22
click==7.1.2
3+
pandas
34
pyyaml

tests/testdata/MIC3109_artic.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
"lab_code": "SE999 Langistan",
2727
"priority": "standard",
2828
"Customer_ID_project": 123456,
29-
"Customer_ID_sample": "12CS123456",
29+
"Customer_ID_sample": "34CS123456",
3030
"customer_id": "cust001",
3131
"application_tag": "VWGDPTR001",
3232
"date_arrival": "2000-03-16 00:00:00",

0 commit comments

Comments
 (0)