-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake-blast-database.pbs.sh
161 lines (123 loc) · 3.66 KB
/
make-blast-database.pbs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/bin/bash
#PBS -W group_list=dp_00007 -A dp_00007
#PBS -N Make_BLAST_database
#PBS -o make-blast-database.out
#PBS -e make-blast-database.err
#PBS -m n
#PBS -l nodes=1:thinnode:ppn=1
#PBS -l mem=50gb
#PBS -l walltime=1:00:00
set -e
module purge
module load tools
module load perl/5.36.1
module load python/3.12.0
module load ncbi-blast/2.16.0+
module load seqkit/2.4.0
# Directory and file prefix of the MIDORI database (must be in QIIME format)
midori_dir="sample_data"
midori_dir=$(realpath "$midori_dir")
prefix="MIDORI2_UNIQ_NUC_GB263_srRNA_QIIME"
# Table containing a list of acceptable taxa IDs
acceptable_species_file="acceptable_species.tsv"
acceptable_species_file=$(realpath "$acceptable_species_file")
# Output directory for taxonomic information
taxonomy_dir="taxonomy"
taxonomy_dir=$(realpath "$taxonomy_dir")
# Folder containing "genbank_nodes_and_names_to_taxonomy.py"
blast_script_dir="scripts"
blast_script_dir=$(realpath "$blast_script_dir")
blast_taxonomy_script="$blast_script_dir/genbank_nodes_and_names_to_taxonomy.py"
# Output directory for the BLAST database
blast_out_dir="blast"
mkdir --parents "$blast_out_dir"
blast_out_dir=$(realpath "$blast_out_dir")
# Write a list of all haplotypes and their respective taxa IDs
cd "$midori_dir"
zcat "$prefix.taxon.gz" \
| awk \
--field-separator ';' \
'{print $1, $NF}' \
| awk \
--field-separator ' ' \
'{print $1, $NF}' \
| awk \
'{
split($1, a, ".")
first_field = a[1] "." a[2]
split($2, b, "_")
second_field = b[2]
print first_field, second_field
}' \
> "all_haplotypes_taxid.txt"
# Keep only the taxa ID belonging to acceptable species.
# In this case, skip the first row (header) and extract the first column (TaxId).
tail --lines +2 "$acceptable_species_file" \
| cut --fields 1 \
| sort --numeric-sort \
| uniq \
> "acceptable_taxid.txt"
# Match and filter the haplotype file with the list of acceptable taxa IDs
awk \
'NR==FNR {nums[$1]; next} $2 in nums' \
"acceptable_taxid.txt" \
"all_haplotypes_taxid.txt" \
| awk \
--field-separator ' ' \
'{print $1, "taxid="$NF";"}' \
> "acceptable_haplotypes_taxid.txt"
# Extract only the list of acceptable haplotypes
cut \
--fields 1 \
--delimiter ' ' \
"acceptable_haplotypes_taxid.txt" \
> "acceptable_haplotypes.txt"
# Extract acceptable FASTA reads
zcat "$prefix.fasta.gz" \
| awk \
'/^>/{split($0, a, "."); print a[1] "." a[2]; next}1' \
| seqkit grep \
--pattern-file "acceptable_haplotypes.txt" \
--out-file "$prefix.temp.fasta"
awk \
--field-separator ' ' \
'NR==FNR {info[$1]=$2; next} /^>/ {id=substr($1, 2); print $1" "info[id]; next} 1' \
"acceptable_haplotypes.txt" \
"$prefix.temp.fasta" \
> "$prefix.filtered.fasta"
rm "$prefix.temp.fasta"
echo 'FASTA file filtered successfully'
# Download and prepare NCBI taxonomy dump
mkdir --parents "$taxonomy_dir"
cd "$taxonomy_dir"
wget 'ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz'
tar \
--extract \
--overwrite \
--file \
'taxdump.tar.gz'
rm --force 'taxdump.tar.gz'
python3 \
"$blast_taxonomy_script" \
names.dmp \
nodes.dmp
echo "Taxonomy database ready"
# Make BLAST database
cd "$blast_out_dir"
# Map all haplotypes to their corresponding taxa ID, removing unwanted characters
sed 's/\(taxid=\|;\)//g' \
"$midori_dir/acceptable_haplotypes_taxid.txt" \
> "taxid_map.txt"
# Generate the BLAST database from the filtered FASTA file and taxonomic map
makeblastdb \
-in "$midori_dir/$prefix.filtered.fasta" \
-out "$blast_out_dir/$prefix" \
-dbtype nucl \
-parse_seqids \
-taxid_map "taxid_map.txt" \
-blastdb_version 5 \
-title "$prefix"
echo "BLAST databases generated successfully"
# Print some debug info (optional)
echo -e "\n\n\n---------- JOB INFO ----------\n\n"
qstat -f -1 "$PBS_JOBID"