Skip to content

Commit 79443ec

Browse files
authored
Merge pull request #1123 from nf-core/improve_tximport
Overhaul tximport.r, output length tables
2 parents 76464e4 + 56edb1c commit 79443ec

File tree

4 files changed

+140
-75
lines changed

4 files changed

+140
-75
lines changed

bin/tximport.r

+118-69
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,143 @@
11
#!/usr/bin/env Rscript
22

3-
# Written by Lorena Pantano and released under the MIT license.
3+
# Script for importing and processing transcript-level quantifications.
4+
# Written by Lorena Pantano, later modified by Jonathan Manning, and released
5+
# under the MIT license.
46

7+
# Loading required libraries
58
library(SummarizedExperiment)
69
library(tximport)
710

8-
args = commandArgs(trailingOnly=TRUE)
11+
# Parsing command line arguments
12+
args <- commandArgs(trailingOnly=TRUE)
913
if (length(args) < 4) {
10-
stop("Usage: tximport.r <coldata> <path> <sample_name> <quant_type> <tx2gene_path>", call.=FALSE)
14+
stop("Usage: tximport.r <coldata_path> <path> <prefix> <quant_type> <tx2gene_path>",
15+
call.=FALSE)
1116
}
1217

13-
coldata = args[1]
14-
path = args[2]
15-
sample_name = args[3]
16-
quant_type = args[4]
17-
tx2gene_path = args[5]
18-
19-
prefix = sample_name
20-
21-
info = file.info(tx2gene_path)
22-
if (info$size == 0) {
23-
tx2gene = NULL
24-
} else {
25-
rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE)
26-
colnames(rowdata) = c("tx", "gene_id", "gene_name")
27-
tx2gene = rowdata[,1:2]
18+
# Assigning command line arguments to variables
19+
coldata_path <- args[1]
20+
path <- args[2]
21+
prefix <- args[3]
22+
quant_type <- args[4]
23+
tx2gene_path <- args[5]
24+
25+
## Functions
26+
27+
# Build a table from a SummarizedExperiment object
28+
build_table <- function(se.obj, slot) {
29+
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]])
2830
}
2931

30-
pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf")
31-
fns = list.files(path, pattern = pattern, recursive = T, full.names = T)
32-
names = basename(dirname(fns))
33-
names(fns) = names
34-
35-
if (file.exists(coldata)) {
36-
coldata = read.csv(coldata, sep="\t")
37-
coldata = coldata[match(names, coldata[,1]),]
38-
coldata = cbind(files = fns, coldata)
39-
} else {
40-
message("ColData not available: ", coldata)
41-
coldata = data.frame(files = fns, names = names)
32+
# Write a table to a file with given parameters
33+
write_se_table <- function(params) {
34+
file_name <- paste0(prefix, ".", params$suffix)
35+
write.table(build_table(params$obj, params$slot), file_name,
36+
sep="\t", quote=FALSE, row.names = FALSE)
4237
}
4338

44-
dropInfReps = quant_type == "kallisto"
39+
# Read transcript metadata from a given path
40+
read_transcript_info <- function(tinfo_path){
41+
info <- file.info(tinfo_path)
42+
if (info$size == 0) {
43+
stop("tx2gene file is empty")
44+
}
45+
46+
transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE,
47+
col.names = c("tx", "gene_id", "gene_name"))
4548

46-
txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps)
47-
rownames(coldata) = coldata[["names"]]
48-
extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]]))
49-
if (length(extra) > 0) {
50-
rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra))
49+
extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]]))
50+
transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra))
51+
transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ]
52+
rownames(transcript_info) <- transcript_info[["tx"]]
53+
54+
list(transcript = transcript_info,
55+
gene = unique(transcript_info[,2:3]),
56+
tx2gene = transcript_info[,1:2])
5157
}
52-
rowdata = rowdata[match(rownames(txi[[1]]), as.character(rowdata[["tx"]])),]
53-
rownames(rowdata) = rowdata[["tx"]]
54-
se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = txi[["abundance"]], length = txi[["length"]]),
55-
colData = DataFrame(coldata),
56-
rowData = rowdata)
57-
if (!is.null(tx2gene)) {
58-
gi = summarizeToGene(txi, tx2gene = tx2gene)
59-
gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
60-
gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")
61-
growdata = unique(rowdata[,2:3])
62-
growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),]
63-
rownames(growdata) = growdata[["tx"]]
64-
gse = SummarizedExperiment(assays = list(counts = gi[["counts"]], abundance = gi[["abundance"]], length = gi[["length"]]),
65-
colData = DataFrame(coldata),
66-
rowData = growdata)
67-
gse.ls = SummarizedExperiment(assays = list(counts = gi.ls[["counts"]], abundance = gi.ls[["abundance"]], length = gi.ls[["length"]]),
68-
colData = DataFrame(coldata),
69-
rowData = growdata)
70-
gse.s = SummarizedExperiment(assays = list(counts = gi.s[["counts"]], abundance = gi.s[["abundance"]], length = gi.s[["length"]]),
71-
colData = DataFrame(coldata),
72-
rowData = growdata)
58+
59+
# Read and process sample/column data from a given path
60+
read_coldata <- function(coldata_path){
61+
if (file.exists(coldata_path)) {
62+
coldata <- read.csv(coldata_path, sep="\t")
63+
coldata <- coldata[match(names, coldata[,1]),]
64+
coldata <- cbind(files = fns, coldata)
65+
} else {
66+
message("ColData not available: ", coldata_path)
67+
coldata <- data.frame(files = fns, names = names)
68+
}
69+
rownames(coldata) <- coldata[["names"]]
7370
}
7471

75-
build_table = function(se.obj, slot) {
76-
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]])
72+
# Create a SummarizedExperiment object with given data
73+
create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) {
74+
SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length),
75+
colData = col_data,
76+
rowData = row_data)
7777
}
7878

79-
if(exists("gse")){
80-
write.table(build_table(gse, "abundance"), paste(c(prefix, "gene_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
81-
write.table(build_table(gse, "counts"), paste(c(prefix, "gene_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
82-
write.table(build_table(gse.ls, "abundance"), paste(c(prefix, "gene_tpm_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
83-
write.table(build_table(gse.ls, "counts"), paste(c(prefix, "gene_counts_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
84-
write.table(build_table(gse.s, "abundance"), paste(c(prefix, "gene_tpm_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
85-
write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
79+
# Main script starts here
80+
81+
# Define pattern for file names based on quantification type
82+
pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf")
83+
fns <- list.files(path, pattern = pattern, recursive = T, full.names = T)
84+
names <- basename(dirname(fns))
85+
names(fns) <- names
86+
dropInfReps <- quant_type == "kallisto"
87+
88+
# Import transcript-level quantifications
89+
txi <- tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps)
90+
91+
# Read transcript and sample data
92+
transcript_info <- read_transcript_info(tx2gene_path)
93+
coldata <- read_coldata(coldata_path)
94+
95+
# Create initial SummarizedExperiment object
96+
se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]],
97+
DataFrame(coldata), transcript_info$transcript)
98+
99+
# Setting parameters for writing tables
100+
params <- list(
101+
list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"),
102+
list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"),
103+
list(obj = se, slot = "length", suffix = "transcript_lengths.tsv")
104+
)
105+
106+
# Process gene-level data if tx2gene mapping is available
107+
if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) {
108+
tx2gene <- transcript_info$tx2gene
109+
gi <- summarizeToGene(txi, tx2gene = tx2gene)
110+
gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM")
111+
gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM")
112+
113+
gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),]
114+
rownames(gene_info) <- gene_info[["tx"]]
115+
116+
col_data_frame <- DataFrame(coldata)
117+
118+
# Create gene-level SummarizedExperiment objects
119+
gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]],
120+
col_data_frame, gene_info)
121+
gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]],
122+
col_data_frame, gene_info)
123+
gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]],
124+
col_data_frame, gene_info)
125+
126+
params <- c(params, list(
127+
list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"),
128+
list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"),
129+
list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"),
130+
list(obj = gse.ls, slot = "abundance", suffix = "gene_tpm_length_scaled.tsv"),
131+
list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"),
132+
list(obj = gse.s, slot = "abundance", suffix = "gene_tpm_scaled.tsv"),
133+
list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv")
134+
))
86135
}
87136

88-
write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
89-
write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE)
137+
# Writing tables for each set of parameters
138+
done <- lapply(params, write_se_table)
90139

91-
# Print sessioninfo to standard out
140+
# Output session information and citations
92141
citation("tximeta")
93142
sessionInfo()
94143

modules/local/tximport/main.nf

+2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@ process TXIMPORT {
1616
path "*gene_counts.tsv" , emit: counts_gene
1717
path "*gene_counts_length_scaled.tsv", emit: counts_gene_length_scaled
1818
path "*gene_counts_scaled.tsv" , emit: counts_gene_scaled
19+
path "*gene_lengths.tsv" , emit: lengths_gene
1920
path "*transcript_tpm.tsv" , emit: tpm_transcript
2021
path "*transcript_counts.tsv" , emit: counts_transcript
22+
path "*transcript_lengths.tsv" , emit: lengths_transcript
2123
path "versions.yml" , emit: versions
2224

2325
when:

subworkflows/local/quantify_pseudo/main.nf

+8-6
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,14 @@ workflow QUANTIFY_PSEUDO_ALIGNMENT {
7979
results = ch_pseudo_results // channel: [ val(meta), results_dir ]
8080
multiqc = ch_pseudo_multiqc // channel: [ val(meta), files_for_multiqc ]
8181

82-
tpm_gene = TXIMPORT.out.tpm_gene // channel: [ val(meta), counts ]
83-
counts_gene = TXIMPORT.out.counts_gene // channel: [ val(meta), counts ]
84-
counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // channel: [ val(meta), counts ]
85-
counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // channel: [ val(meta), counts ]
86-
tpm_transcript = TXIMPORT.out.tpm_transcript // channel: [ val(meta), counts ]
87-
counts_transcript = TXIMPORT.out.counts_transcript // channel: [ val(meta), counts ]
82+
tpm_gene = TXIMPORT.out.tpm_gene // path *gene_tpm.tsv
83+
counts_gene = TXIMPORT.out.counts_gene // path *gene_counts.tsv
84+
lengths_gene = TXIMPORT.out.lengths_gene // path *gene_lengths.tsv
85+
counts_gene_length_scaled = TXIMPORT.out.counts_gene_length_scaled // path *gene_counts_length_scaled.tsv
86+
counts_gene_scaled = TXIMPORT.out.counts_gene_scaled // path *gene_counts_scaled.tsv
87+
tpm_transcript = TXIMPORT.out.tpm_transcript // path *gene_tpm.tsv
88+
counts_transcript = TXIMPORT.out.counts_transcript // path *transcript_counts.tsv
89+
lengths_transcript = TXIMPORT.out.lengths_transcript // path *transcript_lengths.tsv
8890

8991
merged_gene_rds = SE_GENE.out.rds // path: *.rds
9092
merged_gene_rds_length_scaled = SE_GENE_LENGTH_SCALED.out.rds // path: *.rds

tower.yml

+12
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,10 @@ reports:
55
display: "All samples STAR Salmon DESeq2 QC PDF plots"
66
"**/star_salmon/salmon.merged.gene_counts.tsv":
77
display: "All samples STAR Salmon merged gene raw counts"
8+
"**/star_salmon/salmon.merged.gene_lengths.tsv":
9+
display: "All samples STAR Salmon mean transcript length per gene"
10+
"**/star_salmon/salmon.merged.transcript_lengths.tsv":
11+
display: "All samples STAR Salmon transcript lengths"
812
"**/star_salmon/salmon.merged.gene_tpm.tsv":
913
display: "All samples STAR Salmon merged gene TPM counts"
1014
"**/star_salmon/salmon.merged.transcript_counts.tsv":
@@ -15,6 +19,10 @@ reports:
1519
display: "All samples Salmon DESeq2 QC PDF plots"
1620
"**/salmon/salmon.merged.gene_counts.tsv":
1721
display: "All samples Salmon merged gene raw counts"
22+
"**/salmon/salmon.merged.gene_lengths.tsv":
23+
display: "All samples Salmon mean transcript length per gene"
24+
"**/salmon/salmon.merged.transcript_lengths.tsv":
25+
display: "All samples Salmon transcript lengths"
1826
"**/salmon/salmon.merged.gene_tpm.tsv":
1927
display: "All samples Salmon merged gene TPM counts"
2028
"**/salmon/salmon.merged.transcript_counts.tsv":
@@ -25,6 +33,10 @@ reports:
2533
display: "All samples Kallisto DESeq2 QC PDF plots"
2634
"**/kallisto/kallisto.merged.gene_counts.tsv":
2735
display: "All samples Kallisto merged gene raw counts"
36+
"**/kallisto/kallisto.merged.gene_lengths.tsv":
37+
display: "All samples Kallisto mean transcript length per gene"
38+
"**/kallisto/kallisto.merged.transcript_lengths.tsv":
39+
display: "All samples Kallisto transcript lengths"
2840
"**/kallisto/kallisto.merged.gene_tpm.tsv":
2941
display: "All samples Kallisto merged gene TPM counts"
3042
"**/kallisto/kallisto.merged.transcript_counts.tsv":

0 commit comments

Comments
 (0)