|
1 | 1 | #!/usr/bin/env Rscript
|
2 | 2 |
|
3 |
| -# Written by Lorena Pantano and released under the MIT license. |
| 3 | +# Script for importing and processing transcript-level quantifications. |
| 4 | +# Written by Lorena Pantano, later modified by Jonathan Manning, and released |
| 5 | +# under the MIT license. |
4 | 6 |
|
| 7 | +# Loading required libraries |
5 | 8 | library(SummarizedExperiment)
|
6 | 9 | library(tximport)
|
7 | 10 |
|
8 |
| -args = commandArgs(trailingOnly=TRUE) |
| 11 | +# Parsing command line arguments |
| 12 | +args <- commandArgs(trailingOnly=TRUE) |
9 | 13 | if (length(args) < 4) {
|
10 |
| - stop("Usage: tximport.r <coldata> <path> <sample_name> <quant_type> <tx2gene_path>", call.=FALSE) |
| 14 | + stop("Usage: tximport.r <coldata_path> <path> <prefix> <quant_type> <tx2gene_path>", |
| 15 | + call.=FALSE) |
11 | 16 | }
|
12 | 17 |
|
13 |
| -coldata = args[1] |
14 |
| -path = args[2] |
15 |
| -sample_name = args[3] |
16 |
| -quant_type = args[4] |
17 |
| -tx2gene_path = args[5] |
18 |
| - |
19 |
| -prefix = sample_name |
20 |
| - |
21 |
| -info = file.info(tx2gene_path) |
22 |
| -if (info$size == 0) { |
23 |
| - tx2gene = NULL |
24 |
| -} else { |
25 |
| - rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE) |
26 |
| - colnames(rowdata) = c("tx", "gene_id", "gene_name") |
27 |
| - tx2gene = rowdata[,1:2] |
| 18 | +# Assigning command line arguments to variables |
| 19 | +coldata_path <- args[1] |
| 20 | +path <- args[2] |
| 21 | +prefix <- args[3] |
| 22 | +quant_type <- args[4] |
| 23 | +tx2gene_path <- args[5] |
| 24 | + |
| 25 | +## Functions |
| 26 | + |
| 27 | +# Build a table from a SummarizedExperiment object |
| 28 | +build_table <- function(se.obj, slot) { |
| 29 | + cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) |
28 | 30 | }
|
29 | 31 |
|
30 |
| -pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") |
31 |
| -fns = list.files(path, pattern = pattern, recursive = T, full.names = T) |
32 |
| -names = basename(dirname(fns)) |
33 |
| -names(fns) = names |
34 |
| - |
35 |
| -if (file.exists(coldata)) { |
36 |
| - coldata = read.csv(coldata, sep="\t") |
37 |
| - coldata = coldata[match(names, coldata[,1]),] |
38 |
| - coldata = cbind(files = fns, coldata) |
39 |
| -} else { |
40 |
| - message("ColData not available: ", coldata) |
41 |
| - coldata = data.frame(files = fns, names = names) |
| 32 | +# Write a table to a file with given parameters |
| 33 | +write_se_table <- function(params) { |
| 34 | + file_name <- paste0(prefix, ".", params$suffix) |
| 35 | + write.table(build_table(params$obj, params$slot), file_name, |
| 36 | + sep="\t", quote=FALSE, row.names = FALSE) |
42 | 37 | }
|
43 | 38 |
|
44 |
| -dropInfReps = quant_type == "kallisto" |
| 39 | +# Read transcript metadata from a given path |
| 40 | +read_transcript_info <- function(tinfo_path){ |
| 41 | + info <- file.info(tinfo_path) |
| 42 | + if (info$size == 0) { |
| 43 | + stop("tx2gene file is empty") |
| 44 | + } |
| 45 | + |
| 46 | + transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE, |
| 47 | + col.names = c("tx", "gene_id", "gene_name")) |
45 | 48 |
|
46 |
| -txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) |
47 |
| -rownames(coldata) = coldata[["names"]] |
48 |
| -extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]])) |
49 |
| -if (length(extra) > 0) { |
50 |
| - rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra)) |
| 49 | + extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) |
| 50 | + transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra)) |
| 51 | + transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] |
| 52 | + rownames(transcript_info) <- transcript_info[["tx"]] |
| 53 | + |
| 54 | + list(transcript = transcript_info, |
| 55 | + gene = unique(transcript_info[,2:3]), |
| 56 | + tx2gene = transcript_info[,1:2]) |
51 | 57 | }
|
52 |
| -rowdata = rowdata[match(rownames(txi[[1]]), as.character(rowdata[["tx"]])),] |
53 |
| -rownames(rowdata) = rowdata[["tx"]] |
54 |
| -se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = txi[["abundance"]], length = txi[["length"]]), |
55 |
| - colData = DataFrame(coldata), |
56 |
| - rowData = rowdata) |
57 |
| -if (!is.null(tx2gene)) { |
58 |
| - gi = summarizeToGene(txi, tx2gene = tx2gene) |
59 |
| - gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") |
60 |
| - gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") |
61 |
| - growdata = unique(rowdata[,2:3]) |
62 |
| - growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),] |
63 |
| - rownames(growdata) = growdata[["tx"]] |
64 |
| - gse = SummarizedExperiment(assays = list(counts = gi[["counts"]], abundance = gi[["abundance"]], length = gi[["length"]]), |
65 |
| - colData = DataFrame(coldata), |
66 |
| - rowData = growdata) |
67 |
| - gse.ls = SummarizedExperiment(assays = list(counts = gi.ls[["counts"]], abundance = gi.ls[["abundance"]], length = gi.ls[["length"]]), |
68 |
| - colData = DataFrame(coldata), |
69 |
| - rowData = growdata) |
70 |
| - gse.s = SummarizedExperiment(assays = list(counts = gi.s[["counts"]], abundance = gi.s[["abundance"]], length = gi.s[["length"]]), |
71 |
| - colData = DataFrame(coldata), |
72 |
| - rowData = growdata) |
| 58 | + |
| 59 | +# Read and process sample/column data from a given path |
| 60 | +read_coldata <- function(coldata_path){ |
| 61 | + if (file.exists(coldata_path)) { |
| 62 | + coldata <- read.csv(coldata_path, sep="\t") |
| 63 | + coldata <- coldata[match(names, coldata[,1]),] |
| 64 | + coldata <- cbind(files = fns, coldata) |
| 65 | + } else { |
| 66 | + message("ColData not available: ", coldata_path) |
| 67 | + coldata <- data.frame(files = fns, names = names) |
| 68 | + } |
| 69 | + rownames(coldata) <- coldata[["names"]] |
73 | 70 | }
|
74 | 71 |
|
75 |
| -build_table = function(se.obj, slot) { |
76 |
| - cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) |
| 72 | +# Create a SummarizedExperiment object with given data |
| 73 | +create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) { |
| 74 | + SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), |
| 75 | + colData = col_data, |
| 76 | + rowData = row_data) |
77 | 77 | }
|
78 | 78 |
|
79 |
| -if(exists("gse")){ |
80 |
| - write.table(build_table(gse, "abundance"), paste(c(prefix, "gene_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) |
81 |
| - write.table(build_table(gse, "counts"), paste(c(prefix, "gene_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) |
82 |
| - write.table(build_table(gse.ls, "abundance"), paste(c(prefix, "gene_tpm_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) |
83 |
| - write.table(build_table(gse.ls, "counts"), paste(c(prefix, "gene_counts_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) |
84 |
| - write.table(build_table(gse.s, "abundance"), paste(c(prefix, "gene_tpm_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) |
85 |
| - write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) |
| 79 | +# Main script starts here |
| 80 | + |
| 81 | +# Define pattern for file names based on quantification type |
| 82 | +pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") |
| 83 | +fns <- list.files(path, pattern = pattern, recursive = T, full.names = T) |
| 84 | +names <- basename(dirname(fns)) |
| 85 | +names(fns) <- names |
| 86 | +dropInfReps <- quant_type == "kallisto" |
| 87 | + |
| 88 | +# Import transcript-level quantifications |
| 89 | +txi <- tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) |
| 90 | + |
| 91 | +# Read transcript and sample data |
| 92 | +transcript_info <- read_transcript_info(tx2gene_path) |
| 93 | +coldata <- read_coldata(coldata_path) |
| 94 | + |
| 95 | +# Create initial SummarizedExperiment object |
| 96 | +se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]], |
| 97 | + DataFrame(coldata), transcript_info$transcript) |
| 98 | + |
| 99 | +# Setting parameters for writing tables |
| 100 | +params <- list( |
| 101 | + list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"), |
| 102 | + list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"), |
| 103 | + list(obj = se, slot = "length", suffix = "transcript_lengths.tsv") |
| 104 | +) |
| 105 | + |
| 106 | +# Process gene-level data if tx2gene mapping is available |
| 107 | +if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) { |
| 108 | + tx2gene <- transcript_info$tx2gene |
| 109 | + gi <- summarizeToGene(txi, tx2gene = tx2gene) |
| 110 | + gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") |
| 111 | + gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") |
| 112 | + |
| 113 | + gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),] |
| 114 | + rownames(gene_info) <- gene_info[["tx"]] |
| 115 | + |
| 116 | + col_data_frame <- DataFrame(coldata) |
| 117 | + |
| 118 | + # Create gene-level SummarizedExperiment objects |
| 119 | + gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]], |
| 120 | + col_data_frame, gene_info) |
| 121 | + gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]], |
| 122 | + col_data_frame, gene_info) |
| 123 | + gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]], |
| 124 | + col_data_frame, gene_info) |
| 125 | + |
| 126 | + params <- c(params, list( |
| 127 | + list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"), |
| 128 | + list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"), |
| 129 | + list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"), |
| 130 | + list(obj = gse.ls, slot = "abundance", suffix = "gene_tpm_length_scaled.tsv"), |
| 131 | + list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"), |
| 132 | + list(obj = gse.s, slot = "abundance", suffix = "gene_tpm_scaled.tsv"), |
| 133 | + list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv") |
| 134 | + )) |
86 | 135 | }
|
87 | 136 |
|
88 |
| -write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) |
89 |
| -write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) |
| 137 | +# Writing tables for each set of parameters |
| 138 | +done <- lapply(params, write_se_table) |
90 | 139 |
|
91 |
| -# Print sessioninfo to standard out |
| 140 | +# Output session information and citations |
92 | 141 | citation("tximeta")
|
93 | 142 | sessionInfo()
|
94 | 143 |
|
0 commit comments