Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

trying to add sctools #139

Closed
wants to merge 18 commits into from
9 changes: 9 additions & 0 deletions tools/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,19 @@ RUN set -eux && \
curl -sSL https://sdk.cloud.google.com | bash

COPY . /warptools
RUN ls -la /warptools
#build C/C++ code and add binaries to usr's bin
RUN cd /warptools/fastqpreprocessing && ./fetch_and_make_dep_libs.sh && make && cp /warptools/fastqpreprocessing/bin/* /usr/local/bin/
RUN cd /warptools/TagSort && ./fetch_and_make_dep_libs.sh && make && cp /warptools/TagSort/bin/* /usr/local/bin/

# Set the working directory to the scripts folder and install the package

WORKDIR /warptools/scripts/sctools
RUN pwd
RUN ls -la
RUN pip install .

# Set the default working directory for the container
WORKDIR /warptools

# Set tini as default entry point
Expand Down
258 changes: 258 additions & 0 deletions tools/scripts/create_h5ad_snss2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
import argparse
import csv
import os
import numpy as np
import scipy as sc
import anndata as ad
import pandas as pd

def generate_col_attr(args):
"""Converts the QC of Smart Seq2 gene file pipeline outputs to loom file
Args:
qc_path (str): path to the QCs csv
Retruns:
col_attrs: dict
"""
# read the QC values
qc_path = [p for p in args.qc_files if p.endswith(".csv")][0]
with open(qc_path, 'r') as f:
qc_values = [row for row in csv.reader(f)]
n_metrics_files = len(qc_values)-2
metadata_labels = qc_values[0][1:]
metadata_values = []
for index_val in range(len(metadata_labels)):
arr_metric_val = []
for index_file in range(n_metrics_files):
arr_metric_val.append(qc_values[index_file+2][index_val+1])
metadata_values.append(','.join(s for s in arr_metric_val if s))

cell_id = args.input_id

string_metadata = {}
numeric_metadata = {}

for label, value in zip(metadata_labels, metadata_values):

# See if this is numeric or string
numeric_value = None
try:
numeric_value = float(value)
except ValueError:
try:
numeric_value = float(value.strip("%"))/100
except ValueError:
pass

if numeric_value is not None:
numeric_metadata[label] = numeric_value
else:
string_metadata[label] = value

# Metrics
# Write the string and numeric metadata separately
sorted_string_labels = sorted(string_metadata.keys())
sorted_string_values = [string_metadata[m] for m in sorted_string_labels]
sorted_numeric_labels = sorted(numeric_metadata.keys())
sorted_numeric_values = [numeric_metadata[m] for m in sorted_numeric_labels]

# Column attributes
col_attrs = dict()
col_attrs["cell_names"] = [cell_id]
col_attrs["CellID"] = [cell_id]
col_attrs['input_id'] = [args.input_id]

numeric_field_names = np.array(sorted_numeric_labels[:])

for i in range(0, numeric_field_names.shape[0]):
name = numeric_field_names[i]
data = np.array([sorted_numeric_values])[:,i]
col_attrs[name] = data
string_field_names = np.array(sorted_string_labels)
for i in range(0, string_field_names.shape[0]):
name = string_field_names[i]
data = np.array([sorted_string_values])[:,i]
col_attrs[name] = data

return col_attrs


def generate_csr_sparse_coo(expr):

nrows, ncols = np.shape(expr)
expr_coo = sc.sparse.coo_matrix(expr[:])
xcoord = []
ycoord = []
value = []

for k in range(0, expr_coo.data.shape[0]):
xcoord.append(expr_coo.row[k])
ycoord.append(expr_coo.col[k])
value.append(expr_coo.data[k])

xcoord = np.asarray(xcoord)
ycoord = np.asarray(ycoord)
value = np.asarray(value)

expr_sp_t = sc.sparse.coo_matrix((value, (ycoord, xcoord)), shape=(ncols, nrows))
return expr_sp_t


def generate_row_attr_and_matrix(count_results_path):
"""Converts the Smart Seq2 intron and exon counts file to
csr_coos and row attribute dictionary
Args:
input_path (str): file where the SS2 pipeline expression counts are

Return:
row_attrs: dict of attributes
intron_expression_csr_coo: crs coo matrix for introns
exon_expression_csr_coo: crs coo matrix for exons
"""
reader = csv.DictReader(open(count_results_path), delimiter="\t")

intron_expression_values = {}
exon_expression_values = {}
intron_lengths = {}
exon_lengths = {}

gene_ids = []
gene_names = []
count_values = {}
for row in reader:
intron_expression_values[row["gene_id"]] = int(row["introns"])
exon_expression_values[row["gene_id"]] = int(row["exons"])
intron_lengths[row["gene_id"]] = int(row["intron_length"])
exon_lengths[row["gene_id"]] = int(row["exon_length"])
gene_ids.append(row['gene_id'])
gene_names.append(row['gene_name'])

intron_counts = [intron_expression_values[g] for g in gene_ids]
exon_counts = [exon_expression_values[g] for g in gene_ids]
intron_lengths = [intron_lengths[g] for g in gene_ids]
exon_lengths = [exon_lengths[g] for g in gene_ids]

intron_expression_csr_coo = generate_csr_sparse_coo([intron_counts])
exon_expression_csr_coo = generate_csr_sparse_coo([exon_counts])

row_attrs = { "ensembl_ids" : np.array(gene_ids),
"gene_names" : np.array(gene_names),
"Gene" : np.array(gene_ids),
"intron_lengths": np.array(intron_lengths),
"exon_lengths" : np.array(exon_lengths)
}

return row_attrs, intron_expression_csr_coo, exon_expression_csr_coo


def create_h5ad_files(args):
"""This function creates the loom file or folder structure in output_loom_path in
format file_format, with input_id from the input folder analysis_output_path
Args:
input_id (str): sample or cell id
qc_analysis_output_files_string (str): a string with the file names in the QCGroup of SS2
pipeline output, separated by commas
rsem_genes_results_file (str): the file for the expression count
output_loom_path (str): location of the output loom
"""
# generate a dictionary of column attributes
col_attrs = generate_col_attr(args)

# add the expression count matrix data
# generate a dictionary of row attributes
row_attrs, intron_expr_csr_coo, exon_expr_csr_coo = generate_row_attr_and_matrix(args.count_results_file)

# Convert the matrices to CSR format
exon_counts = exon_expr_csr_coo.transpose().tocsr()
intron_counts = intron_expr_csr_coo.transpose().tocsr()

gene_attrs = row_attrs
cell_attrs = col_attrs
adata = ad.AnnData(exon_counts)
print(adata.X)


attrDict = dict()
attrDict['pipeline_version'] = args.pipeline_version

# importing the cell and gene metrics as dataframes
dc = pd.DataFrame(cell_attrs)
dg = pd.DataFrame(gene_attrs)

# assign the cell and gene attrs to the obs and var field respectively
adata.obs = dc
adata.var = dg

# assign global attrs to unstructured data
adata.uns = attrDict
adata.obs_names = [x for x in adata.obs['CellID']]

# set variable names
adata.var_names = [x for x in adata.var['Gene']]

# set the layers to the intron counts
adata.layers['intron_counts'] = intron_counts

# write the h5ad file
adata.write_h5ad(args.output_h5ad_path + ".h5ad")


#generate loom file
#loompy.create(args.output_loom_path, exon_expr_csr_coo, row_attrs, col_attrs, file_attrs=attrDict)

#ds = loompy.connect(args.output_loom_path)

#ds.layers['intron_counts'] = intron_expr_csr_coo
#ds.close()


def main():
description = """This script converts the some of the SmartSeq2 pipeline outputs in to
loom format (https://linnarssonlab.org/loompy/format/index.html) relevant output.
This script can be used as a module or run as a command line script."""

parser = argparse.ArgumentParser(description=description)
parser.add_argument('--qc_files',
dest="qc_files",
nargs = "+",
help=('the grouped QC files from the GroupQCOutputs task of SS2 '
'Single Sample workflow'))

parser.add_argument('--count_results',
dest="count_results_file",
help='path to the intronic and exonic count and FPKM to be added to the loom')

parser.add_argument('--output_h5ad_path',
dest="output_h5ad_path",
help='path where the h5ad file is to be created')

parser.add_argument('--input_id',
dest="input_id",
help='the sample name in the bundle')

parser.add_argument("--input_name",
dest="input_name",
help= "sequencing_input.biomaterial_core.biomaterial_id in HCA metadata, defined by the user",
)

parser.add_argument("--input_id_metadata_field",
dest="input_id_metadata_field",
help= "sequencing_process.provenance.document_id: [UUID] defined by the user",
)

parser.add_argument("--input_name_metadata_field",
dest="input_name_metadata_field",
help= "sequencing_input.biomaterial_core.biomaterial_id defined by the user",
)

parser.add_argument('--pipeline_version',
default="Unknown sample",
help='the version of SS2 used to generate data')

args = parser.parse_args()

create_h5ad_files(args)


if __name__ == '__main__':
main()

1 change: 1 addition & 0 deletions tools/scripts/sctools
Submodule sctools added at 1f28a4
Loading