Skip to content

Commit

Permalink
Merge branch 'develop' into lk-PD-2738
Browse files Browse the repository at this point in the history
  • Loading branch information
ekiernan authored Sep 18, 2024
2 parents 8f7abf1 + 5d55087 commit 84dbb14
Show file tree
Hide file tree
Showing 3 changed files with 333 additions and 0 deletions.
5 changes: 5 additions & 0 deletions tools/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,12 @@ COPY . /warptools
RUN cd /warptools/fastqpreprocessing && ./fetch_and_make_dep_libs.sh && make && cp /warptools/fastqpreprocessing/bin/* /usr/local/bin/
RUN cd /warptools/TagSort && ./fetch_and_make_dep_libs.sh && make && cp /warptools/TagSort/bin/* /usr/local/bin/

# Set the working directory to the scripts folder and install the package
RUN python -m pip install git+https://github.com/HumanCellAtlas/sctools.git#egg=sctools;

# Set the default working directory for the container
WORKDIR /warptools

# Set tini as default entry point
ENTRYPOINT ["/usr/bin/tini", "--"]

244 changes: 244 additions & 0 deletions tools/scripts/create_h5ad_snss2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
import argparse
import csv
import os
import numpy as np
import scipy as sc
import anndata as ad
import pandas as pd

def generate_col_attr(args):
"""Converts the QC of Smart Seq2 gene file pipeline outputs to H5AD file
Args:
qc_path (str): path to the QCs csv
Retruns:
col_attrs: dict
"""
# read the QC values
qc_path = [p for p in args.qc_files if p.endswith(".csv")][0]
with open(qc_path, 'r') as f:
qc_values = [row for row in csv.reader(f)]
n_metrics_files = len(qc_values)-2
metadata_labels = qc_values[0][1:]
metadata_values = []
for index_val in range(len(metadata_labels)):
arr_metric_val = []
for index_file in range(n_metrics_files):
arr_metric_val.append(qc_values[index_file+2][index_val+1])
metadata_values.append(','.join(s for s in arr_metric_val if s))

cell_id = args.input_id

string_metadata = {}
numeric_metadata = {}

for label, value in zip(metadata_labels, metadata_values):

# See if this is numeric or string
numeric_value = None
try:
numeric_value = float(value)
except ValueError:
try:
numeric_value = float(value.strip("%"))/100
except ValueError:
pass

if numeric_value is not None:
numeric_metadata[label] = numeric_value
else:
string_metadata[label] = value

# Metrics
# Write the string and numeric metadata separately
sorted_string_labels = sorted(string_metadata.keys())
sorted_string_values = [string_metadata[m] for m in sorted_string_labels]
sorted_numeric_labels = sorted(numeric_metadata.keys())
sorted_numeric_values = [numeric_metadata[m] for m in sorted_numeric_labels]

# Column attributes
col_attrs = dict()
col_attrs["cell_names"] = [cell_id]
col_attrs["CellID"] = [cell_id]
col_attrs['input_id'] = [args.input_id]

numeric_field_names = np.array(sorted_numeric_labels[:])

for i in range(0, numeric_field_names.shape[0]):
name = numeric_field_names[i]
data = np.array([sorted_numeric_values])[:,i]
col_attrs[name] = data
string_field_names = np.array(sorted_string_labels)
for i in range(0, string_field_names.shape[0]):
name = string_field_names[i]
data = np.array([sorted_string_values])[:,i]
col_attrs[name] = data

return col_attrs


def generate_csr_sparse_coo(expr):

nrows, ncols = np.shape(expr)
expr_coo = sc.sparse.coo_matrix(expr[:])
xcoord = []
ycoord = []
value = []

for k in range(0, expr_coo.data.shape[0]):
xcoord.append(expr_coo.row[k])
ycoord.append(expr_coo.col[k])
value.append(expr_coo.data[k])

xcoord = np.asarray(xcoord)
ycoord = np.asarray(ycoord)
value = np.asarray(value)

expr_sp_t = sc.sparse.coo_matrix((value, (ycoord, xcoord)), shape=(ncols, nrows))
return expr_sp_t


def generate_row_attr_and_matrix(count_results_path):
"""Converts the Smart Seq2 intron and exon counts file to
csr_coos and row attribute dictionary
Args:
input_path (str): file where the SS2 pipeline expression counts are
Return:
row_attrs: dict of attributes
intron_expression_csr_coo: crs coo matrix for introns
exon_expression_csr_coo: crs coo matrix for exons
"""
reader = csv.DictReader(open(count_results_path), delimiter="\t")

intron_expression_values = {}
exon_expression_values = {}
intron_lengths = {}
exon_lengths = {}

gene_ids = []
gene_names = []
count_values = {}
for row in reader:
intron_expression_values[row["gene_id"]] = int(row["introns"])
exon_expression_values[row["gene_id"]] = int(row["exons"])
intron_lengths[row["gene_id"]] = int(row["intron_length"])
exon_lengths[row["gene_id"]] = int(row["exon_length"])
gene_ids.append(row['gene_id'])
gene_names.append(row['gene_name'])

intron_counts = [intron_expression_values[g] for g in gene_ids]
exon_counts = [exon_expression_values[g] for g in gene_ids]
intron_lengths = [intron_lengths[g] for g in gene_ids]
exon_lengths = [exon_lengths[g] for g in gene_ids]

intron_expression_csr_coo = generate_csr_sparse_coo([intron_counts])
exon_expression_csr_coo = generate_csr_sparse_coo([exon_counts])

row_attrs = { "ensembl_ids" : np.array(gene_ids),
"gene_names" : np.array(gene_names),
"Gene" : np.array(gene_ids),
"intron_lengths": np.array(intron_lengths),
"exon_lengths" : np.array(exon_lengths)
}

return row_attrs, intron_expression_csr_coo, exon_expression_csr_coo


def create_h5ad_files(args):
"""This function creates the H5AD file or folder structure in output_h5ad_path in
format file_format, with input_id from the input folder analysis_output_path
Args:
input_id (str): sample or cell id
qc_analysis_output_files_string (str): a string with the file names in the QCGroup of SS2
pipeline output, separated by commas
rsem_genes_results_file (str): the file for the expression count
output_h5ad_path (str): location of the output H5AD file
"""
# generate a dictionary of column attributes
col_attrs = generate_col_attr(args)

# add the expression count matrix data
# generate a dictionary of row attributes
row_attrs, intron_expr_csr_coo, exon_expr_csr_coo = generate_row_attr_and_matrix(args.count_results_file)

# Convert the matrices to CSR format
exon_counts = exon_expr_csr_coo.transpose().tocsr()
intron_counts = intron_expr_csr_coo.transpose().tocsr()

gene_attrs = row_attrs
cell_attrs = col_attrs
adata = ad.AnnData(exon_counts)
attrDict = dict()
attrDict['pipeline_version'] = args.pipeline_version

# importing the cell and gene metrics as dataframes
dc = pd.DataFrame(cell_attrs)
dg = pd.DataFrame(gene_attrs)

# assign the cell and gene attrs to the obs and var field respectively
adata.obs = dc
adata.var = dg

# assign global attrs to unstructured data
adata.uns = attrDict
adata.obs_names = [x for x in adata.obs['CellID']]

# set variable names
adata.var_names = [x for x in adata.var['Gene']]

# set the layers to the intron counts
adata.layers['intron_counts'] = intron_counts

# Write the h5ad file
adata.write_h5ad(args.output_h5ad_path + ".h5ad")

def main():
description = """This script converts the some of the SmartSeq2 pipeline outputs in to
H5AD format. This script can be used as a module or run as a command line script."""

parser = argparse.ArgumentParser(description=description)
parser.add_argument('--qc_files',
dest="qc_files",
nargs = "+",
help=('the grouped QC files from the GroupQCOutputs task of SS2 '
'Single Sample workflow'))

parser.add_argument('--count_results',
dest="count_results_file",
help='path to the intronic and exonic count and FPKM to be added to the H5AD')

parser.add_argument('--output_h5ad_path',
dest="output_h5ad_path",
help='path where the h5ad file is to be created')

parser.add_argument('--input_id',
dest="input_id",
help='the sample name in the bundle')

parser.add_argument("--input_name",
dest="input_name",
help= "sequencing_input.biomaterial_core.biomaterial_id in HCA metadata, defined by the user",
)

parser.add_argument("--input_id_metadata_field",
dest="input_id_metadata_field",
help= "sequencing_process.provenance.document_id: [UUID] defined by the user",
)

parser.add_argument("--input_name_metadata_field",
dest="input_name_metadata_field",
help= "sequencing_input.biomaterial_core.biomaterial_id defined by the user",
)

parser.add_argument('--pipeline_version',
default="Unknown sample",
help='the version of SS2 used to generate data')

args = parser.parse_args()

create_h5ad_files(args)


if __name__ == '__main__':
main()

84 changes: 84 additions & 0 deletions tools/scripts/ss2_h5ad_merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import argparse
import anndata as ad

def merge_h5ad_files(args):
"""This function merges multiple H5AD files into one, preserving metadata."""
h5ad_files = args.input_h5ad_files
print(f"Input H5AD files: {h5ad_files}")

adatas = [ad.read_h5ad(f) for f in h5ad_files]
print(f"Number of AnnData objects read: {len(adatas)}")

# Concatenate the AnnData objects
merged_adata = ad.concat(adatas, axis=0, merge="same")


# Add global attributes (uns)
attrDict = dict()
attrDict['batch_id'] = args.batch_id
attrDict['pipeline_version'] = args.pipeline_version
if args.batch_name is not None:
attrDict['batch_name'] = args.batch_name
if args.library is not None:
attrDict['library_preparation_protocol.library_construction_approach'] = args.library
if args.species is not None:
attrDict['donor_organism.genus_species'] = args.species
if args.organ is not None:
attrDict['specimen_from_organism.organ'] = args.organ
if args.project_id is not None:
attrDict['project.provenance.document_id'] = args.project_id
if args.project_name is not None:
attrDict['project.project_core.project_short_name'] = args.project_name

# Assign these attributes to the merged AnnData object
merged_adata.uns.update(attrDict)
print(f"Global attributes added to the merged AnnData: {attrDict}")

# Save the merged H5AD file
merged_adata.write_h5ad(args.output_h5ad_file)
print(f"Merged H5AD file saved to: {args.output_h5ad_file}")

def main():
description = """Merge the outputs of multiple SS2 pipeline runs into a single H5AD file"""
parser = argparse.ArgumentParser(description=description)
parser.add_argument('--input-h5ad-files',
dest='input_h5ad_files',
nargs="+",
required=True,
help="Path to input H5AD files")
parser.add_argument('--output-h5ad-file',
dest='output_h5ad_file',
required=True,
help="Path to output H5AD file")
parser.add_argument('--batch_id',
dest='batch_id',
required=True,
help="Batch id for output H5AD")
parser.add_argument('--batch_name',
dest='batch_name',
help='User provided batch name for output H5AD')
parser.add_argument('--project_id',
dest='project_id',
help='User provided project id for output H5AD')
parser.add_argument('--project_name',
dest='project_name',
help='User provided project name for output H5AD')
parser.add_argument('--library',
dest='library',
help='User provided library for output H5AD')
parser.add_argument('--species',
dest='species',
help='User provided species for output H5AD')
parser.add_argument('--organ',
dest='organ',
help='User provided organ for output H5AD')
parser.add_argument('--pipeline_version',
dest='pipeline_version',
required=True,
help='Multisample SS2 version')
args = parser.parse_args()

merge_h5ad_files(args)

if __name__ == '__main__':
main()

0 comments on commit 84dbb14

Please sign in to comment.