From 5d550874a0380f24685ae9c9cbbaba260f8c95ab Mon Sep 17 00:00:00 2001 From: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Date: Wed, 4 Sep 2024 13:13:00 -0400 Subject: [PATCH] Np add create h5ad for snss2 (#141) * add sctools * add dockerfile and script * remove submod * temp * get rid of sctools * please work * update dockerfile * update dockerfile * update dockerfile * remove sctools * update dockerfile * add merge h5ad script * add merge h5ad script * clean up debugging statements in create_h5ad_snss2.py * clean up debugging statements in create_h5ad_snss2.py --- tools/Dockerfile | 5 + tools/scripts/create_h5ad_snss2.py | 244 +++++++++++++++++++++++++++++ tools/scripts/ss2_h5ad_merge.py | 84 ++++++++++ 3 files changed, 333 insertions(+) create mode 100644 tools/scripts/create_h5ad_snss2.py create mode 100644 tools/scripts/ss2_h5ad_merge.py diff --git a/tools/Dockerfile b/tools/Dockerfile index 0cd0f559..0536d45c 100644 --- a/tools/Dockerfile +++ b/tools/Dockerfile @@ -25,7 +25,12 @@ COPY . /warptools RUN cd /warptools/fastqpreprocessing && ./fetch_and_make_dep_libs.sh && make && cp /warptools/fastqpreprocessing/bin/* /usr/local/bin/ RUN cd /warptools/TagSort && ./fetch_and_make_dep_libs.sh && make && cp /warptools/TagSort/bin/* /usr/local/bin/ +# Set the working directory to the scripts folder and install the package +RUN python -m pip install git+https://github.com/HumanCellAtlas/sctools.git#egg=sctools; + +# Set the default working directory for the container WORKDIR /warptools # Set tini as default entry point ENTRYPOINT ["/usr/bin/tini", "--"] + diff --git a/tools/scripts/create_h5ad_snss2.py b/tools/scripts/create_h5ad_snss2.py new file mode 100644 index 00000000..e9f5069c --- /dev/null +++ b/tools/scripts/create_h5ad_snss2.py @@ -0,0 +1,244 @@ +import argparse +import csv +import os +import numpy as np +import scipy as sc +import anndata as ad +import pandas as pd + +def generate_col_attr(args): + """Converts the QC of Smart Seq2 gene file pipeline outputs to H5AD file + Args: + qc_path (str): path to the QCs csv + Retruns: + col_attrs: dict + """ + # read the QC values + qc_path = [p for p in args.qc_files if p.endswith(".csv")][0] + with open(qc_path, 'r') as f: + qc_values = [row for row in csv.reader(f)] + n_metrics_files = len(qc_values)-2 + metadata_labels = qc_values[0][1:] + metadata_values = [] + for index_val in range(len(metadata_labels)): + arr_metric_val = [] + for index_file in range(n_metrics_files): + arr_metric_val.append(qc_values[index_file+2][index_val+1]) + metadata_values.append(','.join(s for s in arr_metric_val if s)) + + cell_id = args.input_id + + string_metadata = {} + numeric_metadata = {} + + for label, value in zip(metadata_labels, metadata_values): + + # See if this is numeric or string + numeric_value = None + try: + numeric_value = float(value) + except ValueError: + try: + numeric_value = float(value.strip("%"))/100 + except ValueError: + pass + + if numeric_value is not None: + numeric_metadata[label] = numeric_value + else: + string_metadata[label] = value + + # Metrics + # Write the string and numeric metadata separately + sorted_string_labels = sorted(string_metadata.keys()) + sorted_string_values = [string_metadata[m] for m in sorted_string_labels] + sorted_numeric_labels = sorted(numeric_metadata.keys()) + sorted_numeric_values = [numeric_metadata[m] for m in sorted_numeric_labels] + + # Column attributes + col_attrs = dict() + col_attrs["cell_names"] = [cell_id] + col_attrs["CellID"] = [cell_id] + col_attrs['input_id'] = [args.input_id] + + numeric_field_names = np.array(sorted_numeric_labels[:]) + + for i in range(0, numeric_field_names.shape[0]): + name = numeric_field_names[i] + data = np.array([sorted_numeric_values])[:,i] + col_attrs[name] = data + string_field_names = np.array(sorted_string_labels) + for i in range(0, string_field_names.shape[0]): + name = string_field_names[i] + data = np.array([sorted_string_values])[:,i] + col_attrs[name] = data + + return col_attrs + + +def generate_csr_sparse_coo(expr): + + nrows, ncols = np.shape(expr) + expr_coo = sc.sparse.coo_matrix(expr[:]) + xcoord = [] + ycoord = [] + value = [] + + for k in range(0, expr_coo.data.shape[0]): + xcoord.append(expr_coo.row[k]) + ycoord.append(expr_coo.col[k]) + value.append(expr_coo.data[k]) + + xcoord = np.asarray(xcoord) + ycoord = np.asarray(ycoord) + value = np.asarray(value) + + expr_sp_t = sc.sparse.coo_matrix((value, (ycoord, xcoord)), shape=(ncols, nrows)) + return expr_sp_t + + +def generate_row_attr_and_matrix(count_results_path): + """Converts the Smart Seq2 intron and exon counts file to + csr_coos and row attribute dictionary + Args: + input_path (str): file where the SS2 pipeline expression counts are + + Return: + row_attrs: dict of attributes + intron_expression_csr_coo: crs coo matrix for introns + exon_expression_csr_coo: crs coo matrix for exons + """ + reader = csv.DictReader(open(count_results_path), delimiter="\t") + + intron_expression_values = {} + exon_expression_values = {} + intron_lengths = {} + exon_lengths = {} + + gene_ids = [] + gene_names = [] + count_values = {} + for row in reader: + intron_expression_values[row["gene_id"]] = int(row["introns"]) + exon_expression_values[row["gene_id"]] = int(row["exons"]) + intron_lengths[row["gene_id"]] = int(row["intron_length"]) + exon_lengths[row["gene_id"]] = int(row["exon_length"]) + gene_ids.append(row['gene_id']) + gene_names.append(row['gene_name']) + + intron_counts = [intron_expression_values[g] for g in gene_ids] + exon_counts = [exon_expression_values[g] for g in gene_ids] + intron_lengths = [intron_lengths[g] for g in gene_ids] + exon_lengths = [exon_lengths[g] for g in gene_ids] + + intron_expression_csr_coo = generate_csr_sparse_coo([intron_counts]) + exon_expression_csr_coo = generate_csr_sparse_coo([exon_counts]) + + row_attrs = { "ensembl_ids" : np.array(gene_ids), + "gene_names" : np.array(gene_names), + "Gene" : np.array(gene_ids), + "intron_lengths": np.array(intron_lengths), + "exon_lengths" : np.array(exon_lengths) + } + + return row_attrs, intron_expression_csr_coo, exon_expression_csr_coo + + +def create_h5ad_files(args): + """This function creates the H5AD file or folder structure in output_h5ad_path in + format file_format, with input_id from the input folder analysis_output_path + Args: + input_id (str): sample or cell id + qc_analysis_output_files_string (str): a string with the file names in the QCGroup of SS2 + pipeline output, separated by commas + rsem_genes_results_file (str): the file for the expression count + output_h5ad_path (str): location of the output H5AD file + """ + # generate a dictionary of column attributes + col_attrs = generate_col_attr(args) + + # add the expression count matrix data + # generate a dictionary of row attributes + row_attrs, intron_expr_csr_coo, exon_expr_csr_coo = generate_row_attr_and_matrix(args.count_results_file) + + # Convert the matrices to CSR format + exon_counts = exon_expr_csr_coo.transpose().tocsr() + intron_counts = intron_expr_csr_coo.transpose().tocsr() + + gene_attrs = row_attrs + cell_attrs = col_attrs + adata = ad.AnnData(exon_counts) + attrDict = dict() + attrDict['pipeline_version'] = args.pipeline_version + + # importing the cell and gene metrics as dataframes + dc = pd.DataFrame(cell_attrs) + dg = pd.DataFrame(gene_attrs) + + # assign the cell and gene attrs to the obs and var field respectively + adata.obs = dc + adata.var = dg + + # assign global attrs to unstructured data + adata.uns = attrDict + adata.obs_names = [x for x in adata.obs['CellID']] + + # set variable names + adata.var_names = [x for x in adata.var['Gene']] + + # set the layers to the intron counts + adata.layers['intron_counts'] = intron_counts + + # Write the h5ad file + adata.write_h5ad(args.output_h5ad_path + ".h5ad") + +def main(): + description = """This script converts the some of the SmartSeq2 pipeline outputs in to + H5AD format. This script can be used as a module or run as a command line script.""" + + parser = argparse.ArgumentParser(description=description) + parser.add_argument('--qc_files', + dest="qc_files", + nargs = "+", + help=('the grouped QC files from the GroupQCOutputs task of SS2 ' + 'Single Sample workflow')) + + parser.add_argument('--count_results', + dest="count_results_file", + help='path to the intronic and exonic count and FPKM to be added to the H5AD') + + parser.add_argument('--output_h5ad_path', + dest="output_h5ad_path", + help='path where the h5ad file is to be created') + + parser.add_argument('--input_id', + dest="input_id", + help='the sample name in the bundle') + + parser.add_argument("--input_name", + dest="input_name", + help= "sequencing_input.biomaterial_core.biomaterial_id in HCA metadata, defined by the user", + ) + + parser.add_argument("--input_id_metadata_field", + dest="input_id_metadata_field", + help= "sequencing_process.provenance.document_id: [UUID] defined by the user", + ) + + parser.add_argument("--input_name_metadata_field", + dest="input_name_metadata_field", + help= "sequencing_input.biomaterial_core.biomaterial_id defined by the user", + ) + + parser.add_argument('--pipeline_version', + default="Unknown sample", + help='the version of SS2 used to generate data') + + args = parser.parse_args() + + create_h5ad_files(args) + + +if __name__ == '__main__': + main() + diff --git a/tools/scripts/ss2_h5ad_merge.py b/tools/scripts/ss2_h5ad_merge.py new file mode 100644 index 00000000..acf04820 --- /dev/null +++ b/tools/scripts/ss2_h5ad_merge.py @@ -0,0 +1,84 @@ +import argparse +import anndata as ad + +def merge_h5ad_files(args): + """This function merges multiple H5AD files into one, preserving metadata.""" + h5ad_files = args.input_h5ad_files + print(f"Input H5AD files: {h5ad_files}") + + adatas = [ad.read_h5ad(f) for f in h5ad_files] + print(f"Number of AnnData objects read: {len(adatas)}") + + # Concatenate the AnnData objects + merged_adata = ad.concat(adatas, axis=0, merge="same") + + + # Add global attributes (uns) + attrDict = dict() + attrDict['batch_id'] = args.batch_id + attrDict['pipeline_version'] = args.pipeline_version + if args.batch_name is not None: + attrDict['batch_name'] = args.batch_name + if args.library is not None: + attrDict['library_preparation_protocol.library_construction_approach'] = args.library + if args.species is not None: + attrDict['donor_organism.genus_species'] = args.species + if args.organ is not None: + attrDict['specimen_from_organism.organ'] = args.organ + if args.project_id is not None: + attrDict['project.provenance.document_id'] = args.project_id + if args.project_name is not None: + attrDict['project.project_core.project_short_name'] = args.project_name + + # Assign these attributes to the merged AnnData object + merged_adata.uns.update(attrDict) + print(f"Global attributes added to the merged AnnData: {attrDict}") + + # Save the merged H5AD file + merged_adata.write_h5ad(args.output_h5ad_file) + print(f"Merged H5AD file saved to: {args.output_h5ad_file}") + +def main(): + description = """Merge the outputs of multiple SS2 pipeline runs into a single H5AD file""" + parser = argparse.ArgumentParser(description=description) + parser.add_argument('--input-h5ad-files', + dest='input_h5ad_files', + nargs="+", + required=True, + help="Path to input H5AD files") + parser.add_argument('--output-h5ad-file', + dest='output_h5ad_file', + required=True, + help="Path to output H5AD file") + parser.add_argument('--batch_id', + dest='batch_id', + required=True, + help="Batch id for output H5AD") + parser.add_argument('--batch_name', + dest='batch_name', + help='User provided batch name for output H5AD') + parser.add_argument('--project_id', + dest='project_id', + help='User provided project id for output H5AD') + parser.add_argument('--project_name', + dest='project_name', + help='User provided project name for output H5AD') + parser.add_argument('--library', + dest='library', + help='User provided library for output H5AD') + parser.add_argument('--species', + dest='species', + help='User provided species for output H5AD') + parser.add_argument('--organ', + dest='organ', + help='User provided organ for output H5AD') + parser.add_argument('--pipeline_version', + dest='pipeline_version', + required=True, + help='Multisample SS2 version') + args = parser.parse_args() + + merge_h5ad_files(args) + +if __name__ == '__main__': + main()