diff --git a/tools/scripts/compare_gex_h5ad.py b/tools/scripts/compare_gex_h5ad.py new file mode 100644 index 00000000..e155c689 --- /dev/null +++ b/tools/scripts/compare_gex_h5ad.py @@ -0,0 +1,94 @@ +import anndata +import sys +def load_anndata(file_path): + # Load an AnnData object from the specified H5AD file + return anndata.read_h5ad(file_path) +def main(): + # Check if the correct number of command-line arguments is provided + if len(sys.argv) != 3: + print("Usage: python script.py ") + sys.exit(1) + # Get the input file paths from command-line arguments + test_h5ad_path = sys.argv[1] + truth_h5ad_path = sys.argv[2] + # Load AnnData objects from the H5AD files + test = load_anndata(test_h5ad_path) + truth = load_anndata(truth_h5ad_path) + # Print some information about the loaded AnnData objects (optional) + print("Loaded test AnnData:") + print(test_h5ad_path) + print(test) + print("\nLoaded truth AnnData:") + print(truth) + print(truth_h5ad_path) + # Now you can use the 'test' and 'truth' AnnData objects as needed in your script + # Do comparisons between test and truth obs + print("Comparing test cell metrics to truth cell metrics using truth as ref") + command1="test.obs." + command2="truth.obs." + for x in truth.obs.columns: + z = eval(command1+x) + y = eval(command2+x) + if z.equals(y)==False: + print("Cell Metric Column does not match:") + print(x) + print("Sum of test: ") + print(z.sum()) + print("Sum of truth: ") + print(y.sum()) + print("Comparing test gene metrics to truth gene metrics using truth as ref") + command3="test.var." + command4="truth.var." + for x in truth.var.columns: + z = eval(command3+x) + y = eval(command4+x) + if z.equals(y)==False: + print("Gene Metric Column does not match:") + print(x) + print("Making gene_names unique") + test.var_names_make_unique() + truth.var_names_make_unique() + for x in truth.var.columns: + z = eval(command3+x) + y = eval(command4+x) + if z.equals(y)==False: + print("Gene metric does not match after making gene names unique") + print(x) + print("Done") + print("If no warning above Done, gene metrics match now that they are unique") + print("Testing for new obs columns in test data set:") + for x in test.obs.columns: + if x not in truth.obs.columns: + print("Column not in truth", x) + print("Done") + print("If no warning above Done, no new obs columns in test matrix") + print("Testing for new var columns in test data set:") + for x in test.var.columns: + if x not in truth.var.columns: + print("Column not in truth", x) + print("Done") + print("If no warning above Done, no new var columns in test matrix") + print("Testing matrix count sums") + if test.X.sum()==truth.X.sum(): + print("Counts match") + else: + print("Counts do not match") + print("Testing that multimapped_reads + unique_reads = n_reads in test") + if test.obs.n_reads.sum()==(test.obs.reads_mapped_multiple.sum()+test.obs.reads_mapped_uniquely.sum()): + print("N_reads = reads_mapped_multiple+unique") + else: + print("Sums of unique, multimapped do not equal n_reads") + print("Sums of unqie + multimapped: ") + print(test.obs.reads_mapped_multiple.sum()+test.obs.reads_mapped_uniquely.sum()) + print("Sums of n_reads: ") + print(test.obs.n_reads.sum()) + print("Sum of unique: ") + print(test.obs.reads_mapped_uniquely.sum()) + print("Sum of multimapped: ") + print(test.obs.reads_mapped_multiple.sum()) + if test.obs.n_reads.sum()==(test.obs.reads_mapped_multiple.sum()+test.obs.reads_mapped_uniquely.sum()+test.obs.reads_mapped_intergenic.sum()): + print("N_reads = reads_mapped_multiple+unique+intergenic") + else: + print("Sums of unique, multimapped, intergenic do not equal n_reads") +if __name__ == "__main__": + main()