From 89c42509a33d55e20f22d8ac8b92059f3460da9e Mon Sep 17 00:00:00 2001
From: npetrill <npetrill@broadinstitute.org>
Date: Fri, 8 Nov 2024 10:29:46 -0500
Subject: [PATCH] add multiome check script

---
 tools/scripts/compare_gex_h5ad.py | 94 +++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 tools/scripts/compare_gex_h5ad.py

diff --git a/tools/scripts/compare_gex_h5ad.py b/tools/scripts/compare_gex_h5ad.py
new file mode 100644
index 00000000..e155c689
--- /dev/null
+++ b/tools/scripts/compare_gex_h5ad.py
@@ -0,0 +1,94 @@
+import anndata
+import sys
+def load_anndata(file_path):
+    # Load an AnnData object from the specified H5AD file
+    return anndata.read_h5ad(file_path)
+def main():
+    # Check if the correct number of command-line arguments is provided
+    if len(sys.argv) != 3:
+        print("Usage: python script.py <test_h5ad_file> <truth_h5ad_file>")
+        sys.exit(1)
+    # Get the input file paths from command-line arguments
+    test_h5ad_path = sys.argv[1]
+    truth_h5ad_path = sys.argv[2]
+    # Load AnnData objects from the H5AD files
+    test = load_anndata(test_h5ad_path)
+    truth = load_anndata(truth_h5ad_path)
+    # Print some information about the loaded AnnData objects (optional)
+    print("Loaded test AnnData:")
+    print(test_h5ad_path)
+    print(test)
+    print("\nLoaded truth AnnData:")
+    print(truth)
+    print(truth_h5ad_path)
+    # Now you can use the 'test' and 'truth' AnnData objects as needed in your script
+    # Do comparisons between test and truth obs
+    print("Comparing test cell metrics to truth cell metrics using truth as ref")
+    command1="test.obs."
+    command2="truth.obs."
+    for x in truth.obs.columns:
+        z = eval(command1+x)
+        y = eval(command2+x)
+        if z.equals(y)==False:
+            print("Cell Metric Column does not match:")
+            print(x)
+            print("Sum of test: ")
+            print(z.sum())
+            print("Sum of truth: ")
+            print(y.sum())
+    print("Comparing test gene metrics to truth gene metrics using truth as ref")
+    command3="test.var."
+    command4="truth.var."
+    for x in truth.var.columns:
+        z = eval(command3+x)
+        y = eval(command4+x)
+        if z.equals(y)==False:
+            print("Gene Metric Column does not match:")
+            print(x)
+    print("Making gene_names unique")
+    test.var_names_make_unique()
+    truth.var_names_make_unique()
+    for x in truth.var.columns:
+        z = eval(command3+x)
+        y = eval(command4+x)
+        if z.equals(y)==False:
+            print("Gene metric does not match after making gene names unique")
+            print(x)
+    print("Done")
+    print("If no warning above Done, gene metrics match now that they are unique")
+    print("Testing for new obs columns in test data set:")
+    for x in test.obs.columns:
+        if x not in truth.obs.columns:
+            print("Column not in truth", x)
+    print("Done")
+    print("If no warning above Done, no new obs columns in test matrix")
+    print("Testing for new var columns in test data set:")
+    for x in test.var.columns:
+        if x not in truth.var.columns:
+            print("Column not in truth", x)
+    print("Done")
+    print("If no warning above Done, no new var columns in test matrix")
+    print("Testing matrix count sums")
+    if test.X.sum()==truth.X.sum():
+        print("Counts match")
+    else:
+        print("Counts do not match")
+    print("Testing that multimapped_reads + unique_reads = n_reads in test")
+    if test.obs.n_reads.sum()==(test.obs.reads_mapped_multiple.sum()+test.obs.reads_mapped_uniquely.sum()):
+        print("N_reads = reads_mapped_multiple+unique")
+    else:
+        print("Sums of unique, multimapped do not equal n_reads")
+        print("Sums of unqie + multimapped: ")
+        print(test.obs.reads_mapped_multiple.sum()+test.obs.reads_mapped_uniquely.sum())
+        print("Sums of n_reads: ")
+        print(test.obs.n_reads.sum())
+        print("Sum of unique: ")
+        print(test.obs.reads_mapped_uniquely.sum())
+        print("Sum of multimapped: ")
+        print(test.obs.reads_mapped_multiple.sum())
+    if test.obs.n_reads.sum()==(test.obs.reads_mapped_multiple.sum()+test.obs.reads_mapped_uniquely.sum()+test.obs.reads_mapped_intergenic.sum()):
+        print("N_reads = reads_mapped_multiple+unique+intergenic")
+    else:
+        print("Sums of unique, multimapped, intergenic do not equal n_reads")
+if __name__ == "__main__":
+    main()