Optional normalization for generic_cell_clustering.ipynb (#1027)

* add normalize arg, default True * test to check for no normalizing * update notebook * move arg init to section 1 * notebook test * notebook typo * notebook wording
angelolab · Aug 2, 2023 · 94e6363 · 94e6363
1 parent 7823f9d
commit 94e6363
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 38 deletions.
diff --git a/src/ark/phenotyping/cell_som_clustering.py b/src/ark/phenotyping/cell_som_clustering.py
@@ -8,7 +8,7 @@
 def train_cell_som(fovs, base_dir, cell_table_path, cell_som_cluster_cols,
                    cell_som_input_data, som_weights_name='cell_som_weights.feather',
                    xdim=10, ydim=10, lr_start=0.05, lr_end=0.01, num_passes=1, seed=42,
-                   overwrite=False):
+                   overwrite=False, normalize=True):
     """Run the SOM training on the expression columns specified in `cell_som_cluster_cols`.
 
     Saves the SOM weights to `base_dir/som_weights_name`.
@@ -40,6 +40,8 @@ def train_cell_som(fovs, base_dir, cell_table_path, cell_som_cluster_cols,
             The random seed to use for training the SOM
         overwrite (bool):
             If set, force retrains the SOM and overwrites the weights
+        normalize (bool):
+            Whether to perform 99.9% percentile normalization, default to True.
 
     Returns:
         cluster_helpers.CellSOMCluster:
@@ -62,7 +64,7 @@ def train_cell_som(fovs, base_dir, cell_table_path, cell_som_cluster_cols,
     cell_pysom = cluster_helpers.CellSOMCluster(
         cell_som_input_data, som_weights_path, fovs, cell_som_cluster_cols,
         num_passes=num_passes, xdim=xdim, ydim=ydim, lr_start=lr_start, lr_end=lr_end,
-        seed=seed
+        seed=seed, normalize=normalize
     )
 
     # train the SOM weights

diff --git a/src/ark/phenotyping/cluster_helpers.py b/src/ark/phenotyping/cluster_helpers.py
@@ -282,7 +282,7 @@ class CellSOMCluster(PixieSOMCluster):
     def __init__(self, cell_data: pd.DataFrame, weights_path: pathlib.Path,
                  fovs: List[str], columns: List[str], num_passes: int = 1,
                  xdim: int = 10, ydim: int = 10, lr_start: float = 0.05, lr_end: float = 0.01,
-                 seed=42):
+                 seed=42, normalize=True):
         """Creates a cell SOM cluster object derived from the abstract PixieSOMCluster
 
         Args:
@@ -306,6 +306,9 @@ def __init__(self, cell_data: pd.DataFrame, weights_path: pathlib.Path,
                 The learning rate to decay to.
             seed (int):
                 The random seed to use.
+            normalize (bool):
+                Whether to perform 99.9% percentile normalization, default to True.
+
         """
         super().__init__(
             weights_path, columns, num_passes, xdim, ydim, lr_start, lr_end, seed
@@ -323,7 +326,8 @@ def __init__(self, cell_data: pd.DataFrame, weights_path: pathlib.Path,
         ].reset_index(drop=True)
 
         # since cell_data is the only dataset, we can just normalize it immediately
-        self.normalize_data()
+        if normalize:
+            self.normalize_data()
 
     def normalize_data(self):
         """Normalizes `cell_data` by the 99.9% value of each pixel cluster count column

diff --git a/templates/generic_cell_clustering.ipynb b/templates/generic_cell_clustering.ipynb
@@ -1,7 +1,6 @@
 {
  "cells": [
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "tags": []
@@ -39,7 +38,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "tags": []
@@ -67,7 +65,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -92,7 +89,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {
     "tags": []
@@ -102,7 +98,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -132,7 +127,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -168,7 +162,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -193,7 +186,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -229,27 +221,46 @@
    ]
   },
   {
-   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**99.9% Normalization**\n",
+    "\n",
+    "Whether to normalize each of the `cell_som_cluster_cols` by their 99.9% value prior to training FlowSOM. Set to `False` to skip this normalization step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "normalize_param"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "normalize = True"
+   ]
+  },
+  {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## 2: Cell clustering"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### 2.1: train cell SOM"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Train the cell SOM on the expression values provided per cell (the data stored in `cell_som_input_name`).  Training is done using the `FlowSOM` algorithm. Note that each of the `cell_som_cluster_cols` are normalized by their 99.9% value prior to training.\n",
+    "Train the cell SOM on the expression values provided per cell (the data stored in `cell_som_input_name`).  Training is done using the `FlowSOM` algorithm.\n",
     "\n",
     "For a full set of parameters you can customize for `train_cell_som`, please consult: <a href=https://ark-analysis.readthedocs.io/en/latest/_markdown/ark.phenotyping.html#ark.phenotyping.cell_cluster_utils.train_cell_som>cell training docs</a>."
    ]
@@ -272,20 +283,19 @@
     "    cell_som_cluster_cols,\n",
     "    cell_som_input_data,\n",
     "    som_weights_name=cell_som_weights_name,\n",
-    "    num_passes=1\n",
+    "    num_passes=1,\n",
+    "    normalize=normalize\n",
     ")"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### 2.2: assign cell SOM clusters"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -321,15 +331,13 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### 2.3: run cell consensus clustering"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -344,7 +352,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -387,23 +394,20 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## 3: visualize results"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### 3.1: use the interactive reclustering results to relabel cell meta clusters"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -455,7 +459,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -491,7 +494,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -516,7 +518,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -574,7 +575,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -602,7 +602,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -639,15 +638,13 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### 3.3: save consensus cluster labels to cell table"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -670,15 +667,13 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### 4.6: save the full results of Pixie cell clustering"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -702,7 +697,6 @@
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -750,7 +744,8 @@
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3"
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
   },
   "vscode": {
    "interpreter": {

diff --git a/tests/phenotyping/cell_som_clustering_test.py b/tests/phenotyping/cell_som_clustering_test.py
@@ -92,8 +92,9 @@ def test_train_cell_som():
             cluster_som_weights_names=cell_weights.columns.values
         )
 
-        # assert the shape
+        # assert the shape and normalization
         assert cell_weights.shape == (100, 15)
+        assert np.all(cell_weights < 1)
 
         # remove cell weights and weighted channel average file for next test
         os.remove(cell_pysom.weights_path)
@@ -124,8 +125,36 @@ def test_train_cell_som():
             cluster_som_weights_names=cell_weights.columns.values
         )
 
-        # assert the shape
+        # assert the shape and normalization
         assert cell_weights.shape == (100, 2)
+        assert np.all(cell_weights < 1)
+
+        # remove cell weights and weighted channel average file for next test
+        os.remove(cell_pysom.weights_path)
+
+        # TEST 3: check data was not normalized
+        _, cluster_counts_norm = cell_cluster_utils.create_c2pc_data(
+            fovs, pixel_data_path, cell_table_path, 'pixel_som_cluster'
+        )
+
+        # train the cell SOM
+        cell_pysom = cell_som_clustering.train_cell_som(
+            fovs=fovs,
+            base_dir=temp_dir,
+            cell_table_path=cell_table_path,
+            cell_som_cluster_cols=['pixel_som_cluster_%d' % i for i in np.arange(15)],
+            cell_som_input_data=cluster_counts_norm, normalize=False
+        )
+
+        # assert cell weights has been created
+        assert os.path.exists(cell_pysom.weights_path)
+
+        # read in the cell weights
+        cell_weights = feather.read_dataframe(cell_pysom.weights_path)
+
+        # assert the shape and lack of normalization
+        assert cell_weights.shape == (100, 15)
+        assert not np.all(cell_weights < 1)
 
 
 # NOTE: overwrite functionality tested in cluster_helpers_test.py

diff --git a/tests/utils/notebooks_test.py b/tests/utils/notebooks_test.py
@@ -596,6 +596,9 @@ def test_cluster_prefix(self):
     def test_cell_cluster_files(self):
         self.tb.execute_cell("cell_cluster_files")
 
+    def test_normalization(self):
+        self.tb.execute_cell("normalize_param")
+
     def test_train_cell_som(self):
         self.tb.execute_cell("train_cell_som")