Merge branch 'main' into releases/v0.6.6

angelolab · Aug 25, 2023 · fa3c8aa · fa3c8aa
2 parents c95cef0 + 2ba38f2
commit fa3c8aa
Show file tree

Hide file tree

Showing 16 changed files with 140 additions and 74 deletions.
diff --git a/README.md b/README.md
@@ -58,10 +58,10 @@ The [**segmentation notebook**](./templates/1_Segment_Image_Data.ipynb) will wal
   - *Note:* It is assumed that the cell table uses the default column names as in `ark/settings.py`. Refer to the [docs](docs/_rtd/data_types.md) to get descriptions of the cell table columns, and methods to adjust them if necessary.
 
 #### 2. Pixel clustering with Pixie  
-The first step in the [Pixie](https://www.biorxiv.org/content/10.1101/2022.08.16.504171v1) pipeline is to run the [**pixel clustering notebook**](./templates/2_Pixie_Cluster_Pixels.ipynb). The notebook walks you through the process of generating pixel clusters for your data, and lets you specify what markers to use for the clustering, train a model, use it to classify your entire dataset, and generate pixel cluster overlays. The notebook includes a GUI for manual cluster adjustment and annotation. [Workshop Talk - Session IV - Pixel Level Analysis](https://youtu.be/e7C1NvaPLaY)
+The first step in the [Pixie](https://doi.org/10.1038/s41467-023-40068-5) pipeline is to run the [**pixel clustering notebook**](./templates/2_Pixie_Cluster_Pixels.ipynb). The notebook walks you through the process of generating pixel clusters for your data, and lets you specify what markers to use for the clustering, train a model, use it to classify your entire dataset, and generate pixel cluster overlays. The notebook includes a GUI for manual cluster adjustment and annotation. [Workshop Talk - Session IV - Pixel Level Analysis](https://youtu.be/e7C1NvaPLaY)
 
 #### 3. Cell clustering with Pixie  
-The second step in the [Pixie](https://www.biorxiv.org/content/10.1101/2022.08.16.504171v1) pipeline is to run the [**cell clustering notebook**](./templates/3_Pixie_Cluster_Cells.ipynb). This notebook will use the pixel clusters generated in the first notebook to cluster the cells in your dataset. The notebook walks you through generating cell clusters for your data and generates cell cluster overlays. The notebook includes a GUI for manual cluster adjustment and annotation. [Workshop Talk - Session V - Cell-level Analysis - Part 2: Cell Clustering](https://youtu.be/4_AJxrxPYlk?t=2704)
+The second step in the [Pixie](https://doi.org/10.1038/s41467-023-40068-5) pipeline is to run the [**cell clustering notebook**](./templates/3_Pixie_Cluster_Cells.ipynb). This notebook will use the pixel clusters generated in the first notebook to cluster the cells in your dataset. The notebook walks you through generating cell clusters for your data and generates cell cluster overlays. The notebook includes a GUI for manual cluster adjustment and annotation. [Workshop Talk - Session V - Cell-level Analysis - Part 2: Cell Clustering](https://youtu.be/4_AJxrxPYlk?t=2704)
 
 #### 4. Post Clustering Tasks
 After the Pixie Pipeline, the user can inspect and fine tune their results with the [**post clustering notebook**](./templates/4_Post_Clustering.ipynb). This notebook will go over cleaning up artifacts left from clustering, and working with functional markers.
@@ -106,7 +106,7 @@ Open terminal and navigate to where you want the code stored.
 If you would like to use the latest version of `ark` simply clone the project and create the Conda environment.
 
 ```sh
-git clone -b v0.6.4 https://github.com/angelolab/ark-analysis.git
+git clone -b v0.6.5 https://github.com/angelolab/ark-analysis.git
 cd ark-analysis
 conda env create -f environment.yml
 ```
@@ -379,4 +379,4 @@ If you would like to help make `ark` better, please take a look at our [contribu
 Please directly cite the `ark` repo (https://github.com/angelolab/ark-analysis) if it was a part of your analysis. In addition, please cite the relevant paper(s) below where applicable to your study. 
 
 1. [Greenwald, Miller et al. Whole-cell segmentation of tissue images with human-level performance using large-scale data annotation and deep learning [2021]](https://www.nature.com/articles/s41587-021-01094-0)
-2. [Liu et al. Robust phenotyping of highly multiplexed tissue imaging data using pixel-level clustering [2022]](https://www.biorxiv.org/content/10.1101/2022.08.16.504171v1)
+2. [Liu et al. Robust phenotyping of highly multiplexed tissue imaging data using pixel-level clustering [2023]](https://doi.org/10.1038/s41467-023-40068-5)
diff --git a/src/ark/analysis/cell_neighborhood_stats.py b/src/ark/analysis/cell_neighborhood_stats.py
@@ -49,8 +49,11 @@ def compute_neighborhood_diversity(neighborhood_mat, cell_type_col):
 
     diversity_data = []
     fov_list = np.unique(neighborhood_mat[settings.FOV_ID])
-    with tqdm(total=len(fov_list), desc="Calculate Neighborhood Diversity") as diversity_progress:
+    with tqdm(total=len(fov_list), desc="Calculate Neighborhood Diversity", unit="FOVs") \
+            as diversity_progress:
         for fov in fov_list:
+            diversity_progress.set_postfix(FOV=fov)
+
             fov_neighborhoods = neighborhood_mat[neighborhood_mat[settings.FOV_ID] == fov]
 
             diversity_scores = []
@@ -72,7 +75,6 @@ def compute_neighborhood_diversity(neighborhood_mat, cell_type_col):
             })
             diversity_data.append(fov_data)
 
-            diversity_progress.set_postfix(FOV=fov)
             diversity_progress.update(1)
 
     # dataframe containing all fovs
@@ -216,8 +218,11 @@ def generate_cell_distance_analysis(
     fov_list = np.unique(cell_table[fov_col])
 
     cell_dists = []
-    with tqdm(total=len(fov_list), desc="Calculate Average Distances") as distance_progress:
+    with tqdm(total=len(fov_list), desc="Calculate Average Distances", unit="FOVs") \
+            as distance_progress:
         for fov in fov_list:
+            distance_progress.set_postfix(FOV=fov)
+
             fov_cell_table = cell_table[cell_table[fov_col] == fov]
             fov_dist_xr = xr.load_dataarray(os.path.join(dist_mat_dir, str(fov) + '_dist_mat.xr'))
 
@@ -231,7 +236,6 @@ def generate_cell_distance_analysis(
             fov_cell_dists.insert(2, cell_type_col, fov_cell_table[cell_type_col])
             cell_dists.append(fov_cell_dists)
 
-            distance_progress.set_postfix(FOV=fov)
             distance_progress.update(1)
 
     # combine data for all fovs and save to csv

diff --git a/src/ark/analysis/dimensionality_reduction.py b/src/ark/analysis/dimensionality_reduction.py
@@ -98,20 +98,23 @@ def visualize_dimensionality_reduction(cell_data, columns, category, color_map="
 
         plot_dim_reduced_data(embedding[:, 0], embedding[:, 1], fig_id=1,
                               hue=cell_data[category], cell_data=cell_data, title=graph_title,
-                              dpi=dpi, save_dir=save_dir, save_file="UMAPVisualization.png")
+                              dpi=dpi, save_dir=save_dir, save_file="UMAPVisualization.png",
+                              palette=color_map)
 
     elif algorithm == "PCA":
         pca = PCA()
         pca_result = pca.fit_transform(cell_data[columns].values)
 
         plot_dim_reduced_data(pca_result[:, 0], pca_result[:, 1], fig_id=2,
                               hue=cell_data[category], cell_data=cell_data, title=graph_title,
-                              dpi=dpi, save_dir=save_dir, save_file="PCAVisualization.png")
+                              dpi=dpi, save_dir=save_dir, save_file="PCAVisualization.png",
+                              palette=color_map)
 
     elif algorithm == "tSNE":
         tsne = TSNE()
         tsne_results = tsne.fit_transform(cell_data[columns].values)
 
         plot_dim_reduced_data(tsne_results[:, 0], tsne_results[:, 1], fig_id=3,
                               hue=cell_data[category], cell_data=cell_data, title=graph_title,
-                              dpi=dpi, save_dir=save_dir, save_file="tSNEVisualization.png")
+                              dpi=dpi, save_dir=save_dir, save_file="tSNEVisualization.png",
+                              palette=color_map)
diff --git a/src/ark/analysis/neighborhood_analysis.py b/src/ark/analysis/neighborhood_analysis.py
@@ -76,9 +76,11 @@ def create_neighborhood_matrix(all_data, dist_mat_dir, included_fovs=None, distl
 
     cell_neighbor_freqs = cell_neighbor_counts.copy(deep=True)
 
-    with tqdm(total=len(included_fovs), desc="Neighbors Matrix Generation") \
+    with tqdm(total=len(included_fovs), desc="Neighbors Matrix Generation", unit="FOVs") \
             as neighbor_mat_progress:
         for fov in included_fovs:
+            neighbor_mat_progress.set_postfix(FOV=fov)
+
             # Subsetting expression matrix to only include patients with correct fov label
             current_fov_idx = all_neighborhood_data.loc[:, fov_col] == fov
             current_fov_neighborhood_data = all_neighborhood_data[current_fov_idx]
@@ -100,7 +102,6 @@ def create_neighborhood_matrix(all_data, dist_mat_dir, included_fovs=None, distl
             cell_neighbor_freqs.loc[current_fov_neighborhood_data.index, fov_cluster_names]\
                 = freqs
 
-            neighbor_mat_progress.set_postfix(FOV=fov)
             neighbor_mat_progress.update(1)
 
     # Remove cells that have no neighbors within the distlim

diff --git a/src/ark/analysis/spatial_analysis_utils.py b/src/ark/analysis/spatial_analysis_utils.py
@@ -37,8 +37,11 @@ def calc_dist_matrix(label_dir, save_path, prefix='_whole_cell'):
     fov_files = io_utils.list_files(label_dir, substrs=prefix + '.tiff')
 
     # iterate for each fov
-    with tqdm(total=len(fov_files), desc="Distance Matrix Generation") as dist_mat_progress:
+    with tqdm(total=len(fov_files), desc="Distance Matrix Generation", unit="FOVs") \
+            as dist_mat_progress:
         for fov_file in fov_files:
+            dist_mat_progress.set_postfix(FOV=fov_file)
+
             # retrieve the fov name
             fov_name = fov_file.replace(prefix + '.tiff', '')
 
@@ -66,7 +69,6 @@ def calc_dist_matrix(label_dir, save_path, prefix='_whole_cell'):
                 format='NETCDF3_64BIT'
             )
 
-            dist_mat_progress.set_postfix(FOV=fov_name)
             dist_mat_progress.update(1)
 
 

diff --git a/src/ark/analysis/spatial_enrichment.py b/src/ark/analysis/spatial_enrichment.py
@@ -70,8 +70,11 @@ def generate_channel_spatial_enrichment_stats(label_dir, dist_mat_dir, marker_th
     values = []
     stats_datasets = []
 
-    with tqdm(total=len(all_label_fovs), desc="Channel Spatial Enrichment") as chan_progress:
+    with tqdm(total=len(all_label_fovs), desc="Channel Spatial Enrichment", unit="FOVs") \
+            as chan_progress:
         for fov_name, label_file in zip(all_label_fovs, all_label_names):
+            chan_progress.set_postfix(FOV=fov_name)
+
             label_maps = load_utils.load_imgs_from_dir(label_dir, files=[label_file],
                                                        xr_channel_names=[xr_channel_name],
                                                        trim_suffix=suffix)
@@ -292,8 +295,11 @@ def generate_cluster_spatial_enrichment_stats(label_dir, dist_mat_dir, all_data,
     values = []
     stats_datasets = []
 
-    with tqdm(total=len(all_label_fovs), desc="Cluster Spatial Enrichment") as clust_progress:
+    with tqdm(total=len(all_label_fovs), desc="Cluster Spatial Enrichment", unit="FOVs") \
+            as clust_progress:
         for fov_name, label_file in zip(all_label_fovs, all_label_names):
+            clust_progress.set_postfix(FOV=fov_name)
+
             label_maps = load_utils.load_imgs_from_dir(label_dir, files=[label_file],
                                                        xr_channel_names=[xr_channel_name],
                                                        trim_suffix=suffix)

diff --git a/src/ark/phenotyping/cell_som_clustering.py b/src/ark/phenotyping/cell_som_clustering.py
@@ -8,7 +8,7 @@
 def train_cell_som(fovs, base_dir, cell_table_path, cell_som_cluster_cols,
                    cell_som_input_data, som_weights_name='cell_som_weights.feather',
                    xdim=10, ydim=10, lr_start=0.05, lr_end=0.01, num_passes=1, seed=42,
-                   overwrite=False):
+                   overwrite=False, normalize=True):
     """Run the SOM training on the expression columns specified in `cell_som_cluster_cols`.
 
     Saves the SOM weights to `base_dir/som_weights_name`.
@@ -40,6 +40,8 @@ def train_cell_som(fovs, base_dir, cell_table_path, cell_som_cluster_cols,
             The random seed to use for training the SOM
         overwrite (bool):
             If set, force retrains the SOM and overwrites the weights
+        normalize (bool):
+            Whether to perform 99.9% percentile normalization, default to True.
 
     Returns:
         cluster_helpers.CellSOMCluster:
@@ -62,7 +64,7 @@ def train_cell_som(fovs, base_dir, cell_table_path, cell_som_cluster_cols,
     cell_pysom = cluster_helpers.CellSOMCluster(
         cell_som_input_data, som_weights_path, fovs, cell_som_cluster_cols,
         num_passes=num_passes, xdim=xdim, ydim=ydim, lr_start=lr_start, lr_end=lr_end,
-        seed=seed
+        seed=seed, normalize=normalize
     )
 
     # train the SOM weights

diff --git a/src/ark/phenotyping/cluster_helpers.py b/src/ark/phenotyping/cluster_helpers.py
@@ -282,7 +282,7 @@ class CellSOMCluster(PixieSOMCluster):
     def __init__(self, cell_data: pd.DataFrame, weights_path: pathlib.Path,
                  fovs: List[str], columns: List[str], num_passes: int = 1,
                  xdim: int = 10, ydim: int = 10, lr_start: float = 0.05, lr_end: float = 0.01,
-                 seed=42):
+                 seed=42, normalize=True):
         """Creates a cell SOM cluster object derived from the abstract PixieSOMCluster
 
         Args:
@@ -306,6 +306,9 @@ def __init__(self, cell_data: pd.DataFrame, weights_path: pathlib.Path,
                 The learning rate to decay to.
             seed (int):
                 The random seed to use.
+            normalize (bool):
+                Whether to perform 99.9% percentile normalization, default to True.
+
         """
         super().__init__(
             weights_path, columns, num_passes, xdim, ydim, lr_start, lr_end, seed
@@ -323,7 +326,8 @@ def __init__(self, cell_data: pd.DataFrame, weights_path: pathlib.Path,
         ].reset_index(drop=True)
 
         # since cell_data is the only dataset, we can just normalize it immediately
-        self.normalize_data()
+        if normalize:
+            self.normalize_data()
 
     def normalize_data(self):
         """Normalizes `cell_data` by the 99.9% value of each pixel cluster count column

diff --git a/src/ark/segmentation/fiber_segmentation.py b/src/ark/segmentation/fiber_segmentation.py
@@ -16,6 +16,7 @@
 from skimage.measure import regionprops_table
 from skimage.morphology import remove_small_objects
 from skimage.segmentation import watershed
+from tqdm.auto import tqdm
 
 from ark import settings
 from ark.utils.plot_utils import set_minimum_color_for_colormap
@@ -176,14 +177,21 @@ def run_fiber_segmentation(data_dir, fiber_channel, out_dir, img_sub_folder=None
 
     fiber_object_table = []
 
-    for fov in fovs:
-        print(f'Processing FOV: {fov}')
-        subset_xr = load_utils.load_imgs_from_tree(
-            data_dir, img_sub_folder, fovs=fov, channels=[fiber_channel]
-        )
-        subtable = segment_fibers(subset_xr, fiber_channel, out_dir, fov, save_csv=False,
-                                  **kwargs)
-        fiber_object_table.append(subtable)
+    with tqdm(total=len(fovs), desc="Fiber Segmentation", unit="FOVs") \
+            as fibseg_progress:
+        for fov in fovs:
+            fibseg_progress.set_postfix(FOV=fov)
+
+            subset_xr = load_utils.load_imgs_from_tree(
+                data_dir, img_sub_folder, fovs=fov, channels=[fiber_channel]
+            )
+            # run fiber segmentation on the FOV
+            subtable = segment_fibers(subset_xr, fiber_channel, out_dir, fov, save_csv=False,
+                                      **kwargs)
+            fiber_object_table.append(subtable)
+
+            # update progress bar
+            fibseg_progress.update(1)
 
     fiber_object_table = pd.concat(fiber_object_table)
 

diff --git a/src/ark/utils/data_utils.py b/src/ark/utils/data_utils.py
@@ -327,10 +327,10 @@ def generate_and_save_cell_cluster_masks(
     )
 
     # create the pixel cluster masks across each fov
-    with tqdm(
-        total=len(fovs), desc="Cell Cluster Mask Generation", unit="FOVs"
-    ) as pbar:
+    with tqdm(total=len(fovs), desc="Cell Cluster Mask Generation", unit="FOVs") as pbar:
         for fov in fovs:
+            pbar.set_postfix(FOV=fov)
+
             # generate the cell mask for the FOV
             cell_mask: np.ndarray = generate_cell_cluster_mask(
                 fov=fov, seg_dir=seg_dir, ccmd=ccmd, seg_suffix=seg_suffix
@@ -461,8 +461,11 @@ def generate_and_save_pixel_cluster_masks(fovs: List[str],
     """
 
     # create the pixel cluster masks across each fov
-    with tqdm(total=len(fovs), desc="Pixel Cluster Mask Generation") as pixel_mask_progress:
+    with tqdm(total=len(fovs), desc="Pixel Cluster Mask Generation", unit="FOVs") \
+            as pixel_mask_progress:
         for fov in fovs:
+            pixel_mask_progress.set_postfix(FOV=fov)
+
             # define the path to provided channel file in the fov dir, used to calculate dimensions
             chan_file_path = os.path.join(fov, chan_file)
 
@@ -532,11 +535,12 @@ def generate_and_save_neighborhood_cluster_masks(
     )
 
     # create the neighborhood cluster masks across each fov
-    with tqdm(
-        total=len(fovs), desc="Neighborhood Cluster Mask Generation"
-    ) as neigh_mask_progress:
+    with tqdm(total=len(fovs), desc="Neighborhood Cluster Mask Generation", unit="FOVs") \
+            as neigh_mask_progress:
         # generate the mask for each FOV
         for fov in fovs:
+            neigh_mask_progress.set_postfix(FOV=fov)
+
             # load in the label map for the FOV
             label_map = load_utils.load_imgs_from_dir(
                 seg_dir,

diff --git a/src/ark/utils/deepcell_service_utils.py b/src/ark/utils/deepcell_service_utils.py
@@ -296,13 +296,17 @@ def run_deepcell_direct(input_dir, output_dir, host='https://deepcell.org',
             }
         ).json()
         if redis_response['value'][0] == 'done':
+            # make sure progress bar shows 100%
+            pbar_next = int(redis_response['value'][1])
+            progress_bar.update(max(pbar_next - pbar_last, 0))
             break
 
         # update progress bar here
         if redis_response['value'][0] == 'waiting':
             pbar_next = int(redis_response['value'][1])
-            progress_bar.update(max(pbar_next - pbar_last, 0))
-            pbar_last = pbar_next
+            if pbar_next > pbar_last:
+                progress_bar.update(max(pbar_next - pbar_last, 0))
+                pbar_last = pbar_next
 
         if redis_response['value'][0] not in ['done', 'waiting', 'new']:
             print(redis_response['value'])

diff --git a/src/ark/utils/plot_utils.py b/src/ark/utils/plot_utils.py
@@ -686,6 +686,8 @@ def save_colored_masks(
 
     with tqdm(total=len(fovs), desc="Saving colored masks", unit="FOVs") as pbar:
         for fov in fovs:
+            pbar.set_postfix(FOV=fov, refresh=False)
+
             mask: xr.DataArray = load_utils.load_imgs_from_dir(
                 data_dir=mask_dir,
                 files=[f"{fov}_{cluster_type}_mask.tiff"],
@@ -705,5 +707,4 @@ def save_colored_masks(
                 fname=save_dir / f"{fov}_{cluster_type}_mask_colored.tiff",
                 data=colored_mask,)
 
-            pbar.set_postfix(FOV=fov, refresh=False)
             pbar.update(1)
diff --git a/start_docker.sh b/start_docker.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 # define the version number, this needs to be updated every new Docker release
-VERSION='v0.6.4'
+VERSION='v0.6.5'
 
 # check for template developer flag
 JUPYTER_DIR='scripts'