Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse CellProfiler features #298

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
138 changes: 138 additions & 0 deletions pycytominer/cyto_utils/parse_cp_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
def parse_cp_features(
feature: str, channels: list = ["DNA", "RNA", "AGP", "Mito", "ER", "mito_tubeness"]
):
"""Parses a CellProfiler feature string into its semantic components.

This function will take a feature string and return a dictionary containing its semantic components,
specifically: the compartment, feature group, feature type, and channel.
If the feature string is not in a recognized format, the function will assign 'Unknown' to the non-comprehensible components.
Channel information will be returned as 'None' where it's not applicable.

Parameters
----------
feature : str
The CellProfiler feature string to parse.

channels : list, optional
A list of channel names to use when parsing the feature string. The default is ['DNA', 'RNA', 'AGP', 'Mito', 'ER', "mito_tubeness"].

Returns
-------
dict
A dictionary with the following keys: 'feature', 'compartment', 'feature_group', 'feature_type', 'channel'.
Each key maps to the respective component of the feature string.

Raises
------
ValueError
Raised if the input is not a string.
"""

if not isinstance(feature, str):
raise ValueError(f"Expected a string, got {type(feature).__name__}")

if not isinstance(channels, list):
raise ValueError(f"Expected a list, got {type(channels).__name__}")

def channel_standardizer(channel):
channel = channel.replace("Orig", "")
return channel

unique_token = "XUNIQUEX"
tokenized_feature = feature
for channel in channels:
tokenized_channel = channel.replace("_", unique_token)
tokenized_feature = tokenized_feature.replace(channel, tokenized_channel)

parts = tokenized_feature.split("_")

feature_group = parts[1]
if parts[0] not in ["Cells", "Cytoplasm", "Nuclei", "Image"]:
compartment = "XUNKNOWN"
feature_group = "XUNKNOWN"
feature_type = "XUNKNOWN"
channel = "XUNKNOWN"
else:
compartment = parts[0]
feature_group = parts[1]
feature_type = "XNONE" # default value
channel = "XNONE" # default value

if feature_group in [
"AreaShape",
"Neighbors",
"Children",
"Parent",
"Number",
"Threshold",
"ObjectSkeleton",
]:
# Examples:
# Cells,AreaShape,Zernike_2_0
# Cells,AreaShape,BoundingBoxArea
# Cells,Neighbors,AngleBetweenNeighbors_Adjacent
# Nuclei,Children,Cytoplasm_Count
# Nuclei,Parent,NucleiIncludingEdges
# Nuclei,Number,ObjectNumber
# Image,Threshold,SumOfEntropies_NucleiIncludingEdges
# Nuclei,ObjectSkeleton,NumberTrunks_mito_skel

feature_type = parts[2]

elif feature_group == "Location":
# Examples:
# Cells,Location_CenterMassIntensity_X_DNA
# Cells,Location_Center_X

feature_type = parts[2]
if feature_type != "Center":
channel = parts[4]

elif feature_group == "Count":
# Examples:
# Cells,Count,Cells
pass

elif feature_group == "Granularity":
# Examples:
# Cells,Granularity,15_ER
channel = parts[3]

elif feature_group in ["Intensity", "ImageQuality"]:
# Examples:
# Cells,Intensity,MeanIntensity_DNA
# Image,ImageQuality,MaxIntensity_OrigAGP
feature_type = parts[2]
channel = parts[3]

elif feature_group == "Correlation":
# Examples:
# Cells,Correlation,Correlation_DNA_ER
feature_type = parts[2]
channel = [parts[3], parts[4]]
channel.sort()
channel = "_".join(channel)

elif feature_group in ["Texture", "RadialDistribution"]:
# Examples:
# Cells,Texture,SumEntropy_ER_3_01_256
# Cells,RadialDistribution,FracAtD_mito_tubeness_2of16
feature_type = parts[2]
channel = parts[3]

else:
feature_group = "XUNKNOWN"
feature_type = "XUNKNOWN"
channel = "XUNKNOWN"

channel = "_".join(list(map(channel_standardizer, channel.split("_"))))

channel = channel.replace(unique_token, "_")

return {
"feature": feature,
"compartment": compartment,
"feature_group": feature_group,
"feature_type": feature_type,
"channel": channel,
}
5 changes: 5 additions & 0 deletions pycytominer/cyto_utils/parse_cp_features_cmd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from pycytominer.cyto_utils.parse_cp_features import parse_cp_features
import fire

if __name__ == "__main__":
fire.Fire(parse_cp_features)
41 changes: 41 additions & 0 deletions tests/test_cyto_utils/test_parse_cp_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""This tests parse_cp_features"""

from pycytominer.cyto_utils.parse_cp_features import parse_cp_features
import pathlib
import pandas as pd


def test_parse_feature():
feature_strings = [
"Cells_RadialDistribution_RadialCV_mito_tubeness_Overflow",
"Cells_Texture_SumVariance_RNA_5",
"Nuclei_Intensity_MaxIntensityEdge_DNA",
"Cytoplasm_Correlation_Correlation_DNA_RNA",
"Image_AreaShape_Compactness",
]

for feature in feature_strings:
result = parse_cp_features(feature)
assert isinstance(result, dict)
assert result["feature"] == feature
assert result["compartment"] is not None
assert result["feature_group"] is not None
assert result["feature_type"] is not None
assert result["channel"] is not None


def test_parse_feature_with_file():
cp_features_file = f"{pathlib.Path(__file__).parent.parent}/test_data/parse_cp_features_example_data/cp_features.txt"
with open(cp_features_file, "r") as file:
features = file.read().splitlines()

parsed_features_df = pd.DataFrame(
[parse_cp_features(feature.strip()) for feature in features]
)

pd.testing.assert_frame_equal(
parsed_features_df,
pd.read_csv(
f"{pathlib.Path(__file__).parent.parent}/test_data/parse_cp_features_example_data/parsed_features.csv"
),
)
Loading