-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
08f781c
commit c2b84e2
Showing
9 changed files
with
198 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# This script is designed to help package a new version of a plugin | ||
|
||
# Get the new version | ||
version=$(<VERSION) | ||
|
||
# Bump the version | ||
bump2version --config-file bumpversion.cfg --new-version ${version} --allow-dirty part | ||
|
||
# Build the container | ||
./build-docker.sh | ||
|
||
# Push to dockerhub | ||
docker push polusai/feature-subsetting-plugin:${version} | ||
|
||
# Run pytests | ||
python -m pytest -s tests |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/bin/bash | ||
|
||
version=$(<VERSION) | ||
datapath=$(readlink --canonicalize data) | ||
echo ${datapath} | ||
|
||
# Inputs | ||
inpDir=${datapath}/input | ||
filePattern=".*.csv" | ||
groupingPattern="\w+$" | ||
labelCol="species" | ||
minClusterSize=3 | ||
outDir=${datapath}/output | ||
|
||
docker run -v ${datapath}:${datapath} \ | ||
polusai/hdbscan-clustering-plugin:${version} \ | ||
--inpDir ${inpDir} \ | ||
--filePattern ${filePattern} \ | ||
--groupingPattern ${groupingPattern} \ | ||
--labelCol ${labelCol} \ | ||
--minClusterSize ${minClusterSize} \ | ||
--incrementOutlierId \ | ||
--outDir ${outDir} |
2 changes: 2 additions & 0 deletions
2
...ing/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
"""Feature Subsetting Plugin.""" | ||
__version__ = "0.2.0-dev" |
156 changes: 156 additions & 0 deletions
156
...ing/feature-subsetting-plugin/src/polus/plugins/clustering/feature_subsetting/__main__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
"""Hdbscan Clustering Plugin.""" | ||
|
||
import json | ||
import logging | ||
from multiprocessing import cpu_count | ||
from pathlib import Path | ||
from typing import Any | ||
from typing import Optional | ||
|
||
import filepattern as fp | ||
import polus.plugins.clustering.feature_subsetting.feature_subset as fs | ||
import preadator | ||
import typer | ||
from tqdm import tqdm | ||
|
||
app = typer.Typer() | ||
|
||
# Initialize the logger | ||
logging.basicConfig( | ||
format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", | ||
datefmt="%d-%b-%y %H:%M:%S", | ||
) | ||
logger = logging.getLogger("polus.plugins.clustering.feature_subsetting") | ||
logger.setLevel(logging.INFO) | ||
|
||
|
||
@app.command() | ||
def main( # noqa: PLR0913 | ||
inp_dir: Path = typer.Option( | ||
..., | ||
"--inpDir", | ||
"-i", | ||
help="Path to folder with tabular files", | ||
), | ||
file_pattern: Optional[str] = typer.Option( | ||
".*", | ||
"--filePattern", | ||
"-f", | ||
help="Pattern use to parse filenames", | ||
), | ||
grouping_pattern: Optional[str] = typer.Option( | ||
None, | ||
"--groupingPattern", | ||
"-g", | ||
help="Regular expression to group rows to capture groups.", | ||
), | ||
average_groups: Optional[bool] = typer.Option( | ||
False, | ||
"--averageGroups", | ||
"-a", | ||
help="Whether to average data across groups. Requires capture groups.", | ||
), | ||
label_col: Optional[str] = typer.Option( | ||
None, | ||
"--labelCol", | ||
"-l", | ||
help="Name of column containing labels. Required only for grouping operations.", | ||
), | ||
min_cluster_size: int = typer.Option( | ||
..., | ||
"--minClusterSize", | ||
"-m", | ||
help="Minimum cluster size.", | ||
), | ||
increment_outlier_id: Optional[bool] = typer.Option( | ||
False, | ||
"--incrementOutlierId", | ||
"-io", | ||
help="Increments outlier ID to 1.", | ||
), | ||
out_dir: Path = typer.Option( | ||
..., | ||
"--outDir", | ||
"-o", | ||
help="Output directory", | ||
), | ||
preview: Optional[bool] = typer.Option( | ||
False, | ||
"--preview", | ||
help="Output a JSON preview of files", | ||
), | ||
) -> None: | ||
"""Cluster data using HDBSCAN.""" | ||
logger.info(f"--inpDir = {inp_dir}") | ||
logger.info(f"--filePattern = {file_pattern}") | ||
# Regular expression for grouping. | ||
logger.info(f"--groupingPattern = {grouping_pattern}") | ||
# Whether to average data for each group. | ||
logger.info(f"--averageGroups = {average_groups}") | ||
# Name of column to use for grouping. | ||
logger.info(f"--labelCol = {label_col}") | ||
# Minimum cluster size for clustering using HDBSCAN. | ||
logger.info(f"--minClusterSize = {min_cluster_size}") | ||
# Set outlier cluster id as 1. | ||
logger.info(f"--incrementOutlierId = {increment_outlier_id}") | ||
logger.info(f"--outDir = {out_dir}") | ||
|
||
inp_dir = inp_dir.resolve() | ||
out_dir = out_dir.resolve() | ||
|
||
assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again" | ||
assert ( | ||
out_dir.exists() | ||
), f"{out_dir} does not exist!! Please check output path again" | ||
|
||
num_workers = max([cpu_count(), 2]) | ||
|
||
files = fp.FilePattern(inp_dir, file_pattern) | ||
|
||
if files is None: | ||
msg = f"No tabular files found. Please check {file_pattern} again" | ||
raise ValueError(msg) | ||
|
||
if preview: | ||
with Path.open(Path(out_dir, "preview.json"), "w") as jfile: | ||
out_json: dict[str, Any] = { | ||
"filepattern": file_pattern, | ||
"outDir": [], | ||
} | ||
for file in files(): | ||
out_name = file[1][0].name.replace( | ||
"".join(file[1][0].suffixes), | ||
f"_hdbscan{hd.POLUS_TAB_EXT}", | ||
) | ||
out_json["outDir"].append(out_name) | ||
json.dump(out_json, jfile, indent=2) | ||
else: | ||
with preadator.ProcessManager( | ||
name="Cluster data using HDBSCAN", | ||
num_processes=num_workers, | ||
threads_per_process=2, | ||
) as pm: | ||
for file in tqdm( | ||
files(), | ||
total=len(files()), | ||
desc="Clustering data", | ||
mininterval=5, | ||
initial=0, | ||
unit_scale=True, | ||
colour="cyan", | ||
): | ||
pm.submit_process( | ||
hd.hdbscan_clustering, | ||
file[1][0], | ||
min_cluster_size, | ||
out_dir, | ||
grouping_pattern, | ||
label_col, | ||
average_groups, | ||
increment_outlier_id, | ||
) | ||
pm.join_processes() | ||
|
||
|
||
if __name__ == "__main__": | ||
app() |
1 change: 1 addition & 0 deletions
1
...ing/feature-subsetting-plugin/src/main.py → ...ring/feature_subsetting/feature_subset.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Empty file.
Empty file.
Empty file.