diff --git a/.github/workflows/package-filter.yml b/.github/workflows/package-filter.yml index 9ab8f353b..070367408 100644 --- a/.github/workflows/package-filter.yml +++ b/.github/workflows/package-filter.yml @@ -20,6 +20,9 @@ on: list: description: "The list of directories containing the updated packages" value: ${{ jobs.package-filter.outputs.list }} + num_packages: + description: "The number of updated packages" + value: ${{ jobs.package-filter.outputs.num_packages }} permissions: contents: read @@ -31,6 +34,7 @@ jobs: outputs: matrix: ${{ steps.package-filter.outputs.matrix }} list: ${{ steps.package-filter.outputs.list }} + num_packages: ${{ steps.package-filter.outputs.num_packages }} steps: - name: Checkout uses: actions/checkout@v4 @@ -41,6 +45,7 @@ jobs: - name: Find Updated Packages id: package-filter run: | + NUM_PACKAGES=0 PACKAGE_DIRS="" COMPANION_FILES="VERSION .bumpversion.cfg" @@ -63,7 +68,17 @@ jobs: echo "The comparison point is ${comparison_point}" # Get the changed files - changed_files=$(git diff --name-only ${comparison_point}...) + # the `--diff-filter=ACMR` flag filters out deleted files. The filters are as follows: + # A: Added + # C: Copied + # M: Modified + # R: Renamed + # D: Deleted + # T: Type changed (for example, regular file or symlink or submodule) + # U: Unmerged + # X: Unknown + # B: Broken pairing + changed_files=$(git diff --diff-filter=ACMR --name-only ${comparison_point}...) # echo the changed files echo "The changed files are $changed_files" @@ -113,6 +128,7 @@ jobs: fi fi + NUM_PACKAGES=$((NUM_PACKAGES+1)) PACKAGE_DIRS="$PACKAGE_DIRS ${pkg_dir}" fi done @@ -124,29 +140,38 @@ jobs: echo "The updated packages are $PACKAGE_DIRS" if [ -z "$PACKAGE_DIRS" ] then - echo "::error::No updated packages were found" && exit 1 - fi + echo "::warning::No updated packages were found" - # Convert the package directories to JSON for the output matrix - JSON="{\"include\": [" - for package_dir in $PACKAGE_DIRS - do - package_name=$(basename $package_dir) - JSON_LINE="{\"package_dir\": \"${package_dir}\", \"package_name\": \"${package_name}\"}," - # Add the JSON line to the JSON string if it is not already included - if [ ! "$JSON" == *"$JSON_LINE"* ] + echo "matrix={\"include\": []}" >> $GITHUB_OUTPUT + echo "list=" >> $GITHUB_OUTPUT + echo "num_packages=0" >> $GITHUB_OUTPUT + + else + echo "The number of updated packages is $NUM_PACKAGES" + + # Convert the package directories to JSON for the output matrix + JSON="{\"include\": [" + for package_dir in $PACKAGE_DIRS + do + package_name=$(basename $package_dir) + JSON_LINE="{\"package_dir\": \"${package_dir}\", \"package_name\": \"${package_name}\"}," + # Add the JSON line to the JSON string if it is not already included + if [ ! "$JSON" == *"$JSON_LINE"* ] + then + JSON="$JSON$JSON_LINE" + fi + done + + # Remove trailing comma and add closing brackets + if [ "$JSON" == *"," ] then - JSON="$JSON$JSON_LINE" + JSON="${JSON%?}" fi - done + JSON="$JSON]}" - # Remove trailing comma and add closing brackets - if [ "$JSON" == *"," ] - then - JSON="${JSON%?}" - fi - JSON="$JSON]}" + # Set the output + echo "matrix=$( echo "$JSON" )" >> $GITHUB_OUTPUT + echo "list=$( echo "$PACKAGE_DIRS" )" >> $GITHUB_OUTPUT + echo "num_packages=$NUM_PACKAGES" >> $GITHUB_OUTPUT - # Set the output - echo "matrix=$( echo "$JSON" )" >> $GITHUB_OUTPUT - echo "list=$( echo "$PACKAGE_DIRS" )" >> $GITHUB_OUTPUT + fi diff --git a/.github/workflows/package-tests.yml b/.github/workflows/package-tests.yml index 44f4a1c65..b6ae538d3 100644 --- a/.github/workflows/package-tests.yml +++ b/.github/workflows/package-tests.yml @@ -27,6 +27,7 @@ jobs: pre-commit: name: Pre-commit | ${{ matrix.package_name }} needs: package-filter + if: ${{ needs.package-filter.outputs.num_packages > 0 }} strategy: fail-fast: false matrix: ${{ fromJson(needs.package-filter.outputs.matrix) }} @@ -56,6 +57,7 @@ jobs: docker: name: Docker | Build ${{ matrix.package_name }} needs: package-filter + if: ${{ needs.package-filter.outputs.num_packages > 0 }} strategy: fail-fast: false matrix: ${{ fromJson(needs.package-filter.outputs.matrix) }} @@ -106,6 +108,7 @@ jobs: tests: name: Test | ${{ matrix.package_name }} needs: package-filter + if: ${{ needs.package-filter.outputs.num_packages > 0 }} strategy: fail-fast: false matrix: ${{ fromJson(needs.package-filter.outputs.matrix) }} diff --git a/clustering/K-NN/Distributed-Memory/Dockerfile-Distributed b/clustering/K-NN/Distributed-Memory/Dockerfile-Distributed deleted file mode 100644 index 11e218a63..000000000 --- a/clustering/K-NN/Distributed-Memory/Dockerfile-Distributed +++ /dev/null @@ -1,30 +0,0 @@ -FROM ubuntu:latest - -RUN apt-get -y update && apt-get -y install g++ make wget -RUN mkdir -p /home/DistributedKNN /home/Inputs /home/Outputs - -COPY . /home/DistributedKNN -WORKDIR /home/DistributedKNN - -RUN wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz -RUN tar xfz openmpi-4.0.0.tar.gz -RUN rm openmpi-4.0.0.tar.gz -WORKDIR /home/DistributedKNN/openmpi-4.0.0 -RUN ./configure -RUN make all install -ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" -ENV PATH="/usr/local/bin:${PATH}" - -WORKDIR /home/DistributedKNN -RUN wget https://dl.bintray.com/boostorg/release/1.71.0/source/boost_1_71_0.tar.gz -RUN tar xfz boost_1_71_0.tar.gz -RUN rm boost_1_71_0.tar.gz -WORKDIR /home/DistributedKNN/boost_1_71_0 -RUN ./bootstrap.sh -RUN ./b2 -ENV LD_LIBRARY_PATH="/home/DistributedKNN/boost_1_71_0/stage/lib:${LD_LIBRARY_PATH}" - -WORKDIR /home/DistributedKNN -RUN mpicxx -I/home/DistributedKNN/boost_1_71_0 KNN_Distributed_code-OpenMP.cpp -o output.exe -L/home/DistributedKNN/boost_1_71_0/stage/lib -lboost_iostreams -O2 -fopenmp -ENV OMP_NUM_THREADS=2 -ENTRYPOINT ["mpirun","-np","4","./output.exe"] diff --git a/clustering/K-NN/Distributed-Memory/KNN_Distributed_code-OpenMP.cpp b/clustering/K-NN/Distributed-Memory/KNN_Distributed_code-OpenMP.cpp deleted file mode 100644 index 336f9c56a..000000000 --- a/clustering/K-NN/Distributed-Memory/KNN_Distributed_code-OpenMP.cpp +++ /dev/null @@ -1,1411 +0,0 @@ -/** - * @author Mahdi Maghrebi - * October 2019 - * This is the Implementation of K-NN Algorithm in Distributed Systems as developed - * in "PANDA: Extreme Scale Parallel K-Nearest Neighbor on Distributed Architectures", Patwary et a., 2016 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace std; -/** - * Read the output of linux command execution - * @param cmd is the inux command to be executed - * @return the output from the execution of the linux command - */ -std::string exec(const char* cmd) { - std::array buffer; - std::string result; - std::unique_ptr pipe(popen(cmd, "r"), pclose); - if (!pipe) { - throw std::runtime_error("popen() failed!"); - } - while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { - result += buffer.data(); - } - return result; -} -/** - * Defining the criteria for Sorting the data in a pair container from the biggest value to the smallest - */ -bool sortinrev(const pair &a,const pair &b) { - return (a.first > b.first); -} -/** - * Compute the variance of a sampled data over data dimensions and Sort dimensions according to their variability - * @param DataCounts Number of total data from which we take the samples - * @param nodeData0 Dataset containing the data available for sampling - * @param featureCounts Number of features in dataset (equal to number of columns in the input csv file) - * @param world_size Total number of MPI processors - * @param globalKdTreeSamples Number of Samples from dataset for computation here - * @return VectorGlobalSqrtSum A sorted pair containig the index of the dimensions with the highest variability - */ -auto findMaxVarDims(int DataCounts,double **nodeData0, int featureCounts, int world_size, int globalKdTreeSamples) { - double samplingData[globalKdTreeSamples][featureCounts]; - double localSum[featureCounts], globalSum[featureCounts]; - double localSqrtSum[featureCounts], globalSqrtSum[featureCounts]; - - for (int j=0; j> VectorGlobalSqrtSum; - - for (int j=0; j nodeDataIndex0, int globalKdTreeSamplesMedian, double Epsilon, int world_size, int world_rank, double** data) { - int randomIndex; - vector sampledDataValues, leftSampledDataValues, rightSampledDataValues; - sampledDataValues.reserve(globalKdTreeSamplesMedian); - leftSampledDataValues.reserve(globalKdTreeSamplesMedian); - rightSampledDataValues.reserve(globalKdTreeSamplesMedian); - - for (int i=0; i< globalKdTreeSamplesMedian; ++i){ - randomIndex=rand()%nodeDataIndex0.size(); - int index=nodeDataIndex0[randomIndex]; - sampledDataValues.push_back(data[index][maxVarDimension]); - } - - int randomRank; - double MedianCandidate; - int totalCountsData=world_size*globalKdTreeSamplesMedian; - int accumulatedLeftCounts=0; - bool whileFlag=true; - int whileCount=0; - - while(whileFlag){ - if (world_rank==0) {randomRank=rand()%world_size;} - MPI_Bcast(&randomRank,1,MPI_INT,0,MPI_COMM_WORLD); - - if (world_rank==randomRank) { - randomIndex=rand()%sampledDataValues.size(); - MedianCandidate=sampledDataValues[randomIndex]; - } - MPI_Bcast(&MedianCandidate,1,MPI_DOUBLE,randomRank,MPI_COMM_WORLD); - - int leftCounts=0; int rightCounts=0;int globalleftCounts=0; - leftSampledDataValues.clear(); - rightSampledDataValues.clear(); - - for (int i=0; i 0.5-Epsilon ) { - whileFlag=false; - return MedianCandidate ;} - else if (ratio < 0.5-Epsilon){ - accumulatedLeftCounts=globalleftCounts; - sampledDataValues.clear(); - sampledDataValues=rightSampledDataValues; - } - - ++whileCount; - // For diagnosis, the following error hints at the difficulty of finding the median - MPI_File logfile; - char line[1024]; - if (whileCount % 10000 == 0) { - printf("Too Many Trials for Global KD Tree Median, Processor = %d \n",world_rank); - sprintf(line,"Too Many Trials for Global KD Tree Median, Processor = %d \n",world_rank); - MPI_File_write(logfile, line, strlen(line), MPI_CHAR, MPI_STATUS_IGNORE); - } - } -} -/** - * Compute the median of data at a dividing node of the local Kd Tree - * @param localKdTreeSamplesMedian Number of samples used to compute the median - * @paramn sampledDataValues The coordinates of the sampled data - * @param Epsilon The acceptable buffer in estimating the median - * @param world_rank Rank of each MPI processor - * @return MedianCandidate The estimated value of median at the dividing node - */ -double localFindMedian (int localKdTreeSamplesMedian,vector sampledDataValues, double Epsilon, int world_rank) { - vector leftSampledDataValues, rightSampledDataValues; - leftSampledDataValues.reserve(localKdTreeSamplesMedian); - rightSampledDataValues.reserve(localKdTreeSamplesMedian); - - int accumulatedLeftCounts=0; - bool whileFlag=true; - int whileCount=0; - - while(whileFlag){ - int randomIndex=rand()%sampledDataValues.size(); - double MedianCandidate=sampledDataValues[randomIndex]; - int leftCounts=0; - int rightCounts=0; - leftSampledDataValues.clear(); - rightSampledDataValues.clear(); - - for (int i=0; i 0.5-Epsilon ) { - whileFlag=false; - return MedianCandidate; - } - else if (ratio < 0.5-Epsilon){ - accumulatedLeftCounts=leftCounts; - sampledDataValues.clear(); - sampledDataValues=rightSampledDataValues; - whileFlag=true; - } - else if (ratio > 0.5+Epsilon){ - whileFlag=true; - } - ++whileCount; - - if (whileCount % 10000 == 0) { - if (Epsilon<0.25) Epsilon*=2; - else return MedianCandidate; - } - } -} -/** - * Sort the max-heap data structure for a new data inserted at its index i - * @param ID The ID of the point data - * @paramn i Index of the inserted data in the Heap - * @param KNNDistanceinBuckets The values of distances for selected K-NNs - * @param KNNIDsinBuckets The IDs of the selected K-NNs - * @param KNNCounts Desired count of K-NNs to be computed in this program - */ -void Max_Heapify(int ID, int i, double ** KNNDistanceinBuckets, int ** KNNIDsinBuckets,int KNNCounts) { - int largest = 0; - int l = 2*i + 1; - int r = 2*i + 2; - - if ((l < KNNCounts) && (KNNDistanceinBuckets[ID][l] > KNNDistanceinBuckets[ID][i])) { - largest = l; - } - else { - largest = i; - } - - if ((r < KNNCounts) && (KNNDistanceinBuckets[ID][r] > KNNDistanceinBuckets[ID][largest])) { - largest = r; - } - - if (largest != i) { - std::swap(KNNDistanceinBuckets[ID][i], KNNDistanceinBuckets[ID][largest]); - std::swap(KNNIDsinBuckets[ID][i], KNNIDsinBuckets[ID][largest]); - Max_Heapify(ID, largest, KNNDistanceinBuckets, KNNIDsinBuckets,KNNCounts); - } -} -/** - * Build Max-Heap datat structure for the first time - * @param ID The ID of the point data - * @param KNNCounts Desired count of K-NNs to be computed in this program - * @param KNNDistanceinBuckets The values of distances for selected K-NNs - * @param KNNIDsinBuckets The IDs of the selected K-NNs - */ -void Build_Max_Heap(int ID,int KNNCounts, double** KNNDistanceinBuckets, int** KNNIDsinBuckets) { - for (int i = floor((KNNCounts - 1) / 2); i >= 0; i--) { - Max_Heapify(ID, i,KNNDistanceinBuckets, KNNIDsinBuckets,KNNCounts); - } -} -/** - * Sort the max-heap data structure for a newly inserted data - * @param k The index of the inserted point data - * @param receivingHeapArrayDistances2DCopy The values of distances for selected K-NNs - * @param receivingHeapArray2DCopy The IDs of the selected K-NNs - * @param KNNCounts Desired count of K-NNs to be computed in this program - */ -void Max_Heapify2 (int k, double * receivingHeapArrayDistances2DCopy, int * receivingHeapArray2DCopy,int KNNCounts) { - int largest = 0; - int l = 2*k + 1; - int r = 2*k + 2; - - if ((l < KNNCounts) && (receivingHeapArrayDistances2DCopy[l] > receivingHeapArrayDistances2DCopy[k])) { - largest = l; - } - else { - largest = k; - } - - if ((r < KNNCounts) && (receivingHeapArrayDistances2DCopy[r] > receivingHeapArrayDistances2DCopy[largest])) { - largest = r; - } - - if (largest != k) { - std::swap(receivingHeapArrayDistances2DCopy[k], receivingHeapArrayDistances2DCopy[largest]); - std::swap(receivingHeapArray2DCopy[k], receivingHeapArray2DCopy[largest]); - Max_Heapify2(largest,receivingHeapArrayDistances2DCopy,receivingHeapArray2DCopy,KNNCounts); - } -} -/** - * Build Max-Heap datat structure for the first time - * @param KNNCounts Desired count of K-NNs to be computed in this program - * @param receivingHeapArrayDistances2DCopy The values of distances for selected K-NNs - * @param receivingHeapArray2DCopy The IDs of the selected K-NNs - */ -void Build_Max_Heap2(int KNNCounts, double* receivingHeapArrayDistances2DCopy, int* receivingHeapArray2DCopy) { - for (int ii = floor((KNNCounts - 1) / 2); ii >= 0; ii--) { - Max_Heapify2(ii,receivingHeapArrayDistances2DCopy,receivingHeapArray2DCopy,KNNCounts); - } -} - - -/** - * Main Function of the Code - */ -int main(int argc, char * const argv[]) { - /** - * MPI Parallel Logfile - */ - MPI_File logfile; - char line[1024]; - /** - * Beginning MPI communications - */ - MPI_Init(NULL, NULL); - /** - * world_size is defined here as total number of MPI processors - */ - int world_size; - MPI_Comm_size(MPI_COMM_WORLD, &world_size); - /** - * world_rank is defined here as the rank of MPI processors - */ - int world_rank; - MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); - /** - * The errors and informational messages are outputted to the log file - */ - MPI_File_open(MPI_COMM_WORLD, "Setting.txt", MPI_MODE_WRONLY | MPI_MODE_CREATE,MPI_INFO_NULL, &logfile); - /** - * The following arguments are passed to the code (in order) from the command line: - * fileName is the full path to the input csv dataset - * KNNCounts is the desired number of K-NNs for each data point to be computed in this code - * featureCounts is the number of columns in the input csv datastet (number of data dimensions) - */ - string fileName = argv[1]; - const int KNNCounts = atoi(argv[2]); - - int featureCounts, colIndex1, colIndex2; - if (argc == 3) { - string cmd0="head -n 1 "+ fileName + " |tr '\\,' '\\n' |wc -l "; - featureCounts = stoi(exec(cmd0.c_str())); - } else if (argc == 5) { - string cmd0="head -n 1 "+ fileName + " |tr '\\,' '\\n' |wc -l "; - featureCounts = stoi(exec(cmd0.c_str())); - colIndex1 = atoi(argv[3]); - colIndex2 = atoi(argv[4]); - } else { - printf("Wrong Input Arguments\n"); - sprintf(line,"Wrong Input Arguments\n"); - MPI_File_write(logfile, line, strlen(line), MPI_CHAR, MPI_STATUS_IGNORE); - return -1; - } - /** - * The following important parameters are used in the design of algorithm. Their values are - * initialized according to the suggested values in the referencing paper. - * globalKdTreeSamples is the number of data sampled by each processor to collaboratively compute dimensions with the highest variability. - * globalKdTreeSamplesMedian is the number of data sampled by each processor to collaboratively compute the median of the chosen dimension for each splitting node within the global Kd Tree. - * localKdTreeSamplesMedian is the number of data sampled by each processor separately to compute the median of the chosen dimension for each splitting node within the local Kd Tree. - * Epsilon is a buffer in accepting the Median value - * Parallel_IO is a flag that defines if the input csv file can be read in parallel by all the processors - * bucketSize is the size of a bucket (or a leaf) in the local Kd Tree - * estimatedExtraLayers: To limit the growing size of the local Kd Trees, the growth of the tree is limited by a number of layers defined here from the initial guess of the required buckets - */ - const int globalKdTreeSamples=256; - const int globalKdTreeSamplesMedian=256; - int localKdTreeSamplesMedian=1024; - double Epsilon=0.01; - const int Parallel_IO = 1; - const int bucketSize=32; - const int estimatedExtraLayers=1; - /** - * Seed for random number generation - */ - srand(17); - /** - * total number of MPI processors should be a power of 2 due to algorithm design for global Kd Tree. - * Otherwise, output an error and exit the program - */ - bool powerOfTwo = !(world_size == 0) && !(world_size & (world_size - 1)); - if (powerOfTwo!=true) { - if (world_rank==0) { - printf("Number of Processors should be a power of 2\n"); - sprintf(line,"Number of Processors should be a power of 2\n"); - MPI_File_write(logfile, line, strlen(line), MPI_CHAR, MPI_STATUS_IGNORE); - } - MPI_Finalize(); - return 0; - } - int numericWidth=floor(log10(world_size) + 1); - /** - * The master processor splits the input csv file as each processor could have its own non-overlapping set of input data - */ - if (world_rank==0) { - string cmd=string("split -n l/")+to_string(world_size)+" "+ fileName+" -a "+to_string(numericWidth)+" -d tmpFile --additional-suffix=.csv"; - int returnValue=system(cmd.c_str()); - } - /** - * All procesors neeed to stop here until master processor returns - */ - MPI_Barrier(MPI_COMM_WORLD); - /** - * Each processor reads its own set of data from a unique csv file (localFileName) - */ - int worldRankWidth=floor(log10(world_rank) + 1); - std::stringstream ss; - ss << std::setw(numericWidth-worldRankWidth) << std::setfill('0') << world_rank; - std::string s = ss.str(); - string localFileName="tmpFile"+s+".csv"; - - ifstream infile; - infile.open(localFileName); - /** - * Output error in case the localFileName was not opened for reading - */ - if(infile.fail()) { - printf("error in opening the input file\n"); - sprintf(line,"error in opening the input file\n"); - MPI_File_write(logfile, line, strlen(line), MPI_CHAR, MPI_STATUS_IGNORE); - return 1; - } - /** - * Each processor finds out about the number of records in its localFileName - */ - string cmd3="wc -l "+localFileName; - string outputCmd3 = exec(cmd3.c_str()); - int tmpFileLineCounts=stoi(outputCmd3.substr(0, outputCmd3.find(" "))); - /** - * The master node needs to subtract 1 record which is for header information - */ - if (world_rank==0) { - string dummyLine; - getline(infile, dummyLine); - --tmpFileLineCounts; - } - /** - * MPI communication between the processors as they all need to know how many data the other processors have - */ - int tmpFileLineCountsArray[world_size], tmpFileLineCountsArrayCum[world_size] ; - int sendBuffer0[0]; - sendBuffer0[0]=tmpFileLineCounts; - MPI_Allgather(sendBuffer0,1,MPI_INT,tmpFileLineCountsArray,1,MPI_INT,MPI_COMM_WORLD); - /** - * All Processors make an array tmpFileLineCountsArrayCum that cummulatively stores the number of data in the other processors - */ - for (int i=0; i= colIndex1-1 && j < colIndex2) inputdata[i][j] = atof(temp2.c_str()); - temp.erase(0, temp.find(",") + 1); - } - } - } - if (argc == 5) featureCounts=colIndex2-colIndex1+1; - /** - * Remove the local input files as their data has been already parsed and read - */ - infile.close(); - string cmd2= string("rm ")+localFileName; - int returnValue=system(cmd2.c_str()); - /** - * Query about the number of available OpenMP processors and set it for OpenMP - */ - int nProcessors = omp_get_num_procs(); - omp_set_num_threads(nProcessors-1); - cout <<"Total Number of OpenMP Processes in the Parallel Region = "<< nProcessors-1 <> VectorGlobalSqrtSum; - vector nodeDataIndex[world_size]; - nodeDataIndex[0].reserve(tmpFileLineCounts); - for (int i=0; i globalMedianValuesforNodes; - vector nextLayerNodeDataIndex[world_size]; - int nodeCounts=1, nodesLayer=0; - double medianNodeData; - - while (nodeCounts!= world_size){ - if (world_rank ==0) { - printf("Constructing Global Kd Tree: Layer = %d \n",nodesLayer); - sprintf(line,"Constructing Global Kd Tree: Layer = %d \n",nodesLayer); - MPI_File_write(logfile, line, strlen(line), MPI_CHAR, MPI_STATUS_IGNORE); - } - int indexMaxVarDim=VectorGlobalSqrtSum[nodesLayer].second; - - for (int i=0; i0) myDATA[j]= nodeDataIndex[i][j]+tmpFileLineCountsArrayCum[world_rank-1]; - else if(world_rank==0) myDATA[j]= nodeDataIndex[i][j]; - } - - int Totalcounts=(int)nodeDataIndex[i].size(); - send_buffer[0]=nodeDataIndex[i].size(); - MPI_Gather(send_buffer,1, MPI_INT,rcount,1, MPI_INT,i,MPI_COMM_WORLD); - - if (world_rank==i){ - cnts=0; - for (int k=0; k is(mmap, std::ios::binary); - string tempString,tempString2; - int m_numLines = 0; - string dummyLine; - getline(is, dummyLine); - - for (int i=0; i is(mmap, std::ios::binary); - string tempString,tempString2; - int m_numLines = 0; - string dummyLine; - getline(is, dummyLine); - - for (int i=0; i > localNodeDataIndex; - vector tmpvector; - vector localMedianNodeData; - vector isBucket; - bool localFlag=true; - int numberofNodeSofar; - int nodeIndexofaPoint[cnts]; - tmpvector.reserve(cnts); - - for (int i=0; i featureCounts){ - printf("Error in Exceeding Dimensions, increase BucketSize\n"); - sprintf(line,"Error in Exceeding Dimensions, increase BucketSize\n"); - MPI_File_write(logfile, line, strlen(line), MPI_CHAR, MPI_STATUS_IGNORE); - } - - if (localNodeDataIndex[0].size() <= bucketSize+1) {isBucket.push_back(1); localFlag=false;} - else {isBucket.push_back(0);} - - while (localFlag){ - int indexMaxVarDim=VectorGlobalSqrtSum[localNodesLayer+nodesLayer].second; - if (localNodesLayer==0) {numberofNodeSofar=0;} - else {numberofNodeSofar=pow(2,localNodesLayer)-1;} - - for (int i=0; i()); - localNodeDataIndex.push_back(std::vector()); - - if (isBucket[globalID]==1) {isBucket.push_back(0); isBucket.push_back(0); localMedianNodeData.push_back(0); continue;} - if (localNodeDataIndex[globalID].size()==0) {isBucket.push_back(0); isBucket.push_back(0); localMedianNodeData.push_back(0); continue;} - if (localKdTreeSamplesMedian > localNodeDataIndex[globalID].size()/2) localKdTreeSamplesMedian=localNodeDataIndex[globalID].size()/2; - vector sampledDataValues; - for (int i=0; i< localKdTreeSamplesMedian; ++i){ - int randomIndex=rand()%localNodeDataIndex[globalID].size(); - int index=localNodeDataIndex[globalID][randomIndex]; - int index1=indexLookupArray[index]; - sampledDataValues.push_back(mappedData[index1][indexMaxVarDim]); - } - - double temp=localFindMedian(localKdTreeSamplesMedian,sampledDataValues,Epsilon,world_rank); - localMedianNodeData.push_back(temp); - - for (int j=0; j< localNodeDataIndex[globalID].size(); ++j){ - int index0=localNodeDataIndex[globalID][j]; - int index=indexLookupArray[index0]; - if (mappedData[index][indexMaxVarDim] < localMedianNodeData[globalID]){ - localNodeDataIndex[leftNodeGlobalIndex].push_back(index0); - ++countLeft; - } - else{ - localNodeDataIndex[rightNodeGlobalIndex].push_back(index0); - ++countRight; - } - } - - if (countLeft ==1) { - localNodeDataIndex[rightNodeGlobalIndex].push_back(localNodeDataIndex[leftNodeGlobalIndex][0]); - localNodeDataIndex[leftNodeGlobalIndex].pop_back(); - --countLeft; - } - - if (countRight ==1) { - localNodeDataIndex[leftNodeGlobalIndex].push_back(localNodeDataIndex[rightNodeGlobalIndex][0]); - localNodeDataIndex[rightNodeGlobalIndex].pop_back(); - --countRight; - } - - - if ((countLeft <= bucketSize+1 && countLeft >0)|| ((localNodesLayer == maxAllowedLayers-1) && countLeft >0) ) { - isBucket.push_back(1); - - for (int j=0; j< localNodeDataIndex[leftNodeGlobalIndex].size(); ++j){ - int index0=localNodeDataIndex[leftNodeGlobalIndex][j]; - int index=indexLookupArray[index0]; - nodeIndexofaPoint[index]=leftNodeGlobalIndex; - } - } - else {isBucket.push_back(0);} - - if ((countRight <= bucketSize+1 && countRight >0) || ((localNodesLayer == maxAllowedLayers-1) && countRight >0) ) { - isBucket.push_back(1); - - for (int j=0; j< localNodeDataIndex[rightNodeGlobalIndex].size(); ++j){ - int index0=localNodeDataIndex[rightNodeGlobalIndex][j]; - int index=indexLookupArray[index0]; - nodeIndexofaPoint[index]=rightNodeGlobalIndex; - } - } - else {isBucket.push_back(0);} - } - - localFlag=false; - for (int i=0; i0 ) {localFlag=true; break; - } - } - layerNodeCounts*=2; - ++localNodesLayer; - } - /** - * For performance, it is better to refer to local Kd tree later - * from the ID of the first dividing node which has been converted to a bucket - */ - int FirstBucket; - for (int i=0; i< localNodeDataIndex.size(); ++i){ - if (isBucket[i] == 1) {FirstBucket=i;break;} - } - /** - * Now, it is the time to start computing K-NNs from the data points within each bucket in the local Kd Tree - * and store them in KNNIDsinBuckets and KNNDistanceinBuckets - * To improve the performance, the data locality was considered for main arrays of localNodeDataIndex2 and mappedData2 - * and the data within the same bucket arranged close to each other in the new arrays - */ - if (world_rank==0) { - printf("Computing K-NNs for the points within the Same Bucket\n"); - sprintf(line,"Computing K-NNs for the points within the Same Bucket\n"); - MPI_File_write(logfile, line, strlen(line), MPI_CHAR, MPI_STATUS_IGNORE); - } - int KNNIDsinBucketsFilledCounts[cnts]; - int localIndexConvertor[cnts]; - int counter=0; - vector> localNodeDataIndex2; - - int **KNNIDsinBuckets = new int*[cnts]; - for (int i=0; i()); - if (isBucket[i] == 0) {continue;} - - for (int j=0; j< localNodeDataIndex[i].size(); ++j){ - localIndexConvertor[counter]=localNodeDataIndex[i][j]; - localNodeDataIndex2[i].push_back(counter); - ++counter; - } - } - - double** mappedData2=new double*[cnts]; - for (int i=0; i ScatterVlocalNodeDataIndex[world_size]; - vector ScatterVKNNIDsinBucketsFilledCounts[world_size]; - - int globalLayerID = int(log2(world_size)); - int lowestNodeID=pow(2,globalLayerID)-1; - int highestNodeID=lowestNodeID+world_size-1; - int NeighboringNodes[cnts][world_size-1]; - - if (world_size != 1){ - for (int i=0; i> globalStack; - globalStack.push(make_pair(0,0)); - /** - * C1NodeID is the closer child, and C2NodeID is the other child - */ - int C1NodeID,C2NodeID; - int jcounts=0; - - while (!globalStack.empty()){ - pair topPairinStack=globalStack.top(); - int nodeID=topPairinStack.first; - double dValue=topPairinStack.second; - globalStack.pop(); - int nodesLayer0=int(log2(nodeID+1)); - int indexMaxVarDim=VectorGlobalSqrtSum[nodesLayer0].second; - - if (dValue < rPrime){ - double dPrime= mappedData2[index1][indexMaxVarDim] - globalMedianValuesforNodes[nodeID]; - if (dPrime < 0) { - C1NodeID=2*nodeID+1; - C2NodeID=2*nodeID+2; - } - else{ - C1NodeID=2*nodeID+2; - C2NodeID=2*nodeID+1; - } - - dPrime=sqrt(dValue*dValue+dPrime*dPrime); - if (dPrime= lowestNodeID && (C2NodeID-lowestNodeID)!=world_rank) { - NeighboringNodes[index1][jcounts]=C2NodeID-lowestNodeID; - ScatterVlocalNodeDataIndex[C2NodeID-lowestNodeID].push_back(index1); - ScatterVKNNIDsinBucketsFilledCounts[C2NodeID-lowestNodeID].push_back(KNNIDsinBucketsFilledCounts[index1]); - ++jcounts; - } - } - } - - if (C1NodeID <= highestNodeID) { - globalStack.push(make_pair(C1NodeID,dValue)); - if (C1NodeID >= lowestNodeID && (C1NodeID-lowestNodeID)!=world_rank) { - NeighboringNodes[index1][jcounts]=C1NodeID-lowestNodeID; - ScatterVlocalNodeDataIndex[C1NodeID-lowestNodeID].push_back(index1); - ScatterVKNNIDsinBucketsFilledCounts[C1NodeID-lowestNodeID].push_back(KNNIDsinBucketsFilledCounts[index1]); - ++jcounts; - } - } - } - } - } - } - } - /** - * Now, send the data of the given point to the neighboring processors identified above - * for further computation of possible K-NNs in those processors - */ - int displ[world_size],displ2[world_size],displ3[world_size]; - int bufferCounts[world_size],bufferCounts2[world_size],bufferCounts3[world_size]; - bufferCounts[world_rank]=0; - bufferCounts2[world_rank]=0; - bufferCounts3[world_rank]=0; - - if (world_size != 1){ - for (int i=0; i> globalStack; - globalStack.push(make_pair(0,0)); - - while (!globalStack.empty()){ - pair topPairinStack=globalStack.top(); - int nodeID=topPairinStack.first; - double dValue=topPairinStack.second; - globalStack.pop(); - int nodesLayer0=int(log2(nodeID+1)); - int indexMaxVarDim=VectorGlobalSqrtSum[nodesLayer0+nodesLayer].second; - - if (isBucket[nodeID] == 1) { - for (int kk=0; kk> setContainer; - int pointID=localIndexConvertor[i]; - /** - * Insert into Set container the K-NNs initially computed from the points within the same bucket - */ - for (int j=0; j::iterator it = std::find(ScatterVlocalNodeDataIndex[neighborID].begin(), ScatterVlocalNodeDataIndex[neighborID].end(), i); - int index = std::distance(ScatterVlocalNodeDataIndex[neighborID].begin(), it); - - for (int k=0; k>::iterator pairIt; - pairIt=setContainer.begin(); - int outputCounter=0; - for (int ii=0; ii KNN_Indices.csv"); - int returnValue=system(cmd4.c_str()); - string cmd5= string("cat KNN_Distances_*.csv > KNN_Distances.csv"); - returnValue=system(cmd5.c_str()); - - string cmd6= string("rm KNN_Indices_*"); - returnValue=system(cmd6.c_str()); - string cmd7= string("rm KNN_Distances_*"); - returnValue=system(cmd7.c_str()); - } - - MPI_File_close(&logfile); - MPI_Finalize(); - return 0; -} - diff --git a/clustering/K-NN/README.rst b/clustering/K-NN/README.rst deleted file mode 100644 index 085fa3688..000000000 --- a/clustering/K-NN/README.rst +++ /dev/null @@ -1,196 +0,0 @@ -=================================== -K-NN Code for Shared-Memory Systems -=================================== - -The K-NN code for the Shared-Memory systems was implemented according to the algorithm developed by Dong et al., 2012, titled "Efficient K-Nearest Neighbor Graph Construction for Generic Similarity Measures". The full description of the algorithm is available -`Here `_. - ------------------------- -Installing Boost Library ------------------------- - -Both K-NN codes for Shared-Memory and Distributed-Memory use Boost library for mapping data into memory and reading inputs from the command line. The steps for installing Boost library in Linux are displayed below. - -.. code:: bash - - wget https://dl.bintray.com/boostorg/release/1.71.0/source/boost_1_71_0.tar.gz - tar xfz boost_1_71_0.tar.gz - cd boost_1_71_0/ - ./bootstrap.sh - ./b2 - export LD_LIBRARY_PATH=currentpath/stage/lib:$LD_LIBRARY_PATH - -It is recommended to include the last line into .bashrc file at home directory. - ------------------ -Runtime Arguments ------------------ - -The code requires the following parameters as the input. - -1- ``filePath``: The full path to the input csv file containig the dataset. Please ensure there are no other csv files in this path. Please note that the code assumes that the first line in the input csv file is the header and ignores it. - -2- ``K``: The desired number of Nearest Neighbours to be computed. - -3- ``sampleRate``: The rate at which we do sampling. This parameter plays a key role in the performance. This parameter is a trades-off between the performance and the accuracy of the results. The values closer to 1 provides more accurate results but the execution time instead takes longer. - -4- ``convThreshold``: An integer that defines the threshold for the convergence of the model. A fixed integer is used here instead of the expression delta*N*K which is given in the reference paper. - -5- ``outputPath``: The full path to the output csv files. - -6,7- ``colIndex1`` and ``colIndex2`` (Optional): The indices of columns from the input csv file where raw data exists continuously in between. If these two arguments were left blank, the code assumes that the entire input csv file is raw data and automatically computes the number of columns in the input csv file. The numbering for these 2 indices begin from 1 (and not 0). - ------------- -Code Outputs ------------- - -The code produces the following output files: - -1- ``KNN_Indices.csv``: The indices of K-NNs for the entire dataset. The order of data here is the same as the order of data at the input csv file. - -2- ``KNN_Distances.csv``: The corresponding distances of K-NNs which was saved at KNN_Indices.csv. - -3- ``Setting.txt``: The logging file containing the errors and messages. - --------------------------------- -An Example of Executing the Code --------------------------------- - -.. code:: bash - - ulimit -s unlimited - g++ -I/Path_To_Boost_Library/boost_1_71_0 KNN_Serial_Code.cpp -o output.exe -L/Path_To_Boost_Library/boost_1_71_0/stage/lib -lboost_iostreams -lboost_system -lboost_filesystem -O2 - time ./output.exe --inputPath . --K 10 --sampleRate 0.99 --convThreshold 5 --outputPath . - time ./output.exe --inputPath . --K 10 --sampleRate 0.99 --convThreshold 5 --outputPath . --colIndex1 3 --colIndex2 26 - -Please note that the multi-threaded version of Shared-Memory K-NN can be compiled and run as follows. The number of threads in the OpenMP parallelized region of the code is automatically set equal to the number of threads in the machine minus 1. - -.. code:: bash - - ulimit -s unlimited - g++ -I/Path_To_Boost_Library/boost_1_71_0 KNN_Serial_Code.cpp -o output.exe -L/Path_To_Boost_Library/boost_1_71_0/stage/lib -lboost_iostreams -lboost_system -lboost_filesystem -O2 -fopenmp - time ./output.exe --inputPath . --K 10 --sampleRate 0.99 --convThreshold 5 --outputPath . - time ./output.exe --inputPath . --K 10 --sampleRate 0.99 --convThreshold 5 --outputPath . --colIndex1 3 --colIndex2 26 - ---------------------------- -An Advise About Performance ---------------------------- - -The parameter ``sampleRate`` has a significant impact on the performance. It is advised that its optimal value to be determined for each specific project. - -------------------- -Install WIPP Plugin -------------------- -If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of plugin.json into the pop-up window and submit. - ------------------------------------------- -An Example of Running the Docker Container ------------------------------------------- - -.. code:: bash - - docker run -v /path/to/data:/home/Inputs -v /path/to/outputs:/home/Outputs \ - containername --inputPath /home/Inputs --K 10 --sampleRate 0.9 \ - --convThreshold 5 --outputPath /home/Outputs - - -================================================== -GPU-Enabled K-NN Code for Shared-Memory Systems -================================================== - -Alternatively, the performance of K-NN code for Shared-Memory Systems was improved by adding CUDA directives. The computation loads are then automatically switched between GPU and CPU. - -.. code:: bash - - ulimit -s unlimited - nvcc -I/Path_To_Boost_Library/boost_1_71_0 KNN_GPU_Code.cu -o output.exe -L/Path_To_Boost_Library/boost_1_71_0/stage/lib -lboost_iostreams -lboost_system -lboost_filesystem -O2 -arch=sm_75 - time ./output.exe --inputPath . --K 10 --sampleRate 0.99 --convThreshold 5 --outputPath . - -The following parameters are GPU-specific parameters. Their values might need to be adjusted for any given device. - -1- ``MAXTPB``: The Max number of Threads per Block. It is by deafult 1024. - -2- ``MinimumThreads``: The Minimum number of computations that is needed to switch the computation to GPU device (Otherwise stay in host). This parameter might have considerable impact on the performance. - -3- ``arch=sm_75``: This compilation flag should represent the GPU specificiation of the given machine. - - -The docker for GPU-Enabled K-NN code can also be run using the following command. - -.. code:: bash - - docker run --gpus all -v /path/to/data:/home/Inputs -v /path/to/outputs:/home/Outputs \ - containername --inputPath /home/Inputs --K 10 --sampleRate 0.9 \ - --convThreshold 5 --outputPath /home/Outputs - -======================================== -K-NN Code for Distributed-Memory Systems -======================================== - -The K-NN code for the Distributed-Memory systems was implemented according to the algorithm developed by Patwary et a., 2016, titled "PANDA: Extreme Scale Parallel K-Nearest Neighbor on Distributed Architectures". The full description of the algorithm is available -`Here `_. - ------------------ -Runtime Arguments ------------------ - -The code requires the following input parameters. - -1- ``Number of Processors``: Due to the special design for global Kd Tree, the number of processors should be a power of 2 (e.g., 1,2,4,8,16,...). - -2- ``filePath``: The full path to the input csv file containig the raw dataset. Please note that the code assumes that the first line in the input csv file is the header and ignores it. - -3- ``KNNCounts``: The desired number of Nearest Neighbours to be computed. - -4- ``colIndex1`` and ``colIndex2`` (Optional): The index of columns from the input csv file where raw data exists continuously in between. If these two arguments were left blank, the code assumes that the entire input csv file is raw data and automatically computes the number of columns in the input csv file. The numbering for these 2 indices begin from 1 (and not 0). - -Please note that the performance has been improved by adding OpenMP directives (multi-threading) in addition to the current MPI directives (multi-node). The number of threads in the OpenMP parallelized region of the code is set using an environment variable as shown below. - -.. code:: bash - - export OMP_NUM_THREADS=2 - --------------------------------- -An Example of Executing the code --------------------------------- - -.. code:: bash - - ulimit -s unlimited - export OMP_NUM_THREADS=2 - mpicxx -I/Path_To_Boost_Library/boost_1_71_0 KNN_Distributed_code-OpenMP.cpp -o output.exe -L/Path_To_Boost_Library/boost_1_71_0/stage/lib -lboost_iostreams -O2 -fopenmp - time mpirun -np 4 ./output.exe /fullPath/inputfile.csv 15 - time mpirun -np 4 ./output.exe /fullPath/inputfile.csv 15 3 26 - ------------- -Code Outputs ------------- - -Similar to the Shared-Memory code, the Distributed-Memory code produces the following output files: - -1- ``KNN_Indices.csv``: The indices of K-NNs for the entire dataset. The first entry in each row contains the index of that point according to the index from the input csv file. - -2- ``KNN_Distances.csv``: The corresponding distances of K-NNs which were saved at KNN_Indices.csv. Similarly, the first entry in each row contains the index of that point according to the index from the input csv file. - -3- ``Setting.txt``: The logging file containing the error and messages. - -------------------------------------- -Description of Some Design Parameters -------------------------------------- - -The code also has a few other parameters that are a part of the Kd Tree design. These parameters were initialized in the code to the values suggested in the reference paper (Patwary et al., 2016). For the complicated cases, these values might need to be adjusted for the optimal performance. - -1- ``globalKdTreeSamples``: The number of data sampled by each processor to collaboratively compute the dimensions with the highest variability. - -2- ``globalKdTreeSamplesMedian``: The number of data sampled by each processor to collaboratively compute the median of the chosen dimension at each splitting node of the global Kd Tree. - -3- ``Parallel_IO``: A flag that defines if the input csv file can be read in parallel by all the processors. - -4-``Epsilon``: The error in estimating the Median value. - -5- ``localKdTreeSamplesMedian``: The number of data sampled by each processor separately to compute the median of the chosen dimension at each splitting node of the local Kd Tree. - -6- ``bucketSize``: The size of a bucket (or a leaf) in the local Kd Tree. - -7- ``estimatedExtraLayers``: To limit the growing size of the local Kd Trees, the growth of the tree is limited by a cerain number of layers using this parameter. - diff --git a/clustering/K-NN/Shared-Memory-GPU/Dockerfile b/clustering/K-NN/Shared-Memory-GPU/Dockerfile deleted file mode 100644 index cb9c83c72..000000000 --- a/clustering/K-NN/Shared-Memory-GPU/Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -FROM ktaletsk/gpu:bionic-20190612-cuda-10-2 - -# Update apps on the base image -RUN apt-get -y update && apt-get install -y g++ wget make - -#Create new directory -RUN mkdir -p /home/GPU_KNN /home/Inputs /home/Outputs - -# Specify the working directory -WORKDIR /home/GPU_KNN - -# Install Boost Library -RUN wget https://dl.bintray.com/boostorg/release/1.71.0/source/boost_1_71_0.tar.gz -RUN tar xfz boost_1_71_0.tar.gz -RUN rm boost_1_71_0.tar.gz -WORKDIR /home/GPU_KNN/boost_1_71_0 -RUN ./bootstrap.sh -RUN ./b2 -ENV LD_LIBRARY_PATH="/home/GPU_KNN/boost_1_71_0/stage/lib:${LD_LIBRARY_PATH}" - -# Copy the current folder to the docker image -COPY . /home/GPU_KNN - -# Compile the source file -WORKDIR /home/GPU_KNN -RUN nvcc -I/home/GPU_KNN/boost_1_71_0 KNN_GPU_Code.cu -o Out.exe -L/home/GPU_KNN/boost_1_71_0/stage/lib -lboost_iostreams -lboost_system -lboost_filesystem -arch=sm_75 -O2 - -# Run the output program from the previous step -ENTRYPOINT ["./Out.exe"] - - - diff --git a/clustering/K-NN/Shared-Memory-GPU/KNN_GPU_Code.cu b/clustering/K-NN/Shared-Memory-GPU/KNN_GPU_Code.cu deleted file mode 100644 index cbba53145..000000000 --- a/clustering/K-NN/Shared-Memory-GPU/KNN_GPU_Code.cu +++ /dev/null @@ -1,727 +0,0 @@ -/** - * This code is an implementation of the algorithm presented by Dong et al., 2012, - *"Efficient K-Nearest Neighbor Graph Construction for Generic Similarity Measures" - * and the performance has been improved by CUDA (GPU) directives. - * @author: Mahdi Maghrebi - * March 2020 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using boost::iostreams::mapped_file_source; -using boost::iostreams::stream; -using namespace std; - -/** - * The Max number of Threads per Block. This is one of GPU hardware characteristics. - */ -#define MAXTPB 1024 - -/** - * The Minimum number of computations that is needed to switch to GPU device (Otherwise stay in host) - */ -#define MinimumThreads 10 - -/** - * Error handling for GPU Code - */ -#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) -{ - if (code != cudaSuccess) - { - fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -/** - * GPU Kernel definition - */ -__global__ void ComputeDistancesKernel(int * device_New_Final_List_1D, int * device_New_Final_List_Index, int Dim, double * device_New_Final_List_Dist_1D, double * device_dataPointsGPU, int * device_New_Final_List_Dist_Index){ - - int localDim=Dim; - double localvalue=0; - int Cnts=device_New_Final_List_Index[blockIdx.x+1]-device_New_Final_List_Index[blockIdx.x]; - int Cnts_Dist=device_New_Final_List_Dist_Index[blockIdx.x+1]-device_New_Final_List_Dist_Index[blockIdx.x]; - int par1, par2; - int cnt=0; - int flag=0; - - if (threadIdx.x < Cnts_Dist){ - for (int i=0; i < Cnts; ++i){ - if (flag ==1) break; - for (int j=i+1; j < Cnts; ++j){ - if (threadIdx.x == cnt) { - par1 = device_New_Final_List_1D[i + device_New_Final_List_Index[blockIdx.x]]; - par2 = device_New_Final_List_1D[j + device_New_Final_List_Index[blockIdx.x]]; - flag=1; - break; - } - ++cnt; - } - } - - for (int i=0; i buffer; - std::string result; - std::unique_ptr pipe(popen(cmd, "r"), pclose); - if (!pipe) { - throw std::runtime_error("popen() failed!"); - } - while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { - result += buffer.data(); - } - return result; -} - -/** - * Replace the farthest point in B_Index (for u1) with u2 if u2 is closer - *

- * This method corresponds to UPDATENN(B[u1],) in the paper - *

- * @param Dist represents B_Dist - * @param Index represents B_Index - * @param IsNew represents B_IsNew - * @param u1 the indice of point that we want to potentially update its K-NN with the point u2 - * @param u2 the indice of potential K-NN fpr point u1 - * @param distance the spatial distance between u1 and u2 - * @param flag updates B_IsNew - * @return 1 if B_Index[u1][.] is updated, 0 otherwise - */ -int UpdateNN (int** B_Index, double ** B_Dist, short** B_IsNew, short* allEntriesFilled, int K, int u1, int u2, double distance, int flag = 1) { - - if(allEntriesFilled[u1]==0){ - for (int j = 0; j < K; j++) { - if (B_Dist[u1][j] < 0) { - - for (int jj = 0; jj < j; jj++) {if (B_Index[u1][jj] == u2) return 0;} - - B_Dist[u1][j] = distance; - B_Index[u1][j] = u2; - B_IsNew[u1][j] = flag; - if (j==K-1) allEntriesFilled[u1]=1; - return 1;} - } - } - - else{ - for (int j = 0; j < K; j++) { - if (B_Index[u1][j] == u2) return 0; - } - - double max = DBL_MIN; - int index = -1; - for (int j = 0; j < K; j++) { - if (B_Dist[u1][j] > max) { - max = B_Dist[u1][j]; - index = j; - } - } - if (index == -1) { cout << "Error"<path().extension() == ext){ - fileFound = true; - filePath = it->path().string(); - break; - } - ++it; - } - if (!fileFound){ - logFile << "CSV file is not found in the input path"; - cout << "CSV file is not found in the input path"; - return 1; - } - } - else if (string(argv[i])=="--K") K=atoi(argv[i+1]); - else if (string(argv[i])=="--sampleRate") sampleRate=stof(argv[i+1]); - else if (string(argv[i])=="--convThreshold") convThreshold=stof(argv[i+1]); - else if (string(argv[i])=="--outputPath"){ - boost::filesystem::path p(argv[i+1]); - - if(!boost::filesystem::exists(p) || !boost::filesystem::is_directory(p)) - { - logFile << "Incorrect output path"; - cout << "Incorrect output path"; - return 1; - } - - LogoutputPath=argv[i+1]; - boost::filesystem::path joinedPath = p / boost::filesystem::path("KNN_Indices.csv"); - outputPath = joinedPath.string(); - boost::filesystem::path joinedPath2 = p / boost::filesystem::path("KNN_Distances.csv"); - outputPath2 = joinedPath2.string(); - } - else if (string(argv[i])=="--colIndex1") { - colIndex1=stof(argv[i+1]); - if (colIndex1<1) { - logFile << "colIndex1 should be greater than 1"; - cout << "colIndex1 should be greater than 1"; - return 1; - } - } - else if (string(argv[i])=="--colIndex2") { - colIndex2=stof(argv[i+1]); - if (colIndex2<1) { - logFile << "colIndex2 should be greater than 1"; - cout << "colIndex2 should be greater than 1"; - return 1; - } - } - } - - logFile<<"------------The following Input Arguments were read------------"< *New_Index = new std::vector[N]; - /** - * Data structure for REVERSE(new[v]) or new' - */ - vector *Reverse_New_Index = new vector[N]; - /** - * Data Structure for SAMPLE(new'[v],pk) - */ - vector *Sampled_Reverse_New_Index = new vector[N]; - /** - * Data Structure for new[v] U SAMPLE(new'[v],pk) - */ - vector *New_Final_List = new vector[N]; - /** - * An approximation of zero in computing distances. Two points with the distance - * smaller than epsilon are considered as one point. - */ - double epsilon = 1e-10; // - short* allEntriesFilled = new short[N]; - /** - * At first, let's Read Dataset from Input File Using Memory Mapping - */ - mapped_file_source mmap(filePath); - stream is(mmap, std::ios::binary); - if (is.fail()) - { - logFile << "error in Opening Input File" << endl; - cout << "error in Opening Input File" << endl; - return ; - } - /** - * Remove the header info - */ - string dummyLine; - getline(is, dummyLine); - /** - * Reading the Entire Dataset - */ - if (argc==11){ - for (int i = 0; i < N; ++i) { - string temp, temp2; - getline(is, temp); - for (int j = 0; j < Dim; ++j) { - temp2 = temp.substr(0, temp.find(",")); - double tempV=atof(temp2.c_str()); - dataPoints[i][j] = tempV; - dataPointsGPU[i*Dim+j] = tempV; - temp.erase(0, temp.find(",") + 1); - } - } - } else { - for (int i = 0; i < N; ++i) { - string temp, temp2; - getline(is, temp); - for (int j = 0; j < Dim; ++j) { - temp2 = temp.substr(0, temp.find(",")); - if (j >= colIndex1-1 && j < colIndex2) { - double tempV=atof(temp2.c_str()); - dataPoints[i][j] = tempV; - dataPointsGPU[i*Dim+j] = tempV; - } - temp.erase(0, temp.find(",") + 1); - } - } - } - mmap.close(); - - if (colIndex1 != -1) Dim=colIndex2-colIndex1+1; - if (Dim <1) { - logFile << "Error in Computing the Dimension of input csv file" << endl; - cout << "Error in Computing the Dimension of input csv file" << endl; - return 1; - } - //Convert Pagged Memory to the Pinned Memory for better performance - cudaHostRegister(dataPointsGPU,N*Dim*sizeof(double),0); - /** - * Copy the GPU version of input data (dataPointsGPU) to GPU memory (device_dataPointsGPU) - */ - cudaStream_t stream; - cudaStreamCreate(&stream); - - double * device_dataPointsGPU; - cudaMalloc ((void **) &device_dataPointsGPU, N*Dim*sizeof(double)); - cudaMemcpyAsync (device_dataPointsGPU, dataPointsGPU, N*Dim*sizeof(double),cudaMemcpyHostToDevice, stream); - gpuErrchk(cudaPeekAtLastError()); - - /** - * define a seed for random generator. Using a constant value produces - * the same set of random numbers and is good for debugging. Alternatively, - * we can select the seed number randomly as srand(time(NULL)) - */ - srand(17); - /** - * Initialization of Arrays B_IsNew and B_Dist - */ - for (int i = 0; i < N; ++i) { - allEntriesFilled[i]=0; - for (int j = 0; j < K; ++j) { - B_IsNew[i][j] = 1; - B_Dist[i][j] = -1.0; - } - } - /** - * Random Initialization of B_Index - */ - int randomIndex, iter; - for (int i = 0; i < N; ++i) { - for (int j = 0; j < K; ++j) { - iter = 1; - while (iter) { - randomIndex = rand() % N; - if (randomIndex != i) { - B_Index[i][j] = randomIndex; - iter = 0; - } - } - } - } - - /** - * Main Loop of the Algorithm - */ - bool iterate = true; - while (iterate) { - int c_criteria = 0; - int abort=0; - /** - * Create "New" for each Datapoint - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < K; ++j) { - if (float(rand() % 100) < sampleRate*100) { - if (B_IsNew[i][j] == 1) { - New_Index[i].push_back(B_Index[i][j]); - B_IsNew[i][j] = 0; - } - } - } - } - /** - * Create "New'"(or REVERSE("New")) for each Datapoint - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < New_Index[i].size(); ++j) { - Reverse_New_Index[New_Index[i][j]].push_back(i); - } - } - /** - * Random Sampling from "New'" - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < Reverse_New_Index[i].size(); ++j) { - if (float(rand() % 100) < sampleRate*100) { - Sampled_Reverse_New_Index[i].push_back(Reverse_New_Index[i][j]); - } - } - } - /** - * "New"= "New" U SAMPLE("New'", pK) - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < New_Index[i].size(); ++j) { - New_Final_List[i].push_back(New_Index[i][j]); - } - for (int j = 0; j < Sampled_Reverse_New_Index[i].size(); ++j) { - New_Final_List[i].push_back(Sampled_Reverse_New_Index[i][j]); - } - } - /** - * Remove duplicates from New_Final_List - */ - for (int i = 0; i < N; ++i) { - sort(New_Final_List[i].begin(), New_Final_List[i].end()); - auto last = std::unique(New_Final_List[i].begin(), New_Final_List[i].end()); - New_Final_List[i].erase(last, New_Final_List[i].end()); - } - - /** - * Max_New_Final_List_Length is the maximum length of New_Final_List array - */ - int Max_New_Final_List_Length=0; - - for (int i = 0; i < N; ++i) { - if (New_Final_List[i].size()> Max_New_Final_List_Length) Max_New_Final_List_Length=New_Final_List[i].size(); - } - /** - * ThreadsPerBlockNeeded is the required number of threads per block to compute the longest array of New_Final_List - */ - int ThreadsPerBlockNeeded=0; - for (int i = 0; i < Max_New_Final_List_Length; ++i) { - for (int j = i+1; j < Max_New_Final_List_Length; ++j) { - ++ThreadsPerBlockNeeded; - } - } - - /** - * Switch to GPU computations if the following conditions met. Otherwise proceed to CPU computations. - */ - if (ThreadsPerBlockNeeded < MAXTPB && ThreadsPerBlockNeeded > MinimumThreads) { - /** - * TotalCounts is the total number of elements in New_Final_List - */ - int TotalCounts=0; - for (int i = 0; i < N; ++i) { - TotalCounts += New_Final_List[i].size(); - } - /** - * New_Final_List_1D is the 1D representation of New_Final_List for transferring to GPU - */ - int * New_Final_List_1D = new int [TotalCounts]; - int cnt=0; - - for (int i = 0; i < N; ++i) { - for (int j = 0; j < New_Final_List[i].size(); ++j) { - New_Final_List_1D[cnt] = New_Final_List[i][j]; - ++cnt; - } - } - /** - * device_New_Final_List_1D is on the GPU memory and contains New_Final_List_1D - */ - int *device_New_Final_List_1D; - cudaMalloc ((void **) &device_New_Final_List_1D, TotalCounts*sizeof(int)); - gpuErrchk(cudaMemcpy (device_New_Final_List_1D, New_Final_List_1D, TotalCounts* sizeof(int),cudaMemcpyHostToDevice)); - /** - * New_Final_List_Index is the index of New_Final_List[i] data. It is needed as New_Final_List has variable size in each row of data. - */ - int * New_Final_List_Index = new int [N+1]; - New_Final_List_Index[0] = 0; - for (int i = 1; i < N+1; ++i) { - New_Final_List_Index[i] = New_Final_List[i-1].size()+New_Final_List_Index[i-1]; - } - /** - * device_New_Final_List_Index is on the GPU memory and contains New_Final_List_Index - */ - int *device_New_Final_List_Index; - cudaMalloc ((void **) &device_New_Final_List_Index, (N+1)*sizeof(int)); - gpuErrchk(cudaMemcpy (device_New_Final_List_Index, New_Final_List_Index, (N+1)* sizeof(int),cudaMemcpyHostToDevice)); - /** - * New_Final_List_Dist_Index is the index of pairs of distances computed in GPU. - */ - int * New_Final_List_Dist_Index = new int [N+1]; - int TotalCounts_Dist=0; - - for (int i = 0; i < N; ++i) { - New_Final_List_Dist_Index[i]=TotalCounts_Dist; - for (int j = 0; j < New_Final_List[i].size(); ++j) { - for (int k = j+1; k < New_Final_List[i].size(); ++k) { - ++TotalCounts_Dist; - } - } - } - New_Final_List_Dist_Index[N]=TotalCounts_Dist; - /** - * device_New_Final_List_Dist_Index is on the GPU memory and contains New_Final_List_Dist_Index - */ - int * device_New_Final_List_Dist_Index; - cudaMalloc ((void **) &device_New_Final_List_Dist_Index, (N+1)*sizeof(int)); - gpuErrchk(cudaMemcpy (device_New_Final_List_Dist_Index, New_Final_List_Dist_Index, (N+1) * sizeof(int),cudaMemcpyHostToDevice)); - /** - * device_New_Final_List_Dist_1D is on the GPU memory and contains 1D array of pairs of distances computed in GPU. - */ - double *device_New_Final_List_Dist_1D; - cudaMalloc ((void **) &device_New_Final_List_Dist_1D, TotalCounts_Dist*sizeof(double)); - /** - * Launch the Kernel to compute the distance computations for all pairs of the points. - * cudaDeviceSynchronize is required to ensure data transfer to GPU memory is already finished. - */ - gpuErrchk(cudaDeviceSynchronize()); - - logFile<< "Number of Blocks = "<> aggregateResults; - for (int j=0; j - * August 2019 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using boost::iostreams::mapped_file_source; -using boost::iostreams::stream; - -using namespace std; - -/** - * Read the output of linux command execution - * @param cmd is the linux command to be executed - * @return the output from the execution of the linux command - */ -std::string exec(const char* cmd) { - std::array buffer; - std::string result; - std::unique_ptr pipe(popen(cmd, "r"), pclose); - if (!pipe) { - throw std::runtime_error("popen() failed!"); - } - while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { - result += buffer.data(); - } - return result; -} - -int main(int argc, char * const argv[]) { - /** - * The errors and informational messages are outputted to the log file - */ - ofstream logFile; - string logFileName="Setting.txt"; - logFile.open(logFileName); - - /** - * The input parameters are read from command line which are as follow. - * inputPath: The full path to the input file containig the dataset. - * outputPath: The full path to the output csv files. - * K: K in K-NN that means the desired number of Nearest Neighbours to be computed. - * sampleRate: The rate at which we do sampling. This parameter plays a key role in the performance. - * This parameter is a trades-off between the performance and the accuracy of the results. - * Values closer to 1 provides more accurate results but the execution takes longer. - * convThreshold: Convergance Threshold. A fixed integer is used here instead of delta*N*K. - * colIndex1 and colIndex2 (optional): The indices of columns from the input csv file where raw data exists continuously in between. - * If these two arguments were left blank, the code assumes that the entire input csv file is raw data - * and automatically computes the number of columns in the input csv file. - */ - string filePath, outputPath, outputPath2, inputPath,LogoutputPath; - int K,convThreshold, colIndex1=-1, colIndex2=-1; - float sampleRate; - - for (int i=1; ipath().extension() == ext){ - fileFound = true; - filePath = it->path().string(); - break; - } - ++it; - } - if (!fileFound){ - logFile << "CSV file is not found in the input path"; - cout << "CSV file is not found in the input path"; - return 1; - } - } - else if (string(argv[i])=="--K") K=atoi(argv[i+1]); - else if (string(argv[i])=="--sampleRate") sampleRate=stof(argv[i+1]); - else if (string(argv[i])=="--convThreshold") convThreshold=stof(argv[i+1]); - else if (string(argv[i])=="--outputPath"){ - boost::filesystem::path p(argv[i+1]); - - if(!boost::filesystem::exists(p) || !boost::filesystem::is_directory(p)) - { - logFile << "Incorrect output path"; - cout << "Incorrect output path"; - return 1; - } - - LogoutputPath=argv[i+1]; - boost::filesystem::path joinedPath = p / boost::filesystem::path("KNN_Indices.csv"); - outputPath = joinedPath.string(); - boost::filesystem::path joinedPath2 = p / boost::filesystem::path("KNN_Distances.csv"); - outputPath2 = joinedPath2.string(); - } - else if (string(argv[i])=="--colIndex1") { - colIndex1=stof(argv[i+1]); - if (colIndex1<1) { - logFile << "colIndex1 should be greater than 1"; - cout << "colIndex1 should be greater than 1"; - return 1; - } - } - else if (string(argv[i])=="--colIndex2") { - colIndex2=stof(argv[i+1]); - if (colIndex2<1) { - logFile << "colIndex2 should be greater than 1"; - cout << "colIndex2 should be greater than 1"; - return 1; - } - } - } - - logFile<<"------------The following Input Arguments were read------------"<= colIndex1-1 && j < colIndex2) dataPoints[i][j] = atof(temp2.c_str()); - temp.erase(0, temp.find(",") + 1); - } - } - } - mmap.close(); - - if (colIndex1 != -1) Dim=colIndex2-colIndex1+1; - if (Dim <1) { - logFile << "Error in Computing the Dimension of input csv file" << endl; - cout << "Error in Computing the Dimension of input csv file" << endl; - return 1; - } - /** - * define a seed for random generator. Using a constant value produces - * the same set of random numbers and is good for debugging. Alternatively, - * we can select the seed number randomly as srand(time(NULL)) - */ - srand(17); - /** - * Initialization of Arrays B_IsNew and B_Dist - */ - for (int i = 0; i < N; ++i) { - allEntriesFilled[i]=0; - for (int j = 0; j < K; ++j) { - B_IsNew[i][j] = 1; - B_Dist[i][j] = -1.0; - } - } - /** - * Random Initialization of B_Index - */ - int randomIndex, iter; - for (int i = 0; i < N; ++i) { - for (int j = 0; j < K; ++j) { - iter = 1; - while (iter) { - randomIndex = rand() % N; - if (randomIndex != i) { - B_Index[i][j] = randomIndex; - iter = 0; - } - } - } - } - /** - * Replace the farthest point in B_Index (for u1) with u2 if u2 is closer - *

- * This method corresponds to UPDATENN(B[u1],) in the paper - *

- * @param Dist represents B_Dist - * @param Index represents B_Index - * @param IsNew represents B_IsNew - * @param u1 the indice of point that we want to potentially update its K-NN with the point u2 - * @param u2 the indice of potential K-NN fpr point u1 - * @param distance the spatial distance between u1 and u2 - * @param flag updates B_IsNew - * @return 1 if B_Index[u1][.] is updated, 0 otherwise - */ - auto UpdateNN = [&](int u1, int u2, double distance, int flag = 1) { - - if(allEntriesFilled[u1]==0){ - for (int j = 0; j < K; j++) { - if (B_Dist[u1][j] < 0) { - - for (int jj = 0; jj < j; jj++) {if (B_Index[u1][jj] == u2) return 0;} - - B_Dist[u1][j] = distance; - B_Index[u1][j] = u2; - B_IsNew[u1][j] = flag; - if (j==K-1) allEntriesFilled[u1]=1; - return 1;} - } - } - - else{ - for (int j = 0; j < K; j++) { - if (B_Index[u1][j] == u2) return 0; - } - - double max = DBL_MIN; - int index = -1; - for (int j = 0; j < K; j++) { - if (B_Dist[u1][j] > max) { - max = B_Dist[u1][j]; - index = j; - } - } - if (index == -1) { cout << "Error"; } - if (distance < max) { - B_Dist[u1][index] = distance; - B_Index[u1][index] = u2; - B_IsNew[u1][index] = flag; - return 1; - } - else { return 0; } - } - }; - /** - * Main Loop of the Algorithm - */ - bool iterate = true; - while (iterate) { - /** - * Create "New" for each Datapoint - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < K; ++j) { - if (float(rand() % 100) < sampleRate*100) { - if (B_IsNew[i][j] == 1) { - New_Index[i].push_back(B_Index[i][j]); - B_IsNew[i][j] = 0; - } - } - } - } - /** - * Create "New'"(or REVERSE("New")) for each Datapoint - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < New_Index[i].size(); ++j) { - Reverse_New_Index[New_Index[i][j]].push_back(i); - } - } - /** - * Random Sampling from "New'" - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < Reverse_New_Index[i].size(); ++j) { - if (float(rand() % 100) < sampleRate*100) { - Sampled_Reverse_New_Index[i].push_back(Reverse_New_Index[i][j]); - } - } - } - /** - * "New"= "New" U SAMPLE("New'", pK) - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < New_Index[i].size(); ++j) { - New_Final_List[i].push_back(New_Index[i][j]); - } - for (int j = 0; j < Sampled_Reverse_New_Index[i].size(); ++j) { - New_Final_List[i].push_back(Sampled_Reverse_New_Index[i][j]); - } - } - /** - * c=c+UPDATENN(B[u1],) - */ - int c_criteria = 0; - int abort=0; - - for (int i = 0; i < N; ++i) { - if (abort != 0) break; - - #pragma omp parallel for schedule(dynamic) - for (int it = 0; it < New_Final_List[i].size(); ++it) { - int par1= New_Final_List[i][it]; - - for (int it2 = it+1; it2 < New_Final_List[i].size(); ++it2) { - int par2= New_Final_List[i][it2]; - if (par1 != par2 && abort ==0) { - - double dist = 0; - for (int j = 0; j < Dim; ++j) { - dist += pow((dataPoints[par1][j] - dataPoints[par2][j]), 2); - } - double dista = sqrt(dist); - if (dista < epsilon) { - logFile << "Found Duplicate Data for Points "<< par1 << " and " << par2 <> aggregateResults; - for (int j=0; j - * August 2019 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using boost::iostreams::mapped_file_source; -using boost::iostreams::stream; -using namespace std; - -/** - * Read the output of linux command execution - * @param cmd is the linux command to be executed - * @return the output from the execution of the linux command - */ -std::string exec(const char* cmd) { - std::array buffer; - std::string result; - std::unique_ptr pipe(popen(cmd, "r"), pclose); - if (!pipe) { - throw std::runtime_error("popen() failed!"); - } - while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { - result += buffer.data(); - } - return result; -} - -int main(int argc, char * const argv[]) { - /** - * The errors and informational messages are outputted to the log file - */ - ofstream logFile; - string logFileName="Setting.txt"; - logFile.open(logFileName); - - /** - * The input parameters are read from command line which are as follow. - * inputPath: The full path to the input file containig the dataset. - * outputPath: The full path to the output csv files. - * K: K in K-NN that means the desired number of Nearest Neighbours to be computed. - * sampleRate: The rate at which we do sampling. This parameter plays a key role in the performance. - * This parameter is a trades-off between the performance and the accuracy of the results. - * Values closer to 1 provides more accurate results but the execution takes longer. - * convThreshold: Convergance Threshold. A fixed integer is used here instead of delta*N*K. - * colIndex1 and colIndex2 (optional): The indices of columns from the input csv file where raw data exists continuously in between. - * If these two arguments were left blank, the code assumes that the entire input csv file is raw data - * and automatically computes the number of columns in the input csv file. - */ - string filePath, outputPath, outputPath2, inputPath,LogoutputPath; - int K,convThreshold, colIndex1=-1, colIndex2=-1; - float sampleRate; - - for (int i=1; ipath().extension() == ext){ - fileFound = true; - filePath = it->path().string(); - break; - } - ++it; - } - if (!fileFound){ - logFile << "CSV file is not found in the input path"; - cout << "CSV file is not found in the input path"; - return 1; - } - } - else if (string(argv[i])=="--K") K=atoi(argv[i+1]); - else if (string(argv[i])=="--sampleRate") sampleRate=stof(argv[i+1]); - else if (string(argv[i])=="--convThreshold") convThreshold=stof(argv[i+1]); - else if (string(argv[i])=="--outputPath"){ - boost::filesystem::path p(argv[i+1]); - - if(!boost::filesystem::exists(p) || !boost::filesystem::is_directory(p)) - { - logFile << "Incorrect output path"; - cout << "Incorrect output path"; - return 1; - } - - LogoutputPath=argv[i+1]; - boost::filesystem::path joinedPath = p / boost::filesystem::path("KNN_Indices.csv"); - outputPath = joinedPath.string(); - boost::filesystem::path joinedPath2 = p / boost::filesystem::path("KNN_Distances.csv"); - outputPath2 = joinedPath2.string(); - } - else if (string(argv[i])=="--colIndex1") { - colIndex1=stof(argv[i+1]); - if (colIndex1<1) { - logFile << "colIndex1 should be greater than 1"; - cout << "colIndex1 should be greater than 1"; - return 1; - } - } - else if (string(argv[i])=="--colIndex2") { - colIndex2=stof(argv[i+1]); - if (colIndex2<1) { - logFile << "colIndex2 should be greater than 1"; - cout << "colIndex2 should be greater than 1"; - return 1; - } - } - } - - logFile<<"------------The following Input Arguments were read------------"< *New_Index = new std::vector[N]; - /** - * Data structure for REVERSE(new[v]) or new' - */ - vector *Reverse_New_Index = new vector[N]; - /** - * Data Structure for SAMPLE(new'[v],pk) - */ - vector *Sampled_Reverse_New_Index = new vector[N]; - /** - * Data Structure for new[v] U SAMPLE(new'[v],pk) - */ - list *New_Final_List = new list[N]; - /** - * Iterators to access data stored in the list - */ - list::iterator it, it2, it_temp; - /** - * An approximation of zero in computing distances. Two points with the distance - * smaller than epsilon are considered as one point. - */ - double epsilon = 1e-10; // - short* allEntriesFilled = new short[N]; - /** - * At first, let's Read Dataset from Input File Using Memory Mapping - */ - mapped_file_source mmap(filePath); - stream is(mmap, std::ios::binary); - if (is.fail()) - { - logFile << "error in Opening Input File" << endl; - cout << "error in Opening Input File" << endl; - return 1; - } - /** - * Remove the header info - */ - string dummyLine; - getline(is, dummyLine); - /** - * Reading the Entire Dataset - */ - if (argc==11){ - for (int i = 0; i < N; ++i) { - string temp, temp2; - getline(is, temp); - for (int j = 0; j < Dim; ++j) { - temp2 = temp.substr(0, temp.find(",")); - dataPoints[i][j] = atof(temp2.c_str()); - temp.erase(0, temp.find(",") + 1); - } - } - } else { - for (int i = 0; i < N; ++i) { - string temp, temp2; - getline(is, temp); - for (int j = 0; j < Dim; ++j) { - temp2 = temp.substr(0, temp.find(",")); - if (j >= colIndex1-1 && j < colIndex2) dataPoints[i][j] = atof(temp2.c_str()); - temp.erase(0, temp.find(",") + 1); - } - } - } - mmap.close(); - - if (colIndex1 != -1) Dim=colIndex2-colIndex1+1; - if (Dim <1) { - logFile << "Error in Computing the Dimension of input csv file" << endl; - cout << "Error in Computing the Dimension of input csv file" << endl; - return 1; - } - /** - * define a seed for random generator. Using a constant value produces - * the same set of random numbers and is good for debugging. Alternatively, - * we can select the seed number randomly as srand(time(NULL)) - */ - srand(17); - /** - * Initialization of Arrays B_IsNew and B_Dist - */ - for (int i = 0; i < N; ++i) { - allEntriesFilled[i]=0; - for (int j = 0; j < K; ++j) { - B_IsNew[i][j] = 1; - B_Dist[i][j] = -1.0; - } - } - /** - * Random Initialization of B_Index - */ - int randomIndex, iter; - for (int i = 0; i < N; ++i) { - for (int j = 0; j < K; ++j) { - iter = 1; - while (iter) { - randomIndex = rand() % N; - if (randomIndex != i) { - B_Index[i][j] = randomIndex; - iter = 0; - } - } - } - } - /** - * Replace the farthest point in B_Index (for u1) with u2 if u2 is closer - *

- * This method corresponds to UPDATENN(B[u1],) in the paper - *

- * @param Dist represents B_Dist - * @param Index represents B_Index - * @param IsNew represents B_IsNew - * @param u1 the indice of point that we want to potentially update its K-NN with the point u2 - * @param u2 the indice of potential K-NN fpr point u1 - * @param distance the spatial distance between u1 and u2 - * @param flag updates B_IsNew - * @return 1 if B_Index[u1][.] is updated, 0 otherwise - */ - auto UpdateNN = [&](int u1, int u2, double distance, int flag = 1) { - - if(allEntriesFilled[u1]==0){ - for (int j = 0; j < K; j++) { - if (B_Dist[u1][j] < 0) { - - for (int jj = 0; jj < j; jj++) {if (B_Index[u1][jj] == u2) return 0;} - - B_Dist[u1][j] = distance; - B_Index[u1][j] = u2; - B_IsNew[u1][j] = flag; - if (j==K-1) allEntriesFilled[u1]=1; - return 1;} - } - } - - else{ - for (int j = 0; j < K; j++) { - if (B_Index[u1][j] == u2) return 0; - } - - double max = DBL_MIN; - int index = -1; - for (int j = 0; j < K; j++) { - if (B_Dist[u1][j] > max) { - max = B_Dist[u1][j]; - index = j; - } - } - if (index == -1) { cout << "Error"; } - if (distance < max) { - B_Dist[u1][index] = distance; - B_Index[u1][index] = u2; - B_IsNew[u1][index] = flag; - return 1; - } - else { return 0; } - } - return 0; - }; - /** - * Main Loop of the Algorithm - */ - bool iterate = true; - while (iterate) { - /** - * Create "New" for each Datapoint - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < K; ++j) { - if (float(rand() % 100) < sampleRate*100) { - if (B_IsNew[i][j] == 1) { - New_Index[i].push_back(B_Index[i][j]); - B_IsNew[i][j] = 0; - } - } - } - } - /** - * Create "New'"(or REVERSE("New")) for each Datapoint - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < New_Index[i].size(); ++j) { - Reverse_New_Index[New_Index[i][j]].push_back(i); - } - } - /** - * Random Sampling from "New'" - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < Reverse_New_Index[i].size(); ++j) { - if (float(rand() % 100) < sampleRate*100) { - Sampled_Reverse_New_Index[i].push_back(Reverse_New_Index[i][j]); - } - } - } - /** - * "New"= "New" U SAMPLE("New'", pK) - */ - for (int i = 0; i < N; ++i) { - for (int j = 0; j < New_Index[i].size(); ++j) { - New_Final_List[i].push_back(New_Index[i][j]); - } - for (int j = 0; j < Sampled_Reverse_New_Index[i].size(); ++j) { - New_Final_List[i].push_back(Sampled_Reverse_New_Index[i][j]); - } - } - /** - * c=c+UPDATENN(B[u1],) - */ - int c_criteria = 0; - for (int i = 0; i < N; ++i) { - for (it = New_Final_List[i].begin(); it != New_Final_List[i].end(); it++) { - it_temp = it; - advance(it_temp, 1); - for (it2 = it_temp; it2 != New_Final_List[i].end(); it2++) { - if (*it != *it2) { - double dist = 0; - for (int i = 0; i < Dim; ++i) { - dist += pow((dataPoints[*it][i] - dataPoints[*it2][i]), 2); - } - double dista = sqrt(dist); - if (dista < epsilon) { - logFile << "Found Duplicate Data for Points "<< *it << " and " << *it2<> aggregateResults; - for (int j=0; j\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? -serialize = - {major}.{minor}.{patch}-{release}{dev} - {major}.{minor}.{patch} - -[bumpversion:part:release] -optional_value = _ -first_value = dev -values = - dev - _ - -[bumpversion:part:dev] - -[bumpversion:file:pyproject.toml] -search = version = "{current_version}" -replace = version = "{new_version}" - -[bumpversion:file:plugin.json] - -[bumpversion:file:VERSION] - -[bumpversion:file:README.md] - -[bumpversion:file:CHANGELOG.md] - -[bumpversion:file:src/polus/images/clustering/k_means/__init__.py] diff --git a/clustering/k-means-clustering-tool/CHANGELOG.md b/clustering/k-means-clustering-tool/CHANGELOG.md deleted file mode 100644 index 5689120f4..000000000 --- a/clustering/k-means-clustering-tool/CHANGELOG.md +++ /dev/null @@ -1,13 +0,0 @@ -# K-Means Clustering(0.3.5) - -1. This plugin is updated only to the new plugin standards -2. Before plugin support only `.csv` as an input files supported `.csv` and `.feather` file formats. Now this plugin support other vaex supported file formats both as inputs and outputs. -3. Some additional input arguments added `filePattern`, `fileExtension` -4. Implemented latest updated filepattern package -5. This plugin is now installable with pip. -6. Argparse package is replaced with Typer package for command line arguments. -7. `baseCommand` added in a plugin manifiest. -8. `--preview` flag is added which shows outputs to be generated by this plugin. -9. Use `python -m python -m polus.plugins.clustering.k_means` to run plugin from command line. -10. No unnitests before and new pytests added for testing. -11. Implemented parallel processing diff --git a/clustering/k-means-clustering-tool/Dockerfile b/clustering/k-means-clustering-tool/Dockerfile deleted file mode 100644 index 1d0d53a9e..000000000 --- a/clustering/k-means-clustering-tool/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -FROM polusai/bfio:2.1.9 - -# environment variables defined in polusai/bfio -ENV EXEC_DIR="/opt/executables" -ENV POLUS_IMG_EXT=".ome.tif" -ENV POLUS_TAB_EXT=".arrow" -ENV POLUS_LOG="INFO" - -# Work directory defined in the base container -WORKDIR ${EXEC_DIR} - -# TODO: Change the tool_dir to the tool directory -ENV TOOL_DIR="clustering/k-means-clustering-tool" - -# Copy the repository into the container -RUN mkdir image-tools -COPY . ${EXEC_DIR}/image-tools - -# Install the tool -RUN pip3 install "${EXEC_DIR}/image-tools/${TOOL_DIR}" --no-cache-dir - -# Set the entrypoint -# TODO: Change the entrypoint to the tool entrypoint -ENTRYPOINT ["python3", "-m", "polus.images.clustering.k_means_clustering"] -CMD ["--help"] diff --git a/clustering/k-means-clustering-tool/README.md b/clustering/k-means-clustering-tool/README.md deleted file mode 100644 index 5dd8ec300..000000000 --- a/clustering/k-means-clustering-tool/README.md +++ /dev/null @@ -1,65 +0,0 @@ -# K-Means Clustering(v0.3.5) - -The K-Means Clustering plugin clusters the data using Scikit-learn K-Means clustering algorithm and outputs csv file. Each instance(row) in the input csv file is assigned to one of the clusters. The output csv file contains the column 'Cluster' that shows which cluster the instance belongs to. - -## Inputs: - -### Input data: -The input tabular data that need to be clustered. This plugin supports `.csv` and `.arrow` file formats - -### Methods: -Choose any one of the method mentioned to determine the k-value and cluster the data. - -#### Elbow method -The elbow method runs k-means clustering for a range of values of k and for each k value it calculates the within cluster sum of squared errors (WSS). The idea behind this method is that SSE tends to decrease towards 0 as k-value increases. The goal here is to choose a k-value that has low WSS and the elbow represents where there is diminishing returns by increasing k. - -#### Calinski-Harabasz index -The Calinski-Harabasz index is defined as the ratio of the sum of between-cluster dispersion to the sum of within-cluster dispersion. To choose k, pick maximum number of clusters to be considered and then choose the value of k with the highest score. - -#### Davies-Bouldin index -The Davies-Bouldin index is defined as the average similarity measure of each cluster with its most similar one, where similarity is a ratio of within-cluster distances to between-cluster distances. To choose k value, pick maximum number of clusters to be considered and choose the value of k with lowest value for DB_index. - -### Manual -Select manual method only when you know the number of clusters required to cluster the data. - -### Minimum range: -Enter starting number of sequence in range function to determine k-value. This parameter is required only when elbow or Calinski Harabasz or Davies Bouldin methods are selected. - -### Maximum range: -Enter ending number of sequence in range function to determine k-value. This parameter is required only when elbow or Calinski Harabasz or Davies Bouldin methods are selected. - -### Number of clusters: -Enter k-value if you already know how many clusters are required. This parameter is required only when manual method is selected. - -## Note: -1. If 'Manual' method is selected, enter number of clusters required. -2. If 'Elbow' or 'CalinskiHarabasz' or 'DaviesBouldin' methods are selected, then you should enter values for both 'maximumrange' and 'minimumrange'. -3. The 'minimumrange'value should be >1. - -## Output: -The output is a tabular file containing the cluster data to which each instance in the data belongs to. - -## Building - -To build the Docker image for the conversion plugin, run -`./build-docker.sh`. - -## Install WIPP Plugin - -If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. -For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). - -## Options - -This plugin takes seven input arguments and one output argument: - -| Name | Description | I/O | Type | -| ---------------- | --------------------------------------------------------------------------- | ------ | ------------- | -| `--inpdir` | Input tabular data | Input | genericData | -| `--filePattern` | Pattern to parse tabular files | Input | string | -| `--methods` | Select either Elbow or Calinski Harabasz or Davies Bouldin or Manual method | Input | enum | -| `--minimumrange` | Enter minimum k-value | Input | integer | -| `--maximumrange` | Enter maximum k-value | Input | integer | -| `--numofclus` | Enter number of clusters | Input | integer | -| `--outdir` | Output collection | Output | genericData | -| `--preview` | Generate JSON file with outputs | Output | JSON | diff --git a/clustering/k-means-clustering-tool/VERSION b/clustering/k-means-clustering-tool/VERSION deleted file mode 100644 index c2c0004f0..000000000 --- a/clustering/k-means-clustering-tool/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.3.5 diff --git a/clustering/k-means-clustering-tool/build-docker.sh b/clustering/k-means-clustering-tool/build-docker.sh deleted file mode 100644 index 4dedab2f6..000000000 --- a/clustering/k-means-clustering-tool/build-docker.sh +++ /dev/null @@ -1,23 +0,0 @@ - -#!/bin/bash - -# Change the name of the tool here -tool_dir="clustering" -tool_name="k-means-clustering-tool" - -# The version is read from the VERSION file -version=$(", -"Kelechi Nina Mezu ", -"hamshkhawar " -] -readme = "README.md" -packages = [{include = "polus", from = "src"}] - -[tool.poetry.dependencies] -python = "^3.9" -filepattern = "^2.0.0" -typer = "^0.7.0" -nyxus = "^0.5.0" -vaex = "^4.7.0" -scikit_learn="^1.0.2" -numpy = "<2.0.0" - -[tool.poetry.group.dev.dependencies] -bump2version = "^1.0.1" -pre-commit = "^3.0.4" -black = "^23.1.0" -flake8 = "^6.0.0" -mypy = "^1.0.0" -pytest = "^7.2.1" -ipykernel = "^6.21.2" -requests = "^2.28.2" -pandas = "^2.0.1" - -[build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" diff --git a/clustering/k-means-clustering-tool/run-plugin.sh b/clustering/k-means-clustering-tool/run-plugin.sh deleted file mode 100644 index 18c8bea46..000000000 --- a/clustering/k-means-clustering-tool/run-plugin.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -version=$( None: - """K-means clustering plugin.""" - logger.info(f"inpDir = {inp_dir}") - logger.info(f"filePattern = {file_pattern}") - logger.info(f"minimumRange = {minimum_range}") - logger.info(f"maximumRange = {maximum_range}") - logger.info(f"numOfClus = {num_of_clus}") - logger.info(f"outDir = {out_dir}") - - assert inp_dir.exists(), f"{inp_dir} doesnot exist!! Please check input path again" - assert out_dir.exists(), f"{out_dir} doesnot exist!! Please check output path again" - assert file_pattern in [ - ".csv", - ".arrow", - ], f"{file_pattern} tabular files are not supported by this plugin" - - num_threads = max([cpu_count(), 2]) - - pattern = ".*" + file_pattern - fps = fp.FilePattern(inp_dir, pattern) - print(pattern) - - if not fps: - msg = f"No {file_pattern} files found." - raise ValueError(msg) - - if preview: - with open(pathlib.Path(out_dir, "preview.json"), "w") as jfile: - out_json: dict[str, Any] = { - "filepattern": pattern, - "outDir": [], - } - for file in fps(): - out_name = str(file[1][0].stem) + POLUS_TAB_EXT - out_json["outDir"].append(out_name) - json.dump(out_json, jfile, indent=2) - - flist = [f[1][0] for f in fps()] - - with multiprocessing.Pool(processes=num_threads) as executor: - executor.map( - partial( - km.clustering, - file_pattern=pattern, - methods=methods, - minimum_range=minimum_range, - maximum_range=maximum_range, - num_of_clus=num_of_clus, - out_dir=out_dir, - ), - flist, - ) - executor.close() - executor.join() - - -if __name__ == "__main__": - app() diff --git a/clustering/k-means-clustering-tool/src/polus/images/clustering/k_means/k_means.py b/clustering/k-means-clustering-tool/src/polus/images/clustering/k_means/k_means.py deleted file mode 100644 index 3e58f8a33..000000000 --- a/clustering/k-means-clustering-tool/src/polus/images/clustering/k_means/k_means.py +++ /dev/null @@ -1,216 +0,0 @@ -"""K_means clustering.""" -import logging -import os -import pathlib -from typing import Optional - -import numpy -import numpy as np -import numpy.matlib -import vaex -from sklearn.cluster import KMeans -from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score - -from .utils import Methods - -# Initialize the logger -logging.basicConfig( - format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", - datefmt="%d-%b-%y %H:%M:%S", -) -logger = logging.getLogger("main") -logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) -POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") - - -def elbow(data_array: np.array, minimum_range: int, maximum_range: int) -> np.array: - """Determine k value and cluster data using elbow method. - - Args: - data_array : Input data. - minimum_range : Starting number of sequence in range function to determine k-value. - maximum_range : Ending number of sequence in range function to determine k-value. - - Returns: - Labeled data. - """ - sse = [] - label_value = [] - logger.info("Starting Elbow Method...") - K = range(minimum_range, maximum_range + 1) - for k in K: - kmeans = KMeans(n_clusters=k, random_state=9).fit(data_array) - centroids = kmeans.cluster_centers_ - pred_clusters = kmeans.predict(data_array) - curr_sse = 0 - - # calculate square of Euclidean distance of each point from its cluster center and add to current WSS - logger.info("Calculating Euclidean distance...") - for i in range(len(data_array)): - curr_center = centroids[pred_clusters[i]] - curr_sse += np.linalg.norm(data_array[i] - np.array(curr_center)) ** 2 - sse.append(curr_sse) - labels = kmeans.labels_ - label_value.append(labels) - - logger.info("Finding elbow point in curve...") - # Find the elbow point in the curve - points = len(sse) - # Get coordinates of all points - coord = np.vstack((range(points), sse)).T - # First point - f_point = coord[0] - # Vector between first and last point - linevec = coord[-1] - f_point - # Normalize the line vector - linevecn = linevec / np.sqrt(np.sum(linevec**2)) - # Vector between all point and first point - vecf = coord - f_point - # Parallel vector - prod = np.sum(vecf * numpy.matlib.repmat(linevecn, points, 1), axis=1) - vecfpara = np.outer(prod, linevecn) - # Perpendicular vector - vecline = vecf - vecfpara - # Distance from curve to line - dist = np.sqrt(np.sum(vecline**2, axis=1)) - # Maximum distance point - k_cluster = np.argmax(dist) + minimum_range - logger.info("k cluster: %s", k_cluster) - logger.info("label value: %s", label_value) - logger.info("Setting label_data") - label_data = label_value[k_cluster] - return label_data - - -def calinski_davies( - data_array: np.array, methods: Methods, minimum_range: int, maximum_range: int -) -> np.array: - """Determine k value and cluster data using Calinski Harabasz Index method or Davies Bouldin based on method selection. - - Args: - data: Input data. - methods: Select either Calinski Harabasz or Davies Bouldin method. - minimum_range: Starting number of sequence in range function to determine k-value. - maximum_range:Ending number of sequence in range function to determine k-value. - - Returns: - Labeled data. - """ - K = range(minimum_range, maximum_range + 1) - chdb = [] - label_value = [] - for k in K: - kmeans = KMeans(n_clusters=k, random_state=9).fit(data_array) - labels = kmeans.labels_ - label_value.append(labels) - if f"{methods}" == "CalinskiHarabasz": - ch_db = calinski_harabasz_score(data_array, labels) - else: - ch_db = davies_bouldin_score(data_array, labels) - chdb.append(ch_db) - if f"{methods}" == "CalinskiHarabasz": - score = max(chdb) - else: - score = min(chdb) - k_cluster = chdb.index(score) - label_data = label_value[k_cluster] - return label_data - - -def clustering( - file: pathlib.Path, - file_pattern: str, - methods: Methods, - minimum_range: int, - maximum_range: int, - num_of_clus: int, - out_dir: pathlib.Path, -): - """K-means clustering methods to find clusters of similar or more related objects. - - Args: - file: Input path. - file_pattern: Pattern to parse tabular files. - methods: Select either Calinski Harabasz or Davies Bouldin method or Manual. - minimum_range: Starting number of sequence in range function to determine k-value. - maximum_range:Ending number of sequence in range function to determine k-value. - """ - # Get file name - filename = file.stem - logger.info("Started reading the file " + file.name) - with open(file, encoding="utf-8", errors="ignore") as fr: - ncols = len(fr.readline().split(",")) - chunk_size = max([2**24 // ncols, 1]) - - if f"{file_pattern}" == ".csv": - df = vaex.read_csv(file, convert=True, chunk_size=chunk_size) - else: - df = vaex.open(file) - # Get list of column names - cols = df.get_column_names() - - # Separate data by categorical and numerical data types - numerical = [] - categorical = [] - for col in cols: - if df[col].dtype == str: - categorical.append(col) - else: - numerical.append(col) - # Remove label field - if "label" in numerical: - numerical.remove("label") - - if numerical is None: - raise ValueError("There are no numerical features in the data.") - else: - data = df[numerical] - - if categorical: - cat_array = df[categorical] - else: - logger.info("No categorical features found in the data") - - if f"{methods}" != "Manual": - # Check whether minimum range and maximum range value is entered - if methods and not (minimum_range or maximum_range): - raise ValueError( - "Enter both minimumrange and maximumrange to determine k-value." - ) - if minimum_range <= 1: - raise ValueError("Minimumrange should be greater than 1.") - logger.info( - "Determining k-value using " + methods + " and clustering the data." - ) - if f"{methods}" == "CalinskiHarabasz": - label_data = calinski_davies(data, methods, minimum_range, maximum_range) - if f"{methods}" == "DaviesBouldin": - label_data = calinski_davies(data, methods, minimum_range, maximum_range) - if f"{methods}" == "Elbow": - label_data = elbow(data, minimum_range, maximum_range) - else: - # Check whether numofclus is entered - if not num_of_clus: - raise ValueError("Enter number of clusters") - kvalue = num_of_clus - kmeans = KMeans(n_clusters=kvalue).fit(data) - label_data = kmeans.labels_ - - # Cluster data using K-Means clustering - logger.info("Adding Cluster Data") - data["Cluster"] = label_data - - # Add Categorical Data back to data processed - if categorical: - logger.info("Adding categorical data") - for col in categorical: - data[col] = cat_array[col].values - - # Save dataframe to feather file or to csv file - out_file = pathlib.Path(out_dir, (filename + POLUS_TAB_EXT)) - - if f"{POLUS_TAB_EXT}" in [".feather", ".arrow"]: - data.export_feather(out_file) - else: - logger.info("Saving csv file") - data.export_csv(out_file, chunk_size=chunk_size) diff --git a/clustering/k-means-clustering-tool/src/polus/images/clustering/k_means/utils.py b/clustering/k-means-clustering-tool/src/polus/images/clustering/k_means/utils.py deleted file mode 100644 index 91bb81bf8..000000000 --- a/clustering/k-means-clustering-tool/src/polus/images/clustering/k_means/utils.py +++ /dev/null @@ -1,12 +0,0 @@ -"""K_means clustering.""" -import enum - - -class Methods(str, enum.Enum): - """Clustering methods to determine k-value.""" - - ELBOW = "Elbow" - CALINSKIHARABASZ = "CalinskiHarabasz" - DAVIESBOULDIN = "DaviesBouldin" - MANUAL = "Manual" - Default = "Elbow" diff --git a/clustering/k-means-clustering-tool/tests/__init__.py b/clustering/k-means-clustering-tool/tests/__init__.py deleted file mode 100644 index 36f89f937..000000000 --- a/clustering/k-means-clustering-tool/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""K_means clustering.""" diff --git a/clustering/k-means-clustering-tool/tests/conftest.py b/clustering/k-means-clustering-tool/tests/conftest.py deleted file mode 100644 index 58dce0fef..000000000 --- a/clustering/k-means-clustering-tool/tests/conftest.py +++ /dev/null @@ -1,91 +0,0 @@ -"""Test Fixtures.""" - -import pathlib -import shutil -import tempfile - -import numpy as np -import pandas as pd -import pytest - - -class Generatedata: - """Generate tabular data with several different file format.""" - - def __init__(self, file_pattern: str, size: int, outname: str) -> None: - """Define instance attributes.""" - self.dirpath = pathlib.Path.cwd() - self.inp_dir = tempfile.mkdtemp(dir=self.dirpath) - self.out_dir = tempfile.mkdtemp(dir=self.dirpath) - self.file_pattern = file_pattern - self.size = size - self.outname = outname - self.x = self.create_dataframe() - - def get_inp_dir(self) -> pathlib.Path: - """Get input directory.""" - return pathlib.Path(self.inp_dir) - - def get_out_dir(self) -> pathlib.Path: - """Get output directory.""" - return pathlib.Path(self.out_dir) - - def create_dataframe(self) -> pd.core.frame.DataFrame: - """Create Pandas dataframe.""" - rng = np.random.default_rng() - diction_1 = { - "A": np.linspace(0.0, 4.0, self.size, dtype="float32", endpoint=False), - "B": np.linspace(0.0, 6.0, self.size, dtype="float32", endpoint=False), - "C": np.linspace(0.0, 8.0, self.size, dtype="float32", endpoint=False), - "D": np.linspace(0.0, 10.0, self.size, dtype="float32", endpoint=False), - "label": rng.integers(low=1, high=4, size=self.size), - } - - return pd.DataFrame(diction_1) - - def csv_func(self) -> None: - """Convert pandas dataframe to csv file format.""" - self.x.to_csv(pathlib.Path(self.inp_dir, self.outname), index=False) - - def arrow_func(self) -> None: - """Convert pandas dataframe to Arrow file format.""" - self.x.to_feather(pathlib.Path(self.inp_dir, self.outname)) - - def __call__(self) -> None: - """To make a class callable.""" - data_ext = { - ".csv": self.csv_func, - ".arrow": self.arrow_func, - } - - return data_ext[self.file_pattern]() - - def clean_directories(self) -> None: - """Remove files.""" - for d in self.dirpath.iterdir(): - if d.is_dir() and d.name.startswith("tmp"): - shutil.rmtree(d) - - -def pytest_addoption(parser: pytest.Parser) -> None: - """Add options to pytest.""" - parser.addoption( - "--slow", - action="store_true", - dest="slow", - default=False, - help="run slow tests", - ) - - -@pytest.fixture( - params=[ - ("CalinskiHarabasz", 500, ".csv", 2, 5), - ("DaviesBouldin", 250, ".arrow", 2, 7), - ("Elbow", 500, ".arrow", 2, 10), - ("Manual", 200, ".arrow", 2, 5), - ], -) -def get_params(request: pytest.FixtureRequest) -> pytest.FixtureRequest: - """To get the parameter of the fixture.""" - return request.param diff --git a/clustering/k-means-clustering-tool/tests/test_main.py b/clustering/k-means-clustering-tool/tests/test_main.py deleted file mode 100644 index 7c516c759..000000000 --- a/clustering/k-means-clustering-tool/tests/test_main.py +++ /dev/null @@ -1,142 +0,0 @@ -"""K_means clustering.""" - -import shutil - -import filepattern as fp -import pytest -import vaex -from polus.images.clustering.k_means import k_means as km -from polus.images.clustering.k_means.__main__ import app -from typer.testing import CliRunner - -from .conftest import Generatedata - -runner = CliRunner() - - -@pytest.mark.parametrize( - ("ext", "minrange", "maxrange"), - [(".arrow", 2, 5), (".csv", 2, 7)], -) -@pytest.mark.skipif("not config.getoption('slow')") -def test_elbow(ext: str, minrange: int, maxrange: int) -> None: - """Testing elbow function.""" - d = Generatedata(ext, outname=f"data_1{ext}", size=10000) - d() - pattern = f".*{ext}" - fps = fp.FilePattern(d.get_inp_dir(), pattern) - - for file in fps(): - if f"{pattern}" == ".csv": - df = vaex.read_csv(file[1][0], convert=True) - else: - df = vaex.open(file[1][0]) - - label_data = km.elbow( - data_array=df[:, :4].values, - minimum_range=minrange, - maximum_range=maxrange, - ) - - assert label_data is not None - - d.clean_directories() - - -@pytest.mark.parametrize( - ("method", "datasize", "ext", "minrange", "maxrange"), - [ - ("CalinskiHarabasz", 500, ".arrow", 2, 5), - ("DaviesBouldin", 600, ".csv", 2, 7), - ], -) -@pytest.mark.skipif("not config.getoption('slow')") -def test_calinski_davies( - method: str, - datasize: int, - ext: str, - minrange: int, - maxrange: int, -) -> None: - """Testing calinski_davies and davies_bouldin methods.""" - d = Generatedata(ext, outname=f"data_1{ext}", size=datasize) - d() - pattern = f".*{ext}" - fps = fp.FilePattern(d.get_inp_dir(), pattern) - - for file in fps(): - if f"{pattern}" == ".csv": - df = vaex.read_csv(file[1][0], convert=True) - else: - df = vaex.open(file[1][0]) - - label_data = km.calinski_davies( - data_array=df[:, :4].values, - methods=method, - minimum_range=minrange, - maximum_range=maxrange, - ) - - assert label_data is not None - - d.clean_directories() - - -@pytest.mark.skipif("not config.getoption('slow')") -def test_clustering(get_params: pytest.FixtureRequest) -> None: - """Test clustering function.""" - method, datasize, ext, minrange, maxrange = get_params - d = Generatedata(ext, outname=f"data_1{ext}", size=datasize) - d() - pattern = f".*{ext}" - numclusters = 3 - fps = fp.FilePattern(d.get_inp_dir(), pattern) - for file in fps(): - km.clustering( - file=file[1][0], - file_pattern=ext, - methods=method, - minimum_range=minrange, - maximum_range=maxrange, - num_of_clus=numclusters, - out_dir=d.get_out_dir(), - ) - assert d.get_out_dir().joinpath("data_1.arrow") - df = vaex.open(d.get_out_dir().joinpath("data_1.arrow")) - assert "Cluster" in df.columns - d.clean_directories() - - -def test_cli(get_params: pytest.FixtureRequest) -> None: - """Test Cli.""" - method, data_size, inpext, minrange, maxrange = get_params - d = Generatedata(inpext, outname=f"data_1{inpext}", size=data_size) - d() - shutil.copy( - d.get_inp_dir().joinpath(f"data_1{inpext}"), - d.get_inp_dir().joinpath(f"data_2{inpext}"), - ) - numclusters = 3 - - result = runner.invoke( - app, - [ - "--inpDir", - d.get_inp_dir(), - "--filePattern", - inpext, - "--methods", - method, - "--minimumRange", - minrange, - "--maximumRange", - maxrange, - "--numOfClus", - numclusters, - "--outDir", - d.get_out_dir(), - ], - ) - assert result.exit_code == 0 - - d.clean_directories() diff --git a/clustering/outlier-removal-tool/.bumpversion.cfg b/clustering/outlier-removal-tool/.bumpversion.cfg deleted file mode 100644 index 106859eb1..000000000 --- a/clustering/outlier-removal-tool/.bumpversion.cfg +++ /dev/null @@ -1,29 +0,0 @@ -[bumpversion] -current_version = 0.2.7 -commit = True -tag = False -parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? -serialize = - {major}.{minor}.{patch}-{release}{dev} - {major}.{minor}.{patch} - -[bumpversion:part:release] -optional_value = _ -first_value = dev -values = - dev - _ - -[bumpversion:part:dev] - -[bumpversion:file:pyproject.toml] -search = version = "{current_version}" -replace = version = "{new_version}" - -[bumpversion:file:VERSION] - -[bumpversion:file:README.md] - -[bumpversion:file:plugin.json] - -[bumpversion:file:src/polus/images/clustering/outlier_removal/__init__.py] diff --git a/clustering/outlier-removal-tool/.dockerignore b/clustering/outlier-removal-tool/.dockerignore deleted file mode 100644 index 7c603f814..000000000 --- a/clustering/outlier-removal-tool/.dockerignore +++ /dev/null @@ -1,4 +0,0 @@ -.venv -out -tests -__pycache__ diff --git a/clustering/outlier-removal-tool/.gitignore b/clustering/outlier-removal-tool/.gitignore deleted file mode 100644 index 9ed1c3775..000000000 --- a/clustering/outlier-removal-tool/.gitignore +++ /dev/null @@ -1,23 +0,0 @@ -# Jupyter Notebook -.ipynb_checkpoints -poetry.lock -../../poetry.lock -# Environments -.env -.myenv -.venv -env/ -venv/ -# test data directory -data -# yaml file -.pre-commit-config.yaml -# hidden files -.DS_Store -.ds_store -# flake8 -.flake8 -../../.flake8 -__pycache__ -.mypy_cache -requirements.txt diff --git a/clustering/outlier-removal-tool/CHANGELOG.md b/clustering/outlier-removal-tool/CHANGELOG.md deleted file mode 100644 index 59c463e1e..000000000 --- a/clustering/outlier-removal-tool/CHANGELOG.md +++ /dev/null @@ -1,15 +0,0 @@ -# [0.2.6-dev0] - 2024-01-12 - -## Added - -- Pytests to test this plugin -- This plugin is now installable with pip. -- Added support for arrow file format in addition to csv - -## Changed - -- Updated dependencies (bfio, filepattern, preadator) to latest -- Argparse package is replaced with Typer package for command line arguments -- Replaced docker base image with latest container image with pre-installed bfio -- Replaced pandas with vaex -- Seperating descriptive from numerical features for outlier detection if present in the tabular data diff --git a/clustering/outlier-removal-tool/Dockerfile b/clustering/outlier-removal-tool/Dockerfile deleted file mode 100644 index 3aacb378b..000000000 --- a/clustering/outlier-removal-tool/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -FROM polusai/bfio:2.3.3 - -# environment variables defined in polusai/bfio -ENV EXEC_DIR="/opt/executables" -ENV POLUS_IMG_EXT=".ome.tif" -ENV POLUS_TAB_EXT=".arrow" -ENV POLUS_LOG="INFO" - -# Work directory defined in the base container -WORKDIR ${EXEC_DIR} - -# TODO: Change the tool_dir to the tool directory -ENV TOOL_DIR="clustering/outlier-removal-tool" - -# Copy the repository into the container -RUN mkdir image-tools -COPY . ${EXEC_DIR}/image-tools - -# Install the tool -RUN pip3 install "${EXEC_DIR}/image-tools/${TOOL_DIR}" --no-cache-dir - -# Set the entrypoint -# TODO: Change the entrypoint to the tool entrypoint -ENTRYPOINT ["python3", "-m", "polus.images.clustering.outlier_removal"] -CMD ["--help"] diff --git a/clustering/outlier-removal-tool/README.md b/clustering/outlier-removal-tool/README.md deleted file mode 100644 index 453ca1d54..000000000 --- a/clustering/outlier-removal-tool/README.md +++ /dev/null @@ -1,52 +0,0 @@ -# Outlier removal (v0.2.7) - -The outlier removal plugin removes the outliers from the data based on the method selected and outputs csv file. The output will have separate csv files for inliers and outliers. The input file should be in csv format. - -The plugin support vaex supported input csv file that need outliers to be removed. The file should be in csv format. This is a required parameter for the plugin. - -## Methods - -Choose any one of the methods mentioned to remove outliers from the data. - -### Isolation Forest - -Ensemble-based unsupervised method for outlier detection. The algorithm isolates outliers instead of normal instances. It works based on the principle that outliers are few and different and hence, the outliers can be identified easier than the normal points. The score is calculated as the path length to isolate the observation. These two methods can be selected to detect outliers> - -1. `IsolationForest` Detect outliers globally that deviates significantly from the rest of the datapoints -2. `IForest` Detect local outliers that are distinct when compared to those of its neighbors. - - -### Global - - - -### Local - - - -## Outputs: - -Select the output file by passing value to `outputType`. User can select from following options `inlier`, `oulier` or `combined`. The combined file contains `anomaly` column which score each datapoint if it is inlier or outlier. - -## Building - -To build the Docker image for the conversion plugin, run -`./build-docker.sh` - -## Install WIPP Plugin - -If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. -For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). - -## Options - -This plugin takes three input arguments and one output argument: - -| Name | Description | I/O | Type | -| ----------- | ------------------------------------- | ------ | ------------- | -| `--inpDir` | Input directory containing tabular files | Input | genericData | -| `--filePattern` | Pattern to parse tabular file names | Input | string | -| `--methods` | Select methods for outlier removal | Input | enum | -| `--outputType` | Select type of output file | Input | enum | -| `--outdir` | Output collection | Output | genericData | -| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/clustering/outlier-removal-tool/VERSION b/clustering/outlier-removal-tool/VERSION deleted file mode 100644 index b0032849c..000000000 --- a/clustering/outlier-removal-tool/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.2.7 diff --git a/clustering/outlier-removal-tool/build-docker.sh b/clustering/outlier-removal-tool/build-docker.sh deleted file mode 100644 index 3d1dfe86e..000000000 --- a/clustering/outlier-removal-tool/build-docker.sh +++ /dev/null @@ -1,23 +0,0 @@ - -#!/bin/bash - -# Change the name of the tool here -tool_dir="clustering" -tool_name="outlier-removal-tool" - -# The version is read from the VERSION file -version=$(\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? -serialize = - {major}.{minor}.{patch}-{release}{dev} - {major}.{minor}.{patch} - -[bumpversion:part:release] -optional_value = _ -first_value = dev -values = - dev - _ - -[bumpversion:part:dev] - -[bumpversion:file:pyproject.toml] -search = version = "{current_version}" -replace = version = "{new_version}" - -[bumpversion:file:plugin.json] - -[bumpversion:file:VERSION] - -[bumpversion:file:src/polus/plugins/clustering/outlier_removal/__init__.py] diff --git a/clustering/outlier-removal-tool/ict.yaml b/clustering/outlier-removal-tool/ict.yaml deleted file mode 100644 index c2627dd35..000000000 --- a/clustering/outlier-removal-tool/ict.yaml +++ /dev/null @@ -1,77 +0,0 @@ -author: -- Jayapriya Nagarajan -contact: hamdahshafqat.abbasi@nih.gov -container: polusai/outlier-removal-tool:0.2.7-dev0 -description: Remove outliers from the data. -entrypoint: python3 -m polus.images.clustering.outlier_removal -inputs: -- description: Input tabular data. - format: - - genericData - name: inpDir - required: true - type: path -- description: Filename pattern used to separate data. - format: - - string - name: filePattern - required: false - type: string -- description: Select methods for outlier removal - format: - - enum - name: method - required: false - type: string -- description: Select type of output file - format: - - enum - name: outputType - required: false - type: string -- description: Generate an output preview. - format: - - boolean - name: preview - required: false - type: boolean -name: polusai/OutlierRemoval -outputs: -- description: Output collection. - format: - - genericData - name: outDir - required: true - type: path -repository: https://github.com/PolusAI/polus-plugins -specVersion: 1.0.0 -title: Outlier Removal -ui: -- description: Input tabular data to be processed by this plugin. - key: inputs.inpDir - title: Input tabular data - type: path -- description: Filename pattern used to separate data. - key: inputs.filePattern - title: Filename pattern - type: text -- description: Select method for outlier removal. - fields: - - IsolationForest - - IForest - key: inputs.method - title: method - type: select -- description: Select output type. - fields: - - inlier - - outlier - - combined - key: inputs.outputType - title: outputType - type: select -- description: Generate an output preview. - key: inputs.preview - title: Preview - type: checkbox -version: 0.2.7-dev0 diff --git a/clustering/outlier-removal-tool/images/Global.PNG b/clustering/outlier-removal-tool/images/Global.PNG deleted file mode 100644 index c4be3b484..000000000 Binary files a/clustering/outlier-removal-tool/images/Global.PNG and /dev/null differ diff --git a/clustering/outlier-removal-tool/images/Local.PNG b/clustering/outlier-removal-tool/images/Local.PNG deleted file mode 100644 index 4a1580ca0..000000000 Binary files a/clustering/outlier-removal-tool/images/Local.PNG and /dev/null differ diff --git a/clustering/outlier-removal-tool/outlierremoval.cwl b/clustering/outlier-removal-tool/outlierremoval.cwl deleted file mode 100644 index 718c486d3..000000000 --- a/clustering/outlier-removal-tool/outlierremoval.cwl +++ /dev/null @@ -1,40 +0,0 @@ -class: CommandLineTool -cwlVersion: v1.2 -inputs: - filePattern: - inputBinding: - prefix: --filePattern - type: string? - inpDir: - inputBinding: - prefix: --inpDir - type: Directory - method: - inputBinding: - prefix: --method - type: string? - outDir: - inputBinding: - prefix: --outDir - type: Directory - outputType: - inputBinding: - prefix: --outputType - type: string? - preview: - inputBinding: - prefix: --preview - type: boolean? -outputs: - outDir: - outputBinding: - glob: $(inputs.outDir.basename) - type: Directory -requirements: - DockerRequirement: - dockerPull: polusai/outlier-removal-tool:0.2.7-dev0 - InitialWorkDirRequirement: - listing: - - entry: $(inputs.outDir) - writable: true - InlineJavascriptRequirement: {} diff --git a/clustering/outlier-removal-tool/package-release.sh b/clustering/outlier-removal-tool/package-release.sh deleted file mode 100644 index f833f6557..000000000 --- a/clustering/outlier-removal-tool/package-release.sh +++ /dev/null @@ -1,16 +0,0 @@ -# This script is designed to help package a new version of a plugin - -# Get the new version -version=$(", - "Hamdah Shafqat abbasi " - ] -readme = "README.md" -packages = [{include = "polus", from = "src"}] - -[tool.poetry.dependencies] -python = ">=3.9,<3.12" -filepattern = "^2.0.4" -typer = "^0.7.0" -tqdm = "^4.64.1" -preadator="0.4.0.dev2" -vaex = "^4.17.0" -scikit-learn = "^1.3.2" -pyod = "^1.1.2" - -[tool.poetry.group.dev.dependencies] -pre-commit = "^3.3.3" -bump2version = "^1.0.1" -pytest = "^7.3.2" -pytest-xdist = "^3.3.1" -pytest-sugar = "^0.9.7" -ipykernel = "^6.28.0" - -[build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" diff --git a/clustering/outlier-removal-tool/run-docker.sh b/clustering/outlier-removal-tool/run-docker.sh deleted file mode 100644 index f2c347263..000000000 --- a/clustering/outlier-removal-tool/run-docker.sh +++ /dev/null @@ -1,19 +0,0 @@ -version=$( None: - """Remove outliers from the data.""" - logger.info(f"--inpDir = {inp_dir}") - logger.info(f"--filePattern = {file_pattern}") - logger.info(f"--method = {method}") - logger.info(f"--outputType = {output_type}") - logger.info(f"--outDir = {out_dir}") - - inp_dir = inp_dir.resolve() - out_dir = out_dir.resolve() - - assert inp_dir.exists(), f"{inp_dir} does not exist!! Please check input path again" - assert ( - out_dir.exists() - ), f"{out_dir} does not exist!! Please check output path again" - - files = fp.FilePattern(inp_dir, file_pattern) - - if preview: - with Path.open(Path(out_dir, "preview.json"), "w") as jfile: - out_json: dict[str, Any] = { - "filepattern": file_pattern, - "outDir": [], - } - for file in files(): - outname = file[1][0].name.replace( - "".join(file[1][0].suffixes), - f"_{output_type}{rm.POLUS_TAB_EXT}", - ) - - out_json["outDir"].append(outname) - json.dump(out_json, jfile, indent=2) - - else: - with preadator.ProcessManager( - name="Cluster data using HDBSCAN", - num_processes=num_workers, - threads_per_process=2, - ) as pm: - for file in files(): - pm.submit_process( - rm.outlier_detection, - file[1][0], - method, - output_type, - out_dir, - ) - pm.join_processes() - - -if __name__ == "__main__": - app() diff --git a/clustering/outlier-removal-tool/src/polus/images/clustering/outlier_removal/outlier_removal.py b/clustering/outlier-removal-tool/src/polus/images/clustering/outlier_removal/outlier_removal.py deleted file mode 100644 index cb7364b3f..000000000 --- a/clustering/outlier-removal-tool/src/polus/images/clustering/outlier_removal/outlier_removal.py +++ /dev/null @@ -1,135 +0,0 @@ -"""Outlier Removal Plugin.""" -import enum -import logging -import os -from pathlib import Path - -import numpy as np -import vaex -from pyod.models.iforest import IForest -from sklearn.ensemble import IsolationForest -from sklearn.preprocessing import StandardScaler - -logger = logging.getLogger(__name__) -logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) -POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".arrow") - -CHUNK_SIZE = 10000 - - -class Methods(str, enum.Enum): - """Available outlier detection methods.""" - - ISOLATIONFOREST = "IsolationForest" - IFOREST = "IForest" - DEFAULT = "IsolationForest" - - -class Outputs(str, enum.Enum): - """Output Files.""" - - INLIER = "inlier" - OUTLIER = "outlier" - COMBINED = "combined" - DEFAULT = "inlier" - - -def write_outputs(data: vaex.DataFrame, outname: Path) -> None: - """Write outputs in either arrow or csv file formats. - - Args: - data: vaex dataframe. - outname: Name of output file. - """ - if POLUS_TAB_EXT == ".arrow": - data.export_feather(outname) - logger.info(f"Saving outputs: {outname}") - if POLUS_TAB_EXT == ".csv": - data.export_csv(outname, chunk_size=CHUNK_SIZE) - logger.info(f"Saving outputs: {outname}") - - -def isolationforest(data_set: np.ndarray, method: Methods) -> np.ndarray: - """Isolation Forest algorithm. - - Args: - data_set: Input data. - method: Type of method to remove outliers. - - Returns: - ndarray whether or not the data point should be considered as an inlier. - - """ - if method == Methods.ISOLATIONFOREST: - clf = IsolationForest(random_state=19, n_estimators=200) - - if method == Methods.IFOREST: - clf = IForest(random_state=10, n_estimators=200) - - if method == Methods.DEFAULT: - clf = IsolationForest(random_state=19, n_estimators=200) - - clf.fit(data_set) - return clf.predict(data_set) - - -def outlier_detection( - file: Path, - method: Methods, - output_type: Outputs, - out_dir: Path, -) -> None: - """Detects outliers using Isolation Forest algorithm. - - Args: - file: Input tabular data. - method: Select a method to remove outliers. - output_type: Select type of output file. - out_dir: Path to output directory. - """ - if Path(file.name).suffix == ".csv": - data = vaex.from_csv(file, convert=True, chunk_size=CHUNK_SIZE) - else: - data = vaex.open(file) - - int_columns = [ - feature - for feature in data.get_column_names() - if data.data_type(feature) == int or data.data_type(feature) == float - ] - - if len(int_columns) == 0: - msg = "Features with integer datatype do not exist" - raise ValueError(msg) - - # Standardize the data - df = StandardScaler().fit_transform(data[int_columns]) - - # Detect outliers - logger.info("Detecting outliers using " + method) - rem_out = isolationforest(df, method) - - data["anomaly"] = rem_out - - if method == Methods.ISOLATIONFOREST or method == Methods.DEFAULT: - inliers = data[data["anomaly"] == 1] - outliers = data[data["anomaly"] == -1] - - if method == Methods.IFOREST: - inliers = data[data["anomaly"] == 0] - outliers = data[data["anomaly"] == 1] - - # Drop 'anomaly' column - inliers = inliers.drop("anomaly", inplace=True) - outliers = outliers.drop("anomaly", inplace=True) - - outname = Path(out_dir, f"{Path(file.name).stem}_{output_type}{POLUS_TAB_EXT}") - - if output_type == Outputs.INLIER: - write_outputs(inliers, outname) - if output_type == Outputs.OUTLIER: - write_outputs(outliers, outname) - if output_type == Outputs.COMBINED: - write_outputs(data, outname) - if output_type == Outputs.DEFAULT: - write_outputs(inliers, outname) diff --git a/clustering/outlier-removal-tool/tests/__init__.py b/clustering/outlier-removal-tool/tests/__init__.py deleted file mode 100644 index 727cdca8d..000000000 --- a/clustering/outlier-removal-tool/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Test Outlier Removal Plugin.""" diff --git a/clustering/outlier-removal-tool/tests/conftest.py b/clustering/outlier-removal-tool/tests/conftest.py deleted file mode 100644 index 1829c1afa..000000000 --- a/clustering/outlier-removal-tool/tests/conftest.py +++ /dev/null @@ -1,54 +0,0 @@ -"""Test fixtures. - -Set up all data used in tests. -""" -import tempfile -from pathlib import Path - -import numpy as np -import pandas as pd -import pytest - - -@pytest.fixture( - params=[ - (5000, ".csv", "IsolationForest", "combined"), - (100000, ".arrow", "IForest", "inlier"), - (500000, ".csv", "IsolationForest", "outlier"), - ], -) -def get_params(request: pytest.FixtureRequest) -> tuple[int, str]: - """To get the parameter of the fixture.""" - return request.param - - -@pytest.fixture() -def generate_synthetic_data( - get_params: tuple[int, str, str, str], -) -> tuple[Path, Path, str, str, str]: - """Generate tabular data.""" - nrows, file_extension, method, output_type = get_params - - input_directory = Path(tempfile.mkdtemp(prefix="inputs_")) - output_directory = Path(tempfile.mkdtemp(prefix="out_")) - rng = np.random.default_rng() - tabular_data = { - "sepal_length": rng.random(nrows).tolist(), - "sepal_width": rng.random(nrows).tolist(), - "petal_length": rng.random(nrows).tolist(), - "petal_width": rng.random(nrows).tolist(), - "species": rng.choice( - ["Iris-setosa", "Iris-versicolor", "Iris-virginica"], - nrows, - ).tolist(), - } - - df = pd.DataFrame(tabular_data) - if file_extension == ".csv": - outpath = Path(input_directory, "data.csv") - df.to_csv(outpath, index=False) - if file_extension == ".arrow": - outpath = Path(input_directory, "data.arrow") - df.to_feather(outpath) - - return input_directory, output_directory, file_extension, method, output_type diff --git a/clustering/outlier-removal-tool/tests/test_cli.py b/clustering/outlier-removal-tool/tests/test_cli.py deleted file mode 100644 index c303a5d54..000000000 --- a/clustering/outlier-removal-tool/tests/test_cli.py +++ /dev/null @@ -1,59 +0,0 @@ -"""Test Command line Tool.""" -from typer.testing import CliRunner -from polus.images.clustering.outlier_removal.__main__ import app -import shutil -from pathlib import Path - - -def test_cli(generate_synthetic_data: tuple[Path, Path, str, str, str]) -> None: - """Test the command line.""" - inp_dir, out_dir, file_extension, method, output_type = generate_synthetic_data - file_pattern = f".*{file_extension}" - - runner = CliRunner() - result = runner.invoke( - app, - [ - "--inpDir", - inp_dir, - "--filePattern", - file_pattern, - "--method", - method, - "--outputType", - output_type, - "--outDir", - out_dir, - ], - ) - - assert result.exit_code == 0 - shutil.rmtree(inp_dir) - shutil.rmtree(out_dir) - - -def test_short_cli(generate_synthetic_data: tuple[Path, Path, str, str, str]) -> None: - """Test short command line.""" - inp_dir, out_dir, file_extension, method, output_type = generate_synthetic_data - file_pattern = f".*{file_extension}" - - runner = CliRunner() - result = runner.invoke( - app, - [ - "-i", - inp_dir, - "-f", - file_pattern, - "-m", - method, - "-ot", - output_type, - "-o", - out_dir, - ], - ) - - assert result.exit_code == 0 - shutil.rmtree(inp_dir) - shutil.rmtree(out_dir) diff --git a/clustering/outlier-removal-tool/tests/test_outlier_removal.py b/clustering/outlier-removal-tool/tests/test_outlier_removal.py deleted file mode 100644 index 5f90fb01c..000000000 --- a/clustering/outlier-removal-tool/tests/test_outlier_removal.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Test Outlier Removal Plugin.""" -import shutil -from pathlib import Path - -import filepattern as fp -import numpy as np -import polus.images.clustering.outlier_removal.outlier_removal as rm -import vaex - - -def test_outlier_detection( - generate_synthetic_data: tuple[Path, Path, str, str, str], -) -> None: - """Test outlier detection of tabular data.""" - inp_dir, out_dir, file_extension, method, output_type = generate_synthetic_data - - file_pattern = f".*{file_extension}" - files = fp.FilePattern(inp_dir, file_pattern) - for file in files(): - rm.outlier_detection( - file=file[1][0], - method=method, - output_type=output_type, - out_dir=out_dir, - ) - out_ext = [Path(f.name).suffix for f in out_dir.iterdir()] - assert all(out_ext) is True - shutil.rmtree(inp_dir) - shutil.rmtree(out_dir) - - -def test_isolationforest( - generate_synthetic_data: tuple[Path, Path, str, str, str], -) -> None: - """Test isolationforest method.""" - inp_dir, out_dir, file_extension, method, output_type = generate_synthetic_data - file_pattern = f".*{file_extension}" - files = fp.FilePattern(inp_dir, file_pattern) - for file in files(): - df = vaex.open(file[1][0]) - data = df[df.column_names[:-1]].values - prediction = rm.isolationforest(data, method) - assert len(prediction) != 0 - assert type(prediction) == np.ndarray - shutil.rmtree(inp_dir) - shutil.rmtree(out_dir) diff --git a/clustering/polus-feature-subsetting-plugin/Dockerfile b/clustering/polus-feature-subsetting-plugin/Dockerfile deleted file mode 100644 index babcd2385..000000000 --- a/clustering/polus-feature-subsetting-plugin/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ - -FROM polusai/bfio:2.1.9 - -# from bfio container -# ENV POLUS_EXT=".ome.tif" -# ENV POLUS_LOG="INFO" -# ENV EXEC_DIR="/opt/executables" -# ENV DATA_DIR="/data" - -COPY VERSION / - -ARG EXEC_DIR="/opt/executables" -ARG DATA_DIR="/data" - -RUN mkdir -p ${EXEC_DIR} \ - && mkdir -p ${DATA_DIR}/inputs \ - && mkdir ${DATA_DIR}/outputs - -COPY src ${EXEC_DIR}/ -WORKDIR ${EXEC_DIR} - -RUN pip3 install -r ${EXEC_DIR}/requirements.txt --no-cache-dir - -ENTRYPOINT ["python3", "/opt/executables/main.py"] \ No newline at end of file diff --git a/clustering/polus-feature-subsetting-plugin/README.md b/clustering/polus-feature-subsetting-plugin/README.md deleted file mode 100644 index 24ccba663..000000000 --- a/clustering/polus-feature-subsetting-plugin/README.md +++ /dev/null @@ -1,56 +0,0 @@ -# Feature Data Subset - -This WIPP plugin subsets data based on a given feature. It works in conjunction with the `polus-feature-extraction-plugin`, where the feature extraction plugin can be used to extract the features such as the mean intensity of every image in the input image collection. - -# Usage -The details and usage of the plugin inputs is provided in the section below. In addition to the subsetted data, the output directory also consists of a `summary.txt` file which has information as to what images were kept and their new filename if they were renamed. - -### Explanation of inputs -Some of the inputs are pretty straighforward and are used commonly across most WIPP plugins. This section is used to provide some details and examples of the inputs that may be a little complicated. The image collection with the following pattern will be used as an example : `r{r+}_t{t+}_p{p+}_z{z+}_c{c+}.ome.tif`, where r,t,p,z,c stand for replicate, timepoint, positon,z-positon, and channel respectively. Consider we have 5 replicates, 3 timepoints, 50 positions, 10 z-planes and 4 channels. - -1. `inpDir` - This contains the path to the input image collection to subset data from. -2. `filePattern` - Filepattern of the input images -3. `groupVar` - This is a mandatory input across which to subset data. This can take either 1 or 2 variables as input and if 2 variables are provided then the second variable will be treated as the minor grouping variable. In our example, if the `z` is provided as input, then within a subcollection, the mean of the feature value will be taken for all images with the same z. Then the z positions will be filtered out based on the input of `percentile` and `removeDirection` variables. Now if `z,c` are provided as input, then 'c' will be treated as the minor grouping variable which means that the mean will be taken for all images with the same z for each channel. Also, the plugin will ensures that the same values of z positions are filtered out across c. -4. `csvDir` - This contains the path to the csv collection containing the feature values for each image. This can be the output of the feature extraction plugin. -5. `feature` - The column name from the csv file that will be used to filter images -6. `percentile` and `removeDirection` - These two variables denote the critieria with which images are filtered. For example, if percentile is `0.1` and removeDirection is set to `Below` then images with feature value below the 10th percentile will be removed. On the other hand, if removeDirection is set to above then all images with feature value greater than the 10th pecentile will be removed. This enables data subsetting from both `brighfield` and `darkfield` microscopy images. - - **Optional Arguments** - -8. `sectionVar` - This is an optional input to segregate the input image collection into sub-collections. The analysis will be done seperately for each sub-collection. In our example, if the user enters `r,t` as the sectionVar, then we will have 15 subcollections (5*3),1 for each combination of timepoint and replicate. If the user enters `r` as sectionVar, then we will have 5 sub collections, 1 for each replicate. If the user wants to consider the whole image collection as a single section, then no input is required. NOTE: As a post processing step, same number of images will be subsetted across different sections. -9. `padding` - This is an optional variable with default value of 0. A delay of 3 means that 3 additional planes will captured on either side of the subsetted data. This can be used as a sanity check to ensure that the subsetted data captures the images we want. For example, in our examples if the following z values were filtered out intitially - 5,6,7 ; then a delay of 3 means that the output dataset will have z positions 2,3,4,5,6,7,8,9,10 if all them exist. -10. `writeOutput` - This is an optional argument with default value `True`. If it is set to true, then both the output image collection and `summary.txt` file will be created. If it is set to false, then the output directory will only consist of summary.txt. This option enables the user to tune the hyperparameters such as percentile, removeDirecton, feature without actually creating the output image collection. - - - -Contact [Gauhar Bains](mailto:gauhar.bains@labshare.org) for more information. - -For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). - -## Building - -To build the Docker image for the conversion plugin, run -`./build-docker.sh`. - -## Install WIPP Plugin - -If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. - -## Options - -This plugin takes one input argument and one output argument: - -| Name | Description | I/O | Type | -| ------------------- | ----------------------------------------------------- | ------ | ------------- | -| `--csvDir` | CSV collection containing features | Input | csvCollection | -| `--padding` | Number of images to capture outside the cutoff | Input | int | -| `--feature` | Feature to use to subset data | Input | string | -| `--filePattern` | Filename pattern used to separate data | Input | string | -| `--groupVar` | variables to group by in a section | Input | string | -| `--inpDir` | Input image collection to be processed by this plugin | Input | collection | -| `--percentile` | Percentile to remove | Input | int | -| `--removeDirection` | remove direction above or below percentile | Input | string | -| `--sectionVar` | variables to divide larger sections | Input | string | -| `--writeOutput` | write output image collection or not | Input | boolean | -| `--outDir` | Output collection | Output | collection | - diff --git a/clustering/polus-feature-subsetting-plugin/VERSION b/clustering/polus-feature-subsetting-plugin/VERSION deleted file mode 100644 index a34eaa5d0..000000000 --- a/clustering/polus-feature-subsetting-plugin/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.1.11 \ No newline at end of file diff --git a/clustering/polus-feature-subsetting-plugin/build-docker.sh b/clustering/polus-feature-subsetting-plugin/build-docker.sh deleted file mode 100644 index d9ad13705..000000000 --- a/clustering/polus-feature-subsetting-plugin/build-docker.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -version=$(= thresh] - else: - keep_planes = [z for z in planes if feature_dict[z] <= thresh] - - return set(keep_planes) - -def make_uniform(planes_dict, uniques, padding): - """ Ensure each section has the same number of images - - This function makes the output collection uniform in - the sense that it preserves same number of planes across - sections. It also captures additional planes based - on the value of the padding variable - - Args: - planes_dict (dict): planes to keep in different sections - uniques (list): unique values for the major grouping variable - padding (int): additional images to capture outside cutoff - - Returns: - dictionary: dictionary containing planes to keep - """ - - # max no. of planes - max_len = max([len(i) for i in planes_dict.values()]) - - # max planes that can be added on each side - min_ind = min([min(planes_dict[k]) for k in planes_dict]) - max_ind = max([max(planes_dict[k]) for k in planes_dict]) - max_add_left = uniques.index(min_ind) - max_add_right = len(uniques) - (uniques.index(max_ind)+1) - - # add planes in each section based on padding and max number of planes - for section_id, planes in planes_dict.items(): - len_to_add = max_len - len(planes) - len_add_left = min(int(len_to_add)/2+padding, max_add_left) - len_add_right = min(len_to_add - len_add_left+padding, max_add_right) - left_ind = int(uniques.index(min(planes)) - len_add_left) - right_ind = int(uniques.index(max(planes)) + len_add_right)+1 - planes_dict[section_id] = uniques[left_ind:right_ind] - return planes_dict - -def main(inpDir,csvDir,outDir,filePattern,groupVar,percentile, - removeDirection,sectionVar,feature,padding,writeOutput): - """Function containing the main login to subset data - - Args: - inpDir (string): path to input image collection - csvDir (string): path to csv file containing features - outDir (string): path to output collection - filePattern (string): input image filepattern - groupVar (string): grouping variables - percentile (float): cutoff feature percentile - removeDirection (string): subset above or below percentile - sectionVar (string): sectioning variable - feature (string): feature to subset using - padding (int): capture additional images outside of cutoff - writeOutput (boolean): write output image collection or not - """ - - # Get all file names in csvDir image collection - csvDir_files = [f.name for f in Path(csvDir).iterdir() if f.is_file() and "".join(f.suffixes)=='.csv'] - - # Get all file names in inpDir image collection - inpDir_files = [f.name for f in Path(inpDir).iterdir() if f.is_file() and "".join(f.suffixes)=='.ome.tif'] - - # read and concat all csv files - for ind, file in enumerate(csvDir_files): - if ind == 0: - feature_df = pd.read_csv(os.path.join(csvDir, file), header=0) - else: - feature_df = pd.concat([feature_df, pd.read_csv(os.path.join(csvDir, file), header=0)]) - - # store image name and its feature value - feature_dict = {k:v for k,v in zip(feature_df['Image'], feature_df[feature])} - - # seperate filepattern variables into different categories - _,var = filepattern.get_regex(filePattern) - grouping_variables = groupVar.split(',') - section_variables = sectionVar.split(',') - sub_section_variables = [v for v in var if v not in grouping_variables+section_variables] - - # initialize filepattern object - fp = filepattern.FilePattern(inpDir, pattern=filePattern) - uniques = fp.uniques - - [maj_grouping_var, min_grouping_var] = grouping_variables if len(grouping_variables)>1 else grouping_variables+[None] - keep_planes = {} - - logger.info('Iterating over sections...') - # single iteration of this loop gives all images in one section - for file in fp(group_by=sub_section_variables+grouping_variables): - - section_feat_dict = {} - section_keep_planes = [] - section_id = tuple([file[0][i] for i in section_variables]) if section_variables[0] else 1 - - # iterate over files in one section - for f in file: - if min_grouping_var == None: - f[min_grouping_var] = None - - # stote feature values for images - if f[min_grouping_var] not in section_feat_dict: - section_feat_dict[f[min_grouping_var]] = {} - - if f[maj_grouping_var] not in section_feat_dict[f[min_grouping_var]]: - section_feat_dict[f[min_grouping_var]][f[maj_grouping_var]] = [] - - section_feat_dict[f[min_grouping_var]][f[maj_grouping_var]].append(feature_dict[f['file'].name]) - - # average feature value by grouping variable - for key1 in section_feat_dict: - for key2 in section_feat_dict[key1]: - section_feat_dict[key1][key2] = sum(section_feat_dict[key1][key2])/len(section_feat_dict[key1][key2]) - - # find planes to keep based on specified criteria - section_keep_planes.append(filter_planes(section_feat_dict[key1],removeDirection, percentile)) - - # keep same planes within a section, across the minor grouping variable - section_keep_planes = list(section_keep_planes[0].union(*section_keep_planes)) - section_keep_planes = [i for i in range(min(section_keep_planes), max(section_keep_planes)+1) if i in uniques[maj_grouping_var]] - keep_planes[section_id] = section_keep_planes - - # keep same number of planes across different sections - keep_planes = make_uniform(keep_planes, uniques[maj_grouping_var], padding) - - # start writing summary.txt - summary = open(os.path.join(outDir, 'metadata_files', 'summary.txt'), 'w') - - logger.info('renaming subsetted data') - - # reinitialize filepattern object - fp = filepattern.FilePattern(inpDir, pattern=filePattern) - - # rename subsetted data - for file in fp(group_by=sub_section_variables+grouping_variables): - section_id = tuple([file[0][i] for i in section_variables]) if section_variables[0] else 1 - section_keep_planes = keep_planes[section_id] - rename_map = {k:v for k,v in zip(keep_planes[section_id], uniques[maj_grouping_var])} - - # update summary.txt with section renaming info - summary.write('------------------------------------------------ \n') - if sectionVar.strip(): - summary.write('Section : {} \n'.format({k:file[0][k] for k in section_variables})) - logger.info('Renaming files from section : {} \n'.format({k:file[0][k] for k in section_variables})) - summary.write('\nThe following values of "{}" variable have been renamed: \n'.format(maj_grouping_var)) - for k,v in rename_map.items(): - summary.write('{} ---> {} \n'.format(k,v)) - summary.write('\n Files : \n \n') - - # rename and write output - for f in file: - if f[maj_grouping_var] not in keep_planes[section_id]: - continue - - # old and new file name - old_file_name = f['file'].name - file_name_dict = {k.upper():v for k,v in f.items() if k!='file'} - file_name_dict[maj_grouping_var.upper()] = rename_map[f[maj_grouping_var]] - new_file_name = fp.get_matching(**file_name_dict)[0]['file'].name - - # if write output collection - if writeOutput: - shutil.copy2(os.path.join(inpDir, old_file_name),os.path.join(outDir, 'images', new_file_name)) - - summary.write('{} -----> {} \n'.format(old_file_name, new_file_name)) - summary.close() - -if __name__=="__main__": - # Initialize the logger - logging.basicConfig(format='%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s', - datefmt='%d-%b-%y %H:%M:%S') - logger = logging.getLogger("main") - logger.setLevel(logging.INFO) - - ''' Argument parsing ''' - logger.info("Parsing arguments...") - parser = argparse.ArgumentParser(prog='main', description='Subset data using a given feature') - - # Input arguments - parser.add_argument('--csvDir', dest='csvDir', type=str, - help='CSV collection containing features', required=True) - parser.add_argument('--padding', dest='padding', type=str, - help='Number of images to capture outside the cutoff', required=False) - parser.add_argument('--feature', dest='feature', type=str, - help='Feature to use to subset data', required=True) - parser.add_argument('--filePattern', dest='filePattern', type=str, - help='Filename pattern used to separate data', required=True) - parser.add_argument('--groupVar', dest='groupVar', type=str, - help='variables to group by in a section', required=True) - parser.add_argument('--inpDir', dest='inpDir', type=str, - help='Input image collection to be processed by this plugin', required=True) - parser.add_argument('--percentile', dest='percentile', type=str, - help='Percentile to remove', required=True) - parser.add_argument('--removeDirection', dest='removeDirection', type=str, - help='remove direction above or below percentile', required=True) - parser.add_argument('--sectionVar', dest='sectionVar', type=str, - help='variables to divide larger sections', required=False) - parser.add_argument('--writeOutput', dest='writeOutput', type=str, - help='write output image collection or not', required=False) - # Output arguments - parser.add_argument('--outDir', dest='outDir', type=str, - help='Output collection', required=True) - - # Parse the arguments - args = parser.parse_args() - csvDir = args.csvDir - logger.info('csvDir = {}'.format(csvDir)) - padding = args.padding - padding = 0 if padding==None else int(padding) - logger.info('padding = {}'.format(padding)) - feature = args.feature - logger.info('feature = {}'.format(feature)) - filePattern = args.filePattern - logger.info('filePattern = {}'.format(filePattern)) - groupVar = args.groupVar - logger.info('groupVar = {}'.format(groupVar)) - inpDir = args.inpDir - if (Path.is_dir(Path(args.inpDir).joinpath('images'))): - # switch to images folder if present - fpath = str(Path(args.inpDir).joinpath('images').absolute()) - logger.info('inpDir = {}'.format(inpDir)) - percentile = float(args.percentile) - logger.info('percentile = {}'.format(percentile)) - removeDirection = args.removeDirection - logger.info('removeDirection = {}'.format(removeDirection)) - sectionVar = args.sectionVar - sectionVar = '' if sectionVar is None else sectionVar - logger.info('sectionVar = {}'.format(sectionVar)) - writeOutput = True if args.writeOutput==None else args.writeOutput == 'true' - logger.info('writeOutput = {}'.format(writeOutput)) - outDir = args.outDir - logger.info('outDir = {}'.format(outDir)) - - # create metadata and images folder in outDir - if not os.path.isdir(os.path.join(outDir, 'images')): - os.mkdir(os.path.join(outDir, 'images')) - if not os.path.isdir(os.path.join(outDir, 'metadata_files')): - os.mkdir(os.path.join(outDir, 'metadata_files')) - - # Surround with try/finally for proper error catching - try: - main(inpDir=inpDir, - csvDir=csvDir, - outDir=outDir, - filePattern=filePattern, - groupVar=groupVar, - percentile=percentile, - removeDirection=removeDirection, - sectionVar=sectionVar, - feature=feature, - padding=padding, - writeOutput=writeOutput) - - except Exception: - traceback.print_exc() - - finally: - logger.info('exiting workflow..') - # Exit the program - sys.exit() \ No newline at end of file diff --git a/clustering/polus-feature-subsetting-plugin/src/requirements.txt b/clustering/polus-feature-subsetting-plugin/src/requirements.txt deleted file mode 100644 index b7e965ece..000000000 --- a/clustering/polus-feature-subsetting-plugin/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -filepattern>=1.4.5 -pandas>=1.1.3 \ No newline at end of file diff --git a/clustering/polus-hdbscan-clustering-plugin/Dockerfile b/clustering/polus-hdbscan-clustering-plugin/Dockerfile deleted file mode 100644 index 37129b3fd..000000000 --- a/clustering/polus-hdbscan-clustering-plugin/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM polusai/bfio:2.1.9 - -COPY VERSION / -COPY src ${EXEC_DIR}/. - -RUN apt --no-install-recommends -y autoremove --purge python3.9-minimal python3.9\ - && apt-get update && apt-get install --no-install-recommends -y build-essential python3.9-dev\ - && pip3 install -r ${EXEC_DIR}/requirements.txt --no-cache-dir - -ENTRYPOINT ["python3", "/opt/executables/main.py"] \ No newline at end of file diff --git a/clustering/polus-hdbscan-clustering-plugin/README.md b/clustering/polus-hdbscan-clustering-plugin/README.md deleted file mode 100644 index 2169be013..000000000 --- a/clustering/polus-hdbscan-clustering-plugin/README.md +++ /dev/null @@ -1,48 +0,0 @@ -# Hierarchical Density-Based Spatial Clustering of Applications with Noise(HDBSCAN) Clustering - -The HDBSCAN Clustering plugin clusters the data using [HDBSCAN clustering](https://pypi.org/project/hdbscan/) library. The input and output for this plugin is a CSV file. Each observation (row) in the input CSV file is assigned to one of the clusters. The output CSV file contains the column `cluster` that identifies the cluster to which each observation belongs. A user can supply a regular expression with capture groups if they wish to cluster each group independently, or if they wish to average the numerical features across each group and treat them as a single observation. - -## Inputs: - -### Input CSV collection: -The input file(s) that need to be clustered. The file should be in CSV format. This is a required parameter for the plugin. - -### Grouping pattern: -The input for this parameter is a regular expression with capture group. This input splits the data into groups based on the matched pattern. A new column `group` is created in the output CSV file that has the group based on the given pattern. Unless `averageGroups` is set to `true`, providing a grouping pattern will cluster each group independently. - -### Average groups: -Setting this equal to `true` will use the supplied `groupingPattern` to average the numerical features and produce a single row per group which is then clustered. The resulting cluster is assigned to all observations belonging in that group. - -### Label column: -This is the name of the column containing the labels to be used with `groupingPattern`. - -### Minimum cluster size: -This parameter defines the smallest number of points that should be considered as cluster. This is a required parameter. The input should be an integer and the value should be greater than 1. - -### Increment outlier ID: -This parameter sets the ID of the outlier cluster to `1`, otherwise it will be 0. This is useful for visualization purposes if the resulting cluster IDs are turned into image annotations. - -## Output: -The output is a CSV file containing the clustered data. - -## Building -To build the Docker image for the conversion plugin, run -`./build-docker.sh`. - -## Install WIPP Plugin -If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. -For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). - -## Options - -This plugin takes four input arguments and one output argument: - -| Name | Description | I/O | Type | -| ---------------------- | ---------------------------------------------------------------------------------------------- | ------ | ------------- | -| `--inpDir` | Input csv collection. | Input | csvCollection | -| `--groupingPattern` | Regular expression to group rows. Clustering will be applied across capture groups by default. | Input | string | -| `--averageGroups` | If set to `true`, will average data across groups. Requires capture groups | Input | string | -| `--labelCol` | Name of the column containing labels for grouping pattern. | Input | string | -| `--minClusterSize` | Minimum cluster size. | Input | integer | -| `--incrementOutlierId` | Increments outlier ID to 1. | Input | string | -| `--outDir` | Output collection | Output | csvCollection | diff --git a/clustering/polus-hdbscan-clustering-plugin/VERSION b/clustering/polus-hdbscan-clustering-plugin/VERSION deleted file mode 100644 index 5546bd2c5..000000000 --- a/clustering/polus-hdbscan-clustering-plugin/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.4.7 \ No newline at end of file diff --git a/clustering/polus-hdbscan-clustering-plugin/build-docker.sh b/clustering/polus-hdbscan-clustering-plugin/build-docker.sh deleted file mode 100755 index 7a7f44fe8..000000000 --- a/clustering/polus-hdbscan-clustering-plugin/build-docker.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -version=$( typing.List[str]: - """List all the .csv files in the directory. - - Args: - csv_directory (str): Path to the directory containing the csv files. - - Returns: - The path to directory, list of names of the subdirectories in dirpath (if any) and the filenames of .csv files. - - """ - list_of_files = [os.path.join(dirpath, file_name) - for dirpath, dirnames, files in os.walk(csv_directory) - for file_name in fnmatch.filter(files, '*.csv')] - return list_of_files - - -def clustering(data: np.ndarray, min_cluster_size: int, increment_outlier_id: bool) -> np.ndarray: - """Cluster data using HDBSCAN. - - Args: - data (array): Data that need to be clustered. - min_cluster_size (int): Smallest size grouping that should be considered as a cluster. - increment_outlier_id (bool) : Increment outlier ID to unity. - - Returns: - Cluster labels for each row of data. - """ - clusters = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size).fit(data) - labels = clusters.labels_.flatten().astype(np.uint16) + 1 - labels = labels + 1 if increment_outlier_id else labels - - return labels - - -# Setup the argument parsing -def main(inpDir, grouping_pattern, avg_groups, label_col, min_cluster_size, increment_outlier_id, outDir): - # Get list of .csv files in the directory including sub folders for clustering - input_csvs = list_files(inpDir) - if input_csvs is None: - raise ValueError('No .csv files found.') - - for csv in input_csvs: - # Get the full path and split to get only the filename. - split_file = os.path.normpath(csv) - file_name = os.path.split(split_file)[-1] - file_prefix, _ = file_name.split('.', 1) - - logger.info('Reading the file ' + file_name) - - # Read csv file - df = pd.read_csv(csv) - - # If user provided a regular expression. - if grouping_pattern is not None: - df = df[df[label_col].str.match(grouping_pattern)].copy() - if df.empty: - logger.warning(f"Could not find any files matching the pattern {grouping_pattern} in file {csv}. Skipping...") - continue - - #Create a column group with matching string - df['group'] = df[label_col].str.extract(grouping_pattern, expand=True).apply(','.join, axis=1) - - # Get column(s) containing data. - df_data = df.select_dtypes(exclude='object').copy() - df_data['group'] = df['group'] - - # If we want to average features for each group. - if avg_groups: - df_grouped = df_data.groupby('group').apply(lambda x: x.sort_values('group').mean(numeric_only=True)) - - # Cluster data using HDBSCAN clustering. - logger.info('Clustering the data') - cluster_ids = clustering(df_grouped.values, min_cluster_size, increment_outlier_id) - - df_grouped['cluster'] = cluster_ids - df = df.merge(df_grouped['cluster'], left_on='group', right_index=True) - else: # We want separate clustering results for each group. - dfs = [] - for group, df_ss in df_data.groupby('group'): - # Cluster data using HDBSCAN clustering. - logger.info(f'Clustering data in group {group}') - - cluster_ids = clustering(df_ss.values, min_cluster_size, increment_outlier_id) - df_ss['cluster'] = cluster_ids - dfs.append(df_ss) - - df_grouped = pd.concat(dfs) - df = df.merge(df_grouped['cluster'], left_index=True, right_index=True) - - # No grouping. Vanilla clustering. - else: - # Get column(s) containing data. - df_data = df.select_dtypes(exclude='object').copy() - - #Cluster data using HDBSCAN clustering - logger.info('Clustering the data') - cluster_ids = clustering(df_data.values, min_cluster_size, increment_outlier_id) - df['cluster'] = cluster_ids - - df.to_csv(os.path.join(outDir, f'{file_prefix}.csv'), index=None, header=True, encoding='utf-8-sig') - logger.info("Finished all processes!") - -if __name__ == "__main__": - logger.info("Parsing arguments...") - parser = argparse.ArgumentParser(prog='main', description='HDBSCAN clustering plugin') - parser.add_argument('--inpDir', dest='inpDir', type=str, - help='Input collection-Data need to be clustered', required=True) - parser.add_argument('--groupingPattern', dest='groupingPattern', type=str, - help='Regular expression to group rows. Clustering will be applied across capture groups.', required=False) - parser.add_argument('--averageGroups', dest='averageGroups', type=str, - help='Whether to average data across groups. Requires capture groups.', default='false', required=False) - parser.add_argument('--labelCol', dest='labelCol', type=str, - help='Name of column containing labels. Required only for grouping operations.', required=False) - parser.add_argument('--minClusterSize', dest='minClusterSize', type=int, - help='Minimum cluster size', required=True) - parser.add_argument('--incrementOutlierId', dest='incrementOutlierId', type=str, - help='Increments outlier ID to 1.', default='false', required=False) - parser.add_argument('--outDir', dest='outDir', type=str, - help='Output collection', required=True) - - # Parse the arguments. - args = parser.parse_args() - - # Path to csvfile directory. - inpDir = args.inpDir - logger.info('inpDir = {}'.format(inpDir)) - - # Regular expression for grouping. - grouping_pattern = args.groupingPattern - logger.info('grouping_pattern = {}'.format(grouping_pattern)) - - # Whether to average data for each group. - avg_groups = args.averageGroups.lower() != 'false' - logger.info('avg_groups = {}'.format(avg_groups)) - - # Name of column to use for grouping. - label_col = args.labelCol - logger.info('label_col = {}'.format(label_col)) - - # Minimum cluster size for clustering using HDBSCAN. - min_cluster_size = args.minClusterSize - logger.info('min_cluster_size = {}'.format(min_cluster_size)) - - # Set outlier cluster id as 1. - increment_outlier_id = args.incrementOutlierId.lower() != 'false' - logger.info('increment_outlier_id = {}'.format(increment_outlier_id)) - - # Path to save output csvfiles. - outDir = args.outDir - logger.info('outDir = {}'.format(outDir)) - - main( - inpDir, - grouping_pattern, - avg_groups, - label_col, - min_cluster_size, - increment_outlier_id, - outDir - ) \ No newline at end of file diff --git a/clustering/polus-hdbscan-clustering-plugin/src/requirements.txt b/clustering/polus-hdbscan-clustering-plugin/src/requirements.txt deleted file mode 100644 index ffd72e039..000000000 --- a/clustering/polus-hdbscan-clustering-plugin/src/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -hdbscan==0.8.27 -pandas>=1.2.4 diff --git a/features/polus-csv-statistics-plugin/Dockerfile b/features/polus-csv-statistics-plugin/Dockerfile deleted file mode 100644 index d6b8f9f20..000000000 --- a/features/polus-csv-statistics-plugin/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM polusai/bfio:2.1.9 - -COPY VERSION / - -ARG EXEC_DIR="/opt/executables" -ARG DATA_DIR="/data" - -RUN mkdir -p ${EXEC_DIR} \ - && mkdir -p ${DATA_DIR}/inputs \ - && mkdir ${DATA_DIR}/outputs - -COPY src ${EXEC_DIR}/ -WORKDIR ${EXEC_DIR} - -RUN pip3 install -r ${EXEC_DIR}/requirements.txt - -ENTRYPOINT ["python3", "/opt/executables/main.py"] \ No newline at end of file diff --git a/features/polus-csv-statistics-plugin/README.md b/features/polus-csv-statistics-plugin/README.md deleted file mode 100644 index 51ac1c4a9..000000000 --- a/features/polus-csv-statistics-plugin/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# CSV Statistics - -This WIPP plugin performs statistics on values in each column of a csv file if the data is numeric. Rows of data are grouped together by rows that have a matching value in a column with header named `file`. If no columns have the `file` header, then this plugin throws and error. - -Available statistics are: - -1. [mean (arithmetic mean)](https://en.wikipedia.org/wiki/Mean#Arithmetic_mean_(AM)) -2. [median](https://en.wikipedia.org/wiki/Median#The_sample_median) -3. [std (standard deviation)](https://en.wikipedia.org/wiki/Standard_deviation) -4. [var (variance)](https://en.wikipedia.org/wiki/Variance) -5. [skew (Fisher-Pearson skewness)](https://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm) -6. [kurt (excess kurtosis)](https://www.itl.nist.gov/div898/handbook/eda/section3/eda35b.htm) -7. count (number of rows sampled) -8. [iqr (Interquartile_range)](https://en.wikipedia.org/wiki/Interquartile_range) - -For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). - -## Building - -To build the Docker image for the conversion plugin, run -`./build-docker.sh`. - -## Install WIPP Plugin - -If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. - -## Options - -This plugin takes two input argument and one output argument: - -| Name | Description | I/O | Type | -| --------------- | --------------------------------------------------- | ------ | ------------- | -| `--statistics` | Types of statistics to calculate | Input | array | -| `--inpDir` | Input csv collection to be processed by this plugin | Input | csvCollection | -| `--filePattern` | The filePattern of the images in represented in csv | Input | string | -| `--groupBy` | The variable(s) of how the images should be grouped | Input | string | -| `--outDir` | Output collection | Output | csvCollection | diff --git a/features/polus-csv-statistics-plugin/VERSION b/features/polus-csv-statistics-plugin/VERSION deleted file mode 100644 index 7dff5b892..000000000 --- a/features/polus-csv-statistics-plugin/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.2.1 \ No newline at end of file diff --git a/features/polus-csv-statistics-plugin/build-docker.sh b/features/polus-csv-statistics-plugin/build-docker.sh deleted file mode 100755 index ff8f13c78..000000000 --- a/features/polus-csv-statistics-plugin/build-docker.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -version=$( fcheck: - fcheck += 1 - logger.info('Unique Files parsed: {}'.format(fnum)) \ No newline at end of file diff --git a/features/polus-csv-statistics-plugin/src/requirements.txt b/features/polus-csv-statistics-plugin/src/requirements.txt deleted file mode 100644 index 6dd96c62d..000000000 --- a/features/polus-csv-statistics-plugin/src/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -filepattern==1.4.7 \ No newline at end of file diff --git a/formats/arrow-to-tabular-tool/.bumpversion.cfg b/formats/arrow-to-tabular-tool/.bumpversion.cfg deleted file mode 100644 index 7a2f0851d..000000000 --- a/formats/arrow-to-tabular-tool/.bumpversion.cfg +++ /dev/null @@ -1,27 +0,0 @@ -[bumpversion] -current_version = 0.2.3 -commit = True -tag = False -parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? -serialize = - {major}.{minor}.{patch}-{release}{dev} - {major}.{minor}.{patch} - -[bumpversion:part:release] -optional_value = _ -first_value = dev -values = - dev - _ - -[bumpversion:part:dev] - -[bumpversion:file:pyproject.toml] -search = version = "{current_version}" -replace = version = "{new_version}" - -[bumpversion:file:plugin.json] - -[bumpversion:file:VERSION] - -[bumpversion:file:src/polus/images/formats/arrow_to_tabular/__init__.py] diff --git a/formats/arrow-to-tabular-tool/.gitignore b/formats/arrow-to-tabular-tool/.gitignore deleted file mode 100644 index c4aa6d8e4..000000000 --- a/formats/arrow-to-tabular-tool/.gitignore +++ /dev/null @@ -1,175 +0,0 @@ - #Byte-compiled / optimized / DLL files - __pycache__/ - *.py[cod] - *$py.class - - # C extensions - *.so - - # Distribution / packaging - .Python - build/ - develop-eggs/ - dist/ - downloads/ - eggs/ - .eggs/ - lib/ - lib64/ - parts/ - sdist/ - var/ - wheels/ - share/python-wheels/ - *.egg-info/ - .installed.cfg - *.egg - MANIFEST - - # PyInstaller - # Usually these files are written by a python script from a template - # before PyInstaller builds the exe, so as to inject date/other infos into it. - *.manifest - *.spec - - # Installer logs - pip-log.txt - pip-delete-this-directory.txt - - # Unit test / coverage reports - htmlcov/ - .tox/ - .nox/ - .coverage - .coverage.* - .cache - nosetests.xml - coverage.xml - *.cover - *.py,cover - .hypothesis/ - .pytest_cache/ - cover/ - - # Translations - *.mo - *.pot - - # Django stuff: - *.log - local_settings.py - db.sqlite3 - db.sqlite3-journal - - # Flask stuff: - instance/ - .webassets-cache - - # Scrapy stuff: - .scrapy - - # Sphinx documentation - docs/_build/ - - # PyBuilder - .pybuilder/ - target/ - - # Jupyter Notebook - .ipynb_checkpoints - - # IPython - profile_default/ - ipython_config.py - - # pyenv - # For a library or package, you might want to ignore these files since the code is - # intended to run in multiple environments; otherwise, check them in: - # .python-version - - # pipenv - # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. - # However, in case of collaboration, if having platform-specific dependencies or dependencies - # having no cross-platform support, pipenv may install dependencies that don't work, or not - # install all needed dependencies. - #Pipfile.lock - - # poetry - # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. - # This is especially recommended for binary packages to ensure reproducibility, and is more - # commonly ignored for libraries. - # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control - poetry.lock - ../../poetry.lock - - # pdm - # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. - #pdm.lock - # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it - # in version control. - # https://pdm.fming.dev/#use-with-ide - .pdm.toml - - # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm - __pypackages__/ - - # Celery stuff - celerybeat-schedule - celerybeat.pid - - # SageMath parsed files - *.sage.py - - # Environments - .env - .venv - env/ - venv/ - ENV/ - env.bak/ - venv.bak/ - - # Spyder project settings - .spyderproject - .spyproject - - # Rope project settings - .ropeproject - - # mkdocs documentation - /site - - # mypy - .mypy_cache/ - .dmypy.json - dmypy.json - - # Pyre type checker - .pyre/ - - # pytype static type analyzer - .pytype/ - - # Cython debug symbols - cython_debug/ - - # PyCharm - # JetBrains specific template is maintained in a separate JetBrains.gitignore that can - # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore - # and can be added to the global gitignore or merged into this file. For a more nuclear - # option (not recommended) you can uncomment the following to ignore the entire idea folder. - #.idea/ - - # vscode - .vscode - - # test data directory - data - # yaml file - .pre-commit-config.yaml - - # hidden files - .DS_Store - .ds_store - # flake8 - .flake8 diff --git a/formats/arrow-to-tabular-tool/Dockerfile b/formats/arrow-to-tabular-tool/Dockerfile deleted file mode 100644 index bba2b8535..000000000 --- a/formats/arrow-to-tabular-tool/Dockerfile +++ /dev/null @@ -1,25 +0,0 @@ -FROM polusai/bfio:2.1.9 - -# environment variables defined in polusai/bfio -ENV EXEC_DIR="/opt/executables" -ENV POLUS_IMG_EXT=".ome.tif" -ENV POLUS_TAB_EXT=".csv" -ENV POLUS_LOG="INFO" - -# Work directory defined in the base container -WORKDIR ${EXEC_DIR} - -# TODO: Change the tool_dir to the tool directory -ENV TOOL_DIR="formats/arrow-to-tabular-tool" - -# Copy the repository into the container -RUN mkdir image-tools -COPY . ${EXEC_DIR}/image-tools - -# Install the tool -RUN pip3 install "${EXEC_DIR}/image-tools/${TOOL_DIR}" --no-cache-dir - -# Set the entrypoint -# TODO: Change the entrypoint to the tool entrypoint -ENTRYPOINT ["python3", "-m", "polus.images.formats.arrow_to_tabular"] -CMD ["--help"] diff --git a/formats/arrow-to-tabular-tool/README.md b/formats/arrow-to-tabular-tool/README.md deleted file mode 100644 index 5b9d36e25..000000000 --- a/formats/arrow-to-tabular-tool/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Arrow to Tabular (v0.2.0) -This WIPP plugin allows analysts to convert Arrow Feather File Format (V2) into the following file formats for researchers: \ - - `.parquet` \ - - `.csv` - -Contact [Kelechi Nina Mezu](mailto:nina.mezu@nih.gov), [Hamdah Shafqat Abbasi](mailto:hamdahshafqat.abbasi@nih.gov) for more information. - -For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). - -## Building - -To build the Docker image for the conversion plugin, run -`bash build-docker.sh`. - -## Install WIPP Plugin - -If WIPP is running, navigate to the plugins page and add a new plugin. Paste the -contents of `plugin.json` into the pop-up window and submit. - -## Options - -This plugin takes two input arguments and one output argument: - -| Name | Description | I/O | Type | -| --------------- | ------------------------------------------------------------ | ------ | ---------- | -| `--inpDir` | Input generic data collection to be processed by this plugin | Input | collection | -| `--fileFormat` | Filename pattern to convert | Input | string | -| `--outDir` | Output collection | Output | collection | -| `--preview` | Generate a JSON file with outputs | Output | JSON | diff --git a/formats/arrow-to-tabular-tool/VERSION b/formats/arrow-to-tabular-tool/VERSION deleted file mode 100644 index 717903969..000000000 --- a/formats/arrow-to-tabular-tool/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.2.3 diff --git a/formats/arrow-to-tabular-tool/arrowtotabular.cwl b/formats/arrow-to-tabular-tool/arrowtotabular.cwl deleted file mode 100644 index df3754cbc..000000000 --- a/formats/arrow-to-tabular-tool/arrowtotabular.cwl +++ /dev/null @@ -1,28 +0,0 @@ -class: CommandLineTool -cwlVersion: v1.2 -inputs: - fileFormat: - inputBinding: - prefix: --fileFormat - type: string - inpDir: - inputBinding: - prefix: --inpDir - type: Directory - outDir: - inputBinding: - prefix: --outDir - type: Directory -outputs: - outDir: - outputBinding: - glob: $(inputs.outDir.basename) - type: Directory -requirements: - DockerRequirement: - dockerPull: polusai/arrow-to-tabular-tool:0.2.3-dev0 - InitialWorkDirRequirement: - listing: - - entry: $(inputs.outDir) - writable: true - InlineJavascriptRequirement: {} diff --git a/formats/arrow-to-tabular-tool/build-docker.sh b/formats/arrow-to-tabular-tool/build-docker.sh deleted file mode 100755 index f75e537e8..000000000 --- a/formats/arrow-to-tabular-tool/build-docker.sh +++ /dev/null @@ -1,23 +0,0 @@ - -#!/bin/bash - -# Change the name of the tool here -tool_dir="formats" -tool_name="arrow-to-tabular-tool" - -# The version is read from the VERSION file -version=$(","Hamdah Shafqat abbasi "] -readme = "README.md" -packages = [{include = "polus", from = "src"}] - -[tool.poetry.dependencies] -python = ">=3.9,<3.12" -filepattern = "^2.0.4" -typer = "^0.7.0" -tqdm = "^4.64.1" -blake3 = "^0.3.3" -fcsparser = "^0.2.4" -llvmlite = "^0.39.1" -fastapi = "^0.92.0" -vaex = "^4.7.0" - - -[tool.poetry.group.dev.dependencies] -bump2version = "^1.0.1" -pre-commit = "^3.1.0" -black = "^23.1.0" -flake8 = "^6.0.0" -mypy = "^1.0.1" -pytest = "^7.2.1" -pandas = "^1.5.3" - -[build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" diff --git a/formats/arrow-to-tabular-tool/run-plugin.sh b/formats/arrow-to-tabular-tool/run-plugin.sh deleted file mode 100755 index 22f347eb2..000000000 --- a/formats/arrow-to-tabular-tool/run-plugin.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -#!/bin/bash -version=$( None: - """Execute Main function.""" - logger.info(f"inpDir = {inp_dir}") - logger.info(f"outDir = {out_dir}") - logger.info(f"fileFormat = {file_format}") - - inp_dir = inp_dir.resolve() - out_dir = out_dir.resolve() - - assert inp_dir.exists(), f"{inp_dir} doesnot exists!! Please check input path again" - assert ( - out_dir.exists() - ), f"{out_dir} doesnot exists!! Please check output path again" - FILE_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") - - if file_format == Format.Default: - file_format = FILE_EXT - elif file_format == Format.CSV: - file_format = ".csv" - elif file_format == Format.PARQUET: - file_format = ".parquet" - elif file_format == None: - file_format = FILE_EXT - - assert file_format in [ - ".csv", - ".parquet", - ], f"This tabular file format: {file_format} is not support supported by this plugin!! Choose either CSV or Parquet FileFormat" - - pattern_list = [".feather", ".arrow"] - pattern = [f.suffix for f in inp_dir.iterdir() if f.suffix in pattern_list][0] - assert ( - pattern in pattern_list - ), f"This input file extension {pattern} is not support supported by this plugin!! It should be either .feather and .arrow files" - filepattern = {".feather": ".*.feather", ".arrow": ".*.arrow"} - - featherPattern = filepattern[pattern] - - fps = fp.FilePattern(inp_dir, featherPattern) - - if preview: - with open(pathlib.Path(out_dir, "preview.json"), "w") as jfile: - out_json: dict[str, Any] = { - "filepattern": featherPattern, - "outDir": [], - } - for file in fps(): - out_name = str(file[1][0].stem) + file_format - out_json["outDir"].append(out_name) - json.dump(out_json, jfile, indent=2) - - with ProcessPoolExecutor(max_workers) as executor: - processes = [] - for files in fps: - file = files[1][0] - processes.append(executor.submit(arrow_tabular, file, file_format, out_dir)) - - for process in tqdm( - as_completed(processes), desc="Arrow --> Tabular", total=len(processes) - ): - process.result() - - logger.info("Finished all processes!") - - -if __name__ == "__main__": - typer.run(main) diff --git a/formats/arrow-to-tabular-tool/src/polus/images/formats/arrow_to_tabular/arrow_to_tabular.py b/formats/arrow-to-tabular-tool/src/polus/images/formats/arrow_to_tabular/arrow_to_tabular.py deleted file mode 100644 index 719f324bb..000000000 --- a/formats/arrow-to-tabular-tool/src/polus/images/formats/arrow_to_tabular/arrow_to_tabular.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Arrow to Tabular.""" -import logging -import pathlib - -from enum import Enum -import vaex - -logger = logging.getLogger(__name__) - - - -class Format(str, Enum): - """Extension types to be converted.""" - CSV = ".csv" - PARQUET = ".parquet" - Default = "default" - - -def arrow_tabular(file: pathlib.Path, file_format: str, out_dir: pathlib.Path) -> None: - """Convert Arrow file into tabular file. - This plugin uses vaex to open an arrow file and converts into csv or parquet tabular data. - - Args: - file : Path to input file. - file_format : Filepattern of desired tabular output file. - out_dir: Path to output directory. - """ - file_name = pathlib.Path(file).stem - logger.info("Arrow Conversion: Copy ${file_name} into outDir for processing...") - - output_file = pathlib.Path(out_dir, (file_name + file_format)) - - logger.info("Arrow Conversion: Converting file into PyArrow Table") - - data = vaex.open(file) - logger.info("Arrow Conversion: table converted") - ncols = len(data) - chunk_size = max([2**24 // ncols, 1]) - - logger.info("Arrow Conversion: checking for file format") - - if file_format == ".csv": - logger.info("Arrow Conversion: Converting PyArrow Table into .csv file") - # Streaming contents of Arrow Table into csv - return data.export_csv(output_file, chunksize=chunk_size) - - elif file_format == ".parquet": - logger.info("Arrow Conversion: Converting PyArrow Table into .parquet file") - return data.export_parquet(output_file) - else: - logger.error( - "Arrow Conversion Error: This format is not supported in this plugin" - ) diff --git a/formats/arrow-to-tabular-tool/tests/__init__.py b/formats/arrow-to-tabular-tool/tests/__init__.py deleted file mode 100644 index d7bcf679b..000000000 --- a/formats/arrow-to-tabular-tool/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Arrow to Tabular.""" diff --git a/formats/arrow-to-tabular-tool/tests/test_main.py b/formats/arrow-to-tabular-tool/tests/test_main.py deleted file mode 100644 index 9dd214714..000000000 --- a/formats/arrow-to-tabular-tool/tests/test_main.py +++ /dev/null @@ -1,69 +0,0 @@ -"""Testing of Arrow to Tabular plugin.""" -import os -import pathlib -import random -import string - -import filepattern as fp -import numpy as np -import pandas as pd -import pytest -from polus.images.formats.arrow_to_tabular.arrow_to_tabular import arrow_tabular - - -@pytest.fixture() -def generate_arrow(): - """Create pandas dataframe and convert into to arrow file format.""" - dirpath = os.path.abspath(os.path.join(__file__, "../..")) - inpDir = pathlib.Path(dirpath, "data/input") - outDir = pathlib.Path(dirpath, "data/output") - if not inpDir.exists(): - inpDir.mkdir(parents=True, exist_ok=True) - if not outDir.exists(): - outDir.mkdir(exist_ok=True, parents=True) - - df = pd.DataFrame( - { - "A": [random.choice(string.ascii_letters) for i in range(100)], - "B": np.random.randint(low=1, high=100, size=100), - "C": np.random.normal(0.0, 1.0, size=100), - }, - ) - df.to_feather(pathlib.Path(inpDir, "data.arrow")) - df.to_feather(pathlib.Path(inpDir, "data1.arrow")) - - return inpDir, outDir - - -def test_arrow_tabular(generate_arrow): - """Test of Arrow to Parquet file format.""" - pattern = ".parquet" - filePattern = {".csv": ".*.csv", ".parquet": ".*.parquet"} - out_pattern = filePattern[pattern] - in_pattern = ".*.arrow" - fps = fp.FilePattern(generate_arrow[0], in_pattern) - for file in fps(): - arrow_tabular(file[1][0], pattern, generate_arrow[1]) - - assert ( - all( - file[1][0].suffix - for file in fp.FilePattern(generate_arrow[1], out_pattern)() - ) - is True - ) - [os.remove(f) for f in generate_arrow[1].iterdir() if f.name.endswith(pattern)] - - pattern = ".csv" - out_pattern = filePattern[pattern] - fps = fp.FilePattern(generate_arrow[0], in_pattern) - for file in fps(): - arrow_tabular(file[1][0], pattern, generate_arrow[1]) - - assert ( - all( - file[1][0].suffix - for file in fp.FilePattern(generate_arrow[1], out_pattern)() - ) - is True - ) diff --git a/formats/polus-fcs-to-csv-converter-plugin/Dockerfile b/formats/polus-fcs-to-csv-converter-plugin/Dockerfile deleted file mode 100644 index 78be1a4e0..000000000 --- a/formats/polus-fcs-to-csv-converter-plugin/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -FROM polusai/bfio:2.1.9 - -COPY VERSION ${EXEC_DIR} -COPY src ${EXEC_DIR}/ - -RUN pip3 install -r ${EXEC_DIR}/requirements.txt --no-cache-dir - -ENTRYPOINT ["python3", "/opt/executables/main.py"] \ No newline at end of file diff --git a/formats/polus-fcs-to-csv-converter-plugin/README.md b/formats/polus-fcs-to-csv-converter-plugin/README.md deleted file mode 100644 index fd4dc62a3..000000000 --- a/formats/polus-fcs-to-csv-converter-plugin/README.md +++ /dev/null @@ -1,31 +0,0 @@ -# Fcs to Csv file converter - -The fcs to csv file converter plugin converts fcs file to csv file.The input file should be in .fcs file format and output will be .csv file format. - -## Input: -The input should be a file in fcs format. - -## Output: -The output is a csv file. - -For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). - -## Building - -To build the Docker image for the conversion plugin, run -`./build-docker.sh`. - -## Install WIPP Plugin - -If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. - -## Options - -This plugin takes eight input argument and one output argument: - -| Name | Description | I/O | Type | -| ---------- | ------------------------- | ------ | ------------- | -| `--inpDir` | Input fcs file collection | Input | collection | -| `--outDir` | Output collection | Output | csvCollection | - - diff --git a/formats/polus-fcs-to-csv-converter-plugin/VERSION b/formats/polus-fcs-to-csv-converter-plugin/VERSION deleted file mode 100644 index 28af839c0..000000000 --- a/formats/polus-fcs-to-csv-converter-plugin/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.2.5 \ No newline at end of file diff --git a/formats/polus-fcs-to-csv-converter-plugin/build-docker.sh b/formats/polus-fcs-to-csv-converter-plugin/build-docker.sh deleted file mode 100644 index 9a33106b5..000000000 --- a/formats/polus-fcs-to-csv-converter-plugin/build-docker.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -version=$( 0) { - for (i in 1:length(excludes)) { - if(!excludes[i] %in% colnames(dataset)) { - logwarn('column to exclude from %s is not found',file_name) - } - } - datasub <-dataset[ , !(names(dataset) %in% excludes)] - } - else if(length(excludes) == 0) { - datasub <-dataset - } - # Remove columns with all values as zero - datasub <- datasub[colSums(datasub) > 0] - - #Check whether predict column is present in dataframe - if(!(predictcolumn %in% colnames(datasub))) { - logwarn('predict column name is not found in %s',file_name) - next - } - - #Get column names without predict variable - drop_dep <- datasub[ , !(names(datasub) %in% predictcolumn)] - resp_var <- colnames(drop_dep) - - #Number of cores - num_of_cores = detectCores() - loginfo('Cores = %s', num_of_cores) - - #Chunk Size - chunk <- floor((nrow(datasub)/ncol(datasub))*num_of_cores) - - #Function to determine chunks - make.data<-function(formula,data,chunksize,...){ - n<-nrow(data) - cursor<-0 - datafun<-function(reset=FALSE){ - if (reset){ - cursor<<-0 - return(NULL) - } - if (cursor>=n) - return(NULL) - start<-cursor+1 - cursor<<-cursor+min(chunksize, n-cursor) - data[start:cursor,] - } - } - - #Convert to ffdf object - datasub_ff = as.ffdf(datasub) - - #Chunk data - chunk_data <-make.data(formula(paste(predictcolumn,paste(resp_var,collapse= "+"),sep="~")), datasub_ff, chunksize=chunk) - - if((modeltype == 'Gaussian') || (modeltype == 'Poisson') || (modeltype == 'Binomial') || (modeltype == 'Quasibinomial') || (modeltype == 'Quasipoisson') || (modeltype == 'Quasi')) { - modeltype <- tolower(modeltype) - } - - if (modeltype == 'NegativeBinomial') { - fit <- glm.nb(as.formula(paste(predictcolumn,1,sep="~")), data = datasub) - mu <- exp(coef(fit)) - val_pred<-eval(parse(text=paste('datasub',predictcolumn, sep = "$"))) - theta_val = theta.ml(val_pred, mu,nrow(datasub), limit = 22, eps = .Machine$double.eps^0.25, trace = FALSE) - } - - model_list <- c('gaussian','Gamma', 'binomial', 'poisson', 'quasi', 'quasibinomial', 'quasipoisson' ) - - model_data <- function(pred_var, data_model) { - if((modeltype %in% model_list)) { - reg_model <- bigglm(formula(paste(predictcolumn,paste(pred_var,collapse= "+"),sep="~")), data = data_model, family = eval(parse(text=paste(modeltype,"()", sep = ""))), chunksize = chunk) - } - else if(modeltype == 'NegativeBinomial') { - reg_model <- bigglm(formula(paste(predictcolumn,paste(pred_var,collapse= "+"),sep="~")), data = data_model, family = negative.binomial(theta= theta_val), chunksize=chunk) - } - else if(modeltype == 'Multinomial') { - reg_model <- multinom(formula(paste(paste("as.factor(",predictcolumn,")"),paste(pred_var,collapse= "+"),sep="~")), data = data_model, maxit=10, MaxNWts = 10000) - } - return(reg_model) - } - - #Model data based on the options selected - #Get only main effects of the variables - if (glmmethod == 'PrimaryFactors') { - if (modeltype != 'Multinomial') { - test_glm<- model_data(resp_var,chunk_data) - } - else if (modeltype == 'Multinomial') { - test_glm<- model_data(resp_var,datasub_ff) - } - } - #Get interaction values - else if (glmmethod == 'Interaction') { - datasub_pred <- datasub[ , !(names(datasub) %in% predictcolumn)] - #Get correlation between variables - tmp <- cor(datasub_pred) - tmp[upper.tri(tmp)] <- 0 - diag(tmp) <- 0 - - #Remove variables with no interaction - data_no_int <- which(tmp >= 0.1 | tmp < -0.1, arr.ind = TRUE) - data_frame<-data.frame(row = rownames(data_no_int), col = colnames(tmp)[data_no_int[, "col"]], - value = tmp[tmp >= 0.1 | tmp < -0.1]) - colnames(data_frame)<- c("variable1","variable2","coef") - - #Interaction variables - data_frame$variableint <- paste(data_frame$variable1, data_frame$variable2, sep="*") - data_list <- as.character(data_frame$variableint) - if (modeltype != 'Multinomial') { - test_glm<- model_data(data_list,chunk_data) - } - else if (modeltype == 'Multinomial') { - test_glm<- model_data(data_list, datasub_ff) - } - } - #Get second order polynomial values - else if (glmmethod == 'SecondOrder') { - var_resp <- paste('poly(',resp_var,',2)') - if (modeltype != 'Multinomial') { - test_glm<- model_data(var_resp,chunk_data) - } - else if (modeltype == 'Multinomial') { - test_glm<- model_data(var_resp,datasub_ff) - } - } - - #Set output directory - setwd(csvfile) - file_save <- paste0(file_name,".csv") - - #Convert summary of the analysis to a dataframe - tidy_summary <- tidy(test_glm) - - #Reorder the columns - tidy_final <- tidy_summary[c("term", "p.value", "estimate","std.error")] - colnames(tidy_final) <- c("Factors","P-Value","Estimate","Std.Error") - - #Write the dataframe to csv file - write.csv(tidy_final, file_save) - } -} \ No newline at end of file diff --git a/utils/polus-csv-collection-merger/Dockerfile b/utils/polus-csv-collection-merger/Dockerfile deleted file mode 100644 index 9137b8539..000000000 --- a/utils/polus-csv-collection-merger/Dockerfile +++ /dev/null @@ -1,4 +0,0 @@ -FROM alpine -COPY VERSION / -COPY script.sh script.sh -ENTRYPOINT ["sh", "script.sh"] \ No newline at end of file diff --git a/utils/polus-csv-collection-merger/README.md b/utils/polus-csv-collection-merger/README.md deleted file mode 100644 index ea885e526..000000000 --- a/utils/polus-csv-collection-merger/README.md +++ /dev/null @@ -1,45 +0,0 @@ -# Polus CSV Collection Merger Plugin - -This plugin helps to merge multiple CSV Collections in WIPP into one collection for later analysis. - -For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). - -**This plugin is in development and is subject for change** - -## Options - -This plugin takes four input parameters and one output parameter: - -| Name | Description | I/O | WIPP Type | -|----------------------|------------------------------------------------|--------|---------------| -| `input-collection-a` | Input CSV collection A | Input | csvCollection | -| `input-collection-b` | Input CSV collection B | Input | csvCollection | -| `append-a` | Option to append collection ID to files from A | Input | boolean | -| `append-b` | Option to append collection ID to files from B | Input | boolean | -| `output` | Output CSV collection | Output | csvCollection | - -## Build the plugin - -```bash -docker build . -t labshare/polus-csv-collection-merger:0.1.1 -``` - - -## Run the plugin - -### Manually - -To test, create 3 folders: `` and `` should contain csv collections you would like to merge. `` is the target folder which will contain the merged files. - -Run the docker container -```bash -docker run -v :/a \ - -v :/b \ - -v :/c \ - labshare/polus-csv-collection-merger:0.1.1 \ - --input-collection-a /a \ - --input-collection-b /b \ - --append-a 'true' \ - --append-b 'true' \ - --output /c -``` \ No newline at end of file diff --git a/utils/polus-csv-collection-merger/VERSION b/utils/polus-csv-collection-merger/VERSION deleted file mode 100644 index 6da28dde7..000000000 --- a/utils/polus-csv-collection-merger/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.1.1 \ No newline at end of file diff --git a/utils/polus-csv-collection-merger/csvcollectionsmerger.cwl b/utils/polus-csv-collection-merger/csvcollectionsmerger.cwl deleted file mode 100644 index fb0684d11..000000000 --- a/utils/polus-csv-collection-merger/csvcollectionsmerger.cwl +++ /dev/null @@ -1,28 +0,0 @@ -class: CommandLineTool -cwlVersion: v1.2 -inputs: - append-a: - inputBinding: - prefix: --append-a - type: boolean? - append-b: - inputBinding: - prefix: --append-b - type: boolean? - input-collection-a: - inputBinding: - prefix: --input-collection-a - type: Directory - input-collection-b: - inputBinding: - prefix: --input-collection-b - type: Directory - output: - inputBinding: - prefix: --output - type: Directory -outputs: - output: !!python/name:builtins.NotImplementedError '' -requirements: - DockerRequirement: - dockerPull: polusai/csv-collection-merger:0.1.2 diff --git a/utils/polus-csv-collection-merger/ict.yaml b/utils/polus-csv-collection-merger/ict.yaml deleted file mode 100644 index 308334ce3..000000000 --- a/utils/polus-csv-collection-merger/ict.yaml +++ /dev/null @@ -1,61 +0,0 @@ -author: -- Konstantin taletskiy -contact: konstantin.taletskiy@labshare.org -container: polusai/csv-collection-merger:0.1.2 -description: Merge two csv collections. You have an option to prepend collection name - to avoid name conflicts. -entrypoint: '[python3, main.py]' -inputs: -- description: Input csv collection A. - format: - - csvCollection - name: input-collection-a - required: true - type: path -- description: Append collection name to collection A. - format: - - boolean - name: append-a - required: false - type: boolean -- description: Input csv collection B. - format: - - csvCollection - name: input-collection-b - required: true - type: path -- description: Append collection name to collection B. - format: - - boolean - name: append-b - required: false - type: boolean -name: polusai/CSVcollectionsmerger -outputs: -- description: Output csv collection for the plugin - format: - - csvCollection - name: output - required: true - type: path -repository: https://github.com/polusai/image-tools -specVersion: 1.0.0 -title: CSV collections merger -ui: -- description: Pick a collection... - key: inputs.input-collection-a - title: 'CSV Collection A: ' - type: path -- description: Pick an option... - key: inputs.append-a - title: 'Append collection name to filenames in A: ' - type: checkbox -- description: Pick a collection... - key: inputs.input-collection-b - title: 'CSV Collection B: ' - type: path -- description: Pick an option... - key: inputs.append-b - title: 'Append collection name to filenames in B: ' - type: checkbox -version: 0.1.2 diff --git a/utils/polus-csv-collection-merger/plugin.json b/utils/polus-csv-collection-merger/plugin.json deleted file mode 100644 index d777c0c06..000000000 --- a/utils/polus-csv-collection-merger/plugin.json +++ /dev/null @@ -1,61 +0,0 @@ -{ - "name": "CSV collections merger", - "version": "0.1.2", - "title": "CSV collections merger", - "description": "Merge two csv collections. You have an option to prepend collection name to avoid name conflicts.", - "author": "Konstantin taletskiy (konstantin.taletskiy@labshare.org)", - "containerId": "polusai/csv-collection-merger:0.1.2", - "inputs": [ - { - "name": "input-collection-a", - "type": "csvCollection", - "description": "Input csv collection A." - }, - { - "name": "append-a", - "type": "boolean", - "required": "false", - "description": "Append collection name to collection A." - }, - { - "name": "input-collection-b", - "type": "csvCollection", - "description": "Input csv collection B." - }, - { - "name": "append-b", - "type": "boolean", - "required": "false", - "description": "Append collection name to collection B." - } - ], - "outputs": [ - { - "name": "output", - "type": "csvCollection", - "description": "Output csv collection for the plugin" - } - ], - "ui": [ - { - "key": "inputs.input-collection-a", - "title": "CSV Collection A: ", - "description": "Pick a collection..." - }, - { - "key": "inputs.append-a", - "title": "Append collection name to filenames in A: ", - "description": "Pick an option..." - }, - { - "key": "inputs.input-collection-b", - "title": "CSV Collection B: ", - "description": "Pick a collection..." - }, - { - "key": "inputs.append-b", - "title": "Append collection name to filenames in B: ", - "description": "Pick an option..." - } - ] -} \ No newline at end of file diff --git a/utils/polus-csv-collection-merger/script.sh b/utils/polus-csv-collection-merger/script.sh deleted file mode 100644 index 646306ddd..000000000 --- a/utils/polus-csv-collection-merger/script.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/bin/sh - -while [[ $# -gt 0 ]] -do -key="$1" - -case $key in - --input-collection-a) - INPUT_A="$2" - shift # past argument - shift # past value - ;; - --input-collection-b) - INPUT_B="$2" - shift # past argument - shift # past value - ;; - --append-a) - APPEND_A="$2" - shift # past argument - shift # past value - ;; - --append-b) - APPEND_B="$2" - shift # past argument - shift # past value - ;; - --output) - OUTPUT="$2" - shift # past argument - shift # past value - ;; -esac -done - -echo "INPUT COLLECTION A = ${INPUT_A}" -echo "INPUT COLLECTION B = ${INPUT_B}" -echo "APPEND A = ${APPEND_A}" -echo "APPEND B = ${APPEND_B}" -echo "OUTPUT = ${OUTPUT}" - -COLLECTION_A="$(basename $INPUT_A)" -COLLECTION_B="$(basename $INPUT_B)" -echo " " - -echo "Copying files from collection A ($COLLECTION_A):" -for f in $INPUT_A/*; do echo "$(basename $f)"; done -if [ "$APPEND_A" = "true" ]; then - for f in $INPUT_A/*; do cp "$f" "$OUTPUT"/"$COLLECTION_A"_"$(basename $f)"; done -else - for f in $INPUT_A/*; do cp "$f" "$OUTPUT"/"$(basename $f)"; done -fi -echo " " - -echo "Copying files from collection B ($COLLECTION_B):" -for f in $INPUT_B/*; do echo "$(basename $f)"; done -if [ "$APPEND_B" = "true" ]; then - for f in $INPUT_B/*; do cp "$f" "$OUTPUT"/"$COLLECTION_B"_"$(basename $f)"; done -else - for f in $INPUT_B/*; do cp "$f" "$OUTPUT"/"$(basename $f)"; done -fi \ No newline at end of file diff --git a/visualization/polus-graph-pyramid-builder-plugin/Dockerfile b/visualization/polus-graph-pyramid-builder-plugin/Dockerfile deleted file mode 100644 index d303a4f86..000000000 --- a/visualization/polus-graph-pyramid-builder-plugin/Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -# Get image containing bfio -FROM polusai/bfio:2.1.9 - -COPY VERSION / - -ARG EXEC_DIR="/opt/executables" -ARG DATA_DIR="/data" - -#Create folders -RUN mkdir -p ${EXEC_DIR} \ - && mkdir -p ${DATA_DIR}/inputs \ - && mkdir ${DATA_DIR}/outputs - -#Copy executable -COPY src ${EXEC_DIR}/ - -RUN pip3 install -r ${EXEC_DIR}/requirements.txt --no-cache-dir - -RUN python3 ${EXEC_DIR}/dl_fi.py - -WORKDIR ${EXEC_DIR} - -# Default command. Additional arguments are provided through the command line -ENTRYPOINT ["python3", "/opt/executables/main.py"] diff --git a/visualization/polus-graph-pyramid-builder-plugin/README.md b/visualization/polus-graph-pyramid-builder-plugin/README.md deleted file mode 100644 index 4b9a1f17d..000000000 --- a/visualization/polus-graph-pyramid-builder-plugin/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# Polus CZI Extraction Plugin - -This WIPP plugin will import a csv collection and build a DeepZoom pyramid of graphs, where each graph contains a heatmap of each column plotted against another column. All n-columns are plotted against each other, excluding tranposed graphs and graphs where each axis has the same column. This leads to a total of (n^2-n)/2 graphs. - -Two types of graphs will be produced: -1) Linear sclaed graphs -2) Log scaled graphs - - The output will contain dzi and csv files for both linear and log scaled outputs. - There were will be two different directories that contain the pyramid images for the linear and log scaled outputs - -For more information on WIPP, visit the [official WIPP page](https://isg.nist.gov/deepzoomweb/software/wipp). - -## Building - -To build the Docker image for the conversion plugin, run -`./build-docker.sh`. - -## Install WIPP Plugin - -If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. - -## Options - -This plugin takes one input argument and one output argument: - -| Name | Description | I/O | Type | -| -------- | ---------------------- | ------ | ---------------- | -| `inpDir` | Input CSV collection | Input | CSV Collection | -| `outDir` | Output pyramid | Output | Pyramid | - -## Run the plugin - -### Run the Docker Container - -```bash -docker run -v /path/to/data:/data graph-pyramid-builder \ - --inpDir /data/input \ - --outDir /data/output -``` diff --git a/visualization/polus-graph-pyramid-builder-plugin/VERSION b/visualization/polus-graph-pyramid-builder-plugin/VERSION deleted file mode 100644 index e05cb3329..000000000 --- a/visualization/polus-graph-pyramid-builder-plugin/VERSION +++ /dev/null @@ -1 +0,0 @@ -1.3.8 diff --git a/visualization/polus-graph-pyramid-builder-plugin/build-docker.sh b/visualization/polus-graph-pyramid-builder-plugin/build-docker.sh deleted file mode 100755 index e96c75517..000000000 --- a/visualization/polus-graph-pyramid-builder-plugin/build-docker.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -version=$(' - -# Initialize the logger -logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', - datefmt='%d-%b-%y %H:%M:%S') -logger = logging.getLogger("main") -logger.setLevel(logging.INFO) - -def is_number(value): - """ This function checks to see if the value can be converted to a number """ - try: - float(value) - return True - except: - return False - -def load_csv(fpath): - """ Load a csv and select data - - Data is loaded from a csv, and data columns containing numeric values are - returned in a pandas Dataframe. The second row of the csv may contain - column classifiers, so the second row is first loaded and checked to - determine if the classifiers are present. - Inputs: - fpath - Path to csv file - Outputs: - data - A pandas Dataframe - cnames - Names of columns - """ - - # Check if the first row is column coding, and if it is then find valid columns - data = pandas.read_csv(fpath,nrows=1) - is_coded = True - cnames = [] - for ind,fname in zip(range(len(data.columns)),data.columns): - if data[fname][0] != 'F' and data[fname][0] != 'C': - is_coded = False - if is_number(data[fname][0]): - cnames.append([fname,ind]) - else: - logging.info('Column {} does not appear to contain numeric values. Not building graphs for this column.'.format(fname)) - elif data[fname][0] == 'F': - cnames.append([fname,ind]) - else: - logging.info('Skipping column {} for reason: one hot encodings'.format(fname)) - - # Load the data - if is_coded: - data = pandas.read_csv(fpath,skiprows=[1],usecols=[c[0] for c in cnames]) - - else: - data = pandas.read_csv(fpath,usecols=[c[0] for c in cnames]) - - return data, cnames - -def bin_data(data, bin_stats): - """ This function bins the data - Inputs: - data - pandas dataframe of data - bin_stats - stats of the data - Outputs: - bins - binned data ranging from (0, bincount) - graph_index - Numeric value of column index from original csv - graph_dict - a dictionary containing the indexes of graphs - """ - - column_names = data.columns - nfeats = data.shape[1] - nrows = data.shape[0] - - # Handle NaN values - data_ind = pandas.notnull(data) - data[~data_ind] = 255 - - data = data.astype(np.uint16) # cast to save memory - data[data>=bincount] = bincount - 1 # in case of numerical precision issues - - - if nrows < 2**8: - dtype = np.uint8 - elif nrows < 2**16: - dtype = np.uint16 - elif nrows < 2**32: - dtype = np.uint32 - else: - dtype = np.uint64 - - totalgraphs = int((nfeats**2 - nfeats)/2) - bins = np.zeros((totalgraphs, bincount, bincount), dtype=dtype) - graph_index = [] - graph_dict = {} - - # Create a linear index for feature bins - i = 0 - for feat1 in range(nfeats): - name1 = column_names[feat1] - feat1_tf = data[name1] * bincount - - for feat2 in range(feat1 + 1, nfeats): - graph_dict[(feat1, feat2)] = i - name2 = column_names[feat2] - - feat2_tf = data[name2] - feat2_tf = feat2_tf[data_ind[name1] & data_ind[name2]] - - if feat2_tf.size<=1: - continue - - # sort linear matrix indices - SortedFeats = np.sort(feat1_tf[data_ind[name1] & data_ind[name2]] + feat2_tf) - - # Do math to get the indices - ind2 = np.nonzero(np.diff(SortedFeats))[0] # nonzeros are cumulative sum of all bin values - ind2 = np.append(ind2,SortedFeats.size-1) - rows = (SortedFeats[ind2]/bincount).astype(np.uint8) # calculate row from linear index - cols = np.mod(SortedFeats[ind2],bincount) # calculate column from linear index - counts = np.diff(ind2) # calculate the number of values in each bin - - bins[i,rows[0],cols[0]] = ind2[0] + 1 - bins[i,rows[1:],cols[1:]] = counts - graph_index.append([feat1,feat2]) - i = i + 1 - - return bins, graph_index, graph_dict - -def transform_data(data,column_names, typegraph): - """ Bin the data - - Data from a pandas Dataframe is binned in two dimensions. Binning is performed by - binning data in one column along one axis and another column is binned along the - other axis. All combinations of columns are binned without repeats or transposition. - There are only bincount number of bins in each dimension, and each bin is 1/bincount the size of the - difference between the maximum and minimum of each column. - If the data needs to be logarithmically scaled, then the data is transformed by the algorithm presented - in this paper: https://iopscience.iop.org/article/10.1088/0957-0233/24/2/027001 - Inputs: - data - A pandas Dataframe, with nfeats number of columns - column_names - Names of Dataframe columns - typegraph - Defines whether logarithmic scale or linear scalef - Outputs: - bins - A numpy matrix that has shape (int((nfeats**2 - nfeats)/2),bincount,bincount) - bin_feats - A list containing the minimum and maximum values of each column - index - Numeric value of column index from original csv - diction - a dictionary containing the indexes of graphs - """ - - nfeats = len(column_names) - - # If logarithmic, need to transform the data - # https://iopscience.iop.org/article/10.1088/0957-0233/24/2/027001 - # Adjusts for behavior near zero - - if typegraph == "log": - C = 1/np.log(10)# Derivative of Natural Log e, d(ln(x))/dx = 1/x - data = data.astype(np.float64) - data = np.sign(data) * np.log10(1 + (abs(data/C))) - - bin_stats = {'min': data.min(), - 'max': data.max(), - 'binwidth': (data.max()-data.min()+10**-6)/bincount} - - - # Transform data into bin positions for fast binning - data = ((data - bin_stats['min'])/bin_stats['binwidth']).apply(np.floor) - - bins, index, diction = bin_data(data, bin_stats) - return bins, bin_stats, index, diction - -""" 2. Plot Generation """ -def format_ticks(out): - """ Generate tick labels - Polus Plots uses D3 to generate the plots. This function tries to mimic the - formatting of tick labels. In place of using scientific notation a scale - prefix is appended to the end of the number. See _prefix comments to see the - suffixes that are used. Numbers that are larger or smaller than 10**24 or - 10**-24 respectively are not handled and may throw an error. Values outside - of this range do not currently have an agreed upon prefix in the measurement - science community. - - Inputs: - out - the values of the ticks used in graph - Outputs: - fticks - a list of strings containing formatted tick labels - """ - _prefix = { - -24: 'y', # yocto - -21: 'z', # zepto - -18: 'a', # atto - -15: 'f', # femto - -12: 'p', # pico - -9: 'n', # nano - -6: 'u', # micro - -3: 'm', # mili - 0: ' ', - 3: 'k', # kilo - 6: 'M', # mega - 9: 'G', # giga - 12: 'T', # tera - 15: 'P', # peta - 18: 'E', # exa - 21: 'Z', # zetta - 24: 'Y', # yotta - } - - fticks = [] - convertprefix = [] - for i in out: - formtick = "%#.3f" % i - decformtick = '%.2e' % Decimal(formtick) - convertexponent = float(decformtick[-3:]) - numbers = float(decformtick[:-4]) - if convertexponent > 0: - if convertexponent % 3 == 2: - movednum = round(numbers/10,2) - newprefix = _prefix[int(convertexponent + 1)] - formtick = str(movednum) + newprefix - elif convertexponent % 3 == 1: - movednum = round(numbers*10,1) - newprefix = _prefix[int(convertexponent - 1)] - formtick = str(movednum) + newprefix - else: - newprefix = _prefix[int(convertexponent)] - if i < 0: - formtick = str(decformtick[:5]) + newprefix - else: - formtick = str(decformtick[:4]) + newprefix - elif convertexponent < 0: - if convertexponent % -3 == -2: - movednum = round(numbers*10,1) - newprefix = _prefix[int(convertexponent - 1)] - formtick = str(movednum) + newprefix - elif convertexponent % -3 == -1: - movednum = round(numbers/10,2) - newprefix = _prefix[int(convertexponent + 1)] - formtick = str(movednum) + newprefix - else: - newprefix = _prefix[convertexponent] - if i < 0: - formtick = str(decformtick[:5]) + newprefix - else: - formtick = str(decformtick[:4]) + newprefix - else: - if i < 0: - formtick = str(decformtick[:5]) + _prefix[int(convertexponent)] - else: - formtick = str(decformtick[:4]) + _prefix[int(convertexponent)] - convertprefix.append(int(convertexponent)) - fticks.append(formtick) - - return fticks - -# Create a custom colormap to mimick Polus Plots -def get_cmap(): - - cmap_values = [[1.0,1.0,1.0,1.0]] - cmap_values.extend([[r/255,g/255,b/255,1] for r,g,b in zip(np.arange(0,255,2), - np.arange(153,255+1/128,102/126), - np.arange(34+1/128,0,-34/126))]) - cmap_values.extend([[r/255,g/255,b/255,1] for r,g,b in zip(np.arange(255,136-1/128,-119/127), - np.arange(255,0,-2), - np.arange(0,68+1/128,68/127))]) - cmap = ListedColormap(cmap_values) - - return cmap - -def gen_plot(col1, - col2, - indexdict, - column_names, - bin_stats, - fig, - ax, - data, - typegraph): - """ Generate a heatmap - Generate a heatmap of data for column 1 against column 2. - Inputs: - col1 - the column plotted on the y-axis - col2 - column plotted on the x-axis - indexdict - a dictionary containing the indexes of graphs - column_names - list of column names - bin_stats - a list containing the min,max values of each column - fig - pregenerated figure - ax - pregenerated axis - data - p regenerated heatmap bbox artist - typegraph - specifies whether the data is log scaled or linearly scaled - Outputs: - hmap - A numpy array containing pixels of the heatmap - """ - def keepdecreasing(labeltexts0, decreasefont, bbxtext): - """ This function decreases the size of the labels if its too big """ - labeltexts0.set_fontsize(decreasefont) - bbxtext = labeltexts0.get_window_extent(renderer = fig.canvas.renderer) - decreasefont = decreasefont - 1 - return bbxtext, decreasefont - - def calculateticks(ticks, bin_width, fmin, typegraph): - """ This functio n calculates the tick values for the graphs """ - - if typegraph == "linear": - tick_vals = [t for t in ticks*bin_width+fmin] - if typegraph == "log": - C = 1/np.log(10) - tick_vals = [np.sign(t)*C*(-1+(10**abs(t))) for t in ticks*bin_width+fmin] - return tick_vals - - if col2>col1: - d = np.squeeze(bins[indexdict[col1, col2],:,:]) - r = col1 - c = col2 - elif col2 CHUNK_SIZE) or (bbxtext.y0 < 0 or bbxtext.y1 > (CHUNK_SIZE*.075)): - bbxtext, decreasefont = keepdecreasing(axlabel.texts[0], decreasefont, bbxtext) - - # This is to decrease the size of the title labels if the name is too large (Y AXIS LABEL) - if len(aylabel.texts) == 0: - aylabel.text(0.5, 0.5, "\n".join(wrap(cname_r, 60)), va = 'center', ha = 'center', fontsize = sizefont, rotation = 90, wrap = True) - else: - aylabeltext0 = aylabel.texts[0] - aylabeltext0.set_text("\n".join(wrap(cname_r, 60))) - aylabeltext0.set_fontsize(sizefont) - - bbytext = (aylabel.texts[0]).get_window_extent(renderer = fig.canvas.renderer) - decreasefont = sizefont - 1 - while (bbytext.y0 < 0 or bbytext.y1 > CHUNK_SIZE) or (bbytext.x0 < 0 or bbytext.x1 > (CHUNK_SIZE*.075)): - bbytext, decreasefont = keepdecreasing(aylabel.texts[0], decreasefont, bbytext) - - while len(ax.lines) > 0: - ax.lines[-1].remove() - - # Calculating the value of each tick in the graph (fixed width) - fmin_c = bin_stats['min'][cname_c] - fmax_c = bin_stats['max'][cname_c] - binwidth_c = bin_stats['binwidth'][cname_c] - tick_vals_c= calculateticks(ax.get_xticks(), binwidth_c, fmin_c, typegraph) - if fmin_c < 0: # draw x=0 - ax.axvline(x=abs(fmin_c)/binwidth_c) - ax.set_xticklabels(format_ticks(tick_vals_c), rotation=45, fontsize = 5, ha='right') - - # Calculating the value of each tick in the graph (fixed width) - fmin_r = bin_stats['min'][cname_r] - fmax_r = bin_stats['max'][cname_r] - binwidth_r = bin_stats['binwidth'][cname_r] - tick_vals_r = calculateticks(ax.get_yticks(), binwidth_r, fmin_r, typegraph) - if fmin_r < 0: # draw y=0 - ax.axhline(y=abs(fmin_r)/binwidth_r) - ax.set_yticklabels(format_ticks(tick_vals_r), fontsize=5, ha='right') - - fig.canvas.draw() - hmap = np.array(fig.canvas.renderer.buffer_rgba()) - - return hmap - -def get_default_fig(cmap): - """ Generate a default figure, axis, and heatmap artist - Generate a figure and draw an empty graph with useful settings for repeated - drawing of new figures. By passing the existing figure, axis, and heatmap - artist to the plot generator, many things do not need to be drawn from - scratch. This decreases the plot drawing time by a factor of 2-3 times. - Inputs: - cmap - the heatmap colormap - Outputs: - fig - A reference to the figure object - ax - A reference to the axis object - data - A reference to the heatmap artist - """ - fig, ax = plt.subplots(dpi=int(CHUNK_SIZE/4),figsize=(4,4),tight_layout={'h_pad':1,'w_pad':1}) - datacolor = ax.pcolorfast(np.zeros((bincount, bincount),np.uint64),cmap=cmap) - ticks = [t for t in range(0, bincount+1, int(bincount/(10)))] - - ax.set_xlim(0,bincount) - ax.set_ylim(0,bincount) - ax.set_xticks(ticks) - ax.set_yticks(ticks) - ax.set_xlabel(" ") - ax.set_ylabel(" ") - - ax.set_xticklabels(ticks, rotation = 45) - ax.set_yticklabels(ticks) - - fig.canvas.draw() - - axlabel = fig.add_axes([.075, 0, 1, .075], frameon = False, alpha = .5, facecolor = 'b') - axlabel.set_xticks([]) - axlabel.set_yticks([]) - axlabel.set_clip_on(True) - aylabel = fig.add_axes([0, .075, .075, 1], frameon = False, alpha = .5, facecolor = 'b') - aylabel.set_xticks([]) - aylabel.set_yticks([]) - aylabel.set_clip_on(True) - - return fig, ax, datacolor - -""" 3. Pyramid generation functions """ - -def _avg2(image): - """ Average pixels with optical field of 2x2 and stride 2 """ - - # Convert 32-bit pixels to prevent overflow during averaging - image = image.astype(np.uint32) - imageshape0 = image.shape[0] - imageshape1 = image.shape[1] - # Get the height and width of each image to the nearest even number - y_max = imageshape0 - imageshape0 % 2 - x_max = imageshape1 - imageshape1 % 2 - - # Perform averaging - avg_img = np.zeros(np.ceil([image.shape[0]/2,image.shape[1]/2,image.shape[2]]).astype(np.uint32)) - for z in range(4): - avg_img[0:int(y_max/2),0:int(x_max/2),z]= (image[0:y_max-1:2,0:x_max-1:2,z] + \ - image[1:y_max:2,0:x_max-1:2,z] + \ - image[0:y_max-1:2,1:x_max:2,z] + \ - image[1:y_max:2,1:x_max:2,z]) / 4 - - # The next if statements handle edge cases if the height or width of the - # image has an odd number of pixels - if y_max != imageshape0: - for z in range(3): - avg_img[-1,:int(x_max/2),z] = (image[-1,0:x_max-1:2,z] + \ - image[-1,1:x_max:2,z]) / 2 - if x_max != imageshape1: - for z in range(4): - avg_img[:int(y_max/2),-1,z] = (image[0:y_max-1:2,-1,z] + \ - image[1:y_max:2,-1,z]) / 2 - if y_max != imageshape0 and x_max != imageshape1: - for z in range(4): - avg_img[-1,-1,z] = image[-1,-1,z] - return avg_img - -def metadata_to_graph_info(outPath,outFile, ngraphs): - - # Create an output path object for the info file - op = Path(outPath).joinpath("{}.dzi".format(outFile)) - - # create an output path for the images - of = Path(outPath).joinpath('{}_files'.format(outFile)) - of.mkdir(exist_ok=True) - - # Get metadata info from the bfio reader - rows = np.ceil(np.sqrt(ngraphs)) - cols = np.round(np.sqrt(ngraphs)) - sizes = [cols*CHUNK_SIZE,rows*CHUNK_SIZE] - - # Calculate the number of pyramid levels - num_scales = np.ceil(np.log2(rows*CHUNK_SIZE)).astype(np.uint8) - - # create a scales template, use the full resolution - scales = { - "size":sizes, - "key": num_scales - } - - # initialize the json dictionary - info = { - "scales": [scales], # Will build scales belows - "rows": rows, - "cols": cols - } - - # create the information for each scale - for i in range(1,num_scales+1): - previous_scale = info['scales'][-1] - current_scale = copy.deepcopy(previous_scale) - current_scale['key'] = str(num_scales - i) - current_scale['size'] = [int(np.ceil(previous_scale['size'][0]/2)),int(np.ceil(previous_scale['size'][1]/2))] - info['scales'].append(current_scale) - - # write the dzi file - with open(op,'w') as writer: - writer.write(DZI.format(int(info['cols']*CHUNK_SIZE),int(info['rows']*CHUNK_SIZE))) - - return info - - -def _get_higher_res(S,info,cnames, outpath,out_file,indexscale,indexdict,binstats, typegraph, X=None,Y=None): - """ - The following function builds the image pyramid at scale S by building up only - the necessary information at high resolution layers of the pyramid. So, if 0 is - the original resolution of the image, getting a tile at scale 2 will generate - only the necessary information at layers 0 and 1 to create the desired tile at - layer 2. This function is recursive and can be parallelized. - Inputs: - S - current scale - info - dictionary of scale information - outpath - directory for all outputs - out_file - directory for current dataset - indexscale - index of the graph - binstats - stats for the binned data - typegraph - specifies whether the data is linear or logarithmically scaled - Outputs: - DeepZoom format of images. - """ - - # Get the scale info - num_scales = len(info['scales']) - scale_info = info['scales'][num_scales-S-1] - - if scale_info==None: - raise ValueError("No scale information for resolution {}.".format(S)) - if X == None: - X = [0,scale_info['size'][0]] - if Y == None: - Y = [0,scale_info['size'][1]] - - # Modify upper bound to stay within resolution dimensions - if X[1] > scale_info['size'][0]: - X[1] = scale_info['size'][0] - if Y[1] > scale_info['size'][1]: - Y[1] = scale_info['size'][1] - - # Initialize the output - image = np.zeros((int(Y[1]-Y[0]),int(X[1]-X[0]),4),dtype=np.uint8) - - # If requesting from the lowest scale, then just generate the graph - if S==num_scales-1: - index = int((int(Y[0]/CHUNK_SIZE) + int(X[0]/CHUNK_SIZE) * info['rows'])) - if index>=len(indexscale): - image = np.ones((CHUNK_SIZE,CHUNK_SIZE,4),dtype=np.uint8) * (bincount + 55) - else: - image = gen_plot(col1=indexscale[index][0], - col2=indexscale[index][1], - indexdict=indexdict, - column_names=cnames, - bin_stats=binstats, - fig=fig, - ax=ax, - data=datacolor, - typegraph=typegraph) - - else: - # Set the subgrid dimensions - subgrid_dimX = list(np.arange(2*X[0], 2*X[1], CHUNK_SIZE).astype('int')) - subgrid_dimX.append(2*X[1]) - subgrid_dimY = list(np.arange(2*Y[0], 2*Y[1], CHUNK_SIZE).astype('int')) - subgrid_dimY.append(2*Y[1]) - - - for y in range(0,len(subgrid_dimY)-1): - subgrid_Y_ind0 = np.ceil((subgrid_dimY[y] - subgrid_dimY[0])/2).astype('int') - subgrid_Y_ind1 = np.ceil((subgrid_dimY[y+1] - subgrid_dimY[0])/2).astype('int') - for x in range(0,len(subgrid_dimX)-1): - subgrid_X_ind0 = np.ceil((subgrid_dimX[x] - subgrid_dimX[0])/2).astype('int') - subgrid_X_ind1 = np.ceil((subgrid_dimX[x+1] - subgrid_dimX[0])/2).astype('int') - if S==(num_scales - 6): #to use multiple processors to compute faster. - sub_image = _get_higher_res_par(S=S+1, - info=info, - cnames=cnames, - outpath=outpath, - out_file=out_file, - indexscale=indexscale, - indexdict=indexdict, - binstats=binstats, - typegraph=typegraph, - X=subgrid_dimX[x:x+2], - Y=subgrid_dimY[y:y+2]) - else: - sub_image = _get_higher_res(S=S+1, - info=info, - cnames=cnames, - outpath=outpath, - out_file=out_file, - indexscale=indexscale, - indexdict=indexdict, - binstats=binstats, - typegraph=typegraph, - X=subgrid_dimX[x:x+2], - Y=subgrid_dimY[y:y+2]) - - image[subgrid_Y_ind0:subgrid_Y_ind1, subgrid_X_ind0:subgrid_X_ind1,:] = _avg2(sub_image) - del sub_image - - # Write the chunk - outpath = Path(outpath).joinpath('{}_files'.format(out_file),str(S)) - outpath.mkdir(exist_ok=True) - imageio.imwrite(outpath.joinpath('{}_{}.png'.format(int(X[0]/CHUNK_SIZE),int(Y[0]/CHUNK_SIZE))),image,format='PNG-FI',compression=1) - logger.info('Finished building tile (scale,X,Y): ({},{},{})'.format(S,int(X[0]/CHUNK_SIZE),int(Y[0]/CHUNK_SIZE))) - return image - -# This function performs the same operation as _get_highe_res, except it uses multiprocessing to grab higher -# resolution layers at a specific layer. -def _get_higher_res_par(S,info, cnames, outpath,out_file,indexscale, indexdict, binstats, typegraph, X=None,Y=None): - # Get the scale info - num_scales = len(info['scales']) - scale_info = info['scales'][num_scales-S-1] - - if scale_info==None: - ValueError("No scale information for resolution {}.".format(S)) - - if X == None: - X = [0,scale_info['size'][0]] - if Y == None: - Y = [0,scale_info['size'][1]] - - # Modify upper bound to stay within resolution dimensions - if X[1] > scale_info['size'][0]: - X[1] = scale_info['size'][0] - if Y[1] > scale_info['size'][1]: - Y[1] = scale_info['size'][1] - - # Initialize the output - image = np.zeros((Y[1]-Y[0],X[1]-X[0],4),dtype=np.uint8) - # If requesting from the lowest scale, then just generate the graph - if S==int(info['scales'][0]['key']): - index = (int(Y[0]/CHUNK_SIZE) + int(X[0]/CHUNK_SIZE) * info['rows']) - if index>=len(indexscale): - image = np.ones((CHUNK_SIZE,CHUNK_SIZE,4),dtype=np.uint8) * (bincount + 55) - else: - image = gen_plot(col1=indexscale[index][0], - col2=indexscale[index][1], - indexdict=indexdict, - column_names=cnames, - bin_stats=binstats, - fig=fig, - ax=ax, - data=datacolor, - typegraph=typegraph) - - else: - # Set the subgrid dimensions - subgrid_dimX = list(np.arange(2*X[0], 2*X[1], CHUNK_SIZE).astype('int')) - subgrid_dimX.append(2*X[1]) - subgrid_dimY = list(np.arange(2*Y[0], 2*Y[1], CHUNK_SIZE).astype('int')) - subgrid_dimY.append(2*Y[1]) - - subgrid_images = [] - - with Pool(processes=np.min(4,initial=multiprocessing.cpu_count())) as pool: - for y in range(0,len(subgrid_dimY)-1): - subgrid_Y_ind0 = np.ceil((subgrid_dimY[y] - subgrid_dimY[0])/2).astype('int') - subgrid_Y_ind1 = np.ceil((subgrid_dimY[y+1] - subgrid_dimY[0])/2).astype('int') - for x in range(0,len(subgrid_dimX)-1): - subgrid_X_ind0 = np.ceil((subgrid_dimX[x] - subgrid_dimX[0])/2).astype('int') - subgrid_X_ind1 = np.ceil((subgrid_dimX[x+1] - subgrid_dimX[0])/2).astype('int') - subgrid_images.append(pool.apply_async(_get_higher_res,(S+1, - info, - cnames, - outpath, - out_file, - indexscale, - indexdict, - binstats, - typegraph, - subgrid_dimX[x:x+2], - subgrid_dimY[y:y+2]))) - image[subgrid_Y_ind0:subgrid_Y_ind1,subgrid_X_ind0:subgrid_X_ind1,:] = _avg2((subgrid_images[y*(len(subgrid_dimX)-1) + x]).get()) - - del subgrid_images - - # Write the chunk - outpath = Path(outpath).joinpath('{}_files'.format(out_file),str(S)) - outpath.mkdir(exist_ok=True) - imageio.imwrite(outpath.joinpath('{}_{}.png'.format(int(X[0]/CHUNK_SIZE),int(Y[0]/CHUNK_SIZE))),image,format='PNG-FI',compression=1) - logger.info('Finished building tile (scale,X,Y): ({},{},{})'.format(S,int(X[0]/CHUNK_SIZE),int(Y[0]/CHUNK_SIZE))) - return image - -def write_csv(cnames,index,f_info,out_path,out_file): - """ This function writes the csv file necessary for the Deep Zoom format """ - - header = 'dataset_id, x_axis_id, y_axis_id, x_axis_name, y_axis_name, title, length, width, global_row, global_col\n' - line = '{:d}, {:d}, {:d}, {:s}, {:s}, default title, {:d}, {:d}, {:d}, {:d}\n' - l_ind = 0 - with open(str(Path(out_path).joinpath(out_file+'.csv').absolute()),'w') as writer: - writer.write(header) - for ind in index: - ind1 = ind[1] - ind0 = ind[0] - writer.write(line.format(1, - cnames[ind1][1], - cnames[ind0][1], - cnames[ind1][0], - cnames[ind0][0], - CHUNK_SIZE, - CHUNK_SIZE, - int(np.mod(l_ind,f_info['rows'])), - int(l_ind/f_info['rows']))) - l_ind += 1 - -if __name__=="__main__": - - - """ Initialize argument parser """ - logger.info("Parsing arguments...") - parser = argparse.ArgumentParser(prog='main', description='Build an image pyramid from data in a csv file.') - - """ Define the arguments """ - parser.add_argument('--inpDir', dest='inpDir', type=str, - help='Path to input images.', required=True) - - parser.add_argument('--outDir', dest='outDir', type=str, - help='Path to output images.', required=True) - - parser.add_argument('--bincount', dest='bin_count', type=int, - help='Number of bins', required=True) - - parser.add_argument('--scale', dest='scale', type=str, - help='Linear, Log, or Both', required=False) - - """ Get the input arguments """ - args = parser.parse_args() - - input_path = args.inpDir - output_path = Path(args.outDir) - bincount = args.bin_count - scales = [args.scale.lower()] - all_scales = ['linear','log'] - if scales[0] not in all_scales: - scales = all_scales - - logger.info('inpDir = {}'.format(input_path)) - logger.info('outDir = {}'.format(output_path)) - - # Set up the logger for each scale - loggers = {} - for scale in scales: - loggers[scale] = logging.getLogger("main.{}".format(scale.upper())) - loggers[scale].setLevel(logging.INFO) - - # Get the path to each csv file in the collection - input_files = [str(f.absolute()) for f in Path(input_path).iterdir() if ''.join(f.suffixes)=='.csv'] - - # Generate the default figure components - logger.info('Generating colormap and default figure...') - cmap = get_cmap() - fig, ax, datacolor = get_default_fig(cmap) - logger.info('Done!') - - for f in input_files: - - logger.info('Loading csv: {}'.format(f)) - data, cnames = load_csv(f) - column_names = [c[0] for c in cnames] - - for scale in scales: - - # Set the file path folder - folder_name = Path(f).name.replace('.csv','_{}'.format(scale)) - - # Process for current scale - loggers[scale].info('Processing: {}'.format(folder_name)) - - # Bin the data - loggers[scale].info('Binning data for {} {} features...'.format(len(column_names),scale.upper())) - bins, bin_stats, data_index, data_dict = transform_data(data,column_names, scale) - - # Generate the dzi file - loggers[scale].info('Generating pyramid {} metadata...'.format(scale.upper())) - ngraphs = len(data_index) - info_data = metadata_to_graph_info(output_path,folder_name, ngraphs) - loggers[scale].info('Done!') - - loggers[scale].info('Writing {} layout file...!'.format(scale.upper())) - write_csv(cnames,data_index,info_data,output_path,folder_name) - loggers[scale].info('Done!') - - # Create the pyramid - loggers[scale].info('Building {} pyramids...'.format(scale.upper())) - image_data = _get_higher_res(0, info_data,column_names, output_path,folder_name,data_index, data_dict, bin_stats, scale) - loggers[scale].info('Done!') diff --git a/visualization/polus-graph-pyramid-builder-plugin/src/requirements.txt b/visualization/polus-graph-pyramid-builder-plugin/src/requirements.txt deleted file mode 100644 index da4cf76dc..000000000 --- a/visualization/polus-graph-pyramid-builder-plugin/src/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -pandas>=0.25.1 -matplotlib>=3.1.1 -numpy>=1.21.0 -imageio==2.5.0