Skip to content

Commit

Permalink
Merge branch 'oap-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
minmingzhu authored Sep 10, 2024
2 parents 5d08ac0 + 4dc4f0a commit 49bdc75
Show file tree
Hide file tree
Showing 30 changed files with 119 additions and 157 deletions.
6 changes: 4 additions & 2 deletions .github/workflows/dev_cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ on:
- edited
- synchronize

permissions: read-all

permissions:
issues: write
contents: read

jobs:
process:
name: Process
Expand Down
19 changes: 8 additions & 11 deletions mllib-dal/src/main/native/CorrelationImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,19 +197,19 @@ static void doCorrelationOneAPICompute(

JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
JNIEnv *env, jobject obj, jlong pNumTabData, jlong numRows, jlong numCols,
jint executorNum, jint executorCores, jint computeDeviceOrdinal,
jintArray gpuIdxArray, jobject resultObj) {
JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows,
jlong numCols, jint executorNum, jint executorCores,
jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) {
logger::println(logger::INFO,
"oneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());

ccl::communicator &cclComm = getComm();
int rankId = cclComm.rank();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
case ComputeDevice::host:
case ComputeDevice::cpu: {
ccl::communicator &cclComm = getComm();
int rankId = cclComm.rank();
NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
// Set number of threads for oneDAL to use for each rank
services::Environment::getInstance()->setNumberOfThreads(executorCores);
Expand All @@ -229,19 +229,16 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
logger::println(
logger::INFO,
"oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu,
rankId);
rank);

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);

int size = cclComm.size();

auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
auto queue = getAssignedGPU(device, gpuIndices);

ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
queue, executorNum, rank, kvs);
doCorrelationOneAPICompute(env, pNumTabData, numRows, numCols, comm,
resultObj);
env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
Expand Down
18 changes: 7 additions & 11 deletions mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,39 +300,35 @@ static jobject doRFClassifierOneAPICompute(
*/
JNIEXPORT jobject JNICALL
Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassifierTrainDAL(
JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong featureRows,
jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum,
jint computeDeviceOrdinal, jint classCount, jint treeCount,
jint numFeaturesPerNode, jint minObservationsLeafNode,
JNIEnv *env, jobject obj, jint rank, jlong pNumTabFeature,
jlong featureRows, jlong featureCols, jlong pNumTabLabel, jlong labelCols,
jint executorNum, jint computeDeviceOrdinal, jint classCount,
jint treeCount, jint numFeaturesPerNode, jint minObservationsLeafNode,
jint minObservationsSplitNode, jdouble minWeightFractionLeafNode,
jdouble minImpurityDecreaseSplitNode, jint maxTreeDepth, jlong seed,
jint maxBins, jboolean bootstrap, jintArray gpuIdxArray,
jobject resultObj) {
logger::println(logger::INFO, "oneDAL (native): use DPC++ kernels");

ccl::communicator &cclComm = getComm();
int rankId = cclComm.rank();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
case ComputeDevice::gpu: {
int nGpu = env->GetArrayLength(gpuIdxArray);
logger::println(
logger::INFO,
"oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu,
rankId);
rank);

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);

int size = cclComm.size();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);

auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
auto queue = getAssignedGPU(device, gpuIndices);

ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
queue, executorNum, rank, kvs);
jobject hashmapObj = doRFClassifierOneAPICompute(
env, pNumTabFeature, featureRows, featureCols, pNumTabLabel,
labelCols, executorNum, computeDeviceOrdinal, classCount, treeCount,
Expand Down
22 changes: 9 additions & 13 deletions mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -292,37 +292,33 @@ static jobject doRFRegressorOneAPICompute(

JNIEXPORT jobject JNICALL
Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTrainDAL(
JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong featureRows,
jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum,
jint computeDeviceOrdinal, jint treeCount, jint numFeaturesPerNode,
jint minObservationsLeafNode, jint maxTreeDepth, jlong seed, jint maxbins,
jboolean bootstrap, jintArray gpuIdxArray, jobject resultObj) {
JNIEnv *env, jobject obj, jint rank, jlong pNumTabFeature,
jlong featureRows, jlong featureCols, jlong pNumTabLabel, jlong labelCols,
jint executorNum, jint computeDeviceOrdinal, jint treeCount,
jint numFeaturesPerNode, jint minObservationsLeafNode, jint maxTreeDepth,
jlong seed, jint maxbins, jboolean bootstrap, jintArray gpuIdxArray,
jobject resultObj) {
logger::println(logger::INFO,
"OneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());

ccl::communicator &cclComm = getComm();
int rankId = cclComm.rank();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
case ComputeDevice::gpu: {
int nGpu = env->GetArrayLength(gpuIdxArray);
logger::println(
logger::INFO,
"OneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu,
rankId);
rank);

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);

int size = cclComm.size();

auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
auto queue = getAssignedGPU(device, gpuIndices);

ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
queue, executorNum, rank, kvs);
jobject hashmapObj = doRFRegressorOneAPICompute(
env, pNumTabFeature, featureRows, featureCols, pNumTabLabel,
labelCols, executorNum, computeDeviceOrdinal, treeCount,
Expand Down
16 changes: 2 additions & 14 deletions mllib-dal/src/main/native/GPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ static sycl::queue getSyclQueue(const sycl::device device) {
}
}

sycl::queue getAssignedGPU(const ComputeDevice device, ccl::communicator &comm,
int size, int rankId, jint *gpu_indices, int n_gpu) {
sycl::queue getAssignedGPU(const ComputeDevice device, int *gpu_indices) {
switch (device) {
case ComputeDevice::host:
case ComputeDevice::cpu: {
Expand All @@ -78,19 +77,8 @@ sycl::queue getAssignedGPU(const ComputeDevice device, ccl::communicator &comm,
}
case ComputeDevice::gpu: {
logger::println(logger::INFO, "selector GPU");
auto local_rank = getLocalRank(comm, size, rankId);
auto gpus = get_gpus();

logger::println(logger::INFO,
"rank: %d size: %d local_rank: %d n_gpu: %d", rankId,
size, local_rank, n_gpu);

auto gpu_selected = gpu_indices[local_rank % n_gpu];
logger::println(logger::INFO, "GPU selected for current rank: %d",
gpu_selected);

// In case gpu_selected index is larger than number of GPU SYCL devices
auto rank_gpu = gpus[gpu_selected % gpus.size()];
auto rank_gpu = gpus[0];
sycl::queue q{rank_gpu};
return q;
}
Expand Down
3 changes: 1 addition & 2 deletions mllib-dal/src/main/native/GPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include <jni.h>
#include <oneapi/ccl.hpp>

sycl::queue getAssignedGPU(const ComputeDevice device, ccl::communicator &comm,
int size, int rankId, jint *gpu_indices, int n_gpu);
sycl::queue getAssignedGPU(const ComputeDevice device, jint *gpu_indices);

sycl::queue getQueue(const ComputeDevice device);
22 changes: 10 additions & 12 deletions mllib-dal/src/main/native/KMeansImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,21 +305,22 @@ static jlong doKMeansOneAPICompute(
*/
JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCenters(
JNIEnv *env, jobject obj, jlong pNumTabData, jlong numRows, jlong numCols,
jlong pNumTabCenters, jint clusterNum, jdouble tolerance, jint iterationNum,
jint executorNum, jint executorCores, jint computeDeviceOrdinal,
jintArray gpuIdxArray, jobject resultObj) {
JNIEnv *env, jobject obj, jint rank, jlong pNumTabData, jlong numRows,
jlong numCols, jlong pNumTabCenters, jint clusterNum, jdouble tolerance,
jint iterationNum, jint executorNum, jint executorCores,
jint computeDeviceOrdinal, jintArray gpuIdxArray, jobject resultObj) {
logger::println(logger::INFO,
"OneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());

jlong ret = 0L;
ccl::communicator &cclComm = getComm();
int rankId = cclComm.rank();

ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
case ComputeDevice::host:
case ComputeDevice::cpu: {
ccl::communicator &cclComm = getComm();
int rankId = cclComm.rank();
NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
NumericTablePtr centroids = *((NumericTablePtr *)pNumTabCenters);
// Set number of threads for OneDAL to use for each rank
Expand All @@ -341,19 +342,16 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
logger::println(
logger::INFO,
"OneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu,
rankId);
rank);

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);

int size = cclComm.size();

auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
auto queue = getAssignedGPU(device, gpuIndices);

ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
queue, executorNum, rank, kvs);
ret = doKMeansOneAPICompute(env, pNumTabData, numRows, numCols,
pNumTabCenters, clusterNum, tolerance,
iterationNum, comm, resultObj);
Expand Down
52 changes: 22 additions & 30 deletions mllib-dal/src/main/native/LinearRegressionImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,7 @@ ridge_regression_compute(size_t rankId, ccl::communicator &comm,
}

#ifdef CPU_GPU_PROFILE
static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
ccl::communicator &cclComm, sycl::queue &queue,
static jlong doLROneAPICompute(JNIEnv *env, size_t rankId, sycl::queue &queue,
jlong pNumTabFeature, jlong featureRows,
jlong featureCols, jlong pNumTabLabel,
jlong labelCols, jboolean jfitIntercept,
Expand All @@ -225,10 +224,9 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
const bool isRoot = (rankId == ccl_root);
bool fitIntercept = bool(jfitIntercept);

int size = cclComm.size();
ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm = preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
queue, executorNum, rankId, kvs);
homogen_table xtrain = *reinterpret_cast<homogen_table *>(
createHomogenTableWithArrayPtr(pNumTabFeature, featureRows, featureCols,
comm.get_queue())
Expand Down Expand Up @@ -262,7 +260,7 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
*/
JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL(
JNIEnv *env, jobject obj, jlong feature, jlong featureRows,
JNIEnv *env, jobject obj, jint rank, jlong feature, jlong featureRows,
jlong featureCols, jlong label, jlong labelCols, jboolean fitIntercept,
jdouble regParam, jdouble elasticNetParam, jint executorNum,
jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
Expand All @@ -272,9 +270,6 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra
"oneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());

ccl::communicator &cclComm = getComm();
size_t rankId = cclComm.rank();

ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
bool useGPU = false;
if (device == ComputeDevice::gpu && regParam == 0) {
Expand All @@ -288,19 +283,20 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra
logger::println(
logger::INFO,
"oneDAL (native): use GPU kernels with %d GPU(s) rankid %d", nGpu,
rankId);
rank);

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
int size = cclComm.size();
auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
auto queue = getAssignedGPU(device, gpuIndices);

resultptr = doLROneAPICompute(
env, rankId, cclComm, queue, feature, featureRows, featureCols,
label, labelCols, fitIntercept, executorNum, resultObj);
resultptr = doLROneAPICompute(env, rank, queue, feature, featureRows,
featureCols, label, labelCols,
fitIntercept, executorNum, resultObj);
env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
#endif
} else {
ccl::communicator &cclComm = getComm();
size_t rankId = cclComm.rank();

NumericTablePtr pLabel = *((NumericTablePtr *)label);
NumericTablePtr pData = *((NumericTablePtr *)feature);

Expand All @@ -323,22 +319,18 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra

NumericTablePtr *coeffvectors = new NumericTablePtr(resultTable);
resultptr = (jlong)coeffvectors;
}
if (rankId == ccl_root) {
// Get the class of the result object
jclass clazz = env->GetObjectClass(resultObj);
// Get Field references
jfieldID coeffNumericTableField =
env->GetFieldID(clazz, "coeffNumericTable", "J");

jlong ret = 0L;
if (rankId == ccl_root) {
// Get the class of the result object
jclass clazz = env->GetObjectClass(resultObj);
// Get Field references
jfieldID coeffNumericTableField =
env->GetFieldID(clazz, "coeffNumericTable", "J");
env->SetLongField(resultObj, coeffNumericTableField, resultptr);

env->SetLongField(resultObj, coeffNumericTableField, resultptr);

// intercept is already in first column of coeffvectors
ret = resultptr;
} else {
ret = (jlong)0;
// intercept is already in first column of coeffvectors
resultptr = (jlong)coeffvectors;
}
}
return ret;
return resultptr;
}
20 changes: 4 additions & 16 deletions mllib-dal/src/main/native/OneCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include "Logger.h"
#include "OneCCL.h"
#include "com_intel_oap_mllib_OneCCL__.h"
#include "service.h"

extern const size_t ccl_root = 0;

Expand Down Expand Up @@ -61,6 +62,8 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init(
auto &singletonCCLInit = CCLInitSingleton::get(size, rank, ccl_ip_port);

g_kvs.push_back(singletonCCLInit.kvs);

#ifdef CPU_ONLY_PROFILE
g_comms.push_back(
ccl::create_communicator(size, rank, singletonCCLInit.kvs));

Expand All @@ -70,9 +73,7 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init(
.count();
logger::println(logger::INFO, "OneCCL (native): init took %f secs",
duration / 1000);

rank_id = getComm().rank();
comm_size = getComm().size();
#endif

jclass cls = env->GetObjectClass(param);
jfieldID fid_comm_size = env->GetFieldID(cls, "commSize", "J");
Expand All @@ -85,19 +86,6 @@ JNIEXPORT jint JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1init(
return 1;
}

/*
* Class: com_intel_oap_mllib_OneCCL__
* Method: c_init
* Signature: ()I
*/
JNIEXPORT jint JNICALL
Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject) {
logger::printerrln(logger::INFO, "OneCCL (native): init dpcpp");
ccl::init();

return 1;
}

JNIEXPORT void JNICALL
Java_com_intel_oap_mllib_OneCCL_00024_c_1cleanup(JNIEnv *env, jobject obj) {
logger::printerrln(logger::INFO, "OneCCL (native): cleanup");
Expand Down
Loading

0 comments on commit 49bdc75

Please sign in to comment.