remove float data type, Spark supports data type was double.

Signed-off-by: minmingzhu <[email protected]>
oap-project · Aug 23, 2023 · aff2fd0 · aff2fd0
1 parent 0162e14
commit aff2fd0
Show file tree

Hide file tree

Showing 15 changed files with 121 additions and 497 deletions.
diff --git a/mllib-dal/src/main/java/com/intel/oneapi/dal/table/HomogenTableImpl.java b/mllib-dal/src/main/java/com/intel/oneapi/dal/table/HomogenTableImpl.java
@@ -73,16 +73,12 @@ public HomogenTableImpl(long rowCount,
                             Common.ComputeDevice computeDevice) {
         this.device = computeDevice;
         switch (dataType) {
-            case FLOAT32:
-                this.cObject = fPtrInit(rowCount, colCount, dataPtr, dataLayout.ordinal(),
-                        this.device.ordinal());
-                break;
             case FLOAT64:
                 this.cObject = dPtrInit(rowCount, colCount, dataPtr, dataLayout.ordinal(),
                         this.device.ordinal());
                 break;
             default:
-                System.err.println("oneapi algorithm only support float/double");
+                System.err.println("spark algorithm only support double");
                 System.exit(-1);
         }
     }
@@ -194,11 +190,6 @@ private native long dPtrInit(long rowCount,
                                  int dataLayoutIndex,
                                  int computeDeviceIndex);
 
-    private native long fPtrInit(long rowCount,
-                                 long colCount,
-                                 long dataPtr,
-                                 int dataLayoutIndex,
-                                 int computeDeviceIndex);
     private native long cGetColumnCount(long cObject);
     private native long cGetRowCount(long cObject);
     private native long cGetKind(long cObject);

diff --git a/mllib-dal/src/main/native/ALSDALImpl.cpp b/mllib-dal/src/main/native/ALSDALImpl.cpp
@@ -29,8 +29,6 @@ using namespace daal;
 using namespace daal::algorithms;
 using namespace daal::algorithms::implicit_als;
 
-typedef float algorithmFPType; /* Algorithm floating-point type */
-
 NumericTablePtr userOffset;
 NumericTablePtr itemOffset;
 

diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp
@@ -33,8 +33,6 @@ using namespace daal;
 using namespace daal::services;
 namespace covariance_cpu = daal::algorithms::covariance;
 
-typedef double algorithmFPType; /* Algorithm floating-point type */
-
 static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, size_t rankId,
                                      ccl::communicator &comm,
                                      const NumericTablePtr &pData,
@@ -153,49 +151,25 @@ static void doCorrelationOneAPICompute(
     const bool isRoot = (comm.get_rank() == ccl_root);
     homogen_table htable =
         *reinterpret_cast<const homogen_table *>(pNumTabData);
-    const auto &dtype = htable.get_metadata().get_data_type(0);
-    covariance_gpu::compute_result result_train;
+
+    const auto cor_desc =
+        covariance_gpu::descriptor<algorithmFPType>{}.set_result_options(
+            covariance_gpu::result_options::cor_matrix |
+            covariance_gpu::result_options::means);
     auto t1 = std::chrono::high_resolution_clock::now();
-    switch (dtype) {
-    case data_type::float32: {
-        const auto cor_desc =
-            covariance_gpu::descriptor<float>{}.set_result_options(
-                covariance_gpu::result_options::cor_matrix |
-                covariance_gpu::result_options::means);
-        t1 = std::chrono::high_resolution_clock::now();
-        result_train = preview::compute(comm, cor_desc, htable);
-        break;
-    }
-    case data_type::float64: {
-        const auto cor_desc =
-            covariance_gpu::descriptor<double>{}.set_result_options(
-                covariance_gpu::result_options::cor_matrix |
-                covariance_gpu::result_options::means);
-        t1 = std::chrono::high_resolution_clock::now();
-        result_train = preview::compute(comm, cor_desc, htable);
-        break;
-    }
-    default: {
-        std::cout << "no supported data type :" << &dtype << std::endl;
-        exit(-1);
-    }
-    }
-    auto t2 = std::chrono::high_resolution_clock::now();
-    auto duration =
-        (float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
-            .count();
-    std::cout << "Correlation batch(native): computing step took "
-              << duration / 1000 << " secs." << std::endl;
+    const auto result_train = preview::compute(comm, cor_desc, htable);
     if (isRoot) {
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto duration =
+            (float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 -
+                                                                         t1)
+                .count();
+        std::cout << "Correlation batch(native): computing step took "
+                  << duration / 1000 << " secs." << std::endl;
         std::cout << "Mean:\n" << result_train.get_means() << std::endl;
         std::cout << "Correlation:\n"
                   << result_train.get_cor_matrix() << std::endl;
-        t2 = std::chrono::high_resolution_clock::now();
-        duration = (float)std::chrono::duration_cast<std::chrono::milliseconds>(
-                       t2 - t1)
-                       .count();
-        std::cout << "Correlation batch(native): computing step took "
-                  << duration / 1000 << " secs." << std::endl;
+
         // Return all covariance & mean
         jclass clazz = env->GetObjectClass(resultObj);
 

diff --git a/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp b/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp
@@ -225,97 +225,47 @@ static jobject doRFClassifierOneAPICompute(
               << hFeaturetable.get_column_count() << std::endl;
     std::cout << "doRFClassifierOneAPICompute classCount = " << classCount
               << std::endl;
-    const auto &dtype = hFeaturetable.get_metadata().get_data_type(0);
-    df::train_result result_train;
-    df::infer_result result_infer;
+    const auto df_desc =
+        df::descriptor<algorithmFPType, df::method::hist,
+                       df::task::classification>{}
+            .set_class_count(classCount)
+            .set_tree_count(treeCount)
+            .set_features_per_node(numFeaturesPerNode)
+            .set_min_observations_in_leaf_node(minObservationsLeafNode)
+            .set_min_observations_in_split_node(minObservationsSplitNode)
+            .set_min_weight_fraction_in_leaf_node(minWeightFractionLeafNode)
+            .set_min_impurity_decrease_in_split_node(
+                minImpurityDecreaseSplitNode)
+            .set_error_metric_mode(df::error_metric_mode::out_of_bag_error)
+            .set_variable_importance_mode(df::variable_importance_mode::mdi)
+            .set_infer_mode(df::infer_mode::class_responses |
+                            df::infer_mode::class_probabilities)
+            .set_voting_mode(df::voting_mode::weighted)
+            .set_max_tree_depth(maxTreeDepth)
+            .set_max_bins(maxBins);
+
     auto t1 = std::chrono::high_resolution_clock::now();
-    auto t2 = std::chrono::high_resolution_clock::now();
-    auto duration =
-        (float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
-            .count();
-    switch (dtype) {
-    case data_type::float32: {
-        const auto df_desc =
-            df::descriptor<float, df::method::hist, df::task::classification>{}
-                .set_class_count(classCount)
-                .set_tree_count(treeCount)
-                .set_features_per_node(numFeaturesPerNode)
-                .set_min_observations_in_leaf_node(minObservationsLeafNode)
-                .set_min_observations_in_split_node(minObservationsSplitNode)
-                .set_min_weight_fraction_in_leaf_node(minWeightFractionLeafNode)
-                .set_min_impurity_decrease_in_split_node(
-                    minImpurityDecreaseSplitNode)
-                .set_error_metric_mode(df::error_metric_mode::out_of_bag_error)
-                .set_variable_importance_mode(df::variable_importance_mode::mdi)
-                .set_infer_mode(df::infer_mode::class_responses |
-                                df::infer_mode::class_probabilities)
-                .set_voting_mode(df::voting_mode::weighted)
-                .set_max_tree_depth(maxTreeDepth)
-                .set_max_bins(maxBins);
-        t1 = std::chrono::high_resolution_clock::now();
-        result_train =
-            preview::train(comm, df_desc, hFeaturetable, hLabeltable);
-        t2 = std::chrono::high_resolution_clock::now();
-        duration = (float)std::chrono::duration_cast<std::chrono::milliseconds>(
-                       t2 - t1)
-                       .count();
-        std::cout << "DF Classifier (native): training step took "
-                  << duration / 1000 << " secs." << std::endl;
-        result_infer = preview::infer(comm, df_desc, result_train.get_model(),
-                                      hFeaturetable);
-        break;
-    }
-    case data_type::float64: {
-        const auto df_desc =
-            df::descriptor<double, df::method::hist, df::task::classification>{}
-                .set_class_count(classCount)
-                .set_tree_count(treeCount)
-                .set_features_per_node(numFeaturesPerNode)
-                .set_min_observations_in_leaf_node(minObservationsLeafNode)
-                .set_min_observations_in_split_node(minObservationsSplitNode)
-                .set_min_weight_fraction_in_leaf_node(minWeightFractionLeafNode)
-                .set_min_impurity_decrease_in_split_node(
-                    minImpurityDecreaseSplitNode)
-                .set_error_metric_mode(df::error_metric_mode::out_of_bag_error)
-                .set_variable_importance_mode(df::variable_importance_mode::mdi)
-                .set_infer_mode(df::infer_mode::class_responses |
-                                df::infer_mode::class_probabilities)
-                .set_voting_mode(df::voting_mode::weighted)
-                .set_max_tree_depth(maxTreeDepth)
-                .set_max_bins(maxBins);
-        t1 = std::chrono::high_resolution_clock::now();
-        result_train =
-            preview::train(comm, df_desc, hFeaturetable, hLabeltable);
-        t2 = std::chrono::high_resolution_clock::now();
-        duration = (float)std::chrono::duration_cast<std::chrono::milliseconds>(
-                       t2 - t1)
-                       .count();
-        std::cout << "DF Classifier (native): training step took "
-                  << duration / 1000 << " secs." << std::endl;
-        result_infer = preview::infer(comm, df_desc, result_train.get_model(),
-                                      hFeaturetable);
-        break;
-    }
-    default: {
-        std::cout << "no supported data type :" << &dtype << std::endl;
-        exit(-1);
-    }
-    }
+    const auto result_train =
+        preview::train(comm, df_desc, hFeaturetable, hLabeltable);
+    const auto result_infer =
+        preview::infer(comm, df_desc, result_train.get_model(), hFeaturetable);
     jobject trees = nullptr;
     if (isRoot) {
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto duration =
+            (float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 -
+                                                                         t1)
+                .count();
+        std::cout << "DF Classifier (native): training step took "
+                  << duration / 1000 << " secs." << std::endl;
         std::cout << "Variable importance results:\n"
                   << result_train.get_var_importance() << std::endl;
         std::cout << "OOB error: " << result_train.get_oob_err() << std::endl;
         std::cout << "Prediction results:\n"
                   << result_infer.get_responses() << std::endl;
         std::cout << "Probabilities results:\n"
                   << result_infer.get_probabilities() << std::endl;
-        t2 = std::chrono::high_resolution_clock::now();
-        duration = (float)std::chrono::duration_cast<std::chrono::milliseconds>(
-                       t2 - t1)
-                       .count();
-        std::cout << "DF Classifier (native): training step took "
-                  << duration / 1000 << " secs." << std::endl;
+
         // convert to java hashmap
         trees = collect_model(env, result_train.get_model(), classCount);
 

diff --git a/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp b/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp
@@ -220,83 +220,36 @@ static jobject doRFRegressorOneAPICompute(
         *reinterpret_cast<const homogen_table *>(pNumTabLabel);
     std::cout << "doRFRegressorOneAPICompute get_column_count = "
               << hFeaturetable.get_column_count() << std::endl;
-    const auto &dtype = hFeaturetable.get_metadata().get_data_type(0);
-    df::train_result<df::task::regression> result_train;
-    df::infer_result<df::task::regression> result_infer;
+    const auto df_desc =
+        df::descriptor<algorithmFPType, df::method::hist,
+                       df::task::regression>{}
+            .set_tree_count(treeCount)
+            .set_features_per_node(numFeaturesPerNode)
+            .set_min_observations_in_leaf_node(minObservationsLeafNode)
+            .set_max_tree_depth(maxTreeDepth)
+            .set_max_bins(maxbins)
+            .set_error_metric_mode(
+                df::error_metric_mode::out_of_bag_error |
+                df::error_metric_mode::out_of_bag_error_per_observation)
+            .set_variable_importance_mode(df::variable_importance_mode::mdi);
+
     auto t1 = std::chrono::high_resolution_clock::now();
-    auto t2 = std::chrono::high_resolution_clock::now();
-    auto duration =
-        (float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
-            .count();
-    switch (dtype) {
-    case data_type::float32: {
-        const auto df_desc =
-            df::descriptor<float, df::method::hist, df::task::regression>{}
-                .set_tree_count(treeCount)
-                .set_features_per_node(numFeaturesPerNode)
-                .set_min_observations_in_leaf_node(minObservationsLeafNode)
-                .set_max_tree_depth(maxTreeDepth)
-                .set_max_bins(maxbins)
-                .set_error_metric_mode(
-                    df::error_metric_mode::out_of_bag_error |
-                    df::error_metric_mode::out_of_bag_error_per_observation)
-                .set_variable_importance_mode(
-                    df::variable_importance_mode::mdi);
-        t1 = std::chrono::high_resolution_clock::now();
-        result_train =
-            preview::train(comm, df_desc, hFeaturetable, hLabeltable);
-        t2 = std::chrono::high_resolution_clock::now();
-        duration = (float)std::chrono::duration_cast<std::chrono::milliseconds>(
-                       t2 - t1)
-                       .count();
-        std::cout << "DF Classifier (native): training step took "
-                  << duration / 1000 << " secs." << std::endl;
-        result_infer = preview::infer(comm, df_desc, result_train.get_model(),
-                                      hFeaturetable);
-        break;
-    }
-    case data_type::float64: {
-        const auto df_desc =
-            df::descriptor<double, df::method::hist, df::task::regression>{}
-                .set_tree_count(treeCount)
-                .set_features_per_node(numFeaturesPerNode)
-                .set_min_observations_in_leaf_node(minObservationsLeafNode)
-                .set_max_tree_depth(maxTreeDepth)
-                .set_max_bins(maxbins)
-                .set_error_metric_mode(
-                    df::error_metric_mode::out_of_bag_error |
-                    df::error_metric_mode::out_of_bag_error_per_observation)
-                .set_variable_importance_mode(
-                    df::variable_importance_mode::mdi);
-        t1 = std::chrono::high_resolution_clock::now();
-        result_train =
-            preview::train(comm, df_desc, hFeaturetable, hLabeltable);
-        t2 = std::chrono::high_resolution_clock::now();
-        duration = (float)std::chrono::duration_cast<std::chrono::milliseconds>(
-                       t2 - t1)
-                       .count();
-        std::cout << "DF Regression (native): training step took "
-                  << duration / 1000 << " secs." << std::endl;
-        result_infer = preview::infer(comm, df_desc, result_train.get_model(),
-                                      hFeaturetable);
-        break;
-    }
-    default: {
-        std::cout << "no supported data type :" << &dtype << std::endl;
-        exit(-1);
-    }
-    }
+    const auto result_train =
+        preview::train(comm, df_desc, hFeaturetable, hLabeltable);
+    const auto result_infer =
+        preview::infer(comm, df_desc, result_train.get_model(), hFeaturetable);
     jobject trees = nullptr;
     if (isRoot) {
         std::cout << "Variable importance results:\n"
                   << result_train.get_var_importance() << std::endl;
         std::cout << "OOB error: " << result_train.get_oob_err() << std::endl;
         std::cout << "Prediction results:\n"
                   << result_infer.get_responses() << std::endl;
-        t2 = std::chrono::high_resolution_clock::now();
-        duration = (float)std::chrono::duration_cast<std::chrono::milliseconds>(
-                       t2 - t1)
-                       .count();
+        auto t2 = std::chrono::high_resolution_clock::now();
+        auto duration =
+            (float)std::chrono::duration_cast<std::chrono::milliseconds>(t2 -
+                                                                         t1)
+                .count();
         std::cout << "DF Regression (native): training step took "
                   << duration / 1000 << " secs." << std::endl;
         // convert c++ map to java hashmap