[Bug] Nightly test_approximate_nearest_neighbors.py::test_return_fewer_k FAILED #803

NvTimLiu · 2024-12-10T05:25:21Z

Details:
#spark-rapids-ml_nightly/570/

[2024-12-10T04:59:03.289Z]
[2024-12-10T04:59:03.289Z] =================================== FAILURES ===================================
[2024-12-10T04:59:03.289Z] ___________________ test_return_fewer_k[float32-ivfpq-array] ___________________
[2024-12-10T04:59:03.289Z] [gw1] linux -- Python 3.10.16 /root/miniconda3/bin/python3.10
[2024-12-10T04:59:03.289Z]
[2024-12-10T04:59:03.289Z] algorithm = 'ivfpq', feature_type = 'array', data_type = <class 'numpy.float32'>
[2024-12-10T04:59:03.289Z]
[2024-12-10T04:59:03.289Z] @pytest.mark.parametrize(
[2024-12-10T04:59:03.289Z] "algorithm,feature_type",
[2024-12-10T04:59:03.289Z] [
[2024-12-10T04:59:03.289Z] (
[2024-12-10T04:59:03.289Z] "ivfpq",
[2024-12-10T04:59:03.289Z] "array",
[2024-12-10T04:59:03.289Z] ),
[2024-12-10T04:59:03.289Z] (
[2024-12-10T04:59:03.289Z] "ivfflat",
[2024-12-10T04:59:03.289Z] "vector",
[2024-12-10T04:59:03.289Z] ),
[2024-12-10T04:59:03.289Z] ],
[2024-12-10T04:59:03.289Z] )
[2024-12-10T04:59:03.289Z] @pytest.mark.parametrize("data_type", [np.float32])
[2024-12-10T04:59:03.289Z] def test_return_fewer_k(
[2024-12-10T04:59:03.289Z] algorithm: str,
[2024-12-10T04:59:03.289Z] feature_type: str,
[2024-12-10T04:59:03.289Z] data_type: np.dtype,
[2024-12-10T04:59:03.289Z] ) -> None:
[2024-12-10T04:59:03.289Z] """
[2024-12-10T04:59:03.289Z] This tests the corner case where there are less than k neighbors found due to nprobe too small.
[2024-12-10T04:59:03.289Z] More details can be found at the docstring of class ApproximateNearestNeighbors.
[2024-12-10T04:59:03.289Z] """
[2024-12-10T04:59:03.289Z] assert algorithm in {"ivfpq", "ivfflat"}
[2024-12-10T04:59:03.289Z] metric = "euclidean"
[2024-12-10T04:59:03.289Z] gpu_number = 1
[2024-12-10T04:59:03.289Z] k = 4
[2024-12-10T04:59:03.289Z] algo_params = {
[2024-12-10T04:59:03.289Z] "nlist": k,
[2024-12-10T04:59:03.289Z] "nprobe": 1,
[2024-12-10T04:59:03.289Z] }
[2024-12-10T04:59:03.289Z]
[2024-12-10T04:59:03.289Z] if algorithm == "ivfpq":
[2024-12-10T04:59:03.289Z] algo_params.update({"M": 2, "n_bits": 4})
[2024-12-10T04:59:03.289Z]
[2024-12-10T04:59:03.289Z] X = np.array(
[2024-12-10T04:59:03.289Z] [
[2024-12-10T04:59:03.289Z] (
[2024-12-10T04:59:03.289Z] 0.0,
[2024-12-10T04:59:03.289Z] 0.0,
[2024-12-10T04:59:03.289Z] ),
[2024-12-10T04:59:03.289Z] (
[2024-12-10T04:59:03.289Z] 0.0,
[2024-12-10T04:59:03.289Z] 0.0,
[2024-12-10T04:59:03.289Z] ),
[2024-12-10T04:59:03.289Z] (
[2024-12-10T04:59:03.289Z] 2.0,
[2024-12-10T04:59:03.289Z] 2.0,
[2024-12-10T04:59:03.289Z] ),
[2024-12-10T04:59:03.289Z] (
[2024-12-10T04:59:03.289Z] 2.0,
[2024-12-10T04:59:03.289Z] 2.0,
[2024-12-10T04:59:03.289Z] ),
[2024-12-10T04:59:03.289Z] ]
[2024-12-10T04:59:03.289Z] )
[2024-12-10T04:59:03.289Z] y = np.arange(len(X)) # use label column as id column
[2024-12-10T04:59:03.289Z]
[2024-12-10T04:59:03.289Z] with CleanSparkSession() as spark:
[2024-12-10T04:59:03.289Z] df, features_col, label_col = create_pyspark_dataframe(
[2024-12-10T04:59:03.289Z] spark, feature_type, data_type, X, y, label_dtype=np.dtype(np.int64)
[2024-12-10T04:59:03.289Z] )
[2024-12-10T04:59:03.289Z]
[2024-12-10T04:59:03.289Z] est = ApproximateNearestNeighbors(
[2024-12-10T04:59:03.289Z] num_workers=gpu_number,
[2024-12-10T04:59:03.289Z] algorithm=algorithm,
[2024-12-10T04:59:03.289Z] algoParams=algo_params,
[2024-12-10T04:59:03.290Z] metric=metric,
[2024-12-10T04:59:03.290Z] k=k,
[2024-12-10T04:59:03.290Z] inputCol="features",
[2024-12-10T04:59:03.290Z] idCol=label_col,
[2024-12-10T04:59:03.290Z] )
[2024-12-10T04:59:03.290Z] model = est.fit(df)
[2024-12-10T04:59:03.290Z] _, _, knn_df = model.kneighbors(df)
[2024-12-10T04:59:03.290Z] knn_df_collect = knn_df.collect()
[2024-12-10T04:59:03.290Z]
[2024-12-10T04:59:03.290Z] int64_max = np.iinfo("int64").max
[2024-12-10T04:59:03.290Z] float_inf = float("inf")
[2024-12-10T04:59:03.290Z]
[2024-12-10T04:59:03.290Z] # ensure consistency with cuvs for ivfflat, and ivfpq > 24.10
[2024-12-10T04:59:03.290Z] import cuvs
[2024-12-10T04:59:03.290Z] from packaging import version
[2024-12-10T04:59:03.290Z]
[2024-12-10T04:59:03.290Z] if algorithm == "ivfflat" or version.parse(cuvs.version) > version.parse(
[2024-12-10T04:59:03.290Z] "24.10.00"
[2024-12-10T04:59:03.290Z] ):
[2024-12-10T04:59:03.290Z] ann_evaluator = ANNEvaluator(X, k, metric)
[2024-12-10T04:59:03.290Z] spark_indices = np.array([row["indices"] for row in knn_df_collect])
[2024-12-10T04:59:03.290Z] spark_distances = np.array([row["distances"] for row in knn_df_collect])
[2024-12-10T04:59:03.290Z] > ann_evaluator.compare_with_cuml_or_cuvs_sg(
[2024-12-10T04:59:03.290Z] algorithm, algo_params, spark_indices, spark_distances, tolerance=0.0
[2024-12-10T04:59:03.290Z] )
[2024-12-10T04:59:03.290Z]
[2024-12-10T04:59:03.290Z] tests/test_approximate_nearest_neighbors.py:1059:
[2024-12-10T04:59:03.290Z] _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
[2024-12-10T04:59:03.290Z]
[2024-12-10T04:59:03.290Z] self = <tests.test_approximate_nearest_neighbors.ANNEvaluator object at 0x7f4061d42dd0>
[2024-12-10T04:59:03.290Z] algorithm = 'ivfpq', algoParams = {'M': 2, 'n_bits': 4, 'nlist': 4, 'nprobe': 1}
[2024-12-10T04:59:03.290Z] given_indices = array([[0, 1, 0, 0],
[2024-12-10T04:59:03.290Z] [0, 1, 0, 0],
[2024-12-10T04:59:03.290Z] [2, 3, 2, 2],
[2024-12-10T04:59:03.290Z] [2, 3, 2, 2]])
[2024-12-10T04:59:03.290Z] given_distances = array([[ 0., 0., inf, inf],
[2024-12-10T04:59:03.290Z] [ 0., 0., inf, inf],
[2024-12-10T04:59:03.290Z] [ 0., 0., inf, inf],
[2024-12-10T04:59:03.290Z] [ 0., 0., inf, inf]])
[2024-12-10T04:59:03.290Z] tolerance = 0.0
[2024-12-10T04:59:03.290Z]
[2024-12-10T04:59:03.290Z] def compare_with_cuml_or_cuvs_sg(
[2024-12-10T04:59:03.290Z] self,
[2024-12-10T04:59:03.290Z] algorithm: str,
[2024-12-10T04:59:03.290Z] algoParams: Optional[Dict[str, Any]],
[2024-12-10T04:59:03.290Z] given_indices: np.ndarray,
[2024-12-10T04:59:03.290Z] given_distances: np.ndarray,
[2024-12-10T04:59:03.290Z] tolerance: float,
[2024-12-10T04:59:03.290Z] ) -> None:
[2024-12-10T04:59:03.290Z] # compare with cuml sg ANN on avg_recall and avg_dist_gap
[2024-12-10T04:59:03.290Z] cuvssg_distances, cuvssg_indices = self.get_cuvs_sg_results(
[2024-12-10T04:59:03.290Z] algorithm=algorithm, algoParams=algoParams
[2024-12-10T04:59:03.290Z] )
[2024-12-10T04:59:03.290Z]
[2024-12-10T04:59:03.290Z] # compare cuml sg with given results
[2024-12-10T04:59:03.290Z] avg_recall_cumlann = self.cal_avg_recall(cuvssg_indices)
[2024-12-10T04:59:03.290Z] avg_recall = self.cal_avg_recall(given_indices)
[2024-12-10T04:59:03.290Z] assert (avg_recall > avg_recall_cumlann) or abs(
[2024-12-10T04:59:03.290Z] avg_recall - avg_recall_cumlann
[2024-12-10T04:59:03.290Z] ) <= tolerance
[2024-12-10T04:59:03.290Z]
[2024-12-10T04:59:03.290Z] avg_dist_gap_cumlann = self.cal_avg_dist_gap(cuvssg_distances)
[2024-12-10T04:59:03.290Z] avg_dist_gap = self.cal_avg_dist_gap(given_distances)
[2024-12-10T04:59:03.290Z] > assert (avg_dist_gap <= avg_dist_gap_cumlann) or abs(
[2024-12-10T04:59:03.290Z] avg_dist_gap - avg_dist_gap_cumlann
[2024-12-10T04:59:03.290Z] ) <= tolerance
[2024-12-10T04:59:03.290Z] E assert (inf <= 1.4142135623730951 or inf <= 0.0)
[2024-12-10T04:59:03.290Z] E + where inf = abs((inf - 1.4142135623730951))
[2024-12-10T04:59:03.290Z]
[2024-12-10T04:59:03.290Z] tests/test_approximate_nearest_neighbors.py:296: AssertionError
[2024-12-10T04:59:03.290Z] ----------------------------- Captured stdout call -----------------------------
[2024-12-10T04:59:03.290Z] using ivf_pq::index_params nrows 4, dim 2, n_lits 4, pq_dim 2
[2024-12-10T04:59:03.290Z] ----------------------------- Captured stderr call -----------------------------
[2024-12-10T04:59:03.290Z]
[Stage 274:> (0 + 1) / 1]
2024-12-10 04:04:17,877 - spark_rapids_ml.knn.ApproximateNearestNeighborsModel - INFO - partition 0 starts with 4 item vectors
[2024-12-10T04:59:03.290Z] using ivf_pq::index_params nrows 4, dim 2, n_lits 4, pq_dim 2
[2024-12-10T04:59:03.290Z] 2024-12-10 04:04:18,198 - spark_rapids_ml.knn.ApproximateNearestNeighborsModel - INFO - partition 0 indexing finished in 0.321272611618042 seconds.
[2024-12-10T04:59:03.290Z] 2024-12-10 04:04:18,640 - spark_rapids_ml.knn.ApproximateNearestNeighborsModel - INFO - partition 0 search finished in 0.4417121410369873 seconds.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Bug] Nightly test_approximate_nearest_neighbors.py::test_return_fewer_k FAILED #803

[Bug] Nightly test_approximate_nearest_neighbors.py::test_return_fewer_k FAILED #803

NvTimLiu commented Dec 10, 2024

[Bug] Nightly test_approximate_nearest_neighbors.py::test_return_fewer_k FAILED #803

[Bug] Nightly test_approximate_nearest_neighbors.py::test_return_fewer_k FAILED #803

Comments

NvTimLiu commented Dec 10, 2024