Use ExecuTorch prebuilt library in pip package to build custom kernels (

#1059) * Use ExecuTorch prebuilt library in pip package to build custom kernels As titled. The biggest change is this line: ``` find_package(ExecuTorch REQUIRED HINTS ${CMAKE_PREFIX_PATH}/executorch/share/cmake) ``` This gives `EXECUTORCH_INCLUDE_DIRS` with headers and `EXECUTORCH_LIBRARIES` for custom kernels to depend on. * Address comments
pytorch · Oct 11, 2024 · 9cd0da6 · 9cd0da6
1 parent d4b2f33
commit 9cd0da6
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 81 deletions.
diff --git a/torchao/experimental/CMakeLists.txt b/torchao/experimental/CMakeLists.txt
@@ -3,70 +3,55 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+cmake_minimum_required(VERSION 3.19)
 
 project(torchao)
 
-cmake_minimum_required(VERSION 3.19)
-
 set(CMAKE_CXX_STANDARD 17)
 
 if (NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif()
 
+option(TORCHAO_OP_EXECUTORCH_BUILD "Building torchao ops for ExecuTorch." OFF)
 
-# Source root directory for torchao/experimental
-if(NOT TORCHAO_ROOT)
-  set(TORCHAO_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
-endif()
 
 if(NOT TORCHAO_INCLUDE_DIRS)
-  set(TORCHAO_INCLUDE_DIRS ${TORCHAO_ROOT}/../..)
+  set(TORCHAO_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 endif()
 
-if (NOT TORCHAO_OP_TARGET)
-    message(FATAL_ERROR "TORCHAO_OP_TARGET is not set.  Set it to aten or executorch.")
-endif()
-
-if (NOT TORCHAO_PARALLEL_BACKEND)
-    if (TORCHAO_OP_TARGET STREQUAL "aten")
-        set(TORCHAO_PARALLEL_BACKEND "aten_openmp")
-    elseif(TORCHAO_OP_TARGET STREQUAL "executorch")
-        set(TORCHAO_PARALLEL_BACKEND "executorch")
-    else()
-        message(TORCHAO_PARALLEL_BACKEND "TORCHAO_PARALLEL_BACKEND is not set.  Please set it directly or set TORCHAO_OP_TARGET to get a default.")
-    endif()
-endif()
 
 include(CMakePrintHelpers)
 
-add_compile_options("-Wall" "-Werror")
+add_compile_options("-Wall" "-Werror" "-Wno-deprecated")
 
 include(CMakePrintHelpers)
 message("TORCHAO_INCLUDE_DIRS: ${TORCHAO_INCLUDE_DIRS}")
 include_directories(${TORCHAO_INCLUDE_DIRS})
 
-if(TORCHAO_OP_TARGET STREQUAL "aten")
-    add_library(torchao_ops_${TORCHAO_OP_TARGET} SHARED)
-elseif(TORCHAO_OP_TARGET STREQUAL "executorch")
-    add_library(torchao_ops_${TORCHAO_OP_TARGET} STATIC)
-    add_compile_options("-Wno-error=deprecated")
-else()
-    message(FATAL_ERROR "Unknown TORCHAO_OP_TARGET: ${TORCHAO_OP_TARGET}. Please choose one of: aten, executorch.")
-endif()
-
-if (CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
     # Defines target torchao_kernels_aarch64
-    add_subdirectory(${TORCHAO_ROOT}/kernels/cpu/aarch64)
-    add_subdirectory(${TORCHAO_ROOT}/ops/linear_8bit_act_xbit_weight)
+    add_subdirectory(kernels/cpu/aarch64)
+    add_subdirectory(ops/linear_8bit_act_xbit_weight)
 
+    add_library(torchao_ops_aten SHARED)
     target_link_libraries(
-        torchao_ops_${TORCHAO_OP_TARGET} PRIVATE
-        torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET}
+        torchao_ops_aten PRIVATE
+        torchao_ops_linear_8bit_act_xbit_weight_aten
+    )
+    install(
+        TARGETS torchao_ops_aten
+        DESTINATION lib
     )
+    if(TORCHAO_OP_EXECUTORCH_BUILD)
+        add_library(torchao_ops_executorch STATIC)
+        target_link_libraries(torchao_ops_executorch PRIVATE torchao_ops_linear_8bit_act_xbit_weight_executorch)
+
+        install(
+            TARGETS torchao_ops_executorch
+            DESTINATION lib
+        )
+    endif()
+else()
+    message(FATAL_ERROR "Torchao experimental ops can only be built on arm64 CPUs.")
 endif()
-
-install(
-  TARGETS  torchao_ops_${TORCHAO_OP_TARGET}
-  DESTINATION lib
-)
diff --git a/torchao/experimental/Utils.cmake b/torchao/experimental/Utils.cmake
@@ -24,12 +24,12 @@ function(target_link_torchao_parallel_backend target_name torchao_parallel_backe
         target_link_libraries(${target_name} PRIVATE ${TORCH_INSTALL_PREFIX}/lib/libomp${CMAKE_SHARED_LIBRARY_SUFFIX})
 
     elseif(TORCHAO_PARALLEL_BACKEND_TOUPPER STREQUAL "EXECUTORCH")
-    message(STATUS "Building with TORCHAO_PARALLEL_BACKEND=TORCHAO_PARALLEL_EXECUTORCH")
-    message(STATUS "EXECUTORCH_INCLUDE_DIRS: ${EXECUTORCH_INCLUDE_DIRS}")
-    message(STATUS "EXECUTORCH_LIBRARIES: ${EXECUTORCH_LIBRARIES}")
-    target_include_directories(${target_name} PRIVATE "${EXECUTORCH_INCLUDE_DIRS}")
-    target_link_libraries(${target_name} PRIVATE "${EXECUTORCH_LIBRARIES}")
-    target_compile_definitions(${target_name} PRIVATE TORCHAO_PARALLEL_EXECUTORCH=1)
+        message(STATUS "Building with TORCHAO_PARALLEL_BACKEND=TORCHAO_PARALLEL_EXECUTORCH")
+        message(STATUS "EXECUTORCH_INCLUDE_DIRS: ${EXECUTORCH_INCLUDE_DIRS}")
+        message(STATUS "EXECUTORCH_LIBRARIES: ${EXECUTORCH_LIBRARIES}")
+        target_include_directories(${target_name} PRIVATE "${EXECUTORCH_INCLUDE_DIRS}")
+        target_link_libraries(${target_name} PRIVATE "${EXECUTORCH_LIBRARIES}")
+        target_compile_definitions(${target_name} PRIVATE TORCHAO_PARALLEL_EXECUTORCH=1)
 
     elseif(TORCHAO_PARALLEL_BACKEND_TOUPPER STREQUAL "OPENMP")
         message(STATUS "Building with TORCHAO_PARALLEL_BACKEND=OPENMP.  You must set the CMake variable OpenMP_ROOT to the OMP library location before compiling.  Do not use this option if Torch was built with OPENMP; use ATEN_OPENMP instead.")
@@ -59,6 +59,6 @@ function(target_link_torchao_parallel_backend target_name torchao_parallel_backe
         target_compile_definitions(${target_name} PRIVATE TORCHAO_PARALLEL_TEST_DUMMY=1)
 
     else()
-        message(FATAL_ERROR "Unknown TORCHAO_PARALLEL_BACKEND: ${TORCHAO_PARALLEL_BACKEND}. Please choose one of: aten_openmp, openmp, pthreadpool, single_threaded.")
+        message(FATAL_ERROR "Unknown TORCHAO_PARALLEL_BACKEND: ${TORCHAO_PARALLEL_BACKEND}. Please choose one of: aten_openmp, executorch, openmp, pthreadpool, single_threaded.")
     endif()
 endfunction()
diff --git a/torchao/experimental/build_torchao_ops.sh b/torchao/experimental/build_torchao_ops.sh
@@ -10,14 +10,17 @@ if [[ $# -ne 1 ]]; then
     exit 1;
 fi
 TARGET="${1}"
-export CMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')"
+export CMAKE_PREFIX_PATH=$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
 echo "CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}"
-export CMAKE_OUT=/tmp/cmake-out/torchao
+if [[ $TARGET == "executorch" ]]; then
+    TORCHAO_OP_EXECUTORCH_BUILD=ON
+else
+    TORCHAO_OP_EXECUTORCH_BUILD=OFF
+fi
+export CMAKE_OUT=cmake-out/torchao
 cmake -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} \
     -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT} \
-    -DTORCHAO_OP_TARGET="${TARGET}" \
-    -DEXECUTORCH_LIBRARIES="${EXECUTORCH_LIBRARIES}" \
-    -DEXECUTORCH_INCLUDE_DIRS="${EXECUTORCH_INCLUDE_DIRS}" \
+    -DTORCHAO_OP_EXECUTORCH_BUILD="${TORCHAO_OP_EXECUTORCH_BUILD}" \
     -S . \
     -B ${CMAKE_OUT}
 cmake --build  ${CMAKE_OUT} --target install --config Release
diff --git a/torchao/experimental/install_requirements.sh b/torchao/experimental/install_requirements.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Install requirements for experimental torchao ops.
+if [[ -z $PIP ]];
+then
+    PIP=pip
+fi
+
+NIGHTLY_VERSION="dev20241011"
+$PIP install "executorch==0.5.0.$NIGHTLY_VERSION" --extra-index-url https://download.pytorch.org/whl/nightly/cpu
diff --git a/torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt b/torchao/experimental/kernels/cpu/aarch64/CMakeLists.txt
@@ -23,10 +23,10 @@ CMAKE_DEPENDENT_OPTION(BUILD_KLEIDI "Download, build, and link against Arm Kleid
 if (CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
   add_library(
     torchao_kernels_aarch64
-    ${TORCHAO_INCLUDE_DIRS}/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp
-    ${TORCHAO_INCLUDE_DIRS}/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp
-    ${TORCHAO_INCLUDE_DIRS}/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp
-    ${TORCHAO_INCLUDE_DIRS}/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/reduction/find_min_and_max.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/reduction/compute_sum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/quantization/quantize.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/valpacking/interleave.cpp
   )
   if (BUILD_KLEIDI)
     # Temporarily exposing this to the parent scope until we wire

diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/CMakeLists.txt b/torchao/experimental/ops/linear_8bit_act_xbit_weight/CMakeLists.txt
@@ -6,24 +6,22 @@
 
 cmake_minimum_required(VERSION 3.19)
 
-include(${TORCHAO_ROOT}/Utils.cmake)
+include(${CMAKE_CURRENT_SOURCE_DIR}/../../Utils.cmake)
 
+find_package(Torch REQUIRED)
+add_library(torchao_ops_linear_8bit_act_xbit_weight_aten OBJECT
+    linear_8bit_act_xbit_weight.cpp
+    op_linear_8bit_act_xbit_weight_aten.cpp
+)
+target_link_torchao_parallel_backend(torchao_ops_linear_8bit_act_xbit_weight_aten aten_openmp)
+target_link_libraries(torchao_ops_linear_8bit_act_xbit_weight_aten PRIVATE torchao_kernels_aarch64)
+target_include_directories(torchao_ops_linear_8bit_act_xbit_weight_aten PRIVATE "${TORCH_INCLUDE_DIRS}")
+target_link_libraries(torchao_ops_linear_8bit_act_xbit_weight_aten PRIVATE "${TORCH_LIBRARIES}")
+target_compile_definitions(torchao_ops_linear_8bit_act_xbit_weight_aten PRIVATE USE_ATEN=1)
 
-if(TORCHAO_OP_TARGET STREQUAL "aten")
-    message(STATUS "Building with TORCHAO_OP_TARGET=aten")
-    find_package(Torch REQUIRED)
-    add_library(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} OBJECT
-        linear_8bit_act_xbit_weight.cpp
-        op_linear_8bit_act_xbit_weight_aten.cpp
-    )
-    target_link_torchao_parallel_backend(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} "${TORCHAO_PARALLEL_BACKEND}")
-    target_link_libraries(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} PRIVATE torchao_kernels_aarch64)
-    target_include_directories(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} PRIVATE "${TORCH_INCLUDE_DIRS}")
-    target_link_libraries(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} PRIVATE "${TORCH_LIBRARIES}")
-    target_compile_definitions(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} PRIVATE USE_ATEN=1)
-elseif(TORCHAO_OP_TARGET STREQUAL "executorch")
-    message(STATUS "Building with TORCHAO_OP_TARGET=executorch")
-    add_library(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} OBJECT
+if(TORCHAO_OP_EXECUTORCH_BUILD)
+    find_package(ExecuTorch REQUIRED HINTS ${CMAKE_PREFIX_PATH}/executorch/share/cmake)
+    add_library(torchao_ops_linear_8bit_act_xbit_weight_executorch OBJECT
         linear_8bit_act_xbit_weight.cpp
         op_linear_8bit_act_xbit_weight_executorch/w2s.cpp
         op_linear_8bit_act_xbit_weight_executorch/w2sz.cpp
@@ -33,12 +31,12 @@ elseif(TORCHAO_OP_TARGET STREQUAL "executorch")
         op_linear_8bit_act_xbit_weight_executorch/w4sz.cpp
         op_linear_8bit_act_xbit_weight_executorch/w5s.cpp
         op_linear_8bit_act_xbit_weight_executorch/w5sz.cpp
+        op_linear_8bit_act_xbit_weight_executorch/w6s.cpp
+        op_linear_8bit_act_xbit_weight_executorch/w6sz.cpp
     )
-    target_link_torchao_parallel_backend(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} "${TORCHAO_PARALLEL_BACKEND}")
-    target_include_directories(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} PRIVATE "${EXECUTORCH_INCLUDE_DIRS}")
-    target_compile_definitions(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} PRIVATE USE_EXECUTORCH=1)
-    target_link_libraries(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} PRIVATE "${EXECUTORCH_LIBRARIES}")
-    target_link_libraries(torchao_ops_linear_8bit_act_xbit_weight_${TORCHAO_OP_TARGET} PRIVATE torchao_kernels_aarch64)
-else()
-    message(FATAL_ERROR "Unknown TORCHAO_OP_TARGET: ${TORCHAO_OP_TARGET}. Please choose one of: aten, executorch.")
+    target_link_torchao_parallel_backend(torchao_ops_linear_8bit_act_xbit_weight_executorch executorch)
+    target_include_directories(torchao_ops_linear_8bit_act_xbit_weight_executorch PRIVATE "${EXECUTORCH_INCLUDE_DIRS}")
+    target_compile_definitions(torchao_ops_linear_8bit_act_xbit_weight_executorch PRIVATE USE_EXECUTORCH=1)
+    target_link_libraries(torchao_ops_linear_8bit_act_xbit_weight_executorch PRIVATE "${EXECUTORCH_LIBRARIES}")
+    target_link_libraries(torchao_ops_linear_8bit_act_xbit_weight_executorch PRIVATE torchao_kernels_aarch64)
 endif()
diff --git a/torchao/experimental/tests/test_linear_8bit_act_xbit_weight_quantizer.py b/torchao/experimental/tests/test_linear_8bit_act_xbit_weight_quantizer.py
@@ -8,6 +8,7 @@
 
 import glob
 import os
+import subprocess
 
 import sys
 import tempfile
@@ -21,7 +22,36 @@
     Int8DynActIntxWeightQuantizer,
 )
 
-libs = glob.glob("/tmp/cmake-out/torchao/lib/libtorchao_ops_aten.*")
+
+def cmake_build_torchao_ops(temp_build_dir):
+    from distutils.sysconfig import get_python_lib
+
+    print("Building torchao ops for ATen target")
+    cmake_prefix_path = get_python_lib()
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    subprocess.run(
+        [
+            "cmake",
+            "-DCMAKE_PREFIX_PATH=" + cmake_prefix_path,
+            "-DCMAKE_INSTALL_PREFIX=" + temp_build_dir.name,
+            "-S " + dir_path + "/../",
+            "-B " + temp_build_dir.name,
+        ]
+    )
+    subprocess.run(
+        [
+            "cmake",
+            "--build",
+            temp_build_dir.name,
+            "--target install",
+            "--config Release",
+        ]
+    )
+
+
+temp_build_dir = tempfile.TemporaryDirectory()
+cmake_build_torchao_ops(temp_build_dir)
+libs = glob.glob(f"{temp_build_dir.name}/lib/libtorchao_ops_aten.*")
 libs = list(filter(lambda l: (l.endswith("so") or l.endswith("dylib")), libs))
 if len(libs) == 0:
     print(
@@ -30,6 +60,7 @@
 else:
     torch.ops.load_library(libs[0])
 
+
 class TestInt8DynActIntxWeightQuantizer(unittest.TestCase):
     def test_accuracy(self):
         group_size = 128
@@ -81,7 +112,11 @@ def test_export_compile_aoti(self):
         k3 = 1024
         nbit = 4
         has_weight_zeros = False
-        layers = [torch.nn.Linear(k0, k1, bias=False), torch.nn.Linear(k1, k2, bias=False), torch.nn.Linear(k2, k3, bias=False)]
+        layers = [
+            torch.nn.Linear(k0, k1, bias=False),
+            torch.nn.Linear(k1, k2, bias=False),
+            torch.nn.Linear(k2, k3, bias=False),
+        ]
         model = torch.nn.Sequential(*layers)
 
         activations = torch.randn(m, k0, dtype=torch.float32)