index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="description"
        content="">
  <meta name="keywords" content="">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>SPOT: SE(3) Pose Trajectory Diffusion for Object-Centric Manipulation</title>

  <!-- Google tag (gtag.js) -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-7GF0RHBSDK"></script>
  <script>
    window.dataLayer = window.dataLayer || [];
    function gtag(){dataLayer.push(arguments);}
    gtag('js', new Date());

    gtag('config', 'G-7GF0RHBSDK');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <!-- <link rel="icon" href="./static/images/favicon.svg"> -->

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>


<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">SPOT: SE(3) Pose Trajectory Diffusion for Object-Centric Manipulation</h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              <a href="https://chengchunhsu.github.io/">Cheng-Chun Hsu</a><sup>* 1,2</sup>
              <a href="mailto:chengchun@utexas.edu">
              <!-- <span class="icon">
                  <svg class="svg-inline--fa fa-envelope fa-w-16" aria-hidden="true" focusable="false" data-prefix="fa" data-icon="envelope" role="img" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" data-fa-i2svg=""><path fill="currentColor" d="M502.3 190.8c3.9-3.1 9.7-.2 9.7 4.7V400c0 26.5-21.5 48-48 48H48c-26.5 0-48-21.5-48-48V195.6c0-5 5.7-7.8 9.7-4.7 22.4 17.4 52.1 39.5 154.1 113.6 21.1 15.4 56.7 47.8 92.2 47.6 35.7.3 72-32.8 92.3-47.6 102-74.1 131.6-96.3 154-113.7zM256 320c23.2.4 56.6-29.2 73.4-41.4 132.7-96.3 142.8-104.7 173.4-128.7 5.8-4.5 9.2-11.5 9.2-18.9v-19c0-26.5-21.5-48-48-48H48C21.5 64 0 85.5 0 112v19c0 7.4 3.4 14.3 9.2 18.9 30.6 23.9 40.7 32.4 173.4 128.7 16.8 12.2 50.2 41.8 73.4 41.4z"></path></svg>
              </span></a> -->

            </span>
            <span class="author-block">
              <a href="https://research.nvidia.com/person/bowen-wen">, Bowen Wen</a><sup>1</sup>,
            </span>
            <span class="author-block">
              <a href="https://research.nvidia.com/person/jie-xu">Jie Xu</a><sup>1</sup>,
            </span>
            <span class="author-block">
              <a href="https://research.nvidia.com/person/yashraj-narang">Yashraj Narang</a><sup>1</sup>,
            </span>
            <span class="author-block">
              <a href="https://research.nvidia.com/labs/lpr/author/xiaolong-wang/">Xiaolong Wang</a><sup>1,3</sup>,
            </span>
            <span class="author-block">
              <a href="https://research.nvidia.com/person/yuke-zhu">Yuke Zhu</a><sup>1,2</sup>,
            </span>
            <span class="author-block">
              <a href="https://www.joydeepb.com/">Joydeep Biswas</a><sup>1,2</sup>,
            </span>
            <span class="author-block">
              <a href="https://research.nvidia.com/person/stan-birchfield">Stan Birchfield</a><sup>1</sup>
            </span>
          </div>
          <br>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>1</sup>NVIDIA,</span>
            <span class="author-block"><sup>2</sup>UT Austin,</span>
            <span class="author-block"><sup>3</sup>UCSD</span>
          </div>
          <br>

          <div class="is-size-5 publication-authors">
            * Work done during internship at NVIDIA
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <!-- PDF Link. -->
              <span class="link-block">
                <a href=""
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                  </span>
                  <span>Paper</span>
                </a>
              </span>
              <!-- Code Link. -->
              <span class="link-block">
                <a href=""
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
            </div>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            We introduce SPOT, an object-centric imitation learning framework. At the core, it leverages the synergy between diffusion policy and object-centric representation, specifically SE(3) object pose trajectory. This approach decouples embodiment actions from sensory inputs, facilitating learning from various demonstration types, including both action-based and action-less human hand demonstrations. Additionally, object pose trajectories inherently capture planning constraints from demonstrations without the need for manually crafted rules. In real-world evaluation, using only eight demonstrations shot on an iPhone, our approach completed all tasks while fully complying with task constraints.
          </p>
        </div>
        <div class="publication-video">
          <iframe width="560" height="315" src="https://www.youtube.com/embed/yktAPZ1ERnQ?si=i82GNX6BOibw-ZO7" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe>
        </div>
      </div>
    </div>
    <!--/ Abstract. -->

    <hr>
    <br>

    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">6D Object Pose as Intermediate Representation</h2>
        <div class="content has-text-justified">
          <p>Given the observation, our framework estimates the object’s pose, predicts its future path in SE(3), and derives an action plan accordingly. Our diffusion model is trained on
            demonstration trajectories extracted from videos without needing action data from the same embodiment.
          </p>
        </div>
        <img src="static/images/teaser.png" width="100%">
      </div>
    </div>

    <hr>
    <br>

    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Framework Overview</h2>
        <div class="content has-text-justified">
          <p>During training, we extract object pose trajectories from demonstration RGBD videos (e.g., collected with an iPhone), which are independent of the embodiment. Using these extracted trajectories, we train a diffusion model to generate future object trajectories and determine task completion based on current and past poses. During task execution, the task-relevant object is constantly tracked, and its pose is forwarded to the trajectory diffusion model to predict the object's future trajectory in SE(3), which leads to task accomplishment. Finally, we convert the generated trajectories into embodiment-agnostic action plans for closed-loop manipulation.
          </p>
        </div>
        <img src="static/images/framework.png" width="100%">
      </div>
    </div>

    <hr>
    <br>

</section>


<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <div class="columns is-centered has-text-centered">
      <h2 class="title is-3">Real-world Evaluation</h2>
      </div>
      <div class="content has-text-justified">
        <p>We evaluated our method on 4 real-world manipulation tasks. All models use single camera views and 8 human demonstrations per task.</p>
      </div>
      <div id="results-carousel" class="carousel results-carousel">
        <div class="item item-steve">
          <video poster="" id="steve" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/mug_on_coaster_4x.mp4"
                    type="video/mp4">
          </video>
          <p>Task: mug-on-coaster</p>
          <p><br></p>
        </div>
        <div class="item item-chair-tp">
          <video poster="" id="chair-tp" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/place_plant_4x.mp4"
                    type="video/mp4">
          </video>
          <p>Task: plant-in-vase</p>
          <p><br></p>
        </div>
        <div class="item item-shiba">
          <video poster="" id="shiba" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/pour_water_v2_4x.mp4"
                    type="video/mp4">
          </video>
          <p>Task: pour-water</p>
          <p><br></p>
        </div>
        <div class="item item-toby">
          <video poster="" id="toby" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/oven_4x.mp4"
                    type="video/mp4">
          </video>
          <p>Task: put-plate-into-oven</p>
          <p><br></p>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="hero is-small">
  <div class="hero-body">
    <div class="container">
      <div class="columns is-centered has-text-centered">
      <h2 class="title is-3">Human Demonstration Video</h2>
      </div>
      <div id="results-carousel" class="carousel results-carousel">
        <div class="item item-steve">
          <video poster="" id="steve" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/demo_mug_on_coaster_4x.mp4"
                    type="video/mp4">
          </video>
          <p>Task: mug-on-coaster</p>
          <p><br></p>
        </div>
        <div class="item item-chair-tp">
          <video poster="" id="chair-tp" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/demo_place_plant_4x.mp4"
                    type="video/mp4">
          </video>
          <p>Task: plant-in-vase</p>
          <p><br></p>
        </div>
        <div class="item item-shiba">
          <video poster="" id="shiba" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/demo_pour_water_v2_4x.mp4"
                    type="video/mp4">
          </video>
          <p>Task: pour-water</p>
          <p><br></p>
        </div>
        <div class="item item-toby">
          <video poster="" id="toby" autoplay controls muted loop playsinline height="100%">
            <source src="./static/videos/demo_oven_4x.mp4"
                    type="video/mp4">
          </video>
          <p>Task: put-plate-into-oven</p>
          <p><br></p>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="hero is-small">

  <hr>
  <br>

  <div class="hero-body">
    <div class="container">
      <div class="columns is-centered has-text-centered">
        <h2 class="title is-3">Generalization Tests</h2>
      </div>
      <div class="content has-text-justified">
        <p>We test our method across various scenarios to evaluate its generalization capabilities.</p>
      </div>


      <div class="columns is-centered has-text-centered">
        <h1 class="title is-4">Object Configurations</h2>
      </div>
      <div id="videoal">
        <table>
        <tr>
          <td>
            <video style="text-align:center; width:100%" id="v3" autoplay muted loop playsinline controls height="5%">
              <source src="static/videos/generalization/config_high_2_4x.mp4" type="video/mp4">
            </video>
          </td>
          <td>
            <video style="text-align:center; width:100%" id="v3" autoplay muted loop playsinline controls height="5%">
              <source src="static/videos/generalization/config_high_4x.mp4" type="video/mp4">
            </video>
          </td>
          <td>
            <video style="text-align:center; width:100%" id="v3" autoplay muted loop playsinline controls height="5%">
              <source src="static/videos/generalization/config_far_4x.mp4" type="video/mp4">
            </video>
          </td>
        </tr>
        </table>
      </div>

      <br>

      <div class="columns is-centered has-text-centered">
        <h1 class="title is-4">Lighting Conditions</h2>
      </div>
      <div id="videoal">
        <table>
        <tr>
          <td>
            <video style="text-align:center; width:100%" id="v3" autoplay muted loop playsinline controls height="5%">
              <source src="static/videos/generalization/low_light_4x.mp4" type="video/mp4">
            </video>
          </td>
          <td>
            <video style="text-align:center; width:100%" id="v3" autoplay muted loop playsinline controls height="5%">
              <source src="static/videos/generalization/night_4x.mp4" type="video/mp4">
            </video>
          </td>
          <td>
            <video style="text-align:center; width:100%" id="v3" autoplay muted loop playsinline controls height="5%">
              <source src="static/videos/generalization/night_2_4x.mp4" type="video/mp4">
            </video>
          </td>
        </tr>
        </table>
      </div>

      <br>

      <div class="columns is-centered has-text-centered">
        <h1 class="title is-4">Clutter Scenes</h2>
      </div>
      <div id="videoal">
        <table>
        <tr>
          <td>
            <video style="text-align:center; width:100%" id="v3" autoplay muted loop playsinline controls height="5%">
              <source src="static/videos/generalization/clutter_4x.mp4" type="video/mp4">
            </video>
          </td>
          <td>
            <video style="text-align:center; width:100%" id="v3" autoplay muted loop playsinline controls height="5%">
              <source src="static/videos/generalization/clutter_2_4x.mp4" type="video/mp4">
            </video>
          </td>
          <td>
            <video style="text-align:center; width:100%" id="v3" autoplay muted loop playsinline controls height="5%">
              <source src="static/videos/generalization/config_standard_4x.mp4" type="video/mp4">
            </video>
          </td>
        </tr>
        </table>
      </div>


    </div>
  </div>
</section>


<!-- <section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>@article{park2021nerfies,
  author    = {Park, Keunhong and Sinha, Utkarsh and Barron, Jonathan T. and Bouaziz, Sofien and Goldman, Dan B and Seitz, Steven M. and Martin-Brualla, Ricardo},
  title     = {Nerfies: Deformable Neural Radiance Fields},
  journal   = {ICCV},
  year      = {2021},
}</code></pre>
  </div>
</section> -->


<footer class="footer">
  <div class="container">
    <!-- <div class="content has-text-centered">
      <a class="icon-link"
         href="./static/videos/nerfies_paper.pdf">
        <i class="fas fa-file-pdf"></i>
      </a>
      <a class="icon-link" href="https://github.com/keunhong" class="external-link" disabled>
        <i class="fab fa-github"></i>
      </a>
    </div> -->
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            The website template was borrowed from <a href="https://github.com/nerfies/nerfies.github.io">Nerfies</a>
            under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/">Creative Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>