Merge pull request #93 from IRVLUTD/NIDSNet

add bop learderboard
IRVLUTD · May 30, 2024 · 727994b · 727994b
2 parents fbc5008 + 56e84d6
commit 727994b
Show file tree

Hide file tree

Showing 2 changed files with 70 additions and 31 deletions.
diff --git a/NIDSNet/assets/images/leaderboard.png b/NIDSNet/assets/images/leaderboard.png
diff --git a/NIDSNet/index.html b/NIDSNet/index.html
@@ -134,19 +134,35 @@ <h1 class="title is-2 publication-title"> <span class="small-caps">Adapting Pre-
   </div>
 </section>
 
-<section id="ffa" class="section">
+<section class="section">
   <div class="container is-max-desktop">
-    <!-- 20xScenes. -->
+
+      <!-- Paper video. -->
+<!--      <div class="columns is-centered has-text-centered">-->
+<!--        <div class="column">-->
+<!--          <div class="publication-video">-->
+<!--&lt;!&ndash;            <iframe width="560" height="315" src="https://www.youtube.com/embed/Hf1dJ-FqGFQ" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>&ndash;&gt;-->
+<!--            <iframe src="https://www.youtube.com/embed/CRAVObWhxLw?si=IvSbS4sgEqou8dH6"-->
+<!--            frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>-->
+<!--&lt;!&ndash;            <iframe width="560" height="315" src="https://www.youtube.com/embed/aFTrF7NIihI" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>&ndash;&gt;-->
+<!--          </div>-->
+<!--        </div>-->
+<!--      </div>-->
+      <!--/ Paper video. -->
+
+    <!-- Abstract. -->
     <div class="columns is-centered has-text-centered">
       <div class="column">
-        <h2 class="title is-3">Foreground Feature Averaging (FFA)</h2>
-        <p>FFA is used to generate initial instance embeddings.</p>
+        <h2 class="title is-3">Abstract</h2>
         <div class="content has-text-justified">
-          <img src="assets/images/FFA3.png" alt="gto">
+          <p>
+            Novel Instance Detection and Segmentation (NIDS) aims at detecting and segmenting novel object instances given a few examples of each instance. We propose a unified framework (NIDS-Net) comprising object proposal generation, embedding creation for both instance templates and proposal regions, and embedding matching for instance label assignment. Leveraging recent advancements in large vision methods, we utilize the Grounding DINO and Segment Anything Model (SAM) to obtain object proposals with accurate bounding boxes and masks. Central to our approach is the generation of high-quality instance embeddings. We utilize foreground feature averages of patch embeddings from the DINOv2 ViT backbone, followed by refinement through a weight adapter mechanism that we introduce. We show experimentally that our weight adapter can adjust the embeddings locally within their feature space and effectively limit overfitting. This methodology enables a straightforward matching strategy, resulting in significant performance gains. Our framework surpasses current state-of-the-art methods, demonstrating notable improvements of 22.3, 46.2, 10.3, and 24.0 in average precision (AP) across four detection datasets. In instance segmentation tasks on seven core datasets of the BOP challenge, our method outperforms the top RGB methods by 3.6 AP and remains competitive with the best RGB-D method.
+          </p>
         </div>
       </div>
     </div>
-    <!--/ 20xScenes. -->
+    <!--/ Abstract. -->
+
   </div>
 </section>
 
@@ -166,6 +182,24 @@ <h2 class="title is-3">NIDS-Net</h2>
   </div>
 </section>
 
+<section id="ffa" class="section">
+  <div class="container is-max-desktop">
+    <!-- 20xScenes. -->
+    <div class="columns is-centered has-text-centered">
+      <div class="column">
+        <h2 class="title is-3">Foreground Feature Averaging (FFA)</h2>
+        <p>FFA<a href="#bib-1"> [1] </a> is used to generate initial instance embeddings.</p>
+        <div class="content has-text-justified">
+          <img src="assets/images/FFA3.png" alt="gto">
+        </div>
+      </div>
+    </div>
+    <!--/ 20xScenes. -->
+  </div>
+</section>
+
+
+
 <section id="detection" class="section">
   <div class="container is-max-desktop">
     <!-- 20xScenes. -->
@@ -181,54 +215,39 @@ <h2 class="title is-3">Detection Examples</h2>
   </div>
 </section>
 
-<section id="segmentation" class="section">
+<section id="BOP" class="section">
   <div class="container is-max-desktop">
     <!-- 20xScenes. -->
     <div class="columns is-centered has-text-centered">
       <div class="column">
-        <h2 class="title is-3">Segmentation Examples of BOP Benchmark</h2>
+        <h2 class="title is-3">BOP Benchmark</h2>
+        <h3><a href="https://bop.felk.cvut.cz/leaderboards/segmentation-unseen-bop23/core-datasets/">Ranked #1: Model-based 2D segmentation of unseen objects – Core datasets</a></h3>
         <div class="content has-text-justified">
-          <img src="assets/images/seg_1.png" alt="gto">
+          <img src="assets/images/leaderboard.png" alt="gto">
         </div>
       </div>
     </div>
     <!--/ 20xScenes. -->
   </div>
 </section>
 
-<section class="section">
+<section id="segmentation" class="section">
   <div class="container is-max-desktop">
-
-      <!-- Paper video. -->
-<!--      <div class="columns is-centered has-text-centered">-->
-<!--        <div class="column">-->
-<!--          <div class="publication-video">-->
-<!--&lt;!&ndash;            <iframe width="560" height="315" src="https://www.youtube.com/embed/Hf1dJ-FqGFQ" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>&ndash;&gt;-->
-<!--            <iframe src="https://www.youtube.com/embed/CRAVObWhxLw?si=IvSbS4sgEqou8dH6"-->
-<!--            frameborder="0" allow="autoplay; encrypted-media" allowfullscreen></iframe>-->
-<!--&lt;!&ndash;            <iframe width="560" height="315" src="https://www.youtube.com/embed/aFTrF7NIihI" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>&ndash;&gt;-->
-<!--          </div>-->
-<!--        </div>-->
-<!--      </div>-->
-      <!--/ Paper video. -->
-
-    <!-- Abstract. -->
+    <!-- 20xScenes. -->
     <div class="columns is-centered has-text-centered">
       <div class="column">
-        <h2 class="title is-3">Abstract</h2>
+        <h2 class="title is-3">Segmentation Examples of BOP Benchmark</h2>
         <div class="content has-text-justified">
-          <p>
-            Novel Instance Detection and Segmentation (NIDS) aims at detecting and segmenting novel object instances given a few examples of each instance. We propose a unified framework (NIDS-Net) comprising object proposal generation, embedding creation for both instance templates and proposal regions, and embedding matching for instance label assignment. Leveraging recent advancements in large vision methods, we utilize the Grounding DINO and Segment Anything Model (SAM) to obtain object proposals with accurate bounding boxes and masks. Central to our approach is the generation of high-quality instance embeddings. We utilize foreground feature averages of patch embeddings from the DINOv2 ViT backbone, followed by refinement through a weight adapter mechanism that we introduce. We show experimentally that our weight adapter can adjust the embeddings locally within their feature space and effectively limit overfitting. This methodology enables a straightforward matching strategy, resulting in significant performance gains. Our framework surpasses current state-of-the-art methods, demonstrating notable improvements of 22.3, 46.2, 10.3, and 24.0 in average precision (AP) across four detection datasets. In instance segmentation tasks on seven core datasets of the BOP challenge, our method outperforms the top RGB methods by 3.6 AP and remains competitive with the best RGB-D method.
-          </p>
+          <img src="assets/images/seg_1.png" alt="gto">
         </div>
       </div>
     </div>
-    <!--/ Abstract. -->
-
+    <!--/ 20xScenes. -->
   </div>
 </section>
 
 
+
 <section class="section">
   <div class="container is-max-desktop">
 
@@ -279,6 +298,26 @@ <h2 class="title is-3">
   </div>
 </section>
 
+  <section class="section">
+    <div class="container is-max-desktop">
+      <!-- References. -->
+      <div class="columns is-centered has-text-centered">
+        <div class="column">
+          <h2 class="title is-3">References</h2>
+          <div class="content has-text-justified">
+            <ol id="references">
+                <li id="bib-1">
+                    <p>
+                    K. Kotar, S. Tian, H.-X. Yu, D. Yamins, and J. Wu. Are these the same apple? comparing images based on object intrinsics. Advances in Neural Information Processing Systems, 36, 2024.<a href="https://arxiv.org/abs/2311.00750"> arXiv </a>
+                    </p>
+                </li>
+            </ol>
+          </div>
+        </div>
+      </div>
+      <!--/ References. -->
+    </div>
+  </section>
 
 
 <section class="section" id="BibTeX">