From 5a2de1bb49f667f430d8f32dc70ed0e3431a0701 Mon Sep 17 00:00:00 2001 From: Yunsong Zhou <75066007+zhouyunsong@users.noreply.github.com> Date: Thu, 31 Oct 2024 23:51:45 +0800 Subject: [PATCH] Update component---src-pages-index-tsx-a0cbca3becbcba12f361.js --- component---src-pages-index-tsx-a0cbca3becbcba12f361.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/component---src-pages-index-tsx-a0cbca3becbcba12f361.js b/component---src-pages-index-tsx-a0cbca3becbcba12f361.js index 81c2bc6..8c3e740 100644 --- a/component---src-pages-index-tsx-a0cbca3becbcba12f361.js +++ b/component---src-pages-index-tsx-a0cbca3becbcba12f361.js @@ -1,2 +1,2 @@ -(self.webpackChunkELM=self.webpackChunkELM||[]).push([[691],{9104:function(e,t,i){"use strict";i.r(t),i.d(t,{Head:function(){return N},default:function(){return L}});var n=i(7294),r=i(9583),o=i(8193),a=i(1451),s=i(6529),l=i.p+"./static/teaser-c2a71f1dde356046cbbe0e5e5480c814.png",c=i.p+"static/overview-207c2e5c1076c9d19d0cde294d565f5d.png",u=i.p+"static/datapipeline-b52c081d3a18968a618ab2032f2abe92.png",d=i.p+"static/status-016af8bf626711cb573cf29fd8f7e93e.png",h=i.p+"static/results-4a7b9387e7e17afca7341167ba0bd8cd.png",p=i.p+"static/vis1-342b98486315be4dad64c5f173971081.png",m=i.p+"static/vis2-4fc4b96dd4e04c5a4eb40611fa817f59.png",f=i.p+"static/vis3-004b6204f9dd3bfc307205409d2ff6cc.png",g=i.p+"static/vis4-33f0ce67b804dfdcae294352a2281172.png",v=i.p+"static/vis5-30031627ad193cda766b386d1eb4af05.png",y=i.p+"static/vis6-de64a2dd03b50a5d7b284c9683f04947.png",b=i.p+"static/vis7-a859c5a45af5248f7916816fee9f2ef0.png",E=i.p+"static/vis8-bf7d58452ca36d291e624993f264c42f.png",w=i.p+"static/vis9-64fba565fe8fca0972cff25c92af6db7.png",x=i.p+"static/vis10-af238b6bb0be0d61fe9a8e3c3e5d3f0f.png",S=i.p+"static/vis11-6bdfee004623c2e5c92a7a63c97b174a.png";const T=e=>{let{children:t}=e;return n.createElement("h1",{className:"pb-1 mb-5 sm:mb-4 sm:leading-tight md:leading-tight lg:leading-tight font-bold text-center"},t)},C=e=>{let{website:t,children:i}=e;return n.createElement("div",{className:"flex flex-wrap justify-center text-2xl lg:text-2xl mb-6 sm:mb-5"},n.createElement("a",{className:"no-underline",href:t,target:"_blank"},i))},I=e=>{let{children:t}=e;return n.createElement("div",null,n.createElement("div",{className:"flex justify-center content-center"},n.createElement("p",{className:"font-semibold text-2xl sm:text-3xl m-1 sm:m-2"},"Abstract")),n.createElement("div",{className:"flex justify-center content-center"},n.createElement("p",{className:"text-justify font-light text-base sm:text-lg m-1 sm:m-1 max-w-[100%] sm:max-w-[620px]"},t)))},M=e=>{let{children:t,website:i,firstAuthor:r,affiliations:o,lastAuthor:a}=e;return n.createElement("span",{className:"text-center inline-block"},n.createElement("a",{href:i,target:"_blank",className:"font-normal no-underline text-stone-600 hover:underline underline-offset-3 hover:transition-all"},t),r||o?n.createElement("sup",{className:"pl-0.5"},r?n.createElement("span",{className:"font-bold"},"*"):null,o||null):null,a?null:n.createElement(n.Fragment,null,", "))},P=e=>{let{children:t,website:i,number:r}=e;return n.createElement("span",{className:"text-center inline-block mr-4"},n.createElement("sup",{className:"mr-0.5"},r),n.createElement("a",{href:i,target:"_blank",className:"font-light no-underline text-stone-600 hover:underline underline-offset-3 hover:transition-all"},t))},A=e=>{let{children:t,url:i,icon:r}=e;return n.createElement("span",{className:"text-center inline-block my-3.5 sm:my-2 mx-2"},n.createElement("a",{href:i,target:i.startsWith("#")?"_self":"_blank",className:"text-xl no-underline font-normal text-[#009cff] bg-[#f9f9f9] hover:bg-[#f4f4f4] hover:transition-all px-4 py-3 rounded-xl"},n.createElement("span",{className:"align-middle inline-flex justify-center mr-0.25"},r," "),n.createElement("span",null,t)))},k=e=>{let{children:t}=e;return n.createElement("div",{className:"mx-auto w-full max-w-[90%] format format-md md:format-base lg:max-w-5xl lg:format-lg format-blue dark:format-invert"},t)},R=e=>{let{children:t}=e;return n.createElement("main",{className:"pt-6 lg:pt-12 bg-white dark:bg-gray-900"},t)},N=()=>n.createElement("title",null,"ELM: Embodied Understanding of Driving Scenarios"),O={desktop:{breakpoint:{max:3e3,min:1024},items:3},tablet:{breakpoint:{max:1024,min:464},items:2},mobile:{breakpoint:{max:464,min:0},items:1}};var L=()=>n.createElement(n.Fragment,null,n.createElement(R,null,n.createElement(k,null,n.createElement(T,null,n.createElement("span",{className:"font-extrabold text-transparent bg-clip-text bg-gradient-to-r from-pink-500 via-indigo-600 to-emerald-400"},"ELM:")," ",n.createElement("span",{className:"text-stone-800"},"Embodied Understanding of Driving Scenarios")),n.createElement(C,{website:"https://eccv.ecva.net/"},n.createElement("span",{className:"font-normal text-stone-600 hover:text-transparent hover:bg-clip-text hover:bg-gradient-to-r hover:from-pink-500 hover:via-indigo-600 hover:to-emerald-400 hover:transition-all"},"ECCV 2024")),n.createElement("div",{className:"flex flex-wrap justify-center text-xl lg:text-xl mb-4"},n.createElement(M,{website:"https://zhouyunsong-sjtu.github.io/",firstAuthor:!0,affiliations:"1, 2"},"Yunsong Zhou"),n.createElement(M,{website:"https://github.com/DevLinyan",firstAuthor:!0,affiliations:"1"},"Linyan Huang"),n.createElement(M,{website:"https://github.com/retsuh-bqw",firstAuthor:!0,affiliations:"1, 2"},"Qingwen Bu"),n.createElement(M,{affiliations:"1"},"Jia Zeng"),n.createElement(M,{affiliations:"1"},"Tianyu Li"),n.createElement("br",null),n.createElement(M,{affiliations:"3"},"Hang Qiu"),n.createElement(M,{affiliations:"2"},"Hongzi Zhu"),n.createElement(M,{affiliations:"2"},"Minyi Guo"),n.createElement(M,{affiliations:"1"},"Yu Qiao"),n.createElement(M,{website:"https://lihongyang.info/",affiliations:"1",lastAuthor:!0},"Hongyang Li")),n.createElement("div",{className:"flex flex-wrap justify-center text-xl lg:text-xl mb-1"},n.createElement(P,{website:"https://opendrivelab.com/",number:"1"},"OpenDriveLab at Shanghai AI Lab"),n.createElement(P,{number:"2"},"Shanghai Jiao Tong University"),n.createElement(P,{number:"3"},"University of California, Riverside")),n.createElement("div",{className:"flex flex-wrap justify-center text-l lg:text-l"},n.createElement("span",{className:"text-stone-600 text-center"},n.createElement("sup",{className:"mr-0.5"},"*"),"Indicates equal contribution.")),n.createElement("p",{className:"flex flex-wrap justify-center"},n.createElement(A,{url:"https://arxiv.org/pdf/2403.04593",icon:n.createElement(r.yRW,null)},"Paper"),n.createElement(A,{url:"https://github.com/OpenDriveLab/ELM",icon:n.createElement(o.RrF,null)},"Code")),n.createElement("img",{src:l,alt:"teaser"}),n.createElement("div",{className:"flex justify-center"},n.createElement("p",{className:"text-justify text-xl !mt-0 !mb-2 font-medium max-w-[100%] md:max-w-[100%]"},"ELM is an embodied language model for understanding the long-horizon driving scenarios in space and time. Compared to the vanilla vision-language model (VLM) being confined to the scene description task, we expand a wide spectrum of new tasks to fully leverage the capability of large language models in an embodiment setting. ELM achieves significant improvements in various applications."))),n.createElement("div",{className:"my-6 pt-6 pb-4 bg-gradient-to-r from-pink-100/70 via-indigo-100/70 to-emerald-100/70"},n.createElement("div",{className:"mx-auto w-full max-w-[97.5%] lg:max-w-7xl py-2 md:py-4 px-2 md:px-4"},n.createElement("p",{className:"text-center text-lg md:text-xl md:max-w-[85%] mx-auto"},n.createElement(I,null,"Embodied scene understanding serves as the cornerstone for autonomous agents to perceive, interpret, and respond to open driving scenarios. Such understanding is typically founded upon Vision-Language Models (VLMs). Nevertheless, existing VLMs are restricted to the 2D domain, devoid of spatial awareness and long-horizon extrapolation proficiencies. We revisit the key aspects of autonomous driving and formulate appropriate rubrics. Hereby, we introduce the Embodied Language Model (ELM), a comprehensive framework tailored for agents' understanding of driving scenes with large spatial and temporal spans. ELM incorporates space-aware pre-training to endow the agent with robust spatial localization capabilities. Besides, the model employs time-aware token selection to accurately inquire about temporal cues. We instantiate ELM on the reformulated multi-faced benchmark, and it surpasses previous state-of-the-art approaches in all aspects.")))),n.createElement(k,null,n.createElement("h2",{className:"font-semibold border-b-[1px] !mb-4"},"Methodology"),n.createElement("h3",{id:"language-guided",className:"!mt-4"},"Model Overview"),n.createElement("img",{src:c,alt:"Description of the image"}),n.createElement("p",null," ",n.createElement("b",null,"Systematic Pipeline of ELM.")," It consists of Pre-training by open-world data corpus and Fine-tuning on diverse tasks. To initialize the Space-aware Pre-training, we collect extensive image-text pairs from the world, empowering ELM with spatial localization while preserving the description ability in driving scenarios. In the fine-tuning process, the inputs to ELM are videos, timestamps, and text prompts. After encoding the inputs into tokens, ELM leverages the proposed Time-aware Token Selection to gather the appropriate tokens as instructed by prompts. Finally, the tokens are sent to the language model to generate output texts."),n.createElement("h3",{id:"language-guided",className:"!mt-4"},"Space-aware Pre-training"),n.createElement("img",{src:u,alt:"Description of the image"}),n.createElement("p",null," ",n.createElement("b",null,"Annotation workflow with human quality check in the loop. For location labeling:")," we first select diverse templates from the GPT generated candidates. Pixel-point pairs as annotated in the nuScenes are then sampled and filled into the templates to form our location pre-training data.",n.createElement("b",null,"For description labeling:")," Node 4 utilizes LLaMA-Adapter V2 to obtain diverse labels on nuScenes, Waymo, YouTube, and Ego4D with predefined prompts. Two rounds of quality check are conducted in Node 3 and 7 by inspectors to guarantee the image and caption quality."),n.createElement("img",{src:d,alt:"Description of the image"}),n.createElement("p",null," ",n.createElement("b",null,"Statistics of pre-training data and comparison of data collection with other VLMs. "),"Our pre-train data surpasses that in general vision (top) and autonomous driving (middle) in terms of quantity and diversity. Anno: the type of annotations; Des: description; Loc: localization."),n.createElement("h2",{className:"font-semibold border-b-[1px] !mb-4"},"Experiments")),n.createElement("div",{className:"my-6 pt-6 pb-4 bg-gradient-to-r from-pink-100/70 via-indigo-100/70 to-emerald-100/70"},n.createElement("div",{className:"relative pb-8 mb-3"},n.createElement(s.default,{responsive:O,infinite:!0,showDots:!0,renderDotsOutside:!0},n.createElement("img",{src:m,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:p,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:f,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:g,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:v,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:y,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:b,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:E,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:w,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:x,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:S,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}))),n.createElement("p",{className:"text-center text-lg md:text-xl md:max-w-[85%] mx-auto"},n.createElement("b",null,"Visualization on the benchmark.")," We provide results for ten tasks and E2E planning through videos and corresponding QA pairs.")),n.createElement(k,null,n.createElement("h3",{id:"language-guided",className:"!mt-4"},"Results"),n.createElement("img",{src:h,alt:"Description of the image"}),n.createElement("p",null," ",n.createElement("b",null,"Comparison to State-of-the-arts. "),"All methods are fine-tuned on the corresponding tasks. The main metrics (%) are marked in gray. Bold emphasizes top method; underline marks the runner-up. C: CIDEr; R: ROUGE-L; B: BLEU."),n.createElement("h2",{id:"citation",className:"border-b-[1px]"},"Citation"),n.createElement("div",{className:"relative overflow-auto"},n.createElement("pre",{className:"bg-gradient-to-r from-pink-100 via-indigo-100 to-emerald-100 !my-0"},n.createElement("code",{id:"citation-bib",className:"font-medium text-slate-800"},"@article{zhou2024embodied,\n title={Embodied Understanding of Driving Scenarios},\n author={Zhou, Yunsong and Huang, Linyan and Bu, Qingwen and Zeng, Jia and Li, Tianyu and Qiu, Hang and Zhu, Hongzi and Guo, Minyi and Qiao, Yu and Li, Hongyang},\n journal={arXiv preprint arXiv:2403.04593},\n year={2024}\n}")),n.createElement("div",{className:"absolute top-0 right-0"},n.createElement("button",{className:"float-right text-2xl text-indigo-500 bg-white hover:bg-slate-50 hover:text-indigo-600 hover:transition-all rounded-full p-2 m-3 invisible md:visible",onClick:()=>{let e=document.getElementById("citation-bib"),t=document.createRange(),i=window.getSelection();null!=e&&null!=t&&null!=i&&(t.selectNode(e),i.removeAllRanges(),i.addRange(t))}},n.createElement(a.DV2,null))))),n.createElement("footer",{className:"flex flex-col justify-center bg-gray-50 mt-8 py-8"},n.createElement("div",{className:"flex justify-center align-middle text-lg"},n.createElement("a",{role:"button",className:"text-blue-500",onClick:()=>{window.scrollTo({top:0,behavior:"smooth"})}},n.createElement("span",{className:"align-text-top inline-flex justify-center mr-0.25"},n.createElement(r.ZTc,null)," "),n.createElement("span",null,"Back to Top"))))))},4405:function(e,t,i){"use strict";i.d(t,{w_:function(){return c}});var n=i(7294),r={color:void 0,size:void 0,className:void 0,style:void 0,attr:void 0},o=n.createContext&&n.createContext(r),a=function(){return a=Object.assign||function(e){for(var t,i=1,n=arguments.length;iMath.abs(o)))return;var l=a.populateSlidesOnMouseTouchMove(this.state,this.props,this.initialX,this.lastX,i,this.transformPlaceHolder),c=l.direction,u=l.nextPosition,d=l.canContinue;c&&(this.direction=c,d&&void 0!==u&&this.setTransformDirectly(u)),this.lastX=i}}},t.prototype.handleOut=function(e){this.props.autoPlay&&!this.autoPlay&&(this.autoPlay=setInterval(this.next,this.props.autoPlaySpeed));var t="touchend"===e.type&&!this.props.swipeable,i=("mouseleave"===e.type||"mouseup"===e.type)&&!this.props.draggable;if(!t&&!i&&this.onMove){if(this.setAnimationDirectly(!0),"right"===this.direction)if(this.initialX-this.lastX>=this.props.minimumTouchDrag){var n=Math.round((this.initialX-this.lastX)/this.state.itemWidth);this.next(n)}else this.correctItemsPosition(this.state.itemWidth,!0,!0);"left"===this.direction&&(this.lastX-this.initialX>this.props.minimumTouchDrag?(n=Math.round((this.lastX-this.initialX)/this.state.itemWidth),this.previous(n)):this.correctItemsPosition(this.state.itemWidth,!0,!0)),this.resetMoveStatus()}},t.prototype.isInViewport=function(e){var t=e.getBoundingClientRect(),i=t.top,n=void 0===i?0:i,r=t.left,o=void 0===r?0:r,a=t.bottom,s=void 0===a?0:a,l=t.right,c=void 0===l?0:l;return 0<=n&&0<=o&&s<=(window.innerHeight||document.documentElement.clientHeight)&&c<=(window.innerWidth||document.documentElement.clientWidth)},t.prototype.isChildOfCarousel=function(e){return!!(e instanceof Element&&this.listRef&&this.listRef.current)&&this.listRef.current.contains(e)},t.prototype.onKeyUp=function(e){var t=e.target;switch(e.keyCode){case 37:if(this.isChildOfCarousel(t))return this.previous();break;case 39:if(this.isChildOfCarousel(t))return this.next();break;case 9:if(this.isChildOfCarousel(t)&&t instanceof HTMLInputElement&&this.isInViewport(t))return this.next()}},t.prototype.handleEnter=function(e){s.isMouseMoveEvent(e)&&this.autoPlay&&this.props.autoPlay&&this.props.pauseOnHover&&(clearInterval(this.autoPlay),this.autoPlay=void 0)},t.prototype.goToSlide=function(e,i,n){var r=this;if(void 0===n&&(n=!0),!this.isInThrottle){var o=this.state.itemWidth,a=this.props,s=a.afterChange,l=a.beforeChange,c=this.state.currentSlide;"function"!=typeof l||i&&("object"!=typeof i||i.skipBeforeChange)||l(e,this.getState()),this.isAnimationAllowed=n,this.props.shouldResetAutoplay&&this.resetAutoplayInterval(),this.setState({currentSlide:e,transform:-o*e},(function(){r.props.infinite&&r.correctClonesPosition({domLoaded:!0}),"function"!=typeof s||i&&("object"!=typeof i||i.skipAfterChange)||(t.afterChangeTimeout3=setTimeout((function(){s(c,r.getState())}),r.props.transitionDuration||h))}))}},t.prototype.getState=function(){return this.state},t.prototype.renderLeftArrow=function(e){var t=this,i=this.props,n=i.customLeftArrow,r=i.rtl;return o.createElement(c.LeftArrow,{customLeftArrow:n,getState:function(){return t.getState()},previous:this.previous,disabled:e,rtl:r})},t.prototype.renderRightArrow=function(e){var t=this,i=this.props,n=i.customRightArrow,r=i.rtl;return o.createElement(c.RightArrow,{customRightArrow:n,getState:function(){return t.getState()},next:this.next,disabled:e,rtl:r})},t.prototype.renderButtonGroups=function(){var e=this,t=this.props.customButtonGroup;return t?o.cloneElement(t,{previous:function(){return e.previous()},next:function(){return e.next()},goToSlide:function(t,i){return e.goToSlide(t,i)},carouselState:this.getState()}):null},t.prototype.renderDotsList=function(){var e=this;return o.createElement(l.default,{state:this.state,props:this.props,goToSlide:this.goToSlide,getState:function(){return e.getState()}})},t.prototype.renderCarouselItems=function(){var e=[];if(this.props.infinite){var t=o.Children.toArray(this.props.children);e=a.getClones(this.state.slidesToShow,t)}return o.createElement(u.default,{clones:e,goToSlide:this.goToSlide,state:this.state,notEnoughChildren:a.notEnoughChildren(this.state),props:this.props})},t.prototype.render=function(){var e=this.props,t=e.deviceType,i=e.arrows,n=e.renderArrowsWhenDisabled,r=e.removeArrowOnDeviceType,s=e.infinite,l=e.containerClass,c=e.sliderClass,u=e.customTransition,h=e.additionalTransfrom,m=e.renderDotsOutside,f=e.renderButtonGroupOutside,g=e.className,v=e.rtl,y=a.getInitialState(this.state,this.props),b=y.shouldRenderOnSSR,E=y.shouldRenderAtAll,w=a.isInLeftEnd(this.state),x=a.isInRightEnd(this.state),S=i&&!(r&&(t&&-12*n?e+2*n:r>=i.length?i.length+e:e},t.getOriginalIndexLookupTableByClones=function(e,t){if(t.length>2*e){for(var i={},n=t.length-2*e,r=t.length-n,o=n,a=0;a2*e?t.slice(t.length-2*e,t.length).concat(t,t.slice(0,2*e)):t.concat(t,t)},t.getInitialSlideInInfiniteMode=function(e,t){return t.length>2*e?2*e:t.length},t.checkClonesPosition=function(e,t,i){var n,r=e.currentSlide,o=e.slidesToShow,a=e.itemWidth,s=e.totalItems,l=0,c=0,u=0===r,d=t.length-(t.length-2*o);return t.length2*o?((n=r>=d+t.length)&&(c=-a*(l=r-t.length)),u&&(c=-a*(l=d+(t.length-2*o)))):((n=r>=2*t.length)&&(c=-a*(l=r-t.length)),u&&(c=i.showDots?-a*(l=t.length):-a*(l=s/3))),{isReachingTheEnd:n,isReachingTheStart:u,nextSlide:l,nextPosition:c}}},3989:function(e,t,i){"use strict";Object.defineProperty(t,"__esModule",{value:!0});var n=i(6867);function r(e){var t=e.slidesToShow;return e.totalItems=o&&window.innerWidth<=r&&(l=n)})),l}},4949:function(e,t,i){"use strict";Object.defineProperty(t,"__esModule",{value:!0});var n=i(3905),r=i(3989);t.getLookupTableForNextSlides=function(e,t,i,o){var a={},s=r.getSlidesToSlide(t,i);return Array(e).fill(0).forEach((function(e,i){var r=n.getOriginalCounterPart(i,t,o);if(0===i)a[0]=r;else{var l=a[i-1]+s;a[i]=l}})),a}},6867:function(e,t){"use strict";Object.defineProperty(t,"__esModule",{value:!0});t.getPartialVisibilityGutter=function(e,t,i,n){var r=0,o=n||i;return t&&o&&(r=e[o].partialVisibilityGutter||e[o].paritialVisibilityGutter),r},t.getWidthFromDeviceType=function(e,t){var i;return t[e]&&(i=(100/t[e].items).toFixed(1)),i},t.getItemClientSideWidth=function(e,t,i){return Math.round(i/(t+(e.centerMode?1:0)))}},8035:function(e,t,i){"use strict";Object.defineProperty(t,"__esModule",{value:!0});var n=i(3905);t.getOriginalCounterPart=n.getOriginalCounterPart,t.getClones=n.getClones,t.checkClonesPosition=n.checkClonesPosition,t.getInitialSlideInInfiniteMode=n.getInitialSlideInInfiniteMode;var r=i(6867);t.getWidthFromDeviceType=r.getWidthFromDeviceType,t.getPartialVisibilityGutter=r.getPartialVisibilityGutter,t.getItemClientSideWidth=r.getItemClientSideWidth;var o=i(3989);t.getInitialState=o.getInitialState,t.getIfSlideIsVisbile=o.getIfSlideIsVisbile,t.getTransformForCenterMode=o.getTransformForCenterMode,t.getTransformForPartialVsibile=o.getTransformForPartialVsibile,t.isInLeftEnd=o.isInLeftEnd,t.isInRightEnd=o.isInRightEnd,t.notEnoughChildren=o.notEnoughChildren,t.getSlidesToSlide=o.getSlidesToSlide;var a=i(9950);t.throttle=a.default;var s=i(4802);t.throwError=s.default;var l=i(2558);t.populateNextSlides=l.populateNextSlides;var c=i(5210);t.populatePreviousSlides=c.populatePreviousSlides;var u=i(1097);t.populateSlidesOnMouseTouchMove=u.populateSlidesOnMouseTouchMove},1097:function(e,t){"use strict";Object.defineProperty(t,"__esModule",{value:!0}),t.populateSlidesOnMouseTouchMove=function(e,t,i,n,r,o){var a,s,l=e.itemWidth,c=e.slidesToShow,u=e.totalItems,d=e.currentSlide,h=t.infinite,p=!1,m=Math.round((i-n)/l),f=Math.round((n-i)/l),g=i{let{children:t}=e;return n.createElement("h1",{className:"pb-1 mb-5 sm:mb-4 sm:leading-tight md:leading-tight lg:leading-tight font-bold text-center"},t)},C=e=>{let{website:t,children:i}=e;return n.createElement("div",{className:"flex flex-wrap justify-center text-2xl lg:text-2xl mb-6 sm:mb-5"},n.createElement("a",{className:"no-underline",href:t,target:"_blank"},i))},I=e=>{let{children:t}=e;return n.createElement("div",null,n.createElement("div",{className:"flex justify-center content-center"},n.createElement("p",{className:"font-semibold text-2xl sm:text-3xl m-1 sm:m-2"},"Abstract")),n.createElement("div",{className:"flex justify-center content-center"},n.createElement("p",{className:"text-justify font-light text-base sm:text-lg m-1 sm:m-1 max-w-[100%] sm:max-w-[620px]"},t)))},M=e=>{let{children:t,website:i,firstAuthor:r,affiliations:o,lastAuthor:a}=e;return n.createElement("span",{className:"text-center inline-block"},n.createElement("a",{href:i,target:"_blank",className:"font-normal no-underline text-stone-600 hover:underline underline-offset-3 hover:transition-all"},t),r||o?n.createElement("sup",{className:"pl-0.5"},r?n.createElement("span",{className:"font-bold"},"*"):null,o||null):null,a?null:n.createElement(n.Fragment,null,", "))},P=e=>{let{children:t,website:i,number:r}=e;return n.createElement("span",{className:"text-center inline-block mr-4"},n.createElement("sup",{className:"mr-0.5"},r),n.createElement("a",{href:i,target:"_blank",className:"font-light no-underline text-stone-600 hover:underline underline-offset-3 hover:transition-all"},t))},A=e=>{let{children:t,url:i,icon:r}=e;return n.createElement("span",{className:"text-center inline-block my-3.5 sm:my-2 mx-2"},n.createElement("a",{href:i,target:i.startsWith("#")?"_self":"_blank",className:"text-xl no-underline font-normal text-[#009cff] bg-[#f9f9f9] hover:bg-[#f4f4f4] hover:transition-all px-4 py-3 rounded-xl"},n.createElement("span",{className:"align-middle inline-flex justify-center mr-0.25"},r," "),n.createElement("span",null,t)))},k=e=>{let{children:t}=e;return n.createElement("div",{className:"mx-auto w-full max-w-[90%] format format-md md:format-base lg:max-w-5xl lg:format-lg format-blue dark:format-invert"},t)},R=e=>{let{children:t}=e;return n.createElement("main",{className:"pt-6 lg:pt-12 bg-white dark:bg-gray-900"},t)},N=()=>n.createElement("title",null,"ELM: Embodied Understanding of Driving Scenarios"),O={desktop:{breakpoint:{max:3e3,min:1024},items:3},tablet:{breakpoint:{max:1024,min:464},items:2},mobile:{breakpoint:{max:464,min:0},items:1}};var L=()=>n.createElement(n.Fragment,null,n.createElement(R,null,n.createElement(k,null,n.createElement(T,null,n.createElement("span",{className:"font-extrabold text-transparent bg-clip-text bg-gradient-to-r from-pink-500 via-indigo-600 to-emerald-400"},"ELM:")," ",n.createElement("span",{className:"text-stone-800"},"Embodied Understanding of Driving Scenarios")),n.createElement(C,{website:"https://eccv.ecva.net/"},n.createElement("span",{className:"font-normal text-stone-600 hover:text-transparent hover:bg-clip-text hover:bg-gradient-to-r hover:from-pink-500 hover:via-indigo-600 hover:to-emerald-400 hover:transition-all"},"ECCV 2024")),n.createElement("div",{className:"flex flex-wrap justify-center text-xl lg:text-xl mb-4"},n.createElement(M,{website:"https://zhouyunsong-sjtu.github.io/",firstAuthor:!0,affiliations:"1, 2"},"Yunsong Zhou"),n.createElement(M,{website:"https://github.com/DevLinyan",firstAuthor:!0,affiliations:"1"},"Linyan Huang"),n.createElement(M,{website:"https://github.com/retsuh-bqw",firstAuthor:!0,affiliations:"1, 2"},"Qingwen Bu"),n.createElement(M,{affiliations:"1"},"Jia Zeng"),n.createElement(M,{affiliations:"1"},"Tianyu Li"),n.createElement("br",null),n.createElement(M,{affiliations:"3"},"Hang Qiu"),n.createElement(M,{affiliations:"2"},"Hongzi Zhu"),n.createElement(M,{affiliations:"2"},"Minyi Guo"),n.createElement(M,{affiliations:"1"},"Yu Qiao"),n.createElement(M,{website:"https://lihongyang.info/",affiliations:"1",lastAuthor:!0},"Hongyang Li")),n.createElement("div",{className:"flex flex-wrap justify-center text-xl lg:text-xl mb-1"},n.createElement(P,{website:"https://opendrivelab.com/",number:"1"},"OpenDriveLab at Shanghai AI Lab"),n.createElement(P,{number:"2"},"Shanghai Jiao Tong University"),n.createElement(P,{number:"3"},"University of California, Riverside")),n.createElement("div",{className:"flex flex-wrap justify-center text-l lg:text-l"},n.createElement("span",{className:"text-stone-600 text-center"},n.createElement("sup",{className:"mr-0.5"},"*"),"Indicates equal contribution.")),n.createElement("p",{className:"flex flex-wrap justify-center"},n.createElement(A,{url:"https://arxiv.org/pdf/2403.04593",icon:n.createElement(r.yRW,null)},"Paper"),n.createElement(A,{url:"https://github.com/OpenDriveLab/ELM",icon:n.createElement(o.RrF,null)},"Code")),n.createElement("img",{src:l,alt:"teaser"}),n.createElement("div",{className:"flex justify-center"},n.createElement("p",{className:"text-justify text-xl !mt-0 !mb-2 font-medium max-w-[100%] md:max-w-[100%]"},"ELM is an embodied language model for understanding the long-horizon driving scenarios in space and time. Compared to the vanilla vision-language model (VLM) being confined to the scene description task, we expand a wide spectrum of new tasks to fully leverage the capability of large language models in an embodiment setting. ELM achieves significant improvements in various applications."))),n.createElement("div",{className:"my-6 pt-6 pb-4 bg-gradient-to-r from-pink-100/70 via-indigo-100/70 to-emerald-100/70"},n.createElement("div",{className:"mx-auto w-full max-w-[97.5%] lg:max-w-7xl py-2 md:py-4 px-2 md:px-4"},n.createElement("p",{className:"text-center text-lg md:text-xl md:max-w-[85%] mx-auto"},n.createElement(I,null,"Embodied scene understanding serves as the cornerstone for autonomous agents to perceive, interpret, and respond to open driving scenarios. Such understanding is typically founded upon Vision-Language Models (VLMs). Nevertheless, existing VLMs are restricted to the 2D domain, devoid of spatial awareness and long-horizon extrapolation proficiencies. We revisit the key aspects of autonomous driving and formulate appropriate rubrics. Hereby, we introduce the Embodied Language Model (ELM), a comprehensive framework tailored for agents' understanding of driving scenes with large spatial and temporal spans. ELM incorporates space-aware pre-training to endow the agent with robust spatial localization capabilities. Besides, the model employs time-aware token selection to accurately inquire about temporal cues. We instantiate ELM on the reformulated multi-faced benchmark, and it surpasses previous state-of-the-art approaches in all aspects.")))),n.createElement(k,null,n.createElement("h2",{className:"font-semibold border-b-[1px] !mb-4"},"Methodology"),n.createElement("h3",{id:"language-guided",className:"!mt-4"},"Model Overview"),n.createElement("img",{src:c,alt:"Description of the image"}),n.createElement("p",null," ",n.createElement("b",null,"Systematic Pipeline of ELM.")," It consists of Pre-training by open-world data corpus and Fine-tuning on diverse tasks. To initialize the Space-aware Pre-training, we collect extensive image-text pairs from the world, empowering ELM with spatial localization while preserving the description ability in driving scenarios. In the fine-tuning process, the inputs to ELM are videos, timestamps, and text prompts. After encoding the inputs into tokens, ELM leverages the proposed Time-aware Token Selection to gather the appropriate tokens as instructed by prompts. Finally, the tokens are sent to the language model to generate output texts."),n.createElement("h3",{id:"language-guided",className:"!mt-4"},"Space-aware Pre-training"),n.createElement("img",{src:u,alt:"Description of the image"}),n.createElement("p",null," ",n.createElement("b",null,"Annotation workflow with human quality check in the loop. For location labeling:")," we first select diverse templates from the GPT generated candidates. Pixel-point pairs as annotated in the nuScenes are then sampled and filled into the templates to form our location pre-training data.",n.createElement("b",null,"For description labeling:")," Node 4 utilizes LLaMA-Adapter V2 to obtain diverse labels on nuScenes, Waymo, YouTube, and Ego4D with predefined prompts. Two rounds of quality check are conducted in Node 3 and 7 by inspectors to guarantee the image and caption quality."),n.createElement("img",{src:d,alt:"Description of the image"}),n.createElement("p",null," ",n.createElement("b",null,"Statistics of pre-training data and comparison of data collection with other VLMs. "),"Our pre-train data surpasses that in general vision (top) and autonomous driving (middle) in terms of quantity and diversity. Anno: the type of annotations; Des: description; Loc: localization."),n.createElement("h2",{className:"font-semibold border-b-[1px] !mb-4"},"Experiments")),n.createElement("div",{className:"my-6 pt-6 pb-4 bg-gradient-to-r from-pink-100/70 via-indigo-100/70 to-emerald-100/70"},n.createElement("div",{className:"relative pb-8 mb-3"},n.createElement(s.default,{responsive:O,infinite:!0,showDots:!0,renderDotsOutside:!0},n.createElement("img",{src:m,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:p,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:f,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:g,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:v,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:y,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:b,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:E,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:w,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:x,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}),n.createElement("img",{src:S,alt:"Description of the image",style:{maxHeight:"800px",marginRight:"100px"}}))),n.createElement("p",{className:"text-center text-lg md:text-xl md:max-w-[85%] mx-auto"},n.createElement("b",null,"Visualization on the benchmark.")," We provide results for ten tasks and E2E planning through videos and corresponding QA pairs.")),n.createElement(k,null,n.createElement("h3",{id:"language-guided",className:"!mt-4"},"Results"),n.createElement("img",{src:h,alt:"Description of the image"}),n.createElement("p",null," ",n.createElement("b",null,"Comparison to State-of-the-arts. "),"All methods are fine-tuned on the corresponding tasks. The main metrics (%) are marked in gray. Bold emphasizes top method; underline marks the runner-up. C: CIDEr; R: ROUGE-L; B: BLEU."),n.createElement("h2",{id:"citation",className:"border-b-[1px]"},"Citation"),n.createElement("div",{className:"relative overflow-auto"},n.createElement("pre",{className:"bg-gradient-to-r from-pink-100 via-indigo-100 to-emerald-100 !my-0"},n.createElement("code",{id:"citation-bib",className:"font-medium text-slate-800"},"@article{zhou2024embodied,\n title={Embodied Understanding of Driving Scenarios},\n author={Zhou, Yunsong and Huang, Linyan and Bu, Qingwen and Zeng, Jia and Li, Tianyu and Qiu, Hang and Zhu, Hongzi and Guo, Minyi and Qiao, Yu and Li, Hongyang},\n journal={arXiv preprint arXiv:2403.04593},\n year={2024}\n}")),n.createElement("div",{className:"absolute top-0 right-0"},n.createElement("button",{className:"float-right text-2xl text-indigo-500 bg-white hover:bg-slate-50 hover:text-indigo-600 hover:transition-all rounded-full p-2 m-3 invisible md:visible",onClick:()=>{let e=document.getElementById("citation-bib"),t=document.createRange(),i=window.getSelection();null!=e&&null!=t&&null!=i&&(t.selectNode(e),i.removeAllRanges(),i.addRange(t))}},n.createElement(a.DV2,null))))),n.createElement("footer",{className:"flex flex-col justify-center bg-gray-50 mt-8 py-8"},n.createElement("div",{className:"flex justify-center align-middle text-lg"},n.createElement("a",{role:"button",className:"text-blue-500",onClick:()=>{window.scrollTo({top:0,behavior:"smooth"})}},n.createElement("span",{className:"align-text-top inline-flex justify-center mr-0.25"},n.createElement(r.ZTc,null)," "),n.createElement("span",null,"Back to Top"))))))},4405:function(e,t,i){"use strict";i.d(t,{w_:function(){return c}});var n=i(7294),r={color:void 0,size:void 0,className:void 0,style:void 0,attr:void 0},o=n.createContext&&n.createContext(r),a=function(){return a=Object.assign||function(e){for(var t,i=1,n=arguments.length;iMath.abs(o)))return;var l=a.populateSlidesOnMouseTouchMove(this.state,this.props,this.initialX,this.lastX,i,this.transformPlaceHolder),c=l.direction,u=l.nextPosition,d=l.canContinue;c&&(this.direction=c,d&&void 0!==u&&this.setTransformDirectly(u)),this.lastX=i}}},t.prototype.handleOut=function(e){this.props.autoPlay&&!this.autoPlay&&(this.autoPlay=setInterval(this.next,this.props.autoPlaySpeed));var t="touchend"===e.type&&!this.props.swipeable,i=("mouseleave"===e.type||"mouseup"===e.type)&&!this.props.draggable;if(!t&&!i&&this.onMove){if(this.setAnimationDirectly(!0),"right"===this.direction)if(this.initialX-this.lastX>=this.props.minimumTouchDrag){var n=Math.round((this.initialX-this.lastX)/this.state.itemWidth);this.next(n)}else this.correctItemsPosition(this.state.itemWidth,!0,!0);"left"===this.direction&&(this.lastX-this.initialX>this.props.minimumTouchDrag?(n=Math.round((this.lastX-this.initialX)/this.state.itemWidth),this.previous(n)):this.correctItemsPosition(this.state.itemWidth,!0,!0)),this.resetMoveStatus()}},t.prototype.isInViewport=function(e){var t=e.getBoundingClientRect(),i=t.top,n=void 0===i?0:i,r=t.left,o=void 0===r?0:r,a=t.bottom,s=void 0===a?0:a,l=t.right,c=void 0===l?0:l;return 0<=n&&0<=o&&s<=(window.innerHeight||document.documentElement.clientHeight)&&c<=(window.innerWidth||document.documentElement.clientWidth)},t.prototype.isChildOfCarousel=function(e){return!!(e instanceof Element&&this.listRef&&this.listRef.current)&&this.listRef.current.contains(e)},t.prototype.onKeyUp=function(e){var t=e.target;switch(e.keyCode){case 37:if(this.isChildOfCarousel(t))return this.previous();break;case 39:if(this.isChildOfCarousel(t))return this.next();break;case 9:if(this.isChildOfCarousel(t)&&t instanceof HTMLInputElement&&this.isInViewport(t))return this.next()}},t.prototype.handleEnter=function(e){s.isMouseMoveEvent(e)&&this.autoPlay&&this.props.autoPlay&&this.props.pauseOnHover&&(clearInterval(this.autoPlay),this.autoPlay=void 0)},t.prototype.goToSlide=function(e,i,n){var r=this;if(void 0===n&&(n=!0),!this.isInThrottle){var o=this.state.itemWidth,a=this.props,s=a.afterChange,l=a.beforeChange,c=this.state.currentSlide;"function"!=typeof l||i&&("object"!=typeof i||i.skipBeforeChange)||l(e,this.getState()),this.isAnimationAllowed=n,this.props.shouldResetAutoplay&&this.resetAutoplayInterval(),this.setState({currentSlide:e,transform:-o*e},(function(){r.props.infinite&&r.correctClonesPosition({domLoaded:!0}),"function"!=typeof s||i&&("object"!=typeof i||i.skipAfterChange)||(t.afterChangeTimeout3=setTimeout((function(){s(c,r.getState())}),r.props.transitionDuration||h))}))}},t.prototype.getState=function(){return this.state},t.prototype.renderLeftArrow=function(e){var t=this,i=this.props,n=i.customLeftArrow,r=i.rtl;return o.createElement(c.LeftArrow,{customLeftArrow:n,getState:function(){return t.getState()},previous:this.previous,disabled:e,rtl:r})},t.prototype.renderRightArrow=function(e){var t=this,i=this.props,n=i.customRightArrow,r=i.rtl;return o.createElement(c.RightArrow,{customRightArrow:n,getState:function(){return t.getState()},next:this.next,disabled:e,rtl:r})},t.prototype.renderButtonGroups=function(){var e=this,t=this.props.customButtonGroup;return t?o.cloneElement(t,{previous:function(){return e.previous()},next:function(){return e.next()},goToSlide:function(t,i){return e.goToSlide(t,i)},carouselState:this.getState()}):null},t.prototype.renderDotsList=function(){var e=this;return o.createElement(l.default,{state:this.state,props:this.props,goToSlide:this.goToSlide,getState:function(){return e.getState()}})},t.prototype.renderCarouselItems=function(){var e=[];if(this.props.infinite){var t=o.Children.toArray(this.props.children);e=a.getClones(this.state.slidesToShow,t)}return o.createElement(u.default,{clones:e,goToSlide:this.goToSlide,state:this.state,notEnoughChildren:a.notEnoughChildren(this.state),props:this.props})},t.prototype.render=function(){var e=this.props,t=e.deviceType,i=e.arrows,n=e.renderArrowsWhenDisabled,r=e.removeArrowOnDeviceType,s=e.infinite,l=e.containerClass,c=e.sliderClass,u=e.customTransition,h=e.additionalTransfrom,m=e.renderDotsOutside,f=e.renderButtonGroupOutside,g=e.className,v=e.rtl,y=a.getInitialState(this.state,this.props),b=y.shouldRenderOnSSR,E=y.shouldRenderAtAll,w=a.isInLeftEnd(this.state),x=a.isInRightEnd(this.state),S=i&&!(r&&(t&&-12*n?e+2*n:r>=i.length?i.length+e:e},t.getOriginalIndexLookupTableByClones=function(e,t){if(t.length>2*e){for(var i={},n=t.length-2*e,r=t.length-n,o=n,a=0;a2*e?t.slice(t.length-2*e,t.length).concat(t,t.slice(0,2*e)):t.concat(t,t)},t.getInitialSlideInInfiniteMode=function(e,t){return t.length>2*e?2*e:t.length},t.checkClonesPosition=function(e,t,i){var n,r=e.currentSlide,o=e.slidesToShow,a=e.itemWidth,s=e.totalItems,l=0,c=0,u=0===r,d=t.length-(t.length-2*o);return t.length2*o?((n=r>=d+t.length)&&(c=-a*(l=r-t.length)),u&&(c=-a*(l=d+(t.length-2*o)))):((n=r>=2*t.length)&&(c=-a*(l=r-t.length)),u&&(c=i.showDots?-a*(l=t.length):-a*(l=s/3))),{isReachingTheEnd:n,isReachingTheStart:u,nextSlide:l,nextPosition:c}}},3989:function(e,t,i){"use strict";Object.defineProperty(t,"__esModule",{value:!0});var n=i(6867);function r(e){var t=e.slidesToShow;return e.totalItems=o&&window.innerWidth<=r&&(l=n)})),l}},4949:function(e,t,i){"use strict";Object.defineProperty(t,"__esModule",{value:!0});var n=i(3905),r=i(3989);t.getLookupTableForNextSlides=function(e,t,i,o){var a={},s=r.getSlidesToSlide(t,i);return Array(e).fill(0).forEach((function(e,i){var r=n.getOriginalCounterPart(i,t,o);if(0===i)a[0]=r;else{var l=a[i-1]+s;a[i]=l}})),a}},6867:function(e,t){"use strict";Object.defineProperty(t,"__esModule",{value:!0});t.getPartialVisibilityGutter=function(e,t,i,n){var r=0,o=n||i;return t&&o&&(r=e[o].partialVisibilityGutter||e[o].paritialVisibilityGutter),r},t.getWidthFromDeviceType=function(e,t){var i;return t[e]&&(i=(100/t[e].items).toFixed(1)),i},t.getItemClientSideWidth=function(e,t,i){return Math.round(i/(t+(e.centerMode?1:0)))}},8035:function(e,t,i){"use strict";Object.defineProperty(t,"__esModule",{value:!0});var n=i(3905);t.getOriginalCounterPart=n.getOriginalCounterPart,t.getClones=n.getClones,t.checkClonesPosition=n.checkClonesPosition,t.getInitialSlideInInfiniteMode=n.getInitialSlideInInfiniteMode;var r=i(6867);t.getWidthFromDeviceType=r.getWidthFromDeviceType,t.getPartialVisibilityGutter=r.getPartialVisibilityGutter,t.getItemClientSideWidth=r.getItemClientSideWidth;var o=i(3989);t.getInitialState=o.getInitialState,t.getIfSlideIsVisbile=o.getIfSlideIsVisbile,t.getTransformForCenterMode=o.getTransformForCenterMode,t.getTransformForPartialVsibile=o.getTransformForPartialVsibile,t.isInLeftEnd=o.isInLeftEnd,t.isInRightEnd=o.isInRightEnd,t.notEnoughChildren=o.notEnoughChildren,t.getSlidesToSlide=o.getSlidesToSlide;var a=i(9950);t.throttle=a.default;var s=i(4802);t.throwError=s.default;var l=i(2558);t.populateNextSlides=l.populateNextSlides;var c=i(5210);t.populatePreviousSlides=c.populatePreviousSlides;var u=i(1097);t.populateSlidesOnMouseTouchMove=u.populateSlidesOnMouseTouchMove},1097:function(e,t){"use strict";Object.defineProperty(t,"__esModule",{value:!0}),t.populateSlidesOnMouseTouchMove=function(e,t,i,n,r,o){var a,s,l=e.itemWidth,c=e.slidesToShow,u=e.totalItems,d=e.currentSlide,h=t.infinite,p=!1,m=Math.round((i-n)/l),f=Math.round((n-i)/l),g=i