diff --git a/include/imex/Dialect/XeGPU/IR/XeGPUAttrs.td b/include/imex/Dialect/XeGPU/IR/XeGPUAttrs.td
index 9d75719e2..4d1e04b1c 100644
--- a/include/imex/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/include/imex/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -21,13 +21,33 @@ def XeGPU_ScatteredAttr : XeGPUAttr<"Scattered", "scattered"> {
 }
 
 def XeGPU_SgMapAttr: XeGPUAttr<"SgMap", "sg_map"> {
-  let parameters = (ins
-        ArrayRefParameter<"unsigned">:$mmaBlockSize,
+  let parameters = (ins 
         ArrayRefParameter<"unsigned">:$wiLayout,
-        ArrayRefParameter<"unsigned">:$wiData);
-
+        ArrayRefParameter<"unsigned">:$wiData,
+        ArrayRefParameter<"unsigned">:$mmaBlockSize);
+  
   // In format of #xegpu.sg_map<{mma_block_size = [2, 4], wi_layout = [2, 4], wi_data = [2, 4]}>
   let assemblyFormat = "`<` custom<SgMapAttrElements>($mmaBlockSize, $wiLayout, $wiData) `>`";
+
+  let extraClassDeclaration = [{
+    bool hasMMABlockSizeAttr() {
+      return getMmaBlockSize().size() == 2;
+    }
+  }];
+
+  let builders = [
+    AttrBuilder<(ins
+      "::llvm::ArrayRef<unsigned>":$wiLayout,
+      "::llvm::ArrayRef<unsigned>":$wiData,
+      CArg<"::llvm::ArrayRef<unsigned>", "{}">:$mmaBlockSize
+    ), [{
+      assert(wiLayout.size() == 2 && wiData.size() == 2 && "wiLayout and wiData should be 2D arrays.\n");
+      assert((mmaBlockSize.size() == 2 || mmaBlockSize.size() == 0) && "mmaBlockSize can be either empty or a 2D array.\n");
+      return $_get($_ctxt, wiLayout, wiData, mmaBlockSize);
+    }]>
+  ];
+
+  let skipDefaultBuilders = 1;
 }
 
 def XeGPU_WgMapAttr: XeGPUAttr<"WgMap", "wg_map"> {
@@ -35,6 +55,17 @@ def XeGPU_WgMapAttr: XeGPUAttr<"WgMap", "wg_map"> {
         ArrayRefParameter<"unsigned">:$sgLayout,
         ArrayRefParameter<"unsigned">:$sgData);
 
+  let builders = [
+    AttrBuilder<(ins
+      "::llvm::ArrayRef<unsigned>":$sgLayout,
+      "::llvm::ArrayRef<unsigned>":$sgData
+    ), [{
+      assert(sgLayout.size() == 2 && sgData.size() == 2 && "sgLayout and sgData should be 2D arrays.\n");
+      return $_get($_ctxt, sgLayout, sgData);
+    }]>
+  ];
+  let skipDefaultBuilders = 1;
+
   // In format of #xegpu.wg_map<{sg_layout = [2, 4], sg_data = [2, 4]}>
   let assemblyFormat = "`<` custom<WgMapAttrElements>($sgLayout, $sgData) `>`";
 }
@@ -44,7 +75,24 @@ def XeGPU_XeMapAttr: XeGPUAttr<"XeMap", "xe_map"> {
         XeGPU_WgMapAttr: $wg,
         XeGPU_SgMapAttr: $sg);
 
-  // In format of #xegpu.xe_map<wg = {sg_layout = [2, 4], sg_data = [2, 4]}, sg = {sg_layout = [2, 4], sg_data = [2, 4]}>
+  let builders = [
+    AttrBuilder<(ins
+      "::llvm::ArrayRef<unsigned>":$sgLayout,
+      "::llvm::ArrayRef<unsigned>":$sgData,
+      "::llvm::ArrayRef<unsigned>":$wiLayout,
+      "::llvm::ArrayRef<unsigned>":$wiData,
+      CArg<"::llvm::ArrayRef<unsigned>", "{}">:$mmaBlockSize
+    ), [{
+      assert(sgLayout.size() == 2 && sgData.size() == 2 && "sgLayout and sgData should be 2D arrays.\n");
+      assert(wiLayout.size() == 2 && wiData.size() == 2 && "wiLayout and wiData should be 2D arrays.\n");
+      assert((mmaBlockSize.size() == 2 || mmaBlockSize.size() == 0) && "mmaBlockSize can be either empty or a 2D array.\n");
+      auto wg = WgMapAttr::get($_ctxt, sgLayout, sgData);
+      auto sg = SgMapAttr::get($_ctxt, wiLayout, wiData, mmaBlockSize);
+      return $_get($_ctxt, wg, sg);
+    }]>
+  ];
+
+  // In format of #xegpu.xe_map<wg = {sg_layout = [2, 4], sg_data = [2, 4]}, sg = {mma_block_size = [2, 4], sg_layout = [2, 4], sg_data = [2, 4]}>
   let hasCustomAssemblyFormat = 1;
 }
 
diff --git a/include/imex/Dialect/XeGPU/IR/XeGPUOps.h b/include/imex/Dialect/XeGPU/IR/XeGPUOps.h
index 1248f6964..073587bdd 100644
--- a/include/imex/Dialect/XeGPU/IR/XeGPUOps.h
+++ b/include/imex/Dialect/XeGPU/IR/XeGPUOps.h
@@ -46,41 +46,6 @@ class TensorDescType;
 } // namespace xegpu
 } // namespace imex
 
-namespace imex {
-namespace xegpu {
-
-class BaseTensorDescType : public mlir::Type,
-                           public mlir::ShapedType::Trait<BaseTensorDescType> {
-public:
-  using Type::Type;
-
-  /// Returns the element type of this tensor type.
-  mlir::Type getElementType() const;
-
-  /// Returns if this type is ranked, i.e. it has a known number of dimensions.
-  bool hasRank() const;
-
-  /// Returns the shape of this tensor type.
-  llvm::ArrayRef<int64_t> getShape() const;
-
-  /// Clone this type with the given shape and element type. If the
-  /// provided shape is `None`, the current shape of the type is used.
-  BaseTensorDescType cloneWith(std::optional<llvm::ArrayRef<int64_t>> shape,
-                               mlir::Type elementType) const;
-
-  /// Return true if the specified element type is ok in a tensor.
-  static bool isValidElementType(Type type);
-
-  /// Methods for support type inquiry through isa, cast, and dyn_cast.
-  static bool classof(Type type);
-
-  /// Allow implicit conversion to ShapedType.
-  operator mlir::ShapedType() const { return cast<mlir::ShapedType>(); }
-};
-
-} // namespace xegpu
-} // namespace imex
-
 #include <imex/Dialect/XeGPU/IR/XeGPUOpsDialect.h.inc>
 #include <imex/Dialect/XeGPU/IR/XeGPUOpsEnums.h.inc>
 #define GET_ATTRDEF_CLASSES
diff --git a/include/imex/Dialect/XeGPU/IR/XeGPUOps.td b/include/imex/Dialect/XeGPU/IR/XeGPUOps.td
index e2105c092..725299744 100644
--- a/include/imex/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/include/imex/Dialect/XeGPU/IR/XeGPUOps.td
@@ -54,8 +54,7 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe
     both shape and strides are required to to carry the respect information. Otherwise,
     the operator is invalid.
 
-    The operation also supports two attributes:
-    * memory_scope (MemoryScopeAttr): indicates where the memory is located, "global" for global memory (default), and "slm" for shared memory.
+    The operation also supports the following attribute:
     * boundary_check (BoolAttr): indicates whether the operation detects the boundary and pads with zero for out-of-boundary access (default)
 
     Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
@@ -82,7 +81,6 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe
                        Variadic<Index>: $shape,
                        Variadic<Index>: $strides,
                        DenseI64ArrayAttr: $static_offsets,
-                       DefaultValuedAttr<XeGPU_MemoryScopeAttr, "xegpu::MemoryScope::GLOBAL">: $memory_scope,
                        DefaultValuedAttr<BoolAttr, "true">: $boundary_check,
                        DefaultValuedAttr<XeGPU_ModeAttr, "imex::xegpu::Mode::SIMT">: $mode);
 
@@ -95,23 +93,40 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe
   let builders = [
     OpBuilder<(ins "::mlir::Type": $TensorDesc, "::mlir::Value": $source, "::mlir::ValueRange": $offsets,
                    "::mlir::ValueRange": $shape, "::mlir::ValueRange": $strides, "::llvm::ArrayRef<int64_t>": $static_offsets,
-                   CArg<"::imex::xegpu::MemoryScope", "xegpu::MemoryScope::GLOBAL">: $memory_scope,
                    CArg<"bool", "true">: $boundary_check, CArg<"::imex::xegpu::Mode", "imex::xegpu::Mode::SIMT">: $mode),
-    [{  $_state.addOperands(source);
+    [{  
+        auto staticDims = std::count_if(static_offsets.begin(), static_offsets.end(),
+                                          [](int64_t d) { return !mlir::ShapedType::isDynamic(d); });
+        auto dynamicDims = std::count_if(static_offsets.begin(), static_offsets.end(),
+                                          [](int64_t d) { return mlir::ShapedType::isDynamic(d); });
+
+        auto dims = offsets.size() + staticDims;
+        assert((isStaticShapedMemRef(source) && 
+                   dims == getRankOf(source) && 
+                           shape.size() == 0 && 
+                           strides.size() == 0
+               ) || 
+               ((!isMemRef(source) || dims == getRankOf(source)) && 
+                                               shape.size() != 0 && 
+                                            dims == shape.size() && 
+                                            shape.size() == strides.size()
+               )
+              );
+        assert(offsets.size() == dynamicDims);
+
+        $_state.addOperands(source);
         $_state.addOperands(offsets);
         $_state.addOperands(shape);
         $_state.addOperands(strides);
         $_state.addAttribute(getOperandSegmentSizesAttrName($_state.name), $_builder.getDenseI32ArrayAttr({1, static_cast<int32_t>(offsets.size()), static_cast<int32_t>(shape.size()), static_cast<int32_t>(strides.size())}));
         $_state.addAttribute(getStaticOffsetsAttrName($_state.name), $_builder.getDenseI64ArrayAttr(static_offsets));
-        $_state.addAttribute(getMemoryScopeAttrName($_state.name), ::imex::xegpu::MemoryScopeAttr::get($_builder.getContext(), memory_scope));
         $_state.addAttribute(getBoundaryCheckAttrName($_state.name), $_builder.getBoolAttr(boundary_check));
-        $_state.addAttribute(getBoundaryCheckAttrName($_state.name), ::imex::xegpu::ModeAttr::get($_builder.getContext(), mode));
+        $_state.addAttribute(getModeAttrName($_state.name), ::imex::xegpu::ModeAttr::get($_builder.getContext(), mode));
         $_state.addTypes(TensorDesc); }]>,
 
     OpBuilder<(ins "::mlir::Type": $tdesc, "::mlir::Value": $source, "::llvm::ArrayRef<mlir::OpFoldResult>": $offsets,
-                  CArg<"::imex::xegpu::MemoryScope", "::imex::xegpu::MemoryScope::GLOBAL">:$memory_scope,
                   CArg<"bool", "true">:$boundary_check, CArg<"::imex::xegpu::Mode", "imex::xegpu::Mode::SIMT">: $mode),
-    [{  assert(offsets.size() == getRankOf(source));
+    [{  assert(isStaticShapedMemRef(source) && offsets.size() == getRankOf(source));
         llvm::SmallVector<int64_t> staticOffsets;
         llvm::SmallVector<mlir::Value> dynamicOffsets;
         dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
@@ -120,16 +135,14 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe
                                              ::mlir::ValueRange({})     /* empty dynamic shape   */,
                                              ::mlir::ValueRange({})     /* empty dynamic strides */,
                                              staticOffsets              /* static offsets        */,
-                                             memory_scope,
                                              boundary_check,
                                              mode); }]>,
 
 
     OpBuilder<(ins "::mlir::Type": $tdesc, "::mlir::Value": $source, "::llvm::ArrayRef<mlir::OpFoldResult>": $offsets,
                    "::mlir::ValueRange": $shape, "::mlir::ValueRange": $stride,
-                   CArg<"::imex::xegpu::MemoryScope", "xegpu::MemoryScope::GLOBAL">:$memory_scope,
                    CArg<"bool", "true">:$boundary_check, CArg<"::imex::xegpu::Mode", "imex::xegpu::Mode::SIMT">: $mode),
-    [{  assert((!isMemRef(source) || getRankOf(source) == offsets.size()) && shape.size() == stride.size() &&
+    [{  assert((!isMemRef(source) || getRankOf(source) == offsets.size()) && shape.size() != 0 && shape.size() == stride.size() &&
                offsets.size() == shape.size() && isIntegerOrDynamicShapedMemref(source));
 
         llvm::SmallVector<int64_t> staticOffsets;
@@ -141,7 +154,6 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe
                                              shape              /* empty dynamic shape   */,
                                              stride             /* empty dynamic strides */,
                                              staticOffsets      /* static offsets        */,
-                                             memory_scope,
                                              boundary_check,
                                              mode); }]>
   ];
@@ -172,6 +184,59 @@ def XeGPU_CreateNdDescOp : XeGPU_Op<"create_nd_tdesc", [Pure, AttrSizedOperandSe
       assert(0 && "Unreachable");
     }
 
+    void getOffsets(llvm::SmallVectorImpl<mlir::OpFoldResult> &offsets) {
+      auto dynamicOffsets = getOffsets(); //dynamic offsets
+      auto staticOffsets = getStaticOffsets();
+
+      if (staticOffsets.size() == 0) {
+        offsets.assign(dynamicOffsets.begin(), dynamicOffsets.end());
+        return;
+      }
+
+      for (size_t i = 0, j = 0; i < staticOffsets.size(); i++) {
+        if (mlir::ShapedType::isDynamic(staticOffsets[i])) {
+          assert(j < dynamicOffsets.size());
+          offsets.push_back(dynamicOffsets[j++]);
+        } else {
+          auto attr = mlir::IntegerAttr::get(mlir::IndexType::get(getContext()), staticOffsets[i]);
+          offsets.push_back(attr);
+        }
+      }
+    }
+
+    void getShape(llvm::SmallVectorImpl<mlir::OpFoldResult> &shape) {
+      if (isIntegerOrDynamicShapedMemref(getSource())) {
+        shape.append(getShape().begin(), getShape().end());
+      } else {
+        for (auto dim: getSourceType().cast<::mlir::MemRefType>().getShape()) {
+          auto attr = mlir::IntegerAttr::get(mlir::IndexType::get(getContext()), dim);
+          shape.push_back(attr);
+        }
+      } 
+    }
+
+    void getStrides(llvm::SmallVectorImpl<mlir::OpFoldResult> &strides) {
+      if (isIntegerOrDynamicShapedMemref(getSource())) {
+        strides.append(getStrides().begin(), getStrides().end());
+      } else {
+        auto [staticStrides, offset] = mlir::getStridesAndOffset(getSourceType().cast<mlir::MemRefType>());
+        for (auto dim: staticStrides) {
+          auto attr = mlir::IntegerAttr::get(mlir::IndexType::get(getContext()), dim);
+          strides.push_back(attr);
+        }
+      } 
+    }
+
+    size_t getNumStaticOffsets() {
+      return std::count_if(getStaticOffsets().begin(), getStaticOffsets().end(),
+                           [](int64_t dSize) { return !mlir::ShapedType::isDynamic(dSize); });
+    }
+
+    size_t getNumDynamicOffsets() {
+      return std::count_if(getStaticOffsets().begin(), getStaticOffsets().end(),
+                           [](int64_t dSize) { return mlir::ShapedType::isDynamic(dSize); });
+    }
+
     size_t getOffsetsRank() {
       return getOffsets().size() + std::count_if(getStaticOffsets().begin(), getStaticOffsets().end(),
                                           [](int64_t dSize) { return !mlir::ShapedType::isDynamic(dSize); });
@@ -259,7 +324,6 @@ def XeGPU_CreateDescOp
                supportted group size, e.g., vector<16xindex>. And each element in the vector corresponds to a
                work item (SIMT lane) in the subgroup.
                In SIMT mode (default), it is an index scalar representing the offset of the access point.
-    * memory_scope: [optional attribute] indicates where the memory is located, "global" for global memory (default), and "slm" for shared memory.
     * chunk_size_per_lane: [optional attribute] indicates number of continious elements accessed for each offset, default is 1.
 
     Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
@@ -283,7 +347,6 @@ def XeGPU_CreateDescOp
 
   let arguments = (ins XeGPU_BaseAddrType: $source,
                        XeGPU_OffsetType: $offsets,
-                       DefaultValuedAttr<XeGPU_MemoryScopeAttr, "imex::xegpu::MemoryScope::GLOBAL">: $memory_scope,
                        DefaultValuedAttr<I32Attr, "1">: $chunk_size_per_lane,
                        DefaultValuedAttr<XeGPU_ModeAttr, "imex::xegpu::Mode::SIMT">: $mode);
 
@@ -303,7 +366,7 @@ def XeGPU_CreateDescOp
 
   }];
 
-  // Format: xegpu.create_tdesc %src, %offsets {mode=simt, memory_scope=slm, chunk_size_per_lane=1}
+  // Format: xegpu.create_tdesc %src, %offsets {mode=simt, chunk_size_per_lane=1}
   //                   : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   let hasCustomAssemblyFormat = 1;
   let hasVerifier = 1;
diff --git a/include/imex/Dialect/XeGPU/IR/XeGPUTypes.td b/include/imex/Dialect/XeGPU/IR/XeGPUTypes.td
index e7f0723f4..b15072834 100644
--- a/include/imex/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/include/imex/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -23,18 +23,15 @@ include "imex/Dialect/XeGPU/IR/XeGPUDialect.td"
 // An Integer array attribute with fixed 2 elements.
 def XeGPU_IntArrayAttr2: ConfinedAttr<DenseI64ArrayAttr, [DenseArrayCount<2>]>;
 def XeGPU_IntType: AnyTypeOf<[I1, I8, I16, I32, I64, SI1, SI8, SI16, SI32, SI64, UI1, UI8, UI16, UI32, UI64]>;
-def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, F8E4M3FN, F8E5M2, F8E4M3FNUZ, F8E4M3B11FNUZ, F8E5M2FNUZ]>;
+def XeGPU_FloatType: AnyTypeOf<[F16, F32, F64, BF16, TF32]>;
 def XeGPU_ScalarType: AnyTypeOf<[XeGPU_IntType, XeGPU_FloatType]>;
-def XeGPU_BaseAddrType: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1, 2]>, UI64]>;
+def XeGPU_BaseAddrType: AnyTypeOf<[MemRefRankOf<[XeGPU_ScalarType], [1, 2]>, UI64, UI32, I64, I32]>;
 def XeGPU_DpasOpType: VectorOfRankAndType<[2, 3], [XeGPU_ScalarType]>;
 def XeGPU_OffsetType: AnyTypeOf<[VectorOfRankAndType<[1], [Index]>, Index]>;
 def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1,2], [I1]>, I1]>;
 def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3], [XeGPU_ScalarType]>, XeGPU_ScalarType]>;
 
-// def XeGPU_VectorType: VectorOfRankAndType<[1,2,3], [XeGPU_ScalarType]>;
-// def XeGPU_Vector3DType: VectorOfRankAndType<[3], [XeGPU_ScalarType]>;
 def XeGPU_Vector2DType: VectorOfRankAndType<[2], [XeGPU_ScalarType]>;
-// def XeGPU_Vector1DType: VectorOfRankAndType<[1], [XeGPU_ScalarType]>;
 
 // common base class for types in XeGPU dialect
 class XeGPUTypeDef<string name, string typeMnemonic,
@@ -72,15 +69,17 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
 
   let parameters = (ins ArrayRefParameter<"int64_t">:$shape,
                         "::mlir::Type":$elementType,
+                        DefaultValuedParameter<"::imex::xegpu::MemoryScope", "xegpu::MemoryScope::GLOBAL">: $memory_scope,
                         OptionalParameter<"::mlir::Attribute"> :$encoding);
 
   let builders = [
     TypeBuilderWithInferredContext<(ins
       "::llvm::ArrayRef<int64_t>":$shape,
       "::mlir::Type":$elementType,
+      CArg<"::imex::xegpu::MemoryScope", "xegpu::MemoryScope::GLOBAL">: $memory_scope,
       CArg<"::mlir::Attribute", "{}">:$encoding
     ), [{
-      return $_get(elementType.getContext(), shape, elementType, encoding);
+      return $_get(elementType.getContext(), shape, elementType, memory_scope, encoding);
     }]>
   ];
 
@@ -100,8 +99,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     }
   }];
 
-  // let assemblyFormat = "`<` custom<ShapeAndType>($shape, $elementType) (`,` custom<TensorDescAttr>($encoding)^)? `>`";
-  let assemblyFormat = "`<` custom<ShapeAndType>($shape, $elementType) (`,` $encoding^)? `>`";
+  let assemblyFormat = "`<` custom<ShapeAndType>($shape, $elementType)``custom<TensorDescAttr>($memory_scope, $encoding)`>`";
 }
 
 #endif // _XEGPU_TYPES_TD_INCLUDED_
diff --git a/include/imex/Utils/XeUtils.h b/include/imex/Utils/XeUtils.h
index ffb1660b5..1283987cd 100644
--- a/include/imex/Utils/XeUtils.h
+++ b/include/imex/Utils/XeUtils.h
@@ -38,7 +38,8 @@ template <typename T> static std::string makeString(T array) {
   os << "[";
   for (auto i = 1; i < array.size(); i++)
     os << array[i - 1] << ", ";
-  os << array[array.size() - 1] << "]";
+  if (array.size()) os << array[array.size() - 1] ;
+  os << "]";
   os.flush();
   return buf;
 }
diff --git a/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 83b31b8e2..319d13552 100644
--- a/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -77,9 +77,54 @@ static void printShapeAndType(mlir::AsmPrinter &printer,
   printer << type;
 }
 
-template <typename T>
-static mlir::LogicalResult parseArrayList(mlir::AsmParser &parser,
-                                          llvm::SmallVector<T> &array,
+static mlir::LogicalResult parseTensorDescAttr(mlir::AsmParser &parser, 
+                                               imex::xegpu::MemoryScope &scope, 
+                                               mlir::Attribute &encoding) {
+  // implies no attrbutes
+  if (mlir::failed(parser.parseOptionalComma()))
+    return mlir::success();
+
+  auto parseElt = [&]() -> mlir::ParseResult {
+    llvm::StringRef nameId;
+
+    if (!parser.parseOptionalKeyword(&nameId, {"memory_scope"})) {
+      auto loc = parser.getCurrentLocation();
+      if(parser.parseEqual())
+        return mlir::failure();
+ 
+      auto attrOptional = ::mlir::FieldParser<::imex::xegpu::MemoryScope, ::imex::xegpu::MemoryScope>::parse(parser);
+      if(mlir::failed(attrOptional))
+        return parser.emitError(loc, "Invalid memory scope attribute specification.\n");
+      scope = *attrOptional;
+      return mlir::success();
+    } else {
+      auto loc = parser.getCurrentLocation();
+      auto attrOptional = ::mlir::FieldParser<::mlir::Attribute>::parse(parser);
+      if(mlir::failed(attrOptional))
+        return parser.emitError(loc, "Failed to parse XeGPU_TensorDesc parameter 'encoding' which is to be a `::mlir::Attribute`.\n");
+      encoding = *attrOptional; 
+      return mlir::success();
+    }
+    llvm_unreachable("Unexpected.");
+  };
+
+  if (parser.parseCommaSeparatedList(parseElt))
+    return mlir::failure();
+
+  return mlir::success();
+}
+
+static void printTensorDescAttr(mlir::AsmPrinter &printer, 
+                                imex::xegpu::MemoryScope scope, 
+                                mlir::Attribute encoding) {
+  if (scope != imex::xegpu::MemoryScope::GLOBAL) 
+    printer << ", memory_scope = " << scope;
+  if (encoding) printer << ", " << encoding;
+}
+
+template<typename T>
+static mlir::LogicalResult parseArrayList(mlir::AsmParser &parser, 
+                                          llvm::SmallVector<T> &array, 
                                           bool parsePrecedenceEqual = false) {
   mlir::FailureOr<llvm::SmallVector<T>> result;
   // Parse literal '='
@@ -121,38 +166,27 @@ static mlir::LogicalResult parseSgMapAttrElements(
   auto loc = parser.getCurrentLocation();
   auto parseElt = [&]() -> mlir::LogicalResult {
     return mlir::AsmParser::KeywordSwitch<mlir::LogicalResult>(parser)
-        .Case("mma_block_size",
-              [&](llvm::StringRef, llvm::SMLoc) {
-                return parseArrayList(parser, mmaBlockSize, true);
-              })
-        .Case("wi_layout",
-              [&](llvm::StringRef, llvm::SMLoc) {
-                return parseArrayList(parser, layout, true);
-              })
-        .Case("wi_data",
-              [&](llvm::StringRef, llvm::SMLoc) {
-                return parseArrayList(parser, data, true);
-              })
-        .Default([&](llvm::StringRef keyword, llvm::SMLoc) {
-          llvm::dbgs() << "\n3. Default currLoc: "
-                       << llvm::StringRef(
-                              parser.getCurrentLocation().getPointer())
-                       << "\n";
-          llvm::dbgs() << "\n3. keyword: " << keyword << "\n";
-          return mlir::failure();
-        });
-  };
-
-  if (parser.parseLBrace())
-    return mlir::failure();
-  if (parser.parseCommaSeparatedList(parseElt))
-    return mlir::failure();
-  if (parser.parseRBrace())
-    return mlir::failure();
-  if (mmaBlockSize.size() != 2) {
-    parser.emitError(loc,
-                     "failed to parse SgMapAttr: missing mma_block_size which "
-                     "is to be a `llvm::ArrayRef<unsigned>` with size 2");
+      .Case("mma_block_size", [&](llvm::StringRef, llvm::SMLoc) {
+        return parseArrayList(parser, mmaBlockSize, true);
+      })
+      .Case("wi_layout", [&](llvm::StringRef, llvm::SMLoc) {
+        return parseArrayList(parser, layout, true);
+      })
+      .Case("wi_data", [&](llvm::StringRef, llvm::SMLoc) {
+        return parseArrayList(parser, data, true);
+      })
+      .Default([&](llvm::StringRef keyword, llvm::SMLoc) {
+        parser.emitError(loc, "SgMapAttr Parser meet an unexpected keywoard: ") << keyword << "\n";
+        return mlir::failure();
+      });
+  }; 
+  
+  if (parser.parseLBrace()) return mlir::failure();
+  if (parser.parseCommaSeparatedList(parseElt)) return mlir::failure();
+  if (parser.parseRBrace()) return mlir::failure();
+  if (mmaBlockSize.size() != 2 && mmaBlockSize.size() != 0) {
+    parser.emitError(loc, "failed to parse SgMapAttr: mma_block_size should be a `llvm::ArrayRef<unsigned>` " 
+                          "with size 2 or empty. But it got ") << mmaBlockSize.size() << ".\n" ;
     return mlir::failure();
   }
   if (layout.size() != 2) {
@@ -173,8 +207,10 @@ static void printSgMapAttrElements(mlir::AsmPrinter &printer,
                                    llvm::ArrayRef<unsigned> layout,
                                    llvm::ArrayRef<unsigned> data) {
   printer << "{";
-  printArrayElement(printer, "mma_block_size", mmaBlockSize);
-  printer << "," << ' ';
+  if (mmaBlockSize.size()) {
+    printArrayElement(printer, "mma_block_size", mmaBlockSize);
+    printer << "," << ' ';
+  }
   printArrayElement(printer, "wi_layout", layout);
   printer << "," << ' ';
   printArrayElement(printer, "wi_data", data);
@@ -237,39 +273,30 @@ mlir::Attribute XeMapAttr::parse(mlir::AsmParser &parser, mlir::Type type) {
   if (parser.parseLess())
     return {};
 
-  auto parseElt = [&]() -> mlir::ParseResult {
-    mlir::OptionalParseResult result =
-        mlir::AsmParser::KeywordSwitch<mlir::OptionalParseResult>(parser)
-            .Case("sg",
-                  [&](llvm::StringRef, llvm::SMLoc) {
-                    if (parser.parseEqual())
-                      return mlir::failure();
-                    llvm::SmallVector<unsigned> mmaBlockSize;
-                    llvm::SmallVector<unsigned> wiLayout;
-                    llvm::SmallVector<unsigned> wiData;
-                    if (mlir::failed(parseSgMapAttrElements(
-                            parser, mmaBlockSize, wiLayout, wiData)))
-                      return mlir::failure();
-                    sg = imex::xegpu::SgMapAttr::get(
-                        parser.getContext(), mmaBlockSize, wiLayout, wiData);
-                    return mlir::success(!!sg);
-                  })
-            .Case("wg",
-                  [&](llvm::StringRef, llvm::SMLoc) {
-                    if (parser.parseEqual())
-                      return mlir::failure();
-                    llvm::SmallVector<unsigned> sgLayout;
-                    llvm::SmallVector<unsigned> sgData;
-                    if (mlir::failed(
-                            parseWgMapAttrElements(parser, sgLayout, sgData)))
-                      return mlir::failure();
-                    wg = imex::xegpu::WgMapAttr::get(parser.getContext(),
-                                                     sgLayout, sgData);
-                    return mlir::success(!!wg);
-                  })
-            .Default([&](llvm::StringRef keyword, llvm::SMLoc) {
-              return std::nullopt;
-            });
+  auto parseElt = [&]() ->  mlir::ParseResult {
+    mlir::OptionalParseResult result = mlir::AsmParser::KeywordSwitch<mlir::OptionalParseResult>(parser)
+      .Case("sg", [&](llvm::StringRef, llvm::SMLoc) {
+        if (parser.parseEqual()) return mlir::failure();
+        llvm::SmallVector<unsigned> mmaBlockSize;
+        llvm::SmallVector<unsigned> wiLayout;
+        llvm::SmallVector<unsigned> wiData;
+        if (mlir::failed(parseSgMapAttrElements(parser, mmaBlockSize, wiLayout, wiData)))
+          return mlir::failure();
+        sg = imex::xegpu::SgMapAttr::get(parser.getContext(), wiLayout, wiData, mmaBlockSize);
+        return mlir::success(!!sg);
+      })
+      .Case("wg", [&](llvm::StringRef, llvm::SMLoc) {
+        if (parser.parseEqual()) return mlir::failure();
+        llvm::SmallVector<unsigned> sgLayout;
+        llvm::SmallVector<unsigned> sgData;
+        if(mlir::failed(parseWgMapAttrElements(parser, sgLayout, sgData)))
+          return mlir::failure();
+        wg = imex::xegpu::WgMapAttr::get(parser.getContext(), sgLayout, sgData);
+        return mlir::success(!!wg);
+      })
+      .Default([&](llvm::StringRef keyword, llvm::SMLoc) {
+        return std::nullopt;
+      });
     return result.value();
   };
 
diff --git a/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index a8f0a8aa6..dcfdbd0b4 100644
--- a/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -70,15 +70,6 @@ static void transpose(llvm::ArrayRef<int64_t> trans,
     shape[i] = old[trans[i]];
 };
 
-static void dropOnes(std::vector<int64_t> &array) {
-  std::vector<int64_t> old = array;
-  array.clear();
-  for (auto v : old) {
-    if (v != 1)
-      array.push_back(v);
-  }
-};
-
 static bool isMappingAttr(mlir::Attribute attr) {
   return attr && (llvm::isa<imex::xegpu::SgMapAttr>(attr) ||
                   llvm::isa<imex::xegpu::WgMapAttr>(attr) ||
@@ -129,8 +120,7 @@ static mlir::ParseResult parseCustomEnumAttr(mlir::OpAsmParser &parser,
   auto loc = parser.getCurrentLocation();
   auto attrOptional = mlir::FieldParser<CustomEnum, CustomEnum>::parse(parser);
   if (mlir::failed(attrOptional))
-    return parser.emitError(loc, "invalid ")
-           << "memory_scope attribute specification";
+    return parser.emitError(loc, "invalid ") << "attribute specification";
   auto attr =
       CustomEnumAttr::get(parser.getBuilder().getContext(), *attrOptional);
   result.addAttribute(attrKeyword, attr);
@@ -187,10 +177,6 @@ parseOptionalAttrDict(mlir::OpAsmParser &parser, mlir::OperationState &result,
     if (parser.parseEqual())
       return ::mlir::failure();
 
-    if (nameId == "memory_scope")
-      return parseCustomEnumAttr<MemoryScope, MemoryScopeAttr>(parser, result,
-                                                               nameId);
-
     if (nameId == "l1_hint" || nameId == "l2_hint" || nameId == "l3_hint") {
       if (isWrite)
         return parseCustomEnumAttr<CacheWriteHint, CacheWriteHintAttr>(
@@ -296,8 +282,7 @@ mlir::ParseResult CreateNdDescOp::parse(mlir::OpAsmParser &parser,
       return ::mlir::failure();
   }
 
-  if (parseOptionalAttrDict(parser, result,
-                            {"memory_scope", "boundary_check", "mode"}))
+  if (parseOptionalAttrDict(parser, result, {"boundary_check", "mode"}))
     return mlir::failure();
 
   if (parser.parseColon())
@@ -359,8 +344,6 @@ void CreateNdDescOp::print(::mlir::OpAsmPrinter &printer) {
   printer << ' ' << "{";
   printer << "mode = " << getMode();
   printer << "," << ' ';
-  printer << "memory_scope = " << getMemoryScope();
-  printer << "," << ' ';
   printer << "boundary_check = " << getBoundaryCheck();
   printer << "}";
 
@@ -414,8 +397,7 @@ mlir::ParseResult CreateDescOp::parse(mlir::OpAsmParser &parser,
   if (parser.parseOperand(offsetsRawOperands[0]))
     return mlir::failure();
 
-  if (parseOptionalAttrDict(parser, result,
-                            {"memory_scope", "chunk_size_per_lane", "mode"}))
+  if (parseOptionalAttrDict(parser, result, {"chunk_size_per_lane", "mode"}))
     return mlir::failure();
 
   if (parser.parseColon())
@@ -460,8 +442,6 @@ void CreateDescOp::print(::mlir::OpAsmPrinter &printer) {
   printer << ' ' << "{";
   printer << "mode = " << getMode();
   printer << "," << ' ';
-  printer << "memory_scope = " << getMemoryScope();
-  printer << "," << ' ';
   printer << "chunk_size_per_lane = " << getChunkSizePerLane();
   printer << "}";
 
@@ -664,7 +644,6 @@ mlir::LogicalResult LoadNDOp::verify() {
               "tdescShape[i] % sgData[i] == 0");
         tdescShape[i] /= sgLayout[i];
       }
-      // dropOnes(tdescShape);
     }
 
     if (sgMap) {
@@ -675,16 +654,23 @@ mlir::LogicalResult LoadNDOp::verify() {
         if (tdescShape[i] % blockSize[i] != 0 ||
             blockSize[i] % wiLayout[i] != 0 || blockSize[i] % wiData[i] != 0 ||
             blockSize[i] % (wiLayout[i] * wiData[i]) != 0) {
-          return emitOpError(
-              "Invalid SgMapAttr. It should meet the following conditions: "
-              "blockSize[i] % wiLayout[i] == 0 && "
-              "blockSize[i] % wiData[i] == 0 && "
-              "blockSize[i] % wiData[i] == 0 && "
-              "tdescShape[i] % blockSize[i] == 0");
+          return emitOpError("Invalid SgMapAttr. It should meet the following conditions: "
+                             "tdescShape[i] % blockSize[i] == 0 && "
+                             "blockSize[i] % wiLayout[i] == 0 && "
+                             "blockSize[i] % wiData[i] == 0 && "
+                             "blockSize[i] % (wiLayout[i] * wiData[i]) == 0 ");
+
         }
-        auto tmp = blockSize[i] / wiLayout[i];
-        tdescShape[i] /= blockSize[i];
-        tdescShape[i] *= tmp;
+      }
+
+      for (size_t i = 0; i < wiLayout.size(); i++) {
+        if (tdescShape[i] % wiData[i] != 0 || 
+            tdescShape[i] % (wiLayout[i] * wiData[i]) != 0) {
+          return emitOpError("Invalid SgMapAttr. It should meet the following conditions: "
+                             "tdescShape[i] % wiData[i] == 0 && "
+                             "tdescShape[i] % (wiLayout[i] * wiData[i]) == 0 ");
+        }
+        tdescShape[i] /= wiLayout[i];
       }
     }
   }
@@ -702,8 +688,8 @@ mlir::LogicalResult LoadNDOp::verify() {
     auto vnni_factor = valueShape.back();
     tdescShape[axis] /= vnni_factor;
     tdescShape.push_back(vnni_factor);
-    dropOnes(tdescShape);
   }
+
   if (tdescShape != valueShape)
     return emitOpError(
         "Result shape doesn't match TensorDesc shape."
@@ -879,14 +865,29 @@ mlir::LogicalResult StoreNDOp::verify() {
       auto wiLayout = sgMap.getWiLayout();
       auto wiData = sgMap.getWiData();
       for (size_t i = 0; i < shape.size(); i++) {
-        assert(blockSize[i] % (wiLayout[i] * wiData[i]) == 0);
-        assert(blockSize[i] % wiLayout[i] == 0);
-        assert(blockSize[i] % wiData[i] == 0);
-        assert(shape[i] % blockSize[i] == 0);
-        auto tmp = blockSize[i] / wiLayout[i];
-        shape[i] /= blockSize[i];
-        shape[i] *= tmp;
+        if (blockSize[i] % (wiLayout[i] * wiData[i]) != 0 || 
+            blockSize[i] % wiLayout[i] != 0 || 
+            blockSize[i] % wiData[i] == 0   || 
+            shape[i] % blockSize[i] == 0) {
+              return emitOpError("Invalid SgMapAttr. It should meet the following conditions: "
+                                  "tdescShape[i] % blockSize[i] == 0 && "
+                                  "blockSize[i] % wiLayout[i] == 0 && "
+                                  "blockSize[i] % wiData[i] == 0 && "
+                                  "blockSize[i] % (wiLayout[i] * wiData[i]) == 0 ");
+
+        }
       }
+
+      for (size_t i = 0; i < wiLayout.size(); i++) {
+        if (shape[i] % wiData[i] != 0 || 
+            shape[i] % (wiLayout[i] * wiData[i]) != 0) {
+          return emitOpError("Invalid SgMapAttr. It should meet the following conditions: "
+                             "tdescShape[i] % wiData[i] == 0 && "
+                             "tdescShape[i] % (wiLayout[i] * wiData[i]) == 0 ");
+        }
+        shape[i] /= wiLayout[i];
+      }
+
     }
 
     if (shape != valTy.getShape().vec())
@@ -977,8 +978,8 @@ mlir::LogicalResult DpasOp::verify() {
   //   return emitOpError("Incorrect shapes for dpas op");
   // }
 
-  if (lhsRank != rhsRank) {
-    return emitOpError("lhs and rhs rank does not match for dpas op");
+  if (lhsRank != rhsRank || lhsRank != 3) {
+    return emitOpError("lhs and rhs rank does not match for dpas op, or their rank is not 3.");
   }
 
   return mlir::success();
@@ -1134,7 +1135,6 @@ mlir::LogicalResult LoadGatherOp::verify() {
     auto vnni_factor = valueShape.back();
     tdescShape[axis] /= vnni_factor;
     tdescShape.push_back(vnni_factor);
-    dropOnes(tdescShape);
   }
 
   if (valueShape != tdescShape)
diff --git a/test/Dialect/XeGPU/IR/XeGPUOps.mlir b/test/Dialect/XeGPU/IR/XeGPUOps.mlir
index 98bb5b15e..cf1f34f60 100644
--- a/test/Dialect/XeGPU/IR/XeGPUOps.mlir
+++ b/test/Dialect/XeGPU/IR/XeGPUOps.mlir
@@ -25,10 +25,10 @@ func.func @test_create_nd_tdesc_vc(%src: memref<24x32xf32>) {
 // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
 func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = slm, chunk_size_per_lane = 2}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
-  %1 = xegpu.create_tdesc %src, %offsets {mode = vc, memory_scope = slm, chunk_size_per_lane = 2}
-                          : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2}
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, memory_scope = slm, #xegpu.scattered>
+  %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}
+                          : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, memory_scope = slm, #xegpu.scattered>
   return
 }
 
@@ -53,13 +53,13 @@ func.func @test_store_nd_vc(%src: memref<24x32xf16>, %dst: memref<24x32xf16>) {
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = vc, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = vc, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -92,7 +92,7 @@ func.func @test_update_nd_offset_vc(%src: memref<24x32xf32>) {
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = vc, boundary_check = true}
   // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
@@ -112,7 +112,7 @@ func.func @test_update_nd_offset_vc(%src: memref<24x32xf32>) {
 // CHECK-LABEL: func @test_prefetch_nd_vc({{.*}}) {
 func.func @test_prefetch_nd_vc(%src: memref<24x32xf16>, %x : index, %y : index) {
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = vc, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc} : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   // CHECK: xegpu.prefetch_nd
diff --git a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
index f23e6f659..72d8681d2 100644
--- a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
+++ b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir
@@ -67,8 +67,9 @@ func.func @test_create_nd_tdesc_vc_5(%src: memref<?x?xf32>, %w : index, %h : ind
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
   // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
-  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc, memory_scope = slm} : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, memory_scope = slm>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc} 
+                                  : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, memory_scope = slm>
   return
 }
 
@@ -77,8 +78,9 @@ func.func @test_create_nd_tdesc_vc_6(%src: memref<?x?xf32>, %w : index, %h : ind
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
   // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1]
-  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc, memory_scope = slm, boundary_check = true} : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, memory_scope = slm>
+  %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] {mode = vc, boundary_check = true} 
+                            : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, memory_scope = slm>
   return
 }
 
@@ -96,8 +98,9 @@ func.func @test_create_nd_tdesc_vc_7(%src: memref<1024xf32>, %offset : index) {
 func.func @test_create_nd_tdesc_vc_8(%src: memref<?x?xf32>, %w : index, %h : index, %x : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-  %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] {mode = vc, memory_scope = slm, boundary_check = true} : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
+  // CHECK-SAME: memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, memory_scope = slm>
+  %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] {mode = vc, boundary_check = true}
+                                    : memref<?x?xf32> -> !xegpu.tensor_desc<8x16xf32, memory_scope = slm>
   return
 }
 
@@ -105,9 +108,9 @@ func.func @test_create_nd_tdesc_vc_8(%src: memref<?x?xf32>, %w : index, %h : ind
 func.func @test_create_nd_tdesc_vc_9(%src: memref<?x?xf32>, %w : index, %h : index, %x : index) {
   %c1 = arith.constant 1 : index
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = slm, boundary_check = true}
-  // CHECK-SAME: !xegpu.tensor_desc<64x128xf32, #xegpu.xe_map<wg = {sg_layout = [2, 2], sg_data = [32, 64]}, sg = {mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]}>>
-  %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] {memory_scope = slm, boundary_check = true} : memref<?x?xf32>
-                            -> !xegpu.tensor_desc<64x128xf32, #xegpu.xe_map<wg = {sg_layout=[2, 2], sg_data=[32, 64]}, sg = {mma_block_size=[8, 16], wi_layout=[1, 16], wi_data=[8, 1]}>>
+  // CHECK-SAME: {mode = simt, boundary_check = true}
+  // CHECK-SAME: !xegpu.tensor_desc<64x128xf32, memory_scope = slm, #xegpu.xe_map<wg = {sg_layout = [2, 2], sg_data = [32, 64]}, sg = {mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [8, 1]}>> 
+  %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] {boundary_check = true} : memref<?x?xf32> 
+            -> !xegpu.tensor_desc<64x128xf32, memory_scope = slm, #xegpu.xe_map<wg = {sg_layout=[2, 2], sg_data=[32, 64]}, sg = {mma_block_size=[8, 16], wi_layout=[1, 16], wi_data=[8, 1]}>>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/create_tdesc.mlir b/test/Dialect/XeGPU/IR/create_tdesc.mlir
index 98e58b55b..cc6257ecf 100644
--- a/test/Dialect/XeGPU/IR/create_tdesc.mlir
+++ b/test/Dialect/XeGPU/IR/create_tdesc.mlir
@@ -7,8 +7,8 @@
 
 // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) {
 func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc %arg0, %arg1
-  // CHECK-SAME: {mode = vc, memory_scope = global, chunk_size_per_lane = 1}
+  // CHECK: xegpu.create_tdesc %arg0, %arg1 
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1} 
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}: ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   return
@@ -16,18 +16,18 @@ func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) {
 
 // CHECK-LABEL: func @test_create_tdesc_vc_2({{.*}}) {
 func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc %arg0, %arg1
-  // CHECK-SAME: {mode = vc, memory_scope = slm, chunk_size_per_lane = 1}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
-  %1 = xegpu.create_tdesc %src, %offsets {mode = vc, memory_scope=slm}
-                                          : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
+  // CHECK: xegpu.create_tdesc %arg0, %arg1 
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1} 
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, memory_scope = slm, #xegpu.scattered>
+  %1 = xegpu.create_tdesc %src, %offsets {mode = vc} : ui64, vector<16 x index> 
+                            -> !xegpu.tensor_desc<16xf32, memory_scope = slm, #xegpu.scattered>
   return
 }
 
 // CHECK-LABEL: func @test_create_tdesc_vc_3({{.*}}) {
 func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc %arg0, %arg1
-  // CHECK-SAME: {mode = vc, memory_scope = global, chunk_size_per_lane = 8}
+  // CHECK: xegpu.create_tdesc %arg0, %arg1 
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8}
                                           : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
@@ -36,11 +36,11 @@ func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) {
 
 // CHECK-LABEL: func @test_create_tdesc_vc_4({{.*}}) {
 func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) {
-  // CHECK: xegpu.create_tdesc %arg0, %arg1
-  // CHECK-SAME: {mode = vc, memory_scope = slm, chunk_size_per_lane = 2}
-  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
-  %1 = xegpu.create_tdesc %src, %offsets {mode = vc, memory_scope = slm, chunk_size_per_lane = 2}
-                                          : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
+  // CHECK: xegpu.create_tdesc %arg0, %arg1 
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2} 
+  // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, memory_scope = slm, #xegpu.scattered>
+  %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2} 
+                        : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, memory_scope = slm, #xegpu.scattered>
   return
 }
 
@@ -48,10 +48,10 @@ func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) {
 // CHECK-LABEL: func @test_create_tdesc_vc_5({{.*}}) {
 func.func @test_create_tdesc_vc_5(%src: memref<?xf32>, %offsets : vector<16 x index>) {
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = slm, chunk_size_per_lane = 2}
-  // CHECK-SAME: memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
-  %1 = xegpu.create_tdesc %src, %offsets {mode = vc, memory_scope = slm, chunk_size_per_lane = 2}
-                    : memref<?xf32>, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scattered>
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2}
+  // CHECK-SAME: memref<?xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, memory_scope = slm, #xegpu.scattered>
+  %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 2}
+              : memref<?xf32>, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, memory_scope = slm, #xegpu.scattered>
   return
 }
 
@@ -59,19 +59,19 @@ func.func @test_create_tdesc_vc_5(%src: memref<?xf32>, %offsets : vector<16 x in
 // CHECK-LABEL: func @test_create_tdesc_vc_6({{.*}}) {
 func.func @test_create_tdesc_vc_6(%src: memref<?xf32>, %offset : index) {
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = slm, chunk_size_per_lane = 2}
-  // CHECK-SAME: memref<?xf32>, index -> !xegpu.tensor_desc<2xf32, #xegpu.scattered>
-  %1 = xegpu.create_tdesc %src, %offset {mode = vc, memory_scope = slm, chunk_size_per_lane = 2}
-                    : memref<?xf32>, index -> !xegpu.tensor_desc<2xf32, #xegpu.scattered>
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 2}
+  // CHECK-SAME: memref<?xf32>, index -> !xegpu.tensor_desc<2xf32, memory_scope = slm, #xegpu.scattered>
+  %1 = xegpu.create_tdesc %src, %offset {mode = vc, chunk_size_per_lane = 2}
+                    : memref<?xf32>, index -> !xegpu.tensor_desc<2xf32, memory_scope = slm, #xegpu.scattered>
   return
 }
 
 // CHECK-LABEL: func @test_create_tdesc_vc_7({{.*}}) {
 func.func @test_create_tdesc_vc_7(%src: memref<?xf32>, %offset : index) {
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = slm, chunk_size_per_lane = 1}
-  // CHECK-SAME: memref<?xf32>, index -> !xegpu.tensor_desc<1xf32, #xegpu.scattered>
-  %1 = xegpu.create_tdesc %src, %offset {mode = vc, memory_scope = slm, chunk_size_per_lane = 1}
-                    : memref<?xf32>, index -> !xegpu.tensor_desc<1xf32, #xegpu.scattered>
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
+  // CHECK-SAME: memref<?xf32>, index -> !xegpu.tensor_desc<1xf32, memory_scope = slm, #xegpu.scattered>
+  %1 = xegpu.create_tdesc %src, %offset {mode = vc, chunk_size_per_lane = 1}
+                    : memref<?xf32>, index -> !xegpu.tensor_desc<1xf32, memory_scope = slm, #xegpu.scattered>
   return
 }
diff --git a/test/Dialect/XeGPU/IR/invalid.mlir b/test/Dialect/XeGPU/IR/invalid.mlir
index f3d68254c..b8f123d11 100644
--- a/test/Dialect/XeGPU/IR/invalid.mlir
+++ b/test/Dialect/XeGPU/IR/invalid.mlir
@@ -72,7 +72,7 @@ func.func @test_create_tdesc(%src: ui64, %offsets : vector<16x8xindex>) {
 func.func @test_load_gather(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16x8xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, chunk_size_per_lane = 8}
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8}
                               : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.scattered>
diff --git a/test/Dialect/XeGPU/IR/load_gather.mlir b/test/Dialect/XeGPU/IR/load_gather.mlir
index b04dd022d..9201aa18c 100644
--- a/test/Dialect/XeGPU/IR/load_gather.mlir
+++ b/test/Dialect/XeGPU/IR/load_gather.mlir
@@ -9,7 +9,7 @@
 func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, chunk_size_per_lane = 1}
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
@@ -25,7 +25,7 @@ func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) {
 func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16x8xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, chunk_size_per_lane = 8}
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 8}
                 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scattered>
@@ -43,7 +43,7 @@ func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) {
 func.func @test_load_gather_vc_3(%src: ui64, %offset : index) {
   %0 = arith.constant dense<1>: vector<8xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, chunk_size_per_lane = 8}
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 8}
   // CHECK-SAME: ui64, index -> !xegpu.tensor_desc<8xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offset {mode = vc, chunk_size_per_lane = 8}
                 : ui64, index -> !xegpu.tensor_desc<8xf32, #xegpu.scattered>
@@ -61,7 +61,7 @@ func.func @test_load_gather_vc_3(%src: ui64, %offset : index) {
 func.func @test_load_gather_vc_4(%src: ui64, %offsets : vector<16xindex>) {
   %0 = arith.constant dense<1>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, chunk_size_per_lane = 1}
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc, chunk_size_per_lane = 1}
                 : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
diff --git a/test/Dialect/XeGPU/IR/load_nd.mlir b/test/Dialect/XeGPU/IR/load_nd.mlir
index c67cab01b..91d2b6025 100644
--- a/test/Dialect/XeGPU/IR/load_nd.mlir
+++ b/test/Dialect/XeGPU/IR/load_nd.mlir
@@ -7,13 +7,14 @@
 #sg_map_fp16_a = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>
 #sg_map_fp16_b = #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
 #sg_map_fp16_c = #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>
+#sg_map_fp16_d = #xegpu.sg_map<{wi_layout = [2, 8], wi_data = [1, 2]}>
 // CHECK-LABEL: func @test_load_nd_fp16({{.*}}) {
 func.func @test_load_nd_fp16(%A: memref<24x32xf16>, %B : memref<24x32xf16>, %C : memref<24x32xf16>) {
   %c0 = arith.constant 2 : index
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
   %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
@@ -22,11 +23,11 @@ func.func @test_load_nd_fp16(%A: memref<24x32xf16>, %B : memref<24x32xf16>, %C :
   // CHECK: xegpu.load_nd
   // CHECK-SAME: {mode = simt, vnni_axis = 1}
   // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [2, 8], wi_data = [1, 2]}>>
-  // CHECK-SAME: -> vector<4x2xf16>
-  %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> -> vector<4x2xf16>
+  // CHECK-SAME: -> vector<4x1x2xf16>
+  %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_a> -> vector<4x1x2xf16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16>
   // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
   %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
@@ -35,11 +36,11 @@ func.func @test_load_nd_fp16(%A: memref<24x32xf16>, %B : memref<24x32xf16>, %C :
   // CHECK: xegpu.load_nd
   // CHECK-SAME: {mode = simt, vnni_axis = 0}
   // CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.sg_map<{mma_block_size = [16, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
-  // CHECK-SAME: -> vector<8x2xf16>
-  %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> -> vector<8x2xf16>
+  // CHECK-SAME: -> vector<8x1x2xf16>
+  %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<16x16xf16, #sg_map_fp16_b> -> vector<8x1x2xf16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
   %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
@@ -51,6 +52,18 @@ func.func @test_load_nd_fp16(%A: memref<24x32xf16>, %B : memref<24x32xf16>, %C :
   // CHECK-SAME: -> vector<8x1xf32>
   %6 = xegpu.load_nd %5 : !xegpu.tensor_desc<8x16xf32, #sg_map_fp16_c> -> vector<8x1xf32>
 
+  // CHECK: xegpu.create_nd_tdesc
+  // CHECK-SAME: {mode = simt, boundary_check = true}
+  // CHECK-SAME: memref<24x32xf16>
+  // CHECK-SAME: -> !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{wi_layout = [2, 8], wi_data = [1, 2]}>>
+  %7 = xegpu.create_nd_tdesc %A[%c0, %c1]
+      : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_d>
+  // CHECK: xegpu.load_nd
+  // CHECK-SAME: {mode = simt, vnni_axis = 1}
+  // CHECK-SAME: !xegpu.tensor_desc<8x16xf16, #xegpu.sg_map<{wi_layout = [2, 8], wi_data = [1, 2]}>>
+  // CHECK-SAME: -> vector<4x1x2xf16>
+  %8 = xegpu.load_nd %7 {vnni_axis = 1} : !xegpu.tensor_desc<8x16xf16, #sg_map_fp16_d> -> vector<4x1x2xf16>
+
   return
 }
 
@@ -63,7 +76,7 @@ func.func @test_load_nd_i8(%A: memref<64x64xi8>, %B : memref<64x64xi8>, %C : mem
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<64x64xi8>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<{mma_block_size = [8, 32], wi_layout = [2, 8], wi_data = [1, 4]}>>
   %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
@@ -72,11 +85,11 @@ func.func @test_load_nd_i8(%A: memref<64x64xi8>, %B : memref<64x64xi8>, %C : mem
   // CHECK: xegpu.load_nd
   // CHECK-SAME: {mode = simt, vnni_axis = 1}
   // CHECK-SAME:  !xegpu.tensor_desc<8x32xi8, #xegpu.sg_map<{mma_block_size = [8, 32], wi_layout = [2, 8], wi_data = [1, 4]}>>
-  // CHECK-SAME: -> vector<4x4xi8>
-  %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a> -> vector<4x4xi8>
+  // CHECK-SAME: -> vector<4x1x4xi8>
+  %2 = xegpu.load_nd %1 {vnni_axis = 1} : !xegpu.tensor_desc<8x32xi8, #sg_map_i8_a> -> vector<4x1x4xi8>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<64x64xi8>
   // CHECK-SAME: -> !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<{mma_block_size = [32, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
   %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
@@ -85,11 +98,11 @@ func.func @test_load_nd_i8(%A: memref<64x64xi8>, %B : memref<64x64xi8>, %C : mem
   // CHECK: xegpu.load_nd
   // CHECK-SAME: {mode = simt, vnni_axis = 0}
   // CHECK-SAME: !xegpu.tensor_desc<32x16xi8, #xegpu.sg_map<{mma_block_size = [32, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
-  // CHECK-SAME: -> vector<8x4xi8>
-  %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b> -> vector<8x4xi8>
+  // CHECK-SAME: -> vector<8x1x4xi8>
+  %4 = xegpu.load_nd %3 {vnni_axis = 0} : !xegpu.tensor_desc<32x16xi8, #sg_map_i8_b> -> vector<8x1x4xi8>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<64x64xi8>
   // CHECK-SAME: -> !xegpu.tensor_desc<8x16xi32, #xegpu.sg_map<{mma_block_size = [8, 16], wi_layout = [1, 16], wi_data = [1, 1]}>>
   %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
@@ -114,7 +127,7 @@ func.func @test_load_nd_f64(%A: memref<64x64xf64>, %B : memref<64x64xf64>, %C :
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<64x64xf64>
   // CHECK-SAME: -> !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<{mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]}>>
   %1 = xegpu.create_nd_tdesc %A[%c0, %c1]
@@ -127,7 +140,7 @@ func.func @test_load_nd_f64(%A: memref<64x64xf64>, %B : memref<64x64xf64>, %C :
   %2 = xegpu.load_nd %1 : !xegpu.tensor_desc<4x8xf64, #sg_map_f64_a> -> vector<2x1xf64>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME:  memref<64x64xf64>
   // CHECK-SAME:  -> !xegpu.tensor_desc<8x8xf64, #xegpu.sg_map<{mma_block_size = [8, 8], wi_layout = [2, 8], wi_data = [1, 1]}>>
   %3 = xegpu.create_nd_tdesc %B[%c0, %c1]
@@ -140,7 +153,7 @@ func.func @test_load_nd_f64(%A: memref<64x64xf64>, %B : memref<64x64xf64>, %C :
   %4 = xegpu.load_nd %3  : !xegpu.tensor_desc<8x8xf64, #sg_map_f64_b> -> vector<4x1xf64>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = simt, boundary_check = true}
   // CHECK-SAME: memref<64x64xf64>
   // CHECK-SAME: -> !xegpu.tensor_desc<4x8xf64, #xegpu.sg_map<{mma_block_size = [4, 8], wi_layout = [2, 8], wi_data = [1, 1]}>>
   %5 = xegpu.create_nd_tdesc %C[%c0, %c1]
diff --git a/test/Dialect/XeGPU/IR/prefetch_nd.mlir b/test/Dialect/XeGPU/IR/prefetch_nd.mlir
index 3604c91a6..402302610 100644
--- a/test/Dialect/XeGPU/IR/prefetch_nd.mlir
+++ b/test/Dialect/XeGPU/IR/prefetch_nd.mlir
@@ -19,7 +19,7 @@ func.func @test_prefetch_nd_tdesc_vc_0(%src: memref<24x32xf32>) {
 // CHECK-LABEL: func @test_prefetch_nd_tdesc_vc_1({{.*}}) {
 func.func @test_prefetch_nd_tdesc_vc_1(%src: memref<24x32xf16>, %x : index, %y : index) {
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = vc, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%x, %y] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
diff --git a/test/Dialect/XeGPU/IR/store_nd.mlir b/test/Dialect/XeGPU/IR/store_nd.mlir
index ceae5645c..47e714b6c 100644
--- a/test/Dialect/XeGPU/IR/store_nd.mlir
+++ b/test/Dialect/XeGPU/IR/store_nd.mlir
@@ -9,13 +9,13 @@ func.func @test_store_nd_vc_0(%src: memref<24x32xf16>, %dst: memref<24x32xf16>)
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = vc, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = vc, boundary_check = true}
   // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
   %2 = xegpu.create_nd_tdesc %dst[%c0, %c1] {mode = vc}
       : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
diff --git a/test/Dialect/XeGPU/IR/store_scatter.mlir b/test/Dialect/XeGPU/IR/store_scatter.mlir
index 6786692f7..19341dc74 100644
--- a/test/Dialect/XeGPU/IR/store_scatter.mlir
+++ b/test/Dialect/XeGPU/IR/store_scatter.mlir
@@ -9,13 +9,13 @@
 func.func @test_store_scatter_vc(%src: ui64, %offsets : vector<16 x index>, %dst: ui64) {
   %0 = arith.constant dense<1>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, chunk_size_per_lane = 1}
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
           : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
 
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, chunk_size_per_lane = 1}
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %2 = xegpu.create_tdesc %dst, %offsets {mode = vc}
           : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
@@ -38,13 +38,13 @@ func.func @test_store_scatter_vc(%src: ui64, %offsets : vector<16 x index>, %dst
 func.func @test_store_scatter(%src: ui64, %offsets : index, %dst: ui64) {
   %0 = arith.constant 1: i1
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, chunk_size_per_lane = 1}
+  // CHECK-SAME: {mode = simt, chunk_size_per_lane = 1}
   // CHECK-SAME: ui64, index -> !xegpu.tensor_desc<1xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets
           : ui64, index -> !xegpu.tensor_desc<1xf32, #xegpu.scattered>
 
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, chunk_size_per_lane = 1}
+  // CHECK-SAME: {mode = simt, chunk_size_per_lane = 1}
   // CHECK-SAME: ui64, index -> !xegpu.tensor_desc<1xf32, #xegpu.scattered>
   %2 = xegpu.create_tdesc %dst, %offsets
                 : ui64, index -> !xegpu.tensor_desc<1xf32, #xegpu.scattered>
diff --git a/test/Dialect/XeGPU/IR/update_nd_offset.mlir b/test/Dialect/XeGPU/IR/update_nd_offset.mlir
index 93403ea5f..4de5560e0 100644
--- a/test/Dialect/XeGPU/IR/update_nd_offset.mlir
+++ b/test/Dialect/XeGPU/IR/update_nd_offset.mlir
@@ -9,7 +9,7 @@ func.func @test_update_nd_offset_vc_0(%src: memref<24x32xf32>) {
   %c1 = arith.constant 4 : index
 
   // CHECK: xegpu.create_nd_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, boundary_check = true}
+  // CHECK-SAME: {mode = vc, boundary_check = true}
   // CHECK-SAME: memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
   %1 = xegpu.create_nd_tdesc %src[%c0, %c1] {mode = vc}
       : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
diff --git a/test/Dialect/XeGPU/IR/update_offset.mlir b/test/Dialect/XeGPU/IR/update_offset.mlir
index 539a72f48..416fd477a 100644
--- a/test/Dialect/XeGPU/IR/update_offset.mlir
+++ b/test/Dialect/XeGPU/IR/update_offset.mlir
@@ -9,7 +9,7 @@
 func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) {
   %0 = arith.constant dense<1>: vector<16xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = vc, memory_scope = global, chunk_size_per_lane = 1}
+  // CHECK-SAME: {mode = vc, chunk_size_per_lane = 1}
   // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {mode = vc}
               : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scattered>
@@ -36,7 +36,7 @@ func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) {
 func.func @test_update_offset(%src: ui64, %offsets : index) {
   %0 = arith.constant dense<1>: vector<8xi1>
   // CHECK: xegpu.create_tdesc
-  // CHECK-SAME: {mode = simt, memory_scope = global, chunk_size_per_lane = 8}
+  // CHECK-SAME: {mode = simt, chunk_size_per_lane = 8}
   // CHECK-SAME: ui64, index -> !xegpu.tensor_desc<8xf32, #xegpu.scattered>
   %1 = xegpu.create_tdesc %src, %offsets {chunk_size_per_lane = 8}
         : ui64, index -> !xegpu.tensor_desc<8xf32, #xegpu.scattered>