diff --git a/bolt/include/bolt/Profile/YAMLProfileReader.h b/bolt/include/bolt/Profile/YAMLProfileReader.h index bd5a86fd676a59..a6f0fd6f3251f0 100644 --- a/bolt/include/bolt/Profile/YAMLProfileReader.h +++ b/bolt/include/bolt/Profile/YAMLProfileReader.h @@ -105,7 +105,7 @@ class YAMLProfileReader : public ProfileReaderBase { yaml::bolt::BinaryProfile YamlBP; /// Map a function ID from a YAML profile to a BinaryFunction object. - std::vector YamlProfileToFunction; + DenseMap YamlProfileToFunction; using FunctionSet = std::unordered_set; /// To keep track of functions that have a matched profile before the profile @@ -162,8 +162,6 @@ class YAMLProfileReader : public ProfileReaderBase { /// Update matched YAML -> BinaryFunction pair. void matchProfileToFunction(yaml::bolt::BinaryFunctionProfile &YamlBF, BinaryFunction &BF) { - if (YamlBF.Id >= YamlProfileToFunction.size()) - YamlProfileToFunction.resize(YamlBF.Id + 1); YamlProfileToFunction[YamlBF.Id] = &BF; YamlBF.Used = true; diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index fe0fcfdcd42f9b..a5dc8492b59003 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -238,9 +238,7 @@ bool YAMLProfileReader::parseFunctionProfile( BB.setExecutionCount(YamlBB.ExecCount); for (const yaml::bolt::CallSiteInfo &YamlCSI : YamlBB.CallSites) { - BinaryFunction *Callee = YamlCSI.DestId < YamlProfileToFunction.size() - ? YamlProfileToFunction[YamlCSI.DestId] - : nullptr; + BinaryFunction *Callee = YamlProfileToFunction.lookup(YamlCSI.DestId); bool IsFunction = Callee ? true : false; MCSymbol *CalleeSymbol = nullptr; if (IsFunction) @@ -703,7 +701,7 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) { break; } } - YamlProfileToFunction.resize(YamlBP.Functions.size() + 1); + YamlProfileToFunction.reserve(YamlBP.Functions.size()); // Computes hash for binary functions. if (opts::MatchProfileWithFunctionHash) { @@ -756,12 +754,7 @@ Error YAMLProfileReader::readProfile(BinaryContext &BC) { NormalizeByCalls = usesEvent("branches"); uint64_t NumUnused = 0; for (yaml::bolt::BinaryFunctionProfile &YamlBF : YamlBP.Functions) { - if (YamlBF.Id >= YamlProfileToFunction.size()) { - // Such profile was ignored. - ++NumUnused; - continue; - } - if (BinaryFunction *BF = YamlProfileToFunction[YamlBF.Id]) + if (BinaryFunction *BF = YamlProfileToFunction.lookup(YamlBF.Id)) parseFunctionProfile(*BF, YamlBF); else ++NumUnused; diff --git a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp index 8647df4b0edf82..09aa4fbb66bd42 100644 --- a/bolt/lib/Rewrite/PseudoProbeRewriter.cpp +++ b/bolt/lib/Rewrite/PseudoProbeRewriter.cpp @@ -127,8 +127,8 @@ void PseudoProbeRewriter::parsePseudoProbe(bool ProfiledOnly) { StringRef Contents = PseudoProbeDescSection->getContents(); if (!ProbeDecoder.buildGUID2FuncDescMap( - reinterpret_cast(Contents.data()), - Contents.size())) { + reinterpret_cast(Contents.data()), Contents.size(), + /*IsMMapped*/ true)) { errs() << "BOLT-WARNING: fail in building GUID2FuncDescMap\n"; return; } diff --git a/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp index 64c155c29cf8b9..17d2e75e4f666f 100644 --- a/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/ThrowKeywordMissingCheck.cpp @@ -15,9 +15,6 @@ using namespace clang::ast_matchers; namespace clang::tidy::bugprone { void ThrowKeywordMissingCheck::registerMatchers(MatchFinder *Finder) { - auto CtorInitializerList = - cxxConstructorDecl(hasAnyConstructorInitializer(anything())); - Finder->addMatcher( cxxConstructExpr( hasType(cxxRecordDecl( @@ -27,7 +24,7 @@ void ThrowKeywordMissingCheck::registerMatchers(MatchFinder *Finder) { stmt(anyOf(cxxThrowExpr(), callExpr(), returnStmt()))), hasAncestor(decl(anyOf(varDecl(), fieldDecl()))), hasAncestor(expr(cxxNewExpr(hasAnyPlacementArg(anything())))), - allOf(hasAncestor(CtorInitializerList), + allOf(hasAncestor(cxxConstructorDecl()), unless(hasAncestor(cxxCatchStmt())))))) .bind("temporary-exception-not-thrown"), this); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp index bdba2314c7056f..3eef2fd12cc8e5 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/InitVariablesCheck.cpp @@ -8,9 +8,10 @@ #include "InitVariablesCheck.h" +#include "../utils/LexerUtils.h" #include "clang/AST/ASTContext.h" +#include "clang/AST/Type.h" #include "clang/ASTMatchers/ASTMatchFinder.h" -#include "clang/Lex/PPCallbacks.h" #include "clang/Lex/Preprocessor.h" #include @@ -107,8 +108,9 @@ void InitVariablesCheck::check(const MatchFinder::MatchResult &Result) { << MatchedDecl; if (*InitializationString != nullptr) Diagnostic << FixItHint::CreateInsertion( - MatchedDecl->getLocation().getLocWithOffset( - MatchedDecl->getName().size()), + utils::lexer::findNextTerminator(MatchedDecl->getLocation(), + *Result.SourceManager, + Result.Context->getLangOpts()), *InitializationString); if (AddMathInclude) { Diagnostic << IncludeInserter.createIncludeInsertion( diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index abcdcc25705bf5..442fb7180555ea 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -177,6 +177,10 @@ Changes in existing checks usages of ``sizeof()``, ``alignof()``, and ``offsetof()`` when adding or subtracting from a pointer directly or when used to scale a numeric value. +- Improved :doc:`bugprone-throw-keyword-missing + ` by fixing a false positive + when using non-static member initializers and a constructor. + - Improved :doc:`bugprone-unchecked-optional-access ` to support `bsl::optional` and `bdlb::NullableValue` from @@ -190,6 +194,10 @@ Changes in existing checks fix false positive that floating point variable is only used in increment expression. +- Improved :doc:`cppcoreguidelines-init-variables + ` check by fixing the + insertion location for function pointers. + - Improved :doc:`cppcoreguidelines-prefer-member-initializer ` check to avoid false positive when member initialization depends on a structured @@ -208,9 +216,9 @@ Changes in existing checks false positive for C++23 deducing this. - Improved :doc:`modernize-avoid-c-arrays - ` check to suggest using ``std::span`` - as a replacement for parameters of incomplete C array type in C++20 and - ``std::array`` or ``std::vector`` before C++20. + ` check to suggest using + ``std::span`` as a replacement for parameters of incomplete C array type in + C++20 and ``std::array`` or ``std::vector`` before C++20. - Improved :doc:`modernize-loop-convert ` check to fix false positive when diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/throw-keyword-missing.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/throw-keyword-missing.cpp index 49233c0deefdf0..bafd3d19b5a319 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/throw-keyword-missing.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/throw-keyword-missing.cpp @@ -139,6 +139,15 @@ CtorInitializerListTest::CtorInitializerListTest(float) try : exc(RegularExcepti RegularException(); } +namespace GH115055 { +class CtorInitializerListTest2 { + public: + CtorInitializerListTest2() {} + private: + RegularException exc{}; +}; +} // namespace GH115055 + RegularException funcReturningExceptionTest(int i) { return RegularException(); } diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/init-variables.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/init-variables.cpp index e3d50946d1cb8f..824431c1bf52fd 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/init-variables.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/init-variables.cpp @@ -134,3 +134,17 @@ void test_clang_diagnostic_error() { // CHECK-MESSAGES: :[[@LINE-1]]:3: error: unknown type name 'UnknownType' [clang-diagnostic-error] // CHECK-FIXES-NOT: {{^}} UnknownType b = 0;{{$}} } + +namespace gh112089 { + void foo(void*); + using FPtr = void(*)(void*); + void test() { + void(*a1)(void*); + // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: variable 'a1' is not initialized [cppcoreguidelines-init-variables] + // CHECK-FIXES: void(*a1)(void*) = nullptr; + FPtr a2; + // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: variable 'a2' is not initialized [cppcoreguidelines-init-variables] + // CHECK-FIXES: FPtr a2 = nullptr; + } +} // namespace gh112089 + diff --git a/clang/docs/AMDGPUSupport.rst b/clang/docs/AMDGPUSupport.rst index e63c0e1ba7d67b..3eada5f900613a 100644 --- a/clang/docs/AMDGPUSupport.rst +++ b/clang/docs/AMDGPUSupport.rst @@ -50,9 +50,9 @@ Predefined Macros * - ``__AMDGCN_UNSAFE_FP_ATOMICS__`` - Defined if unsafe floating-point atomics are allowed. * - ``__AMDGCN_WAVEFRONT_SIZE__`` - - Defines the wavefront size. Allowed values are 32 and 64. + - Defines the wavefront size. Allowed values are 32 and 64 (deprecated). * - ``__AMDGCN_WAVEFRONT_SIZE`` - - Alias to ``__AMDGCN_WAVEFRONT_SIZE__``. To be deprecated. + - Alias to ``__AMDGCN_WAVEFRONT_SIZE__`` (deprecated). * - ``__HAS_FMAF__`` - Defined if FMAF instruction is available (deprecated). * - ``__HAS_LDEXPF__`` diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst index e26297c7af97ac..e830acd8dd85c0 100644 --- a/clang/docs/HIPSupport.rst +++ b/clang/docs/HIPSupport.rst @@ -178,7 +178,7 @@ Predefined Macros Note that some architecture specific AMDGPU macros will have default values when used from the HIP host compilation. Other :doc:`AMDGPU macros ` -like ``__AMDGCN_WAVEFRONT_SIZE__`` will default to 64 for example. +like ``__AMDGCN_WAVEFRONT_SIZE__`` (deprecated) will default to 64 for example. Compilation Modes ================= diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c3424e0e6f34c9..e817b0ceb3fd06 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -449,6 +449,9 @@ Attribute Changes in Clang - Fix a bug where clang doesn't automatically apply the ``[[gsl::Owner]]`` or ``[[gsl::Pointer]]`` to STL explicit template specialization decls. (#GH109442) +- Clang now supports ``[[clang::lifetime_capture_by(X)]]``. Similar to lifetimebound, this can be + used to specify when a reference to a function parameter is captured by another capturing entity ``X``. + Improvements to Clang's diagnostics ----------------------------------- @@ -740,6 +743,7 @@ X86 Support - Support ISA of ``AMX-FP8``. - Support ISA of ``AMX-TRANSPOSE``. - Support ISA of ``AMX-AVX512``. +- Support ISA of ``AMX-TF32``. Arm and AArch64 Support ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/docs/SafeBuffers.rst b/clang/docs/SafeBuffers.rst index 144c3a76a5832f..da75907e174a00 100644 --- a/clang/docs/SafeBuffers.rst +++ b/clang/docs/SafeBuffers.rst @@ -58,7 +58,7 @@ A relatively fresh version of C++ is recommended. In particular, the very useful standard view class ``std::span`` requires C++20. Other implementations of the C++ standard library may provide different -flags to enable such hardening hardening. +flags to enable such hardening. If you're using custom containers and views, they will need to be hardened this way as well, but you don't necessarily need to do this ahead of time. diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h index b3aea09be03ddf..3eb50f9353ed19 100644 --- a/clang/include/clang/AST/StmtOpenACC.h +++ b/clang/include/clang/AST/StmtOpenACC.h @@ -114,7 +114,6 @@ class OpenACCAssociatedStmtConstruct : public OpenACCConstructStmt { } }; -class OpenACCLoopConstruct; /// This class represents a compute construct, representing a 'Kind' of /// `parallel', 'serial', or 'kernel'. These constructs are associated with a /// 'structured block', defined as: @@ -183,8 +182,7 @@ class OpenACCComputeConstruct final static OpenACCComputeConstruct * Create(const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc, SourceLocation DirectiveLoc, SourceLocation EndLoc, - ArrayRef Clauses, Stmt *StructuredBlock, - ArrayRef AssociatedLoopConstructs); + ArrayRef Clauses, Stmt *StructuredBlock); Stmt *getStructuredBlock() { return getAssociatedStmt(); } const Stmt *getStructuredBlock() const { @@ -198,12 +196,10 @@ class OpenACCLoopConstruct final : public OpenACCAssociatedStmtConstruct, public llvm::TrailingObjects { - // The compute construct this loop is associated with, or nullptr if this is - // an orphaned loop construct, or if it hasn't been set yet. Because we - // construct the directives at the end of their statement, the 'parent' - // construct is not yet available at the time of construction, so this needs - // to be set 'later'. - const OpenACCComputeConstruct *ParentComputeConstruct = nullptr; + // The compute/combined construct kind this loop is associated with, or + // invalid if this is an orphaned loop construct. + OpenACCDirectiveKind ParentComputeConstructKind = + OpenACCDirectiveKind::Invalid; friend class ASTStmtWriter; friend class ASTStmtReader; @@ -212,15 +208,9 @@ class OpenACCLoopConstruct final OpenACCLoopConstruct(unsigned NumClauses); - OpenACCLoopConstruct(SourceLocation Start, SourceLocation DirLoc, - SourceLocation End, + OpenACCLoopConstruct(OpenACCDirectiveKind ParentKind, SourceLocation Start, + SourceLocation DirLoc, SourceLocation End, ArrayRef Clauses, Stmt *Loop); - void setLoop(Stmt *Loop); - - void setParentComputeConstruct(OpenACCComputeConstruct *CC) { - assert(!ParentComputeConstruct && "Parent already set?"); - ParentComputeConstruct = CC; - } public: static bool classof(const Stmt *T) { @@ -231,9 +221,9 @@ class OpenACCLoopConstruct final unsigned NumClauses); static OpenACCLoopConstruct * - Create(const ASTContext &C, SourceLocation BeginLoc, SourceLocation DirLoc, - SourceLocation EndLoc, ArrayRef Clauses, - Stmt *Loop); + Create(const ASTContext &C, OpenACCDirectiveKind ParentKind, + SourceLocation BeginLoc, SourceLocation DirLoc, SourceLocation EndLoc, + ArrayRef Clauses, Stmt *Loop); Stmt *getLoop() { return getAssociatedStmt(); } const Stmt *getLoop() const { @@ -246,10 +236,11 @@ class OpenACCLoopConstruct final /// loop construct is the nearest compute construct that lexically contains /// the loop construct. bool isOrphanedLoopConstruct() const { - return ParentComputeConstruct == nullptr; + return ParentComputeConstructKind == OpenACCDirectiveKind::Invalid; } - const OpenACCComputeConstruct *getParentComputeConstruct() const { - return ParentComputeConstruct; + + OpenACCDirectiveKind getParentComputeConstructKind() const { + return ParentComputeConstructKind; } }; } // namespace clang diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index a631e81d40aa68..6a77967c32cbcb 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1889,6 +1889,39 @@ def LifetimeBound : DeclOrTypeAttr { let SimpleHandler = 1; } +def LifetimeCaptureBy : DeclOrTypeAttr { + let Spellings = [Clang<"lifetime_capture_by", 0>]; + let Subjects = SubjectList<[ParmVar, ImplicitObjectParameter], ErrorDiag>; + let Args = [VariadicParamOrParamIdxArgument<"Params">]; + let Documentation = [LifetimeCaptureByDocs]; + let AdditionalMembers = [{ +private: + SmallVector ArgIdents; + SmallVector ArgLocs; + +public: + static constexpr int THIS = 0; + static constexpr int INVALID = -1; + static constexpr int UNKNOWN = -2; + static constexpr int GLOBAL = -3; + + void setArgs(SmallVector&& Idents, + SmallVector&& Locs) { + assert(Idents.size() == Locs.size()); + assert(Idents.size() == params_Size); + ArgIdents = std::move(Idents); + ArgLocs = std::move(Locs); + } + + ArrayRef getArgIdents() const { return ArgIdents; } + ArrayRef getArgLocs() const { return ArgLocs; } + void setParamIdx(size_t Idx, int Val) { + assert(Idx < params_Size); + params_[Idx] = Val; + } +}]; +} + def TrivialABI : InheritableAttr { // This attribute does not have a C [[]] spelling because it requires the // CPlusPlus language option. diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index b64dbef6332e6a..21fcd183e8969c 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3918,6 +3918,75 @@ have their lifetimes extended. }]; } +def LifetimeCaptureByDocs : Documentation { + let Category = DocCatFunction; + let Content = [{ + Similar to `lifetimebound`_, the ``lifetime_capture_by(X)`` attribute on a function +parameter or implicit object parameter indicates that that objects that are referred to +by that parameter may also be referred to by the capturing entity ``X``. + +By default, a reference is considered to refer to its referenced object, a +pointer is considered to refer to its pointee, a ``std::initializer_list`` +is considered to refer to its underlying array, and aggregates (arrays and +simple ``struct``\s) are considered to refer to all objects that their +transitive subobjects refer to. + +The capturing entity ``X`` can be one of the following: +- Another (named) function parameter. + + .. code-block:: c++ + + void addToSet(std::string_view a [[clang::lifetime_capture_by(s)]], std::set& s) { + s.insert(a); + } + +- ``this`` (in case of member functions). + + .. code-block:: c++ + + class S { + void addToSet(std::string_view a [[clang::lifetime_capture_by(this)]]) { + s.insert(a); + } + std::set s; + }; + +- 'global', 'unknown' (without quotes). + + .. code-block:: c++ + + std::set s; + void addToSet(std::string_view a [[clang::lifetime_capture_by(global)]]) { + s.insert(a); + } + void addSomewhere(std::string_view a [[clang::lifetime_capture_by(unknown)]]); + +The attribute can be applied to the implicit ``this`` parameter of a member +function by writing the attribute after the function type: + +.. code-block:: c++ + + struct S { + const char *data(std::set& s) [[clang::lifetime_capture_by(s)]] { + s.insert(this); + } + }; + +The attribute supports specifying more than one capturing entities: + +.. code-block:: c++ + + void addToSets(std::string_view a [[clang::lifetime_capture_by(s1, s2)]], + std::set& s1, + std::set& s2) { + s1.insert(a); + s2.insert(a); + } + +.. _`lifetimebound`: https://clang.llvm.org/docs/AttributeReference.html#lifetimebound + }]; +} + def TrivialABIDocs : Documentation { let Category = DocCatDecl; let Content = [{ diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 9f7462b1e0d962..25c10d39df32e2 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -139,6 +139,9 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2pbf16l_internal, "V32yUsUsV256iUi", "n", TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512") TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl_internal, "V32xUsUsV256iUi", "n", "amx-avx512,avx10.2-512") TARGET_BUILTIN(__builtin_ia32_tilemovrow_internal, "V16iUsUsV256iUi", "n", "amx-avx512,avx10.2-512") +TARGET_BUILTIN(__builtin_ia32_tmmultf32ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-tf32") +TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-tf32,amx-transpose") + // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") @@ -172,10 +175,6 @@ TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phh, "V32xIUcUi", "n", "amx-avx512,avx10 TARGET_BUILTIN(__builtin_ia32_tcvtrowps2phl, "V32xIUcUi", "n", "amx-avx512,avx10.2-512") TARGET_BUILTIN(__builtin_ia32_tilemovrow, "V16iIUcUi", "n", "amx-avx512,avx10.2-512") -TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi") -TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd") -TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd") - // AMX_FP16 FP16 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16") @@ -185,6 +184,14 @@ TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8") TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8") TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8") +// AMX TF32 +TARGET_BUILTIN(__builtin_ia32_tmmultf32ps, "vIUcIUcIUc", "n", "amx-tf32") +TARGET_BUILTIN(__builtin_ia32_ttmmultf32ps, "vIUcIUcIUc", "n", "amx-tf32,amx-transpose") + +TARGET_BUILTIN(__builtin_ia32_prefetchi, "vvC*Ui", "nc", "prefetchi") +TARGET_BUILTIN(__builtin_ia32_cmpccxadd32, "Siv*SiSiIi", "n", "cmpccxadd") +TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiSLLi*SLLiSLLiIi", "n", "cmpccxadd") + // RAO-INT TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint") TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint") diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index a5d97d7e545ffd..f4452fbb57e736 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3383,6 +3383,20 @@ def err_callback_callee_is_variadic : Error< "'callback' attribute callee may not be variadic">; def err_callback_implicit_this_not_available : Error< "'callback' argument at position %0 references unavailable implicit 'this'">; + +def err_capture_by_attribute_multiple : Error< + "multiple 'lifetime_capture' attributes specified">; +def err_capture_by_attribute_no_entity : Error< + "'lifetime_capture_by' attribute specifies no capturing entity">; +def err_capture_by_implicit_this_not_available : Error< + "'lifetime_capture_by' argument references unavailable implicit 'this'">; +def err_capture_by_attribute_argument_unknown : Error< + "'lifetime_capture_by' attribute argument %0 is not a known function parameter" + "; must be a function parameter, 'this', 'global' or 'unknown'">; +def err_capture_by_references_itself : Error<"'lifetime_capture_by' argument references itself">; +def err_capture_by_param_uses_reserved_name : Error< + "parameter cannot be named '%select{global|unknown}0' while using 'lifetime_capture_by(%select{global|unknown}0)'">; + def err_init_method_bad_return_type : Error< "init methods must return an object pointer type, not %0">; def err_attribute_invalid_size : Error< diff --git a/clang/include/clang/Basic/MacroBuilder.h b/clang/include/clang/Basic/MacroBuilder.h index 96e67cbbfa3f21..d83f27c236e3d8 100644 --- a/clang/include/clang/Basic/MacroBuilder.h +++ b/clang/include/clang/Basic/MacroBuilder.h @@ -26,8 +26,14 @@ class MacroBuilder { MacroBuilder(raw_ostream &Output) : Out(Output) {} /// Append a \#define line for macro of the form "\#define Name Value\n". - void defineMacro(const Twine &Name, const Twine &Value = "1") { + /// If DeprecationMsg is provided, also append a pragma to deprecate the + /// defined macro. + void defineMacro(const Twine &Name, const Twine &Value = "1", + Twine DeprecationMsg = "") { Out << "#define " << Name << ' ' << Value << '\n'; + if (!DeprecationMsg.isTriviallyEmpty()) + Out << "#pragma clang deprecated(" << Name << ", \"" << DeprecationMsg + << "\")\n"; } /// Append a \#undef line for Name. Name should be of the form XXX diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td index b4bc4b7f61c347..1f74e11ab06231 100644 --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -880,7 +880,9 @@ def SVRSQRTS : SInst<"svrsqrts[_{d}]", "ddd", "hfd", MergeNone, "aarch64_sve_frs //////////////////////////////////////////////////////////////////////////////// // Floating-point reductions -def SVFADDA : SInst<"svadda[_{d}]", "sPsd", "hfd", MergeNone, "aarch64_sve_fadda", [VerifyRuntimeMode]>; +let SVETargetGuard = "sve", SMETargetGuard = InvalidMode in { +def SVFADDA : SInst<"svadda[_{d}]", "sPsd", "hfd", MergeNone, "aarch64_sve_fadda">; +} def SVFADDV : SInst<"svaddv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_faddv", [VerifyRuntimeMode]>; def SVFMAXV : SInst<"svmaxv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxv", [VerifyRuntimeMode]>; def SVFMAXNMV : SInst<"svmaxnmv[_{d}]", "sPd", "hfd", MergeNone, "aarch64_sve_fmaxnmv", [VerifyRuntimeMode]>; @@ -1962,7 +1964,7 @@ let SVETargetGuard = "sve2,lut,bf16", SMETargetGuard = "sme2,lut,bf16" in { //////////////////////////////////////////////////////////////////////////////// // SVE2 - Optional -let SVETargetGuard = "sve2,sve-aes", SMETargetGuard = InvalidMode in { +let SVETargetGuard = "sve2-aes", SMETargetGuard = InvalidMode in { def SVAESD : SInst<"svaesd[_{d}]", "ddd", "Uc", MergeNone, "aarch64_sve_aesd", [IsOverloadNone]>; def SVAESIMC : SInst<"svaesimc[_{d}]", "dd", "Uc", MergeNone, "aarch64_sve_aesimc", [IsOverloadNone]>; def SVAESE : SInst<"svaese[_{d}]", "ddd", "Uc", MergeNone, "aarch64_sve_aese", [IsOverloadNone]>; diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td index 74b9a7fc753a62..c4d2afe407516c 100644 --- a/clang/include/clang/Basic/riscv_vector.td +++ b/clang/include/clang/Basic/riscv_vector.td @@ -1651,9 +1651,13 @@ let ManualCodegen = [{ // 13.5. Vector Widening Floating-Point Multiply let Log2LMUL = [-2, -1, 0, 1, 2] in { - defm vfwmul : RVVOutOp0Op1BuiltinSet<"vfwmul", "xf", - [["vv", "w", "wvvu"], + defm vfwmul : RVVOutOp0Op1BuiltinSet<"vfwmul", "f", + [["vv", "w", "wvvu"], ["vf", "w", "wveu"]]>; + let RequiredFeatures = ["Zvfh"] in + defm vfwmul : RVVOutOp0Op1BuiltinSet<"vfwmul", "x", + [["vv", "w", "wvvu"], + ["vf", "w", "wveu"]]>; } } // 13.3. Vector Widening Floating-Point Add/Subtract Instructions @@ -1663,9 +1667,13 @@ let ManualCodegen = [{ // 13.5. Vector Widening Floating-Point Multiply let Log2LMUL = [-2, -1, 0, 1, 2] in { - defm vfwmul : RVVOutOp0Op1BuiltinSet<"vfwmul", "xf", - [["vv", "w", "wvv"], + defm vfwmul : RVVOutOp0Op1BuiltinSet<"vfwmul", "f", + [["vv", "w", "wvv"], ["vf", "w", "wve"]]>; + let RequiredFeatures = ["Zvfh"] in + defm vfwmul : RVVOutOp0Op1BuiltinSet<"vfwmul", "x", + [["vv", "w", "wvv"], + ["vf", "w", "wve"]]>; } } } @@ -1846,20 +1854,30 @@ let ManualCodegen = [{ }] in { let HasFRMRoundModeOp = 1 in { // 13.8. Vector Floating-Point Square-Root Instruction - defm vfsqrt : RVVOutBuiltinSet<"vfsqrt", "xfd", [["v", "v", "vvu"]]>; + defm vfsqrt : RVVOutBuiltinSet<"vfsqrt", "fd", [["v", "v", "vvu"]]>; + let RequiredFeatures = ["Zvfh"] in + defm vfsqrt : RVVOutBuiltinSet<"vfsqrt", "x", [["v", "v", "vvu"]]>; // 13.10. Vector Floating-Point Reciprocal Estimate Instruction - defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "xfd", [["v", "v", "vvu"]]>; + defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "fd", [["v", "v", "vvu"]]>; + let RequiredFeatures = ["Zvfh"] in + defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "x", [["v", "v", "vvu"]]>; } // 13.8. Vector Floating-Point Square-Root Instruction - defm vfsqrt : RVVOutBuiltinSet<"vfsqrt", "xfd", [["v", "v", "vv"]]>; + defm vfsqrt : RVVOutBuiltinSet<"vfsqrt", "fd", [["v", "v", "vv"]]>; + let RequiredFeatures = ["Zvfh"] in + defm vfsqrt : RVVOutBuiltinSet<"vfsqrt", "x", [["v", "v", "vv"]]>; // 13.10. Vector Floating-Point Reciprocal Estimate Instruction - defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "xfd", [["v", "v", "vv"]]>; + defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "fd", [["v", "v", "vv"]]>; + let RequiredFeatures = ["Zvfh"] in + defm vfrec7 : RVVOutBuiltinSet<"vfrec7", "x", [["v", "v", "vv"]]>; } // 13.9. Vector Floating-Point Reciprocal Square-Root Estimate Instruction -defm vfrsqrt7 : RVVOutBuiltinSet<"vfrsqrt7", "xfd", [["v", "v", "vv"]]>; +defm vfrsqrt7 : RVVOutBuiltinSet<"vfrsqrt7", "fd", [["v", "v", "vv"]]>; +let RequiredFeatures = ["Zvfh"] in + defm vfrsqrt7 : RVVOutBuiltinSet<"vfrsqrt7", "x", [["v", "v", "vv"]]>; // 13.11. Vector Floating-Point MIN/MAX Instructions defm vfmin : RVVFloatingBinBuiltinSet; @@ -1870,8 +1888,12 @@ defm vfsgnj : RVVFloatingBinBuiltinSet; defm vfsgnjn : RVVFloatingBinBuiltinSet; defm vfsgnjx : RVVFloatingBinBuiltinSet; } -defm vfneg_v : RVVPseudoVFUnaryBuiltin<"vfsgnjn", "xfd">; -defm vfabs_v : RVVPseudoVFUnaryBuiltin<"vfsgnjx", "xfd">; +defm vfneg_v : RVVPseudoVFUnaryBuiltin<"vfsgnjn", "fd">; +let RequiredFeatures = ["Zvfh"] in + defm vfneg_v : RVVPseudoVFUnaryBuiltin<"vfsgnjn", "x">; +defm vfabs_v : RVVPseudoVFUnaryBuiltin<"vfsgnjx", "fd">; +let RequiredFeatures = ["Zvfh"] in + defm vfabs_v : RVVPseudoVFUnaryBuiltin<"vfsgnjx", "x">; // 13.13. Vector Floating-Point Compare Instructions let MaskedPolicyScheme = HasPassthruOperand, @@ -1885,8 +1907,11 @@ defm vmfge : RVVFloatingMaskOutBuiltinSet; } // 13.14. Vector Floating-Point Classify Instruction -let Name = "vfclass_v", UnMaskedPolicyScheme = HasPassthruOperand in - def vfclass : RVVOp0Builtin<"Uv", "Uvv", "xfd">; +let UnMaskedPolicyScheme = HasPassthruOperand in { +defm vfclass : RVVOp0BuiltinSet<"vfclass", "fd", [["v", "Uv", "Uvv"]]>; +let RequiredFeatures = ["Zvfh"] in + defm vfclass : RVVOp0BuiltinSet<"vfclass", "x", [["v", "Uv", "Uvv"]]>; +} // 13.15. Vector Floating-Point Merge Instruction // C/C++ Operand: (mask, op1, op2, vl), Builtin: (op1, op2, mask, vl) @@ -1907,8 +1932,11 @@ let HasMasked = false, let RequiredFeatures = ["Zvfbfmin"] in defm vmerge : RVVOutOp1BuiltinSet<"vmerge", "y", [["vvm", "v", "vvvm"]]>; - defm vfmerge : RVVOutOp1BuiltinSet<"vfmerge", "xfd", + defm vfmerge : RVVOutOp1BuiltinSet<"vfmerge", "fd", [["vfm", "v", "vvem"]]>; + let RequiredFeatures = ["Zvfh"] in + defm vfmerge : RVVOutOp1BuiltinSet<"vfmerge", "x", + [["vfm", "v", "vvem"]]>; } // 13.16. Vector Floating-Point Move Instruction @@ -1916,55 +1944,71 @@ let HasMasked = false, UnMaskedPolicyScheme = HasPassthruOperand, SupportOverloading = false, MaskedPolicyScheme = NonePolicy, - OverloadedName = "vfmv_v" in - defm vfmv_v : RVVOutBuiltinSet<"vfmv_v_f", "xfd", - [["f", "v", "ve"]]>; + OverloadedName = "vfmv_v" in { + defm vfmv_v : RVVOutBuiltinSet<"vfmv_v_f", "fd", + [["f", "v", "ve"]]>; + let RequiredFeatures = ["Zvfh"] in + defm vfmv_v : RVVOutBuiltinSet<"vfmv_v_f", "x", + [["f", "v", "ve"]]>; +} // 13.17. Single-Width Floating-Point/Integer Type-Convert Instructions let UnMaskedPolicyScheme = HasPassthruOperand in { -def vfcvt_rtz_xu_f_v : RVVConvToUnsignedBuiltin<"vfcvt_rtz_xu">; -def vfcvt_rtz_x_f_v : RVVConvToSignedBuiltin<"vfcvt_rtz_x">; +let OverloadedName = "vfcvt_rtz_xu" in { + defm : RVVConvBuiltinSet<"vfcvt_rtz_xu_f_v", "fd", [["Uv", "Uvv"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfcvt_rtz_xu_f_v", "x", [["Uv", "Uvv"]]>; +} +let OverloadedName = "vfcvt_rtz_x" in { + defm : RVVConvBuiltinSet<"vfcvt_rtz_x_f_v", "fd", [["Iv", "Ivv"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfcvt_rtz_x_f_v", "x", [["Iv", "Ivv"]]>; +} // 13.18. Widening Floating-Point/Integer Type-Convert Instructions let Log2LMUL = [-3, -2, -1, 0, 1, 2] in { - def vfwcvt_rtz_xu_f_v : RVVConvToWidenUnsignedBuiltin<"vfwcvt_rtz_xu">; - def vfwcvt_rtz_x_f_v : RVVConvToWidenSignedBuiltin<"vfwcvt_rtz_x">; - def vfwcvt_f_xu_v : RVVConvBuiltin<"Fw", "FwUv", "si", "vfwcvt_f">; - def vfwcvt_f_x_v : RVVConvBuiltin<"Fw", "Fwv", "si", "vfwcvt_f">; - let RequiredFeatures = ["Zvfh"] in { - let Name = "vfwcvt_f_xu_v", - IRName = "vfwcvt_f_xu_v", - MaskedIRName = "vfwcvt_f_xu_v_mask" in - def : RVVConvBuiltin<"Fw", "FwUv", "c", "vfwcvt_f">; - let Name = "vfwcvt_f_x_v", - IRName = "vfwcvt_f_x_v", - MaskedIRName = "vfwcvt_f_x_v_mask" in - def : RVVConvBuiltin<"Fw", "Fwv", "c", "vfwcvt_f">; - } - def vfwcvt_f_f_v : RVVConvBuiltin<"w", "wv", "f", "vfwcvt_f">; - let RequiredFeatures = ["Zvfhmin"] in - def vfwcvt_f_f_v_fp16 : RVVConvBuiltin<"w", "wv", "x", "vfwcvt_f"> { - let Name = "vfwcvt_f_f_v"; - let IRName = "vfwcvt_f_f_v"; - let MaskedIRName = "vfwcvt_f_f_v_mask"; + let OverloadedName = "vfwcvt_rtz_xu" in { + defm : RVVConvBuiltinSet<"vfwcvt_rtz_xu_f_v", "f", [["Uw", "Uwv"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfwcvt_rtz_xu_f_v", "x", [["Uw", "Uwv"]]>; + } + let OverloadedName = "vfwcvt_rtz_x" in { + defm : RVVConvBuiltinSet<"vfwcvt_rtz_x_f_v", "f", [["Iw", "Iwv"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfwcvt_rtz_x_f_v", "x", [["Iw", "Iwv"]]>; + } + let OverloadedName = "vfwcvt_f" in { + defm : RVVConvBuiltinSet<"vfwcvt_f_xu_v", "si", [["Fw", "FwUv"]]>; + defm : RVVConvBuiltinSet<"vfwcvt_f_x_v", "si", [["Fw", "Fwv"]]>; + let RequiredFeatures = ["Zvfh"] in { + defm : RVVConvBuiltinSet<"vfwcvt_f_xu_v", "c", [["Fw", "FwUv"]]>; + defm : RVVConvBuiltinSet<"vfwcvt_f_x_v", "c", [["Fw", "Fwv"]]>; } + } + let OverloadedName = "vfwcvt_f" in { + defm : RVVConvBuiltinSet<"vfwcvt_f_f_v", "f", [["w", "wv"]]>; + let RequiredFeatures = ["Zvfhmin"] in + defm : RVVConvBuiltinSet<"vfwcvt_f_f_v", "x", [["w", "wv"]]>; + } } // 13.19. Narrowing Floating-Point/Integer Type-Convert Instructions let Log2LMUL = [-3, -2, -1, 0, 1, 2] in { - def vfncvt_rtz_xu_f_w : RVVConvToNarrowingUnsignedBuiltin<"vfncvt_rtz_xu">; - def vfncvt_rtz_x_f_w : RVVConvToNarrowingSignedBuiltin<"vfncvt_rtz_x">; - let RequiredFeatures = ["Zvfh"] in { - let Name = "vfncvt_rtz_xu_f_w", - IRName = "vfncvt_rtz_xu_f_w", - MaskedIRName = "vfncvt_rtz_xu_f_w_mask" in - def : RVVConvBuiltin<"Uv", "UvFw", "c", "vfncvt_rtz_xu">; - let Name = "vfncvt_rtz_x_f_w", - IRName = "vfncvt_rtz_x_f_w", - MaskedIRName = "vfncvt_rtz_x_f_w_mask" in - def : RVVConvBuiltin<"Iv", "IvFw", "c", "vfncvt_rtz_x">; + let OverloadedName = "vfncvt_rtz_xu" in { + defm : RVVConvBuiltinSet<"vfncvt_rtz_xu_f_w", "si", [["Uv", "UvFw"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfncvt_rtz_xu_f_w", "c", [["Uv", "UvFw"]]>; + } + let OverloadedName = "vfncvt_rtz_x" in { + defm : RVVConvBuiltinSet<"vfncvt_rtz_x_f_w", "si", [["Iv", "IvFw"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfncvt_rtz_x_f_w", "c", [["Iv", "IvFw"]]>; + } + let OverloadedName = "vfncvt_rod_f" in { + defm : RVVConvBuiltinSet<"vfncvt_rod_f_f_w", "f", [["v", "vw"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfncvt_rod_f_f_w", "x", [["v", "vw"]]>; } - def vfncvt_rod_f_f_w : RVVConvBuiltin<"v", "vw", "xf", "vfncvt_rod_f">; } // Zvfbfmin - Vector convert BF16 to FP32 @@ -2016,54 +2060,62 @@ let ManualCodegen = [{ }] in { let HasFRMRoundModeOp = 1 in { // 14.17. Single-Width Floating-Point/Integer Type-Convert Instructions - let OverloadedName = "vfcvt_x" in - defm : - RVVConvBuiltinSet<"vfcvt_x_f_v", "xfd", [["Iv", "Ivvu"]]>; - let OverloadedName = "vfcvt_xu" in - defm : - RVVConvBuiltinSet<"vfcvt_xu_f_v", "xfd", [["Uv", "Uvvu"]]>; + let OverloadedName = "vfcvt_x" in { + defm : RVVConvBuiltinSet<"vfcvt_x_f_v", "fd", [["Iv", "Ivvu"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfcvt_x_f_v", "x", [["Iv", "Ivvu"]]>; + } + let OverloadedName = "vfcvt_xu" in { + defm : RVVConvBuiltinSet<"vfcvt_xu_f_v", "fd", [["Uv", "Uvvu"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfcvt_xu_f_v", "x", [["Uv", "Uvvu"]]>; + } let OverloadedName = "vfcvt_f" in { - defm : - RVVConvBuiltinSet<"vfcvt_f_x_v", "xfd", [["v", "vIvu"]]>; - defm : - RVVConvBuiltinSet<"vfcvt_f_xu_v", "xfd", [["v", "vUvu"]]>; + defm : RVVConvBuiltinSet<"vfcvt_f_x_v", "fd", [["v", "vIvu"]]>; + defm : RVVConvBuiltinSet<"vfcvt_f_xu_v", "fd", [["v", "vUvu"]]>; + let RequiredFeatures = ["Zvfh"] in { + defm : RVVConvBuiltinSet<"vfcvt_f_x_v", "x", [["v", "vIvu"]]>; + defm : RVVConvBuiltinSet<"vfcvt_f_xu_v", "x", [["v", "vUvu"]]>; + } } // 13.18. Widening Floating-Point/Integer Type-Convert Instructions let Log2LMUL = [-3, -2, -1, 0, 1, 2] in { - let OverloadedName = "vfwcvt_x" in - defm : - RVVConvBuiltinSet<"vfwcvt_x_f_v", "xf", [["Iw", "Iwvu"]]>; - let OverloadedName = "vfwcvt_xu" in - defm : - RVVConvBuiltinSet<"vfwcvt_xu_f_v", "xf", [["Uw", "Uwvu"]]>; + let OverloadedName = "vfwcvt_x" in { + defm : RVVConvBuiltinSet<"vfwcvt_x_f_v", "f", [["Iw", "Iwvu"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfwcvt_x_f_v", "x", [["Iw", "Iwvu"]]>; + } + let OverloadedName = "vfwcvt_xu" in { + defm : RVVConvBuiltinSet<"vfwcvt_xu_f_v", "f", [["Uw", "Uwvu"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfwcvt_xu_f_v", "x", [["Uw", "Uwvu"]]>; + } } // 13.19. Narrowing Floating-Point/Integer Type-Convert Instructions let Log2LMUL = [-3, -2, -1, 0, 1, 2] in { - let OverloadedName = "vfncvt_x" in - defm : - RVVConvBuiltinSet<"vfncvt_x_f_w", "si", [["Iv", "IvFwu"]]>; - let OverloadedName = "vfncvt_xu" in - defm : - RVVConvBuiltinSet<"vfncvt_xu_f_w", "si", [["Uv", "UvFwu"]]>; - let RequiredFeatures = ["Zvfh"] in { - let OverloadedName = "vfncvt_x" in - defm : - RVVConvBuiltinSet<"vfncvt_x_f_w", "c", [["Iv", "IvFwu"]]>; - let OverloadedName = "vfncvt_xu" in - defm : - RVVConvBuiltinSet<"vfncvt_xu_f_w", "c", [["Uv", "UvFwu"]]>; + let OverloadedName = "vfncvt_x" in { + defm : RVVConvBuiltinSet<"vfncvt_x_f_w", "si", [["Iv", "IvFwu"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfncvt_x_f_w", "c", [["Iv", "IvFwu"]]>; + } + let OverloadedName = "vfncvt_xu" in { + defm : RVVConvBuiltinSet<"vfncvt_xu_f_w", "si", [["Uv", "UvFwu"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfncvt_xu_f_w", "c", [["Uv", "UvFwu"]]>; } let OverloadedName = "vfncvt_f" in { - defm : - RVVConvBuiltinSet<"vfncvt_f_x_w", "xf", [["v", "vIwu"]]>; - defm : - RVVConvBuiltinSet<"vfncvt_f_xu_w", "xf", [["v", "vUwu"]]>; + defm : RVVConvBuiltinSet<"vfncvt_f_x_w", "f", [["v", "vIwu"]]>; + defm : RVVConvBuiltinSet<"vfncvt_f_xu_w", "f", [["v", "vUwu"]]>; + let RequiredFeatures = ["Zvfh"] in { + defm : RVVConvBuiltinSet<"vfncvt_f_x_w", "x", [["v", "vIwu"]]>; + defm : RVVConvBuiltinSet<"vfncvt_f_xu_w", "x", [["v", "vUwu"]]>; + } } let OverloadedName = "vfncvt_f" in { defm : RVVConvBuiltinSet<"vfncvt_f_f_w", "f", [["v", "vwu"]]>; let RequiredFeatures = ["Zvfhmin"] in - defm : RVVConvBuiltinSet<"vfncvt_f_f_w", "x", [["v", "vwu"]]>; + defm : RVVConvBuiltinSet<"vfncvt_f_f_w", "x", [["v", "vwu"]]>; } } @@ -2074,54 +2126,62 @@ let ManualCodegen = [{ } // 13.17. Single-Width Floating-Point/Integer Type-Convert Instructions - let OverloadedName = "vfcvt_x" in - defm : - RVVConvBuiltinSet<"vfcvt_x_f_v", "xfd", [["Iv", "Ivv"]]>; - let OverloadedName = "vfcvt_xu" in - defm : - RVVConvBuiltinSet<"vfcvt_xu_f_v", "xfd", [["Uv", "Uvv"]]>; + let OverloadedName = "vfcvt_x" in { + defm : RVVConvBuiltinSet<"vfcvt_x_f_v", "fd", [["Iv", "Ivv"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfcvt_x_f_v", "x", [["Iv", "Ivv"]]>; + } + let OverloadedName = "vfcvt_xu" in { + defm : RVVConvBuiltinSet<"vfcvt_xu_f_v", "fd", [["Uv", "Uvv"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfcvt_xu_f_v", "x", [["Uv", "Uvv"]]>; + } let OverloadedName = "vfcvt_f" in { - defm : - RVVConvBuiltinSet<"vfcvt_f_x_v", "xfd", [["v", "vIv"]]>; - defm : - RVVConvBuiltinSet<"vfcvt_f_xu_v", "xfd", [["v", "vUv"]]>; + defm : RVVConvBuiltinSet<"vfcvt_f_x_v", "fd", [["v", "vIv"]]>; + defm : RVVConvBuiltinSet<"vfcvt_f_xu_v", "fd", [["v", "vUv"]]>; + let RequiredFeatures = ["Zvfh"] in { + defm : RVVConvBuiltinSet<"vfcvt_f_x_v", "x", [["v", "vIv"]]>; + defm : RVVConvBuiltinSet<"vfcvt_f_xu_v", "x", [["v", "vUv"]]>; + } } // 13.18. Widening Floating-Point/Integer Type-Convert Instructions let Log2LMUL = [-3, -2, -1, 0, 1, 2] in { - let OverloadedName = "vfwcvt_x" in - defm : - RVVConvBuiltinSet<"vfwcvt_x_f_v", "xf", [["Iw", "Iwv"]]>; - let OverloadedName = "vfwcvt_xu" in - defm : - RVVConvBuiltinSet<"vfwcvt_xu_f_v", "xf", [["Uw", "Uwv"]]>; + let OverloadedName = "vfwcvt_x" in { + defm : RVVConvBuiltinSet<"vfwcvt_x_f_v", "f", [["Iw", "Iwv"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfwcvt_x_f_v", "x", [["Iw", "Iwv"]]>; + } + let OverloadedName = "vfwcvt_xu" in { + defm : RVVConvBuiltinSet<"vfwcvt_xu_f_v", "f", [["Uw", "Uwv"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfwcvt_xu_f_v", "x", [["Uw", "Uwv"]]>; + } } // 13.19. Narrowing Floating-Point/Integer Type-Convert Instructions let Log2LMUL = [-3, -2, -1, 0, 1, 2] in { - let OverloadedName = "vfncvt_x" in - defm : - RVVConvBuiltinSet<"vfncvt_x_f_w", "si", [["Iv", "IvFw"]]>; - let OverloadedName = "vfncvt_xu" in - defm : - RVVConvBuiltinSet<"vfncvt_xu_f_w", "si", [["Uv", "UvFw"]]>; - let RequiredFeatures = ["Zvfh"] in { - let OverloadedName = "vfncvt_x" in - defm : - RVVConvBuiltinSet<"vfncvt_x_f_w", "c", [["Iv", "IvFw"]]>; - let OverloadedName = "vfncvt_xu" in - defm : - RVVConvBuiltinSet<"vfncvt_xu_f_w", "c", [["Uv", "UvFw"]]>; + let OverloadedName = "vfncvt_x" in { + defm : RVVConvBuiltinSet<"vfncvt_x_f_w", "si", [["Iv", "IvFw"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfncvt_x_f_w", "c", [["Iv", "IvFw"]]>; + } + let OverloadedName = "vfncvt_xu" in { + defm : RVVConvBuiltinSet<"vfncvt_xu_f_w", "si", [["Uv", "UvFw"]]>; + let RequiredFeatures = ["Zvfh"] in + defm : RVVConvBuiltinSet<"vfncvt_xu_f_w", "c", [["Uv", "UvFw"]]>; } let OverloadedName = "vfncvt_f" in { - defm : - RVVConvBuiltinSet<"vfncvt_f_x_w", "xf", [["v", "vIw"]]>; - defm : - RVVConvBuiltinSet<"vfncvt_f_xu_w", "xf", [["v", "vUw"]]>; + defm : RVVConvBuiltinSet<"vfncvt_f_x_w", "f", [["v", "vIw"]]>; + defm : RVVConvBuiltinSet<"vfncvt_f_xu_w", "f", [["v", "vUw"]]>; + let RequiredFeatures = ["Zvfh"] in { + defm : RVVConvBuiltinSet<"vfncvt_f_x_w", "x", [["v", "vIw"]]>; + defm : RVVConvBuiltinSet<"vfncvt_f_xu_w", "x", [["v", "vUw"]]>; + } } let OverloadedName = "vfncvt_f" in { defm : RVVConvBuiltinSet<"vfncvt_f_f_w", "f", [["v", "vw"]]>; let RequiredFeatures = ["Zvfhmin"] in - defm : RVVConvBuiltinSet<"vfncvt_f_f_w", "x", [["v", "vw"]]>; + defm : RVVConvBuiltinSet<"vfncvt_f_f_w", "x", [["v", "vw"]]>; } } @@ -2281,15 +2341,24 @@ let HasMasked = false, MaskedPolicyScheme = NonePolicy in { // 16.2. Floating-Point Scalar Move Instructions let HasMasked = false, MaskedPolicyScheme = NonePolicy in { - let HasVL = false, OverloadedName = "vfmv_f" in - defm vfmv_f : RVVOp0BuiltinSet<"vfmv_f_s", "xfd", + let HasVL = false, OverloadedName = "vfmv_f" in { + defm vfmv_f : RVVOp0BuiltinSet<"vfmv_f_s", "fd", + [["s", "ve", "ev"]]>; + let RequiredFeatures = ["Zvfh"] in + defm vfmv_f : RVVOp0BuiltinSet<"vfmv_f_s", "x", [["s", "ve", "ev"]]>; + } let OverloadedName = "vfmv_s", UnMaskedPolicyScheme = HasPassthruOperand, - SupportOverloading = false in - defm vfmv_s : RVVOutBuiltinSet<"vfmv_s_f", "xfd", + SupportOverloading = false in { + defm vfmv_s : RVVOutBuiltinSet<"vfmv_s_f", "fd", + [["f", "v", "ve"], + ["x", "Uv", "UvUe"]]>; + let RequiredFeatures = ["Zvfh"] in + defm vfmv_s : RVVOutBuiltinSet<"vfmv_s_f", "x", [["f", "v", "ve"], ["x", "Uv", "UvUe"]]>; + } } // 16.3. Vector Slide Instructions @@ -2325,7 +2394,10 @@ let RequiredFeatures = ["Zvfbfmin"] in { defm vrgather : RVVOutBuiltinSet<"vrgather_vx", "y", [["vx", "v", "vvz"]]>; } -defm vrgatherei16 : RVVOutBuiltinSet<"vrgatherei16_vv", "csilxfd", +defm vrgatherei16 : RVVOutBuiltinSet<"vrgatherei16_vv", "csilfd", + [["vv", "v", "vv(Log2EEW:4)Uv"]]>; +let RequiredFeatures = ["Zvfh"] in +defm vrgatherei16 : RVVOutBuiltinSet<"vrgatherei16_vv", "x", [["vv", "v", "vv(Log2EEW:4)Uv"]]>; // unsigned type defm vrgather : RVVOutBuiltinSet<"vrgather_vv", "csil", diff --git a/clang/include/clang/Basic/riscv_vector_common.td b/clang/include/clang/Basic/riscv_vector_common.td index b38ca7341361c4..ee06d740bb1686 100644 --- a/clang/include/clang/Basic/riscv_vector_common.td +++ b/clang/include/clang/Basic/riscv_vector_common.td @@ -458,52 +458,91 @@ let HasMaskedOffOperand = false in { ["vx", "Uv", "UvUvUeUv"]]>; } multiclass RVVFloatingTerBuiltinSet { - defm "" : RVVOutOp1BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp1BuiltinSet; } multiclass RVVFloatingTerBuiltinSetRoundingMode { - defm "" : RVVOutOp1BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp1BuiltinSet; } } let HasMaskedOffOperand = false, Log2LMUL = [-2, -1, 0, 1, 2] in { multiclass RVVFloatingWidenTerBuiltinSet { - defm "" : RVVOutOp1Op2BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp1Op2BuiltinSet; } multiclass RVVFloatingWidenTerBuiltinSetRoundingMode { - defm "" : RVVOutOp1Op2BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp1Op2BuiltinSet; } } -multiclass RVVFloatingBinBuiltinSet - : RVVOutOp1BuiltinSet; +multiclass RVVFloatingBinBuiltinSet { + defm "" : RVVOutOp1BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp1BuiltinSet; +} -multiclass RVVFloatingBinBuiltinSetRoundingMode - : RVVOutOp1BuiltinSet; +multiclass RVVFloatingBinBuiltinSetRoundingMode { + defm "" : RVVOutOp1BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp1BuiltinSet; +} -multiclass RVVFloatingBinVFBuiltinSet - : RVVOutOp1BuiltinSet; +multiclass RVVFloatingBinVFBuiltinSet { + defm "" : RVVOutOp1BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp1BuiltinSet; +} -multiclass RVVFloatingBinVFBuiltinSetRoundingMode - : RVVOutOp1BuiltinSet; +multiclass RVVFloatingBinVFBuiltinSetRoundingMode { + defm "" : RVVOutOp1BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp1BuiltinSet; +} -multiclass RVVFloatingMaskOutBuiltinSet - : RVVOp0Op1BuiltinSet; +multiclass RVVFloatingMaskOutBuiltinSet { + defm "" : RVVOp0Op1BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOp0Op1BuiltinSet; +} multiclass RVVFloatingMaskOutVFBuiltinSet : RVVOp0Op1BuiltinSet : RVVOp0Builtin<"m", prototype, "c"> { let UnMaskedPolicyScheme = HasPolicyOperand, HasMaskedOffOperand = false in { multiclass RVVSlideUpBuiltinSet { - defm "" : RVVOutBuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutBuiltinSet; defm "" : RVVOutBuiltinSet; } @@ -569,8 +611,11 @@ let UnMaskedPolicyScheme = HasPassthruOperand, IntrinsicTypes = {ResultType, Ops.back()->getType()}; }] in { multiclass RVVSlideDownBuiltinSet { - defm "" : RVVOutBuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutBuiltinSet; defm "" : RVVOutBuiltinSet; } @@ -611,20 +656,32 @@ let HasMaskedOffOperand = true in { [["vs", "UvUSv", "USvUvUSv"]]>; } multiclass RVVFloatingReductionBuiltin { - defm "" : RVVOutOp0BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp0BuiltinSet; } multiclass RVVFloatingReductionBuiltinRoundingMode { - defm "" : RVVOutOp0BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp0BuiltinSet; } multiclass RVVFloatingWidenReductionBuiltin { - defm "" : RVVOutOp0BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp0BuiltinSet; } multiclass RVVFloatingWidenReductionBuiltinRoundingMode { - defm "" : RVVOutOp0BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVOutOp0BuiltinSet; } } @@ -684,22 +741,42 @@ multiclass RVVUnsignedWidenOp0BinBuiltinSet [["wv", "Uw", "UwUwUv"], ["wx", "Uw", "UwUwUe"]]>; -multiclass RVVFloatingWidenBinBuiltinSet - : RVVWidenBuiltinSet; +multiclass RVVFloatingWidenBinBuiltinSet { + defm "" : RVVWidenBuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVWidenBuiltinSet; +} -multiclass RVVFloatingWidenBinBuiltinSetRoundingMode - : RVVWidenBuiltinSet; +multiclass RVVFloatingWidenBinBuiltinSetRoundingMode { + defm "" : RVVWidenBuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVWidenBuiltinSet; +} -multiclass RVVFloatingWidenOp0BinBuiltinSet - : RVVWidenWOp0BuiltinSet; +multiclass RVVFloatingWidenOp0BinBuiltinSet { + defm "" : RVVWidenWOp0BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVWidenWOp0BuiltinSet; +} -multiclass RVVFloatingWidenOp0BinBuiltinSetRoundingMode - : RVVWidenWOp0BuiltinSet; +multiclass RVVFloatingWidenOp0BinBuiltinSetRoundingMode { + defm "" : RVVWidenWOp0BuiltinSet; + let RequiredFeatures = ["Zvfh"] in + defm "" : RVVWidenWOp0BuiltinSet; +} diff --git a/clang/include/clang/CIR/CIRGenerator.h b/clang/include/clang/CIR/CIRGenerator.h index aa1a7e64459b35..c8ca7e4bfa7285 100644 --- a/clang/include/clang/CIR/CIRGenerator.h +++ b/clang/include/clang/CIR/CIRGenerator.h @@ -25,14 +25,15 @@ namespace clang { class DeclGroupRef; class DiagnosticsEngine; +namespace CIRGen { +class CIRGenModule; +} // namespace CIRGen } // namespace clang namespace mlir { class MLIRContext; } // namespace mlir namespace cir { -class CIRGenModule; - class CIRGenerator : public clang::ASTConsumer { virtual void anchor(); clang::DiagnosticsEngine &diags; @@ -44,7 +45,7 @@ class CIRGenerator : public clang::ASTConsumer { protected: std::unique_ptr mlirCtx; - std::unique_ptr cgm; + std::unique_ptr cgm; public: CIRGenerator(clang::DiagnosticsEngine &diags, diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td index 69d6e9774942b9..305a06427ed0e0 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td +++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td @@ -22,7 +22,7 @@ def CIR_Dialect : Dialect { let summary = "A high-level dialect for analyzing and optimizing Clang " "supported languages"; - let cppNamespace = "::mlir::cir"; + let cppNamespace = "::cir"; let useDefaultAttributePrinterParser = 0; let useDefaultTypePrinterParser = 0; @@ -31,13 +31,15 @@ def CIR_Dialect : Dialect { void registerAttributes(); void registerTypes(); - Type parseType(DialectAsmParser &parser) const override; - void printType(Type type, DialectAsmPrinter &printer) const override; + mlir::Type parseType(mlir::DialectAsmParser &parser) const override; + void printType(mlir::Type type, + mlir::DialectAsmPrinter &printer) const override; - Attribute parseAttribute(DialectAsmParser &parser, - Type type) const override; + mlir::Attribute parseAttribute(mlir::DialectAsmParser &parser, + mlir::Type type) const override; - void printAttribute(Attribute attr, DialectAsmPrinter &os) const override; + void printAttribute(mlir::Attribute attr, + mlir::DialectAsmPrinter &os) const override; }]; } diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index c0440faa3c7b17..4462eb6fc00bae 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -51,12 +51,12 @@ include "mlir/Interfaces/SideEffectInterfaces.td" // following: // // class CIRFooOpLowering -// : public mlir::OpConversionPattern { +// : public mlir::OpConversionPattern { // public: -// using OpConversionPattern::OpConversionPattern; +// using OpConversionPattern::OpConversionPattern; // // mlir::LogicalResult matchAndRewrite( -// mlir::cir::FooOp op, +// cir::FooOp op, // OpAdaptor adaptor, // mlir::ConversionPatternRewriter &rewriter) const override { // rewriter.replaceOpWithNewOp( @@ -92,7 +92,7 @@ def FuncOp : CIR_Op<"func"> { let skipDefaultBuilders = 1; - let builders = [OpBuilder<(ins "StringRef":$name)>]; + let builders = [OpBuilder<(ins "llvm::StringRef":$name)>]; let hasCustomAssemblyFormat = 1; let hasVerifier = 1; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 0dba5672c5a85d..1304ef3c5a228b 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6297,6 +6297,8 @@ def mamx_int8 : Flag<["-"], "mamx-int8">, Group; def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group; def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group; def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group; +def mamx_tf32 : Flag<["-"], "mamx-tf32">, Group; +def mno_amx_tf32 : Flag<["-"], "mno-amx-tf32">, Group; def mamx_tile : Flag<["-"], "mamx-tile">, Group; def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group; def mamx_transpose : Flag<["-"], "mamx-transpose">, Group; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index fad446a05e782f..d6f3508a5243f3 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -1760,6 +1760,14 @@ class Sema final : public SemaBase { /// Add [[gsl::Pointer]] attributes for std:: types. void inferGslPointerAttribute(TypedefNameDecl *TD); + LifetimeCaptureByAttr *ParseLifetimeCaptureByAttr(const ParsedAttr &AL, + StringRef ParamName); + // Processes the argument 'X' in [[clang::lifetime_capture_by(X)]]. Since 'X' + // can be the name of a function parameter, we need to parse the function + // declaration and rest of the parameters before processesing 'X'. Therefore + // do this lazily instead of processing while parsing the annotation itself. + void LazyProcessLifetimeCaptureByParams(FunctionDecl *FD); + /// Add _Nullable attributes for std:: types. void inferNullableClassAttribute(CXXRecordDecl *CRD); diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h index 53a4a6991ad67a..1c6ac4b0b51f40 100644 --- a/clang/include/clang/Sema/SemaOpenACC.h +++ b/clang/include/clang/Sema/SemaOpenACC.h @@ -34,11 +34,6 @@ class OpenACCClause; class SemaOpenACC : public SemaBase { private: - /// A collection of loop constructs in the compute construct scope that - /// haven't had their 'parent' compute construct set yet. Entires will only be - /// made to this list in the case where we know the loop isn't an orphan. - llvm::SmallVector ParentlessLoopConstructs; - struct ComputeConstructInfo { /// Which type of compute construct we are inside of, which we can use to /// determine whether we should add loops to the above collection. We can @@ -768,7 +763,6 @@ class SemaOpenACC : public SemaBase { SourceLocation OldLoopWorkerClauseLoc; SourceLocation OldLoopVectorClauseLoc; SourceLocation OldLoopWithoutSeqLoc; - llvm::SmallVector ParentlessLoopConstructs; llvm::SmallVector ActiveReductionClauses; LoopInConstructRAII LoopRAII; diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp index 047f354b200745..f33d2fb1530d17 100644 --- a/clang/lib/AST/Decl.cpp +++ b/clang/lib/AST/Decl.cpp @@ -1738,13 +1738,12 @@ void NamedDecl::printNestedNameSpecifier(raw_ostream &OS, // Suppress inline namespace if it doesn't make the result ambiguous. if (Ctx->isInlineNamespace() && NameInScope) { - bool isRedundant = - cast(Ctx)->isRedundantInlineQualifierFor(NameInScope); if (P.SuppressInlineNamespace == PrintingPolicy::SuppressInlineNamespaceMode::All || (P.SuppressInlineNamespace == PrintingPolicy::SuppressInlineNamespaceMode::Redundant && - isRedundant)) { + cast(Ctx)->isRedundantInlineQualifierFor( + NameInScope))) { continue; } } diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp index 2d864a28857966..82125538acaaa7 100644 --- a/clang/lib/AST/StmtOpenACC.cpp +++ b/clang/lib/AST/StmtOpenACC.cpp @@ -28,44 +28,15 @@ OpenACCComputeConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) { OpenACCComputeConstruct *OpenACCComputeConstruct::Create( const ASTContext &C, OpenACCDirectiveKind K, SourceLocation BeginLoc, SourceLocation DirLoc, SourceLocation EndLoc, - ArrayRef Clauses, Stmt *StructuredBlock, - ArrayRef AssociatedLoopConstructs) { + ArrayRef Clauses, Stmt *StructuredBlock) { void *Mem = C.Allocate( OpenACCComputeConstruct::totalSizeToAlloc( Clauses.size())); auto *Inst = new (Mem) OpenACCComputeConstruct(K, BeginLoc, DirLoc, EndLoc, Clauses, StructuredBlock); - - llvm::for_each(AssociatedLoopConstructs, [&](OpenACCLoopConstruct *C) { - C->setParentComputeConstruct(Inst); - }); - return Inst; } -void OpenACCComputeConstruct::findAndSetChildLoops() { - struct LoopConstructFinder : RecursiveASTVisitor { - OpenACCComputeConstruct *Construct = nullptr; - - LoopConstructFinder(OpenACCComputeConstruct *Construct) - : Construct(Construct) {} - - bool TraverseOpenACCComputeConstruct(OpenACCComputeConstruct *C) { - // Stop searching if we find a compute construct. - return true; - } - bool TraverseOpenACCLoopConstruct(OpenACCLoopConstruct *C) { - // Stop searching if we find a loop construct, after taking ownership of - // it. - C->setParentComputeConstruct(Construct); - return true; - } - }; - - LoopConstructFinder f(this); - f.TraverseStmt(getAssociatedStmt()); -} - OpenACCLoopConstruct::OpenACCLoopConstruct(unsigned NumClauses) : OpenACCAssociatedStmtConstruct( OpenACCLoopConstructClass, OpenACCDirectiveKind::Loop, @@ -79,11 +50,13 @@ OpenACCLoopConstruct::OpenACCLoopConstruct(unsigned NumClauses) } OpenACCLoopConstruct::OpenACCLoopConstruct( - SourceLocation Start, SourceLocation DirLoc, SourceLocation End, + OpenACCDirectiveKind ParentKind, SourceLocation Start, + SourceLocation DirLoc, SourceLocation End, ArrayRef Clauses, Stmt *Loop) : OpenACCAssociatedStmtConstruct(OpenACCLoopConstructClass, OpenACCDirectiveKind::Loop, Start, DirLoc, - End, Loop) { + End, Loop), + ParentComputeConstructKind(ParentKind) { // accept 'nullptr' for the loop. This is diagnosed somewhere, but this gives // us some level of AST fidelity in the error case. assert((Loop == nullptr || isa(Loop)) && @@ -96,12 +69,6 @@ OpenACCLoopConstruct::OpenACCLoopConstruct( Clauses.size())); } -void OpenACCLoopConstruct::setLoop(Stmt *Loop) { - assert((isa(Loop)) && - "Associated Loop not a for loop?"); - setAssociatedStmt(Loop); -} - OpenACCLoopConstruct *OpenACCLoopConstruct::CreateEmpty(const ASTContext &C, unsigned NumClauses) { void *Mem = @@ -111,15 +78,14 @@ OpenACCLoopConstruct *OpenACCLoopConstruct::CreateEmpty(const ASTContext &C, return Inst; } -OpenACCLoopConstruct * -OpenACCLoopConstruct::Create(const ASTContext &C, SourceLocation BeginLoc, - SourceLocation DirLoc, SourceLocation EndLoc, - ArrayRef Clauses, - Stmt *Loop) { +OpenACCLoopConstruct *OpenACCLoopConstruct::Create( + const ASTContext &C, OpenACCDirectiveKind ParentKind, + SourceLocation BeginLoc, SourceLocation DirLoc, SourceLocation EndLoc, + ArrayRef Clauses, Stmt *Loop) { void *Mem = C.Allocate(OpenACCLoopConstruct::totalSizeToAlloc( Clauses.size())); - auto *Inst = - new (Mem) OpenACCLoopConstruct(BeginLoc, DirLoc, EndLoc, Clauses, Loop); + auto *Inst = new (Mem) + OpenACCLoopConstruct(ParentKind, BeginLoc, DirLoc, EndLoc, Clauses, Loop); return Inst; } diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 01cfb1f63f7087..b54c166fc8d9c4 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -2928,7 +2928,7 @@ void TextNodeDumper::VisitOpenACCLoopConstruct(const OpenACCLoopConstruct *S) { if (S->isOrphanedLoopConstruct()) OS << " "; else - OS << " parent: " << S->getParentComputeConstruct(); + OS << " parent: " << S->getParentComputeConstructKind(); } void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) { diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index 6d8db5cf4ffd22..a073a6a4b7d454 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -25,6 +25,7 @@ #include "clang/AST/TextNodeDumper.h" #include "clang/AST/Type.h" #include "clang/Basic/AddressSpaces.h" +#include "clang/Basic/AttrKinds.h" #include "clang/Basic/ExceptionSpecificationType.h" #include "clang/Basic/IdentifierTable.h" #include "clang/Basic/LLVM.h" @@ -1909,6 +1910,19 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, OS << " [[clang::lifetimebound]]"; return; } + if (T->getAttrKind() == attr::LifetimeCaptureBy) { + OS << " [[clang::lifetime_capture_by("; + if (auto *attr = dyn_cast_or_null(T->getAttr())) { + auto Idents = attr->getArgIdents(); + for (unsigned I = 0; I < Idents.size(); ++I) { + OS << Idents[I]->getName(); + if (I != Idents.size() - 1) + OS << ", "; + } + } + OS << ")]]"; + return; + } // The printing of the address_space attribute is handled by the qualifier // since it is still stored in the qualifier. Return early to prevent printing @@ -1976,6 +1990,7 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, case attr::SizedBy: case attr::SizedByOrNull: case attr::LifetimeBound: + case attr::LifetimeCaptureBy: case attr::TypeNonNull: case attr::TypeNullable: case attr::TypeNullableResult: diff --git a/clang/lib/Basic/Attributes.cpp b/clang/lib/Basic/Attributes.cpp index 2d18fb3f9d5bb2..6904bce3ac51ec 100644 --- a/clang/lib/Basic/Attributes.cpp +++ b/clang/lib/Basic/Attributes.cpp @@ -18,6 +18,7 @@ #include "clang/Basic/TargetInfo.h" #include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringSwitch.h" using namespace clang; @@ -155,26 +156,17 @@ std::string AttributeCommonInfo::getNormalizedFullName() const { normalizeName(getAttrName(), getScopeName(), getSyntax())); } -// Sorted list of attribute scope names -static constexpr std::pair ScopeList[] = - {{"", AttributeCommonInfo::Scope::NONE}, - {"clang", AttributeCommonInfo::Scope::CLANG}, - {"gnu", AttributeCommonInfo::Scope::GNU}, - {"gsl", AttributeCommonInfo::Scope::GSL}, - {"hlsl", AttributeCommonInfo::Scope::HLSL}, - {"msvc", AttributeCommonInfo::Scope::MSVC}, - {"omp", AttributeCommonInfo::Scope::OMP}, - {"riscv", AttributeCommonInfo::Scope::RISCV}}; - AttributeCommonInfo::Scope getScopeFromNormalizedScopeName(StringRef ScopeName) { - auto It = std::lower_bound( - std::begin(ScopeList), std::end(ScopeList), ScopeName, - [](const std::pair &Element, - StringRef Value) { return Element.first < Value; }); - assert(It != std::end(ScopeList) && It->first == ScopeName); - - return It->second; + return llvm::StringSwitch(ScopeName) + .Case("", AttributeCommonInfo::Scope::NONE) + .Case("clang", AttributeCommonInfo::Scope::CLANG) + .Case("gnu", AttributeCommonInfo::Scope::GNU) + .Case("gsl", AttributeCommonInfo::Scope::GSL) + .Case("hlsl", AttributeCommonInfo::Scope::HLSL) + .Case("msvc", AttributeCommonInfo::Scope::MSVC) + .Case("omp", AttributeCommonInfo::Scope::OMP) + .Case("riscv", AttributeCommonInfo::Scope::RISCV); } unsigned AttributeCommonInfo::calculateAttributeSpellingListIndex() const { diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 3fd43373cab445..e35ee2b7b9c385 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -473,7 +473,7 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasSVE2p1) Builder.defineMacro("__ARM_FEATURE_SVE2p1", "1"); - if (HasSVE2 && HasSVEAES) + if (HasSVE2 && HasSVE2AES) Builder.defineMacro("__ARM_FEATURE_SVE2_AES", "1"); if (HasSVE2 && HasSVE2BitPerm) @@ -769,7 +769,7 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const { .Case("f32mm", FPU & SveMode && HasMatmulFP32) .Case("f64mm", FPU & SveMode && HasMatmulFP64) .Case("sve2", FPU & SveMode && HasSVE2) - .Case("sve2-pmull128", FPU & SveMode && HasSVEAES && HasSVE2) + .Case("sve2-pmull128", FPU & SveMode && HasSVE2AES) .Case("sve2-bitperm", FPU & SveMode && HasSVE2BitPerm) .Case("sve2-sha3", FPU & SveMode && HasSVE2SHA3) .Case("sve2-sm4", FPU & SveMode && HasSVE2SM4) @@ -861,10 +861,12 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector &Features, HasSVE2 = true; HasSVE2p1 = true; } - if (Feature == "+sve-aes") { + if (Feature == "+sve2-aes") { FPU |= NeonMode; - HasAES = true; - HasSVEAES = true; + FPU |= SveMode; + HasFullFP16 = true; + HasSVE2 = true; + HasSVE2AES = true; } if (Feature == "+sve2-sha3") { FPU |= NeonMode; diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h index 4c25bdb5bb16df..ea3e4015d84265 100644 --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -78,7 +78,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo { bool HasBFloat16 = false; bool HasSVE2 = false; bool HasSVE2p1 = false; - bool HasSVEAES = false; + bool HasSVE2AES = false; bool HasSVE2SHA3 = false; bool HasSVE2SM4 = false; bool HasSVEB16B16 = false; diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index 078819183afdac..99f8f2944e2796 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -337,9 +337,12 @@ void AMDGPUTargetInfo::getTargetDefines(const LangOptions &Opts, if (hasFastFMA()) Builder.defineMacro("FP_FAST_FMA"); - Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE__", Twine(WavefrontSize)); - // ToDo: deprecate this macro for naming consistency. - Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE", Twine(WavefrontSize)); + Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE__", Twine(WavefrontSize), + "compile-time-constant access to the wavefront size will " + "be removed in a future release"); + Builder.defineMacro("__AMDGCN_WAVEFRONT_SIZE", Twine(WavefrontSize), + "compile-time-constant access to the wavefront size will " + "be removed in a future release"); Builder.defineMacro("__AMDGCN_CUMODE__", Twine(CUMode)); } diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 3c3dbfa13e452b..dc85e9aa77cd3d 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -434,6 +434,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasAMXTRANSPOSE = true; } else if (Feature == "+amx-avx512") { HasAMXAVX512 = true; + } else if (Feature == "+amx-tf32") { + HasAMXTF32 = true; } else if (Feature == "+cmpccxadd") { HasCMPCCXADD = true; } else if (Feature == "+raoint") { @@ -959,6 +961,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__AMX_TRANSPOSE__"); if (HasAMXAVX512) Builder.defineMacro("__AMX_AVX512__"); + if (HasAMXTF32) + Builder.defineMacro("__AMX_TF32__"); if (HasCMPCCXADD) Builder.defineMacro("__CMPCCXADD__"); if (HasRAOINT) @@ -1090,6 +1094,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const { .Case("amx-fp16", true) .Case("amx-fp8", true) .Case("amx-int8", true) + .Case("amx-tf32", true) .Case("amx-tile", true) .Case("amx-transpose", true) .Case("avx", true) @@ -1211,6 +1216,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const { .Case("amx-fp16", HasAMXFP16) .Case("amx-fp8", HasAMXFP8) .Case("amx-int8", HasAMXINT8) + .Case("amx-tf32", HasAMXTF32) .Case("amx-tile", HasAMXTILE) .Case("amx-transpose", HasAMXTRANSPOSE) .Case("avx", SSELevel >= AVX) diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index 70047731b17295..04b1d5d33ea231 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -160,6 +160,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo { bool HasAMXFP8 = false; bool HasAMXTRANSPOSE = false; bool HasAMXAVX512 = false; + bool HasAMXTF32 = false; bool HasSERIALIZE = false; bool HasTSXLDTRK = false; bool HasUSERMSR = false; diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index c1adc7ecbf74dd..4e8a8cc3f4c524 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -22,7 +22,9 @@ #include "mlir/IR/Location.h" #include "mlir/IR/MLIRContext.h" -using namespace cir; +using namespace clang; +using namespace clang::CIRGen; + CIRGenModule::CIRGenModule(mlir::MLIRContext &context, clang::ASTContext &astctx, const clang::CodeGenOptions &cgo, @@ -75,7 +77,7 @@ void CIRGenModule::buildGlobal(clang::GlobalDecl gd) { void CIRGenModule::buildGlobalFunctionDefinition(clang::GlobalDecl gd, mlir::Operation *op) { auto const *funcDecl = cast(gd.getDecl()); - auto funcOp = builder.create( + auto funcOp = builder.create( getLoc(funcDecl->getSourceRange()), funcDecl->getIdentifier()->getName()); theModule.push_back(funcOp); } diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 2bf6a5d9c8f597..9e5950ff71c528 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -31,10 +31,8 @@ class LangOptions; class SourceLocation; class SourceRange; class TargetInfo; -} // namespace clang -using namespace clang; -namespace cir { +namespace CIRGen { /// This class organizes the cross-function state that is used while generating /// CIR code. @@ -91,6 +89,8 @@ class CIRGenModule : public CIRGenTypeCache { DiagnosticBuilder errorNYI(SourceRange, llvm::StringRef); DiagnosticBuilder errorNYI(SourceRange, llvm::StringRef, llvm::StringRef); }; -} // namespace cir +} // namespace CIRGen + +} // namespace clang #endif // LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENMODULE_H diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h index 6478e0a0780994..fde9a355f52416 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h +++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h @@ -13,7 +13,7 @@ #ifndef LLVM_CLANG_LIB_CIR_CIRGENTYPECACHE_H #define LLVM_CLANG_LIB_CIR_CIRGENTYPECACHE_H -namespace cir { +namespace clang::CIRGen { /// This structure provides a set of types that are commonly used /// during IR emission. It's initialized once in CodeGenModule's @@ -22,6 +22,6 @@ struct CIRGenTypeCache { CIRGenTypeCache() = default; }; -} // namespace cir +} // namespace clang::CIRGen #endif // LLVM_CLANG_LIB_CIR_CODEGEN_CIRGENTYPECACHE_H diff --git a/clang/lib/CIR/CodeGen/CIRGenerator.cpp b/clang/lib/CIR/CodeGen/CIRGenerator.cpp index 152124a00b2bbd..85367a916ef783 100644 --- a/clang/lib/CIR/CodeGen/CIRGenerator.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenerator.cpp @@ -35,9 +35,9 @@ void CIRGenerator::Initialize(ASTContext &astCtx) { this->astCtx = &astCtx; mlirCtx = std::make_unique(); - mlirCtx->loadDialect(); - cgm = std::make_unique(*mlirCtx.get(), astCtx, codeGenOpts, - diags); + mlirCtx->loadDialect(); + cgm = std::make_unique(*mlirCtx.get(), astCtx, + codeGenOpts, diags); } mlir::ModuleOp CIRGenerator::getModule() const { return cgm->getModule(); } diff --git a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp index 6d74d72b77dca7..7d42da1ab20d76 100644 --- a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp @@ -13,7 +13,7 @@ #include "clang/CIR/Dialect/IR/CIRDialect.h" using namespace mlir; -using namespace mlir::cir; +using namespace cir; //===----------------------------------------------------------------------===// // General CIR parsing / printing diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index e0b38a2902bdbb..f666e5ab4b9990 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -17,13 +17,13 @@ #include "clang/CIR/Dialect/IR/CIROpsDialect.cpp.inc" using namespace mlir; -using namespace mlir::cir; +using namespace cir; //===----------------------------------------------------------------------===// // CIR Dialect //===----------------------------------------------------------------------===// -void mlir::cir::CIRDialect::initialize() { +void cir::CIRDialect::initialize() { registerTypes(); registerAttributes(); addOperations< @@ -36,8 +36,8 @@ void mlir::cir::CIRDialect::initialize() { // FuncOp //===----------------------------------------------------------------------===// -void mlir::cir::FuncOp::build(OpBuilder &builder, OperationState &result, - StringRef name) { +void cir::FuncOp::build(OpBuilder &builder, OperationState &result, + StringRef name) { result.addAttribute(SymbolTable::getSymbolAttrName(), builder.getStringAttr(name)); } @@ -56,7 +56,7 @@ void cir::FuncOp::print(OpAsmPrinter &p) { p.printSymbolName(getSymName()); } -mlir::LogicalResult mlir::cir::FuncOp::verify() { return success(); } +mlir::LogicalResult cir::FuncOp::verify() { return success(); } //===----------------------------------------------------------------------===// // TableGen'd op method definitions diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp index 167c237ae5515c..4eeb70f06f5f76 100644 --- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp @@ -13,7 +13,7 @@ #include "clang/CIR/Dialect/IR/CIRDialect.h" using namespace mlir; -using namespace mlir::cir; +using namespace cir; //===----------------------------------------------------------------------===// // General CIR parsing / printing diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 1c32a675380c7f..390516fea38498 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -5556,12 +5556,14 @@ void CodeGenFunction::EmitOMPDepobjDirective(const OMPDepobjDirective &S) { const auto *DO = S.getSingleClause(); LValue DOLVal = EmitLValue(DO->getDepobj()); if (const auto *DC = S.getSingleClause()) { - OMPTaskDataTy::DependData Dependencies(DC->getDependencyKind(), - DC->getModifier()); - Dependencies.DepExprs.append(DC->varlist_begin(), DC->varlist_end()); - Address DepAddr = CGM.getOpenMPRuntime().emitDepobjDependClause( - *this, Dependencies, DC->getBeginLoc()); - EmitStoreOfScalar(DepAddr.emitRawPointer(*this), DOLVal); + // Build list and emit dependences + OMPTaskDataTy Data; + buildDependences(S, Data); + for (auto &Dep : Data.Dependences) { + Address DepAddr = CGM.getOpenMPRuntime().emitDepobjDependClause( + *this, Dep, DC->getBeginLoc()); + EmitStoreOfScalar(DepAddr.emitRawPointer(*this), DOLVal); + } return; } if (const auto *DC = S.getSingleClause()) { diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index 60fde03289cf35..3f95a1efb2eed7 100644 --- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -67,7 +67,7 @@ CreateFrontendBaseAction(CompilerInstance &CI) { case EmitBC: return std::make_unique(); case EmitCIR: #if CLANG_ENABLE_CIR - return std::make_unique<::cir::EmitCIRAction>(); + return std::make_unique(); #else llvm_unreachable("CIR suppport not built into clang"); #endif diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt index 76366ca1f108e9..225bf131aeab41 100644 --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -151,6 +151,8 @@ set(x86_files amxfp16intrin.h amxfp8intrin.h amxintrin.h + amxtf32intrin.h + amxtf32transposeintrin.h amxtransposeintrin.h avx10_2_512bf16intrin.h avx10_2_512convertintrin.h diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h index 945edea543e706..e4d77e503015af 100644 --- a/clang/lib/Headers/amxavx512intrin.h +++ b/clang/lib/Headers/amxavx512intrin.h @@ -12,7 +12,7 @@ #ifndef __AMX_AVX512INTRIN_H #define __AMX_AVX512INTRIN_H -#ifdef __x86_64__ +#if defined(__x86_64__) && defined(__SSE2__) #define __DEFAULT_FN_ATTRS_AVX512 \ __attribute__((__always_inline__, __nodebug__, \ @@ -378,5 +378,5 @@ static __m512i __tile_movrow(__tile1024i src0, unsigned src1) { return (__m512i)_tile_movrow_internal(src0.row, src0.col, src0.tile, src1); } -#endif // __x86_64__ +#endif // __x86_64__ && __SSE2__ #endif // __AMX_AVX512INTRIN_H diff --git a/clang/lib/Headers/amxtf32intrin.h b/clang/lib/Headers/amxtf32intrin.h new file mode 100644 index 00000000000000..44d002c6600d6f --- /dev/null +++ b/clang/lib/Headers/amxtf32intrin.h @@ -0,0 +1,108 @@ +/*===------------- amxtf32intrin.h - AMX_TF32 intrinsics -*- C++ -*---------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AMX_TF32INTRIN_H +#define __AMX_TF32INTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_TF32 \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-tf32"))) + +/// Do Matrix Multiplication of \a a and \a b, and then do Matrix Plus +/// with \a srcdst. +/// All the calculation is base on float32 but with the lower 13-bit set to 0. +/// +/// \headerfile +/// +/// \code +/// void _tile_mmultf32ps(constexpr int srcdst, constexpr int a, \ +/// constexpr int b); +/// \endcode +/// +/// This intrinsic corresponds to the TMMULTF32PS instruction. +/// +/// \param srcdst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +/// +/// \code{.operation} +/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) { +/// dword[12:0] := 0 +/// dword[31:13] := x[31:13] +/// return dword +/// } +/// +/// DEFINE silence_snan_fp32(x[31:0]) { +/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0) +/// x.fraction[22] := 1 +/// return x +/// } +/// +/// elements_a := a.colsb / 4 +/// elements_dest := srcdst.colsb / 4 +/// +/// FOR m = 0 TO (srcdst.rows-1) +/// tmp[511:0] := 0 +/// FOR k = 0 TO (elements_a-1) +/// FOR n = 0 TO (elements_dest-1) +/// af := silence_snan_fp32(a.row[m].fp32[k]) +/// bf := silence_snan_fp32(b.row[k].fp32[n]) +/// tmp.fp32[n] += zero_lower_mantissa_bits_fp32(af) +/// * zero_lower_mantissa_bits_fp32(bf) +/// ENDFOR +/// ENDFOR +/// +/// FOR n = 0 TO (elements_dest-1) +/// tmp.fp32[n] += srcdst.row[m].fp32[n] +/// ENDFOR +/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb) +/// +/// ENDFOR +/// +/// zero_upper_rows(srcdst, srcdst.rows) +/// zero_tileconfig_start() +/// \endcode +#define _tile_mmultf32ps(srcdst, a, b) \ + __builtin_ia32_tmmultf32ps((srcdst), (a), (b)) + +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32 +_tile_mmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tmmultf32ps_internal(m, n, k, dst, src1, src2); +} + +/// Do Matrix Multiplication of src0 and src1, and then do Matrix Plus with dst. +/// All the calculation is base on float32 but with the lower 13-bit set to 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TMMULTF32PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_TF32 +static void __tile_mmultf32ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_mmultf32ps_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +#endif // __x86_64__ +#endif // __AMX_TF32INTRIN_H diff --git a/clang/lib/Headers/amxtf32transposeintrin.h b/clang/lib/Headers/amxtf32transposeintrin.h new file mode 100644 index 00000000000000..60336f953ecb7a --- /dev/null +++ b/clang/lib/Headers/amxtf32transposeintrin.h @@ -0,0 +1,105 @@ +/*===--------- amxtf32transposeintrin.h - AMX-TF32 and AMX-TRANSPOSE --------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif // __IMMINTRIN_H + +#ifndef __AMX_TF32TRANSPOSEINTRIN_H +#define __AMX_TF32TRANSPOSEINTRIN_H +#ifdef __x86_64__ + +#define __DEFAULT_FN_ATTRS_TF32_TRANSPOSE \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("amx-tf32,amx-transpose"))) + +/// \code +/// void _tile_tmmultf32ps(constexpr int srcdst, constexpr int a, \ +/// constexpr int b); +/// \endcode +/// +/// This intrinsic corresponds to the TTMMULTF32PS instruction. +/// +/// \param srcdst +/// The destination tile. Max size is 1024 Bytes. +/// \param a +/// The 1st source tile. Max size is 1024 Bytes. +/// \param b +/// The 2nd source tile. Max size is 1024 Bytes. +/// +/// \code{.operation} +/// DEFINE zero_lower_mantissa_bits_fp32(x[31:0]) { +/// dword[12:0] := 0 +/// dword[31:13] := x[31:13] +/// return dword +/// } +/// +/// DEFINE silence_snan_fp32(x[31:0]) { +/// IF (x.exponent == 255 and x.fraction != 0 and x.fraction[22] == 0) +/// x.fraction[22] := 1 +/// return x +/// } +/// +/// elements_dest:= srcdst.colsb/4 +/// +/// FOR m := 0 TO (srcdst.rows-1) +/// tmp[511:0] := 0 +/// FOR k := 0 TO (a.rows-1) +/// FOR n := 0 TO (elements_dest-1) +/// a1e := silence_snan_fp32(a.row[k].fp32[m]) +/// a2e := silence_snan_fp32(b.row[k].fp32[n]) +/// s1e := zero_lower_mantissa_bits_fp32(a1e) +/// s2e := zero_lower_mantissa_bits_fp32(a2e) +/// tmp.fp32[n] += s1e * s2e +/// ENDFOR +/// ENDFOR +/// +/// FOR n := 0 TO (elements_dest-1) +/// tmp.fp32[n] += srcdst.row[m].fp32[n] +/// ENDFOR +/// write_row_and_zero(srcdst, m, tmp, srcdst.colsb) +/// +/// ENDFOR +/// +/// zero_upper_rows(srcdst, srcdst.rows) +/// zero_tileconfig_start() +/// \endcode +#define _tile_tmmultf32ps(srcdst, a, b) \ + __builtin_ia32_ttmmultf32ps((srcdst), (a), (b)) + +// dst = m x n (srcdest), src1 = k x m, src2 = k x n +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_TF32_TRANSPOSE +_tile_tmmultf32ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_ttmmultf32ps_internal(m, n, k, dst, src1, src2); +} + +/// Compute transpose and do Matrix Multiplication of src0 and src1, and then do +/// Matrix Plus with dst. All the calculation is base on float32 but with the +/// lower 13-bit set to 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TTMMULTF32PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_TF32_TRANSPOSE +static void __tile_tmmultf32ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_tmmultf32ps_internal(src0.row, src1.col, src0.col, + dst->tile, src0.tile, src1.tile); +} + +#endif // __x86_64__ +#endif // __AMX_TF32TRANSPOSEINTRIN_H diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index bc240e28d59142..87a502238ae162 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -660,6 +660,15 @@ _storebe_i64(void * __P, long long __D) { #include #endif +#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_TF32__) +#include +#endif + +#if !defined(__SCE__) || __has_feature(modules) || \ + (defined(__AMX_TF32__) && defined(__AMX_TRANSPOSE__)) +#include +#endif + #if !defined(__SCE__) || __has_feature(modules) || \ defined(__AVX512VP2INTERSECT__) #include diff --git a/clang/lib/Headers/openmp_wrappers/complex_cmath.h b/clang/lib/Headers/openmp_wrappers/complex_cmath.h index e3d9aebbbc2436..cee36bde3f522e 100644 --- a/clang/lib/Headers/openmp_wrappers/complex_cmath.h +++ b/clang/lib/Headers/openmp_wrappers/complex_cmath.h @@ -64,8 +64,13 @@ template __DEVICE__ _Tp norm(const std::complex<_Tp> &__c) { } // conj - -template std::complex<_Tp> conj(const std::complex<_Tp> &__c) { +#ifdef _GLIBCXX20_CONSTEXPR +#define CXX20_CONSTEXPR_DEVICE __DEVICE__ +#else +#define CXX20_CONSTEXPR_DEVICE +#endif +template +CXX20_CONSTEXPR_DEVICE std::complex<_Tp> conj(const std::complex<_Tp> &__c) { return std::complex<_Tp>(__c.real(), -__c.imag()); } diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp index 6eb24c578f602c..cac15b974a276e 100644 --- a/clang/lib/Sema/HLSLExternalSemaSource.cpp +++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp @@ -502,8 +502,8 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .addSimpleTemplateParams(*SemaPtr, {"element_type"}) .Record; onCompletion(Decl, [this](CXXRecordDecl *Decl) { - setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, - ResourceKind::TypedBuffer, /*IsROV=*/false, + setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, ResourceKind::RawBuffer, + /*IsROV=*/false, /*RawBuffer=*/true) .addArraySubscriptOperators() .completeDefinition(); @@ -513,13 +513,35 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() { .addSimpleTemplateParams(*SemaPtr, {"element_type"}) .Record; onCompletion(Decl, [this](CXXRecordDecl *Decl) { - setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, - ResourceKind::TypedBuffer, /*IsROV=*/false, + setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer, + /*IsROV=*/false, /*RawBuffer=*/true) .addArraySubscriptOperators() .completeDefinition(); }); + Decl = + BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "AppendStructuredBuffer") + .addSimpleTemplateParams(*SemaPtr, {"element_type"}) + .Record; + onCompletion(Decl, [this](CXXRecordDecl *Decl) { + setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer, + /*IsROV=*/false, + /*RawBuffer=*/true) + .completeDefinition(); + }); + + Decl = + BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "ConsumeStructuredBuffer") + .addSimpleTemplateParams(*SemaPtr, {"element_type"}) + .Record; + onCompletion(Decl, [this](CXXRecordDecl *Decl) { + setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer, + /*IsROV=*/false, + /*RawBuffer=*/true) + .completeDefinition(); + }); + Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RasterizerOrderedStructuredBuffer") .addSimpleTemplateParams(*SemaPtr, {"element_type"}) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 61c29e320d5c73..a3bc8e4191c819 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -16687,6 +16687,7 @@ void Sema::AddKnownFunctionAttributes(FunctionDecl *FD) { } } + LazyProcessLifetimeCaptureByParams(FD); inferLifetimeBoundAttribute(FD); AddKnownFunctionAttributesForReplaceableGlobalAllocationFunction(FD); diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index d05d326178e1b8..b4aaa58c082002 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -14,6 +14,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/ASTMutationListener.h" #include "clang/AST/CXXInheritance.h" +#include "clang/AST/Decl.h" #include "clang/AST/DeclCXX.h" #include "clang/AST/DeclObjC.h" #include "clang/AST/DeclTemplate.h" @@ -3867,6 +3868,113 @@ static void handleCallbackAttr(Sema &S, Decl *D, const ParsedAttr &AL) { S.Context, AL, EncodingIndices.data(), EncodingIndices.size())); } +LifetimeCaptureByAttr *Sema::ParseLifetimeCaptureByAttr(const ParsedAttr &AL, + StringRef ParamName) { + // Atleast one capture by is required. + if (AL.getNumArgs() == 0) { + Diag(AL.getLoc(), diag::err_capture_by_attribute_no_entity) + << AL.getRange(); + return nullptr; + } + SmallVector ParamIdents; + SmallVector ParamLocs; + for (unsigned I = 0; I < AL.getNumArgs(); ++I) { + if (AL.isArgExpr(I)) { + Expr *E = AL.getArgAsExpr(I); + Diag(E->getExprLoc(), diag::err_capture_by_attribute_argument_unknown) + << E << E->getExprLoc(); + continue; + } + assert(AL.isArgIdent(I)); + IdentifierLoc *IdLoc = AL.getArgAsIdent(I); + if (IdLoc->Ident->getName() == ParamName) { + Diag(IdLoc->Loc, diag::err_capture_by_references_itself) << IdLoc->Loc; + continue; + } + ParamIdents.push_back(IdLoc->Ident); + ParamLocs.push_back(IdLoc->Loc); + } + SmallVector FakeParamIndices(ParamIdents.size(), + LifetimeCaptureByAttr::INVALID); + LifetimeCaptureByAttr *CapturedBy = ::new (Context) LifetimeCaptureByAttr( + Context, AL, FakeParamIndices.data(), FakeParamIndices.size()); + CapturedBy->setArgs(std::move(ParamIdents), std::move(ParamLocs)); + return CapturedBy; +} + +static void HandleLifetimeCaptureByAttr(Sema &S, Decl *D, + const ParsedAttr &AL) { + // Do not allow multiple attributes. + if (D->hasAttr()) { + S.Diag(AL.getLoc(), diag::err_capture_by_attribute_multiple) + << AL.getRange(); + return; + } + auto *PVD = dyn_cast(D); + assert(PVD); + auto *CaptureByAttr = S.ParseLifetimeCaptureByAttr(AL, PVD->getName()); + if (CaptureByAttr) + D->addAttr(CaptureByAttr); +} + +void Sema::LazyProcessLifetimeCaptureByParams(FunctionDecl *FD) { + bool HasImplicitThisParam = isInstanceMethod(FD); + + llvm::StringMap NameIdxMapping; + NameIdxMapping["global"] = LifetimeCaptureByAttr::GLOBAL; + NameIdxMapping["unknown"] = LifetimeCaptureByAttr::UNKNOWN; + int Idx = 0; + if (HasImplicitThisParam) { + NameIdxMapping["this"] = 0; + Idx++; + } + for (const ParmVarDecl *PVD : FD->parameters()) + NameIdxMapping[PVD->getName()] = Idx++; + auto DisallowReservedParams = [&](StringRef Reserved) { + for (const ParmVarDecl *PVD : FD->parameters()) + if (PVD->getName() == Reserved) + Diag(PVD->getLocation(), diag::err_capture_by_param_uses_reserved_name) + << (PVD->getName() == "unknown"); + }; + auto HandleCaptureBy = [&](LifetimeCaptureByAttr *CapturedBy) { + if (!CapturedBy) + return; + const auto &Entities = CapturedBy->getArgIdents(); + for (size_t I = 0; I < Entities.size(); ++I) { + StringRef Name = Entities[I]->getName(); + auto It = NameIdxMapping.find(Name); + if (It == NameIdxMapping.end()) { + auto Loc = CapturedBy->getArgLocs()[I]; + if (!HasImplicitThisParam && Name == "this") + Diag(Loc, diag::err_capture_by_implicit_this_not_available) << Loc; + else + Diag(Loc, diag::err_capture_by_attribute_argument_unknown) + << Entities[I] << Loc; + continue; + } + if (Name == "unknown" || Name == "global") + DisallowReservedParams(Name); + CapturedBy->setParamIdx(I, It->second); + } + }; + for (ParmVarDecl *PVD : FD->parameters()) + HandleCaptureBy(PVD->getAttr()); + if (!HasImplicitThisParam) + return; + TypeSourceInfo *TSI = FD->getTypeSourceInfo(); + if (!TSI) + return; + AttributedTypeLoc ATL; + for (TypeLoc TL = TSI->getTypeLoc(); + (ATL = TL.getAsAdjusted()); + TL = ATL.getModifiedLoc()) { + auto *A = ATL.getAttrAs(); + if (!A) + continue; + HandleCaptureBy(const_cast(A)); + } +} + static bool isFunctionLike(const Type &T) { // Check for explicit function types. // 'called_once' is only supported in Objective-C and it has @@ -6644,6 +6752,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_Callback: handleCallbackAttr(S, D, AL); break; + case ParsedAttr::AT_LifetimeCaptureBy: + HandleLifetimeCaptureByAttr(S, D, AL); + break; case ParsedAttr::AT_CalledOnce: handleCalledOnceAttr(S, D, AL); break; diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp index ab728f24d8a271..a76a0a41276896 100644 --- a/clang/lib/Sema/SemaFunctionEffects.cpp +++ b/clang/lib/Sema/SemaFunctionEffects.cpp @@ -972,6 +972,7 @@ class Analyzer { CallableInfo &CurrentCaller; ViolationSite VSite; const Expr *TrailingRequiresClause = nullptr; + const Expr *NoexceptExpr = nullptr; FunctionBodyASTVisitor(Analyzer &Outer, PendingFunctionAnalysis &CurrentFunction, @@ -986,9 +987,22 @@ class Analyzer { if (auto *Dtor = dyn_cast(CurrentCaller.CDecl)) followDestructor(dyn_cast(Dtor->getParent()), Dtor); - if (auto *FD = dyn_cast(CurrentCaller.CDecl)) + if (auto *FD = dyn_cast(CurrentCaller.CDecl)) { TrailingRequiresClause = FD->getTrailingRequiresClause(); + // Note that FD->getType->getAs() can yield a + // noexcept Expr which has been boiled down to a constant expression. + // Going through the TypeSourceInfo obtains the actual expression which + // will be traversed as part of the function -- unless we capture it + // here and have TraverseStmt skip it. + if (TypeSourceInfo *TSI = FD->getTypeSourceInfo()) { + if (FunctionProtoTypeLoc TL = + TSI->getTypeLoc().getAs()) + if (const FunctionProtoType *FPT = TL.getTypePtr()) + NoexceptExpr = FPT->getNoexceptExpr(); + } + } + // Do an AST traversal of the function/block body TraverseDecl(const_cast(CurrentCaller.CDecl)); } @@ -1269,7 +1283,8 @@ class Analyzer { // We skip the traversal of lambdas (beyond their captures, see // TraverseLambdaExpr below), so just caching this from our constructor // should suffice. - if (Statement != TrailingRequiresClause) + // The exact same is true for a conditional `noexcept()` clause. + if (Statement != TrailingRequiresClause && Statement != NoexceptExpr) return Base::TraverseStmt(Statement); return true; } diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index b9706400f07202..25427bf2309bf7 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -1538,7 +1538,6 @@ SemaOpenACC::AssociatedStmtRAII::AssociatedStmtRAII( CollectActiveReductionClauses(S.ActiveReductionClauses, Clauses); SemaRef.ActiveComputeConstructInfo.Kind = DirKind; SemaRef.ActiveComputeConstructInfo.Clauses = Clauses; - SemaRef.ParentlessLoopConstructs.swap(ParentlessLoopConstructs); // OpenACC 3.3 2.9.2: When the parent compute construct is a kernels // construct, the gang clause behaves as follows. ... The region of a loop @@ -1668,9 +1667,8 @@ SemaOpenACC::AssociatedStmtRAII::~AssociatedStmtRAII() { if (DirKind == OpenACCDirectiveKind::Parallel || DirKind == OpenACCDirectiveKind::Serial || DirKind == OpenACCDirectiveKind::Kernels) { - assert(SemaRef.ParentlessLoopConstructs.empty() && - "Didn't consume loop construct list?"); - SemaRef.ParentlessLoopConstructs.swap(ParentlessLoopConstructs); + // Nothing really to do here, the restorations above should be enough for + // now. } else if (DirKind == OpenACCDirectiveKind::Loop) { // Nothing really to do here, the LoopInConstruct should handle restorations // correctly. @@ -3171,27 +3169,14 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(OpenACCDirectiveKind K, case OpenACCDirectiveKind::Parallel: case OpenACCDirectiveKind::Serial: case OpenACCDirectiveKind::Kernels: { - auto *ComputeConstruct = OpenACCComputeConstruct::Create( + return OpenACCComputeConstruct::Create( getASTContext(), K, StartLoc, DirLoc, EndLoc, Clauses, - AssocStmt.isUsable() ? AssocStmt.get() : nullptr, - ParentlessLoopConstructs); - - ParentlessLoopConstructs.clear(); - - return ComputeConstruct; + AssocStmt.isUsable() ? AssocStmt.get() : nullptr); } case OpenACCDirectiveKind::Loop: { - auto *LoopConstruct = OpenACCLoopConstruct::Create( - getASTContext(), StartLoc, DirLoc, EndLoc, Clauses, - AssocStmt.isUsable() ? AssocStmt.get() : nullptr); - - // If we are in the scope of a compute construct, add this to the list of - // loop constructs that need assigning to the next closing compute - // construct. - if (isInComputeConstruct()) - ParentlessLoopConstructs.push_back(LoopConstruct); - - return LoopConstruct; + return OpenACCLoopConstruct::Create( + getASTContext(), ActiveComputeConstructInfo.Kind, StartLoc, DirLoc, + EndLoc, Clauses, AssocStmt.isUsable() ? AssocStmt.get() : nullptr); } } llvm_unreachable("Unhandled case in directive handling?"); diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp index e63d605349e060..163f7129a7b42b 100644 --- a/clang/lib/Sema/SemaRISCV.cpp +++ b/clang/lib/Sema/SemaRISCV.cpp @@ -282,20 +282,6 @@ void RISCVIntrinsicManagerImpl::ConstructRVVIntrinsics( if ((BaseTypeI & Record.TypeRangeMask) != BaseTypeI) continue; - // TODO: Remove the check below and use RequiredFeatures in - // riscv_vector.td to check the intrinsics instead, the type check should - // be done in checkRVVTypeSupport. This check also not able to work on the - // intrinsics that have Float16 but the BaseType is not Float16 such as - // `vfcvt_f_x_v`. - if (BaseType == BasicType::Float16) { - if ((Record.RequiredExtensions & RVV_REQ_Zvfhmin) == RVV_REQ_Zvfhmin) { - if (!TI.hasFeature("zvfhmin")) - continue; - } else if (!TI.hasFeature("zvfh")) { - continue; - } - } - // Expanded with different LMUL. for (int Log2LMUL = -3; Log2LMUL <= 3; Log2LMUL++) { if (!(Record.Log2LMULMask & (1 << (Log2LMUL + 3)))) diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 515b9f689a248a..eb7516b3ef1ece 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -8609,6 +8609,15 @@ static void HandleLifetimeBoundAttr(TypeProcessingState &State, } } +static void HandleLifetimeCaptureByAttr(TypeProcessingState &State, + QualType &CurType, ParsedAttr &PA) { + if (State.getDeclarator().isDeclarationOfFunction()) { + auto *Attr = State.getSema().ParseLifetimeCaptureByAttr(PA, "this"); + if (Attr) + CurType = State.getAttributedType(Attr, CurType, CurType); + } +} + static void HandleHLSLParamModifierAttr(TypeProcessingState &State, QualType &CurType, const ParsedAttr &Attr, Sema &S) { @@ -8770,6 +8779,10 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type, if (TAL == TAL_DeclChunk) HandleLifetimeBoundAttr(state, type, attr); break; + case ParsedAttr::AT_LifetimeCaptureBy: + if (TAL == TAL_DeclChunk) + HandleLifetimeCaptureByAttr(state, type, attr); + break; case ParsedAttr::AT_NoDeref: { // FIXME: `noderef` currently doesn't work correctly in [[]] syntax. diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp index 1155a5edc73c34..d7c8ed351f410a 100644 --- a/clang/lib/Sema/SemaX86.cpp +++ b/clang/lib/Sema/SemaX86.cpp @@ -654,6 +654,8 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) { case X86::BI__builtin_ia32_tdpbhf8ps: case X86::BI__builtin_ia32_tdphbf8ps: case X86::BI__builtin_ia32_tdphf8ps: + case X86::BI__builtin_ia32_tmmultf32ps: + case X86::BI__builtin_ia32_ttmmultf32ps: return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2}); case X86::BI__builtin_ia32_ttransposed: return CheckBuiltinTileArgumentsRange(TheCall, {0, 1}); diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index 6aaafb2e8d71cc..df54cb0c6fe4c4 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -2836,12 +2836,12 @@ void ASTStmtReader::VisitOpenACCAssociatedStmtConstruct( void ASTStmtReader::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) { VisitStmt(S); VisitOpenACCAssociatedStmtConstruct(S); - S->findAndSetChildLoops(); } void ASTStmtReader::VisitOpenACCLoopConstruct(OpenACCLoopConstruct *S) { VisitStmt(S); VisitOpenACCAssociatedStmtConstruct(S); + S->ParentComputeConstructKind = Record.readEnum(); } //===----------------------------------------------------------------------===// diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 7f700c2977e09c..326c2ef21e5688 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -2915,6 +2915,7 @@ void ASTStmtWriter::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) { void ASTStmtWriter::VisitOpenACCLoopConstruct(OpenACCLoopConstruct *S) { VisitStmt(S); VisitOpenACCAssociatedStmtConstruct(S); + Record.writeEnum(S->getParentComputeConstructKind()); Code = serialization::STMT_OPENACC_LOOP_CONSTRUCT; } diff --git a/clang/test/AST/HLSL/AppendStructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/AppendStructuredBuffer-AST.hlsl new file mode 100644 index 00000000000000..5a13ca7735f999 --- /dev/null +++ b/clang/test/AST/HLSL/AppendStructuredBuffer-AST.hlsl @@ -0,0 +1,50 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s + + +// This test tests two different AST generations. The "EMPTY" test mode verifies +// the AST generated by forward declaration of the HLSL types which happens on +// initializing the HLSL external AST with an AST Context. + +// The non-empty mode has a use that requires the AppendStructuredBuffer type be complete, +// which results in the AST being populated by the external AST source. That +// case covers the full implementation of the template declaration and the +// instantiated specialization. + +// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit AppendStructuredBuffer +// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type +// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class AppendStructuredBuffer +// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final + +// There should be no more occurrences of AppendStructuredBuffer +// EMPTY-NOT: {{[^[:alnum:]]}}AppendStructuredBuffer + +#ifndef EMPTY + +AppendStructuredBuffer Buffer; + +#endif + +// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit AppendStructuredBuffer +// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type +// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class AppendStructuredBuffer definition + +// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit RawBuffer + +// CHECK-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' +// CHECK-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &(unsigned int)' + +// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <> class AppendStructuredBuffer definition +// CHECK: TemplateArgument type 'int' +// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' +// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]] +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit RawBuffer diff --git a/clang/test/AST/HLSL/ConsumeStructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/ConsumeStructuredBuffer-AST.hlsl new file mode 100644 index 00000000000000..b75f3fcb959cfc --- /dev/null +++ b/clang/test/AST/HLSL/ConsumeStructuredBuffer-AST.hlsl @@ -0,0 +1,51 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s + + +// This test tests two different AST generations. The "EMPTY" test mode verifies +// the AST generated by forward declaration of the HLSL types which happens on +// initializing the HLSL external AST with an AST Context. + +// The non-empty mode has a use that requires the ConsumeStructuredBuffer type be complete, +// which results in the AST being populated by the external AST source. That +// case covers the full implementation of the template declaration and the +// instantiated specialization. + +// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit ConsumeStructuredBuffer +// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type +// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class ConsumeStructuredBuffer +// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final + +// There should be no more occurrences of ConsumeStructuredBuffer +// EMPTY-NOT: {{[^[:alnum:]]}}ConsumeStructuredBuffer + +#ifndef EMPTY + +ConsumeStructuredBuffer Buffer; + +#endif + +// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <> implicit ConsumeStructuredBuffer +// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <> class depth 0 index 0 element_type +// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <> implicit class ConsumeStructuredBuffer definition + +// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit RawBuffer + +// CHECK-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' +// CHECK-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &(unsigned int)' + +// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <> class ConsumeStructuredBuffer definition + +// CHECK: TemplateArgument type 'int' +// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' +// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <> Implicit final +// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <> implicit h '__hlsl_resource_t +// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] +// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] +// CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]] +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit RawBuffer diff --git a/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl index 4104250225aec3..4a1e1d7570e5e9 100644 --- a/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl +++ b/clang/test/AST/HLSL/RWStructuredBuffer-AST.hlsl @@ -34,7 +34,7 @@ RWStructuredBuffer Buffer; // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit RawBuffer // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' @@ -61,4 +61,4 @@ RWStructuredBuffer Buffer; // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]] -// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit RawBuffer diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl index 42a7d1b5617397..521c3d45b20225 100644 --- a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl +++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl @@ -34,7 +34,7 @@ StructuredBuffer Buffer; // CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]] -// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit RawBuffer // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <> operator[] 'element_type &const (unsigned int) const' // CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <> Idx 'unsigned int' @@ -61,4 +61,4 @@ StructuredBuffer Buffer; // CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]] // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]] // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]] -// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit TypedBuffer +// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <> Implicit RawBuffer diff --git a/clang/test/AST/attr-lifetime-capture-by.cpp b/clang/test/AST/attr-lifetime-capture-by.cpp new file mode 100644 index 00000000000000..da2eb0cf3d592e --- /dev/null +++ b/clang/test/AST/attr-lifetime-capture-by.cpp @@ -0,0 +1,9 @@ +// RUN: %clang_cc1 %s -ast-dump | FileCheck %s + +// Verify that we print the [[clang::lifetime_capture_by(X)]] attribute. + +struct S { + void foo(int &a, int &b) [[clang::lifetime_capture_by(a, b, global)]]; +}; + +// CHECK: CXXMethodDecl {{.*}}clang::lifetime_capture_by(a, b, global) diff --git a/clang/test/C/C2y/n3341.c b/clang/test/C/C2y/n3341.c index 523c3dd945ac1d..4cff7f08cc2320 100644 --- a/clang/test/C/C2y/n3341.c +++ b/clang/test/C/C2y/n3341.c @@ -10,7 +10,9 @@ // expected-no-diagnostics struct R {}; // gnu-warning {{empty struct is a GNU extension}} +#if __STDC_VERSION__ >= 201112L struct S { struct { }; }; // gnu-warning {{empty struct is a GNU extension}} +#endif struct T { int : 0; }; // gnu-warning {{struct without named members is a GNU extension}} union U {}; // gnu-warning {{empty union is a GNU extension}} diff --git a/clang/test/C/C2y/n3346.c b/clang/test/C/C2y/n3346.c index d649181f100448..a6fc3325e0c8f7 100644 --- a/clang/test/C/C2y/n3346.c +++ b/clang/test/C/C2y/n3346.c @@ -1,5 +1,5 @@ // RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic -ffreestanding %s -// RUN: %clang_cc1 -verify=expected,ped -Wall -pedantic -ffreestanding %s +// RUN: %clang_cc1 -std=c99 -verify=expected,ped -Wall -pedantic -ffreestanding %s /* WG14 N3346: Yes * Slay Some Earthly Demons VIII @@ -33,8 +33,6 @@ void test1(void) { void test2(void) { typedef __WCHAR_TYPE__ wchar_t; - typedef __CHAR16_TYPE__ char16_t; - typedef __CHAR32_TYPE__ char32_t; // The initializer for an array shall be either a string literal, optionally // enclosed in braces, or a brace-enclosed list of initializers for the @@ -46,23 +44,31 @@ void test2(void) { // respectively). char str1[] = "string literal"; char str2[] = { "string literal" }; - char str3[] = u8"string literal"; - char str4[] = { u8"string literal" }; float str5[] = "this doesn't work"; // expected-error {{array initializer must be an initializer list}} float str6[] = { "this also doesn't work" }; // expected-error {{initializing 'float' with an expression of incompatible type 'char[23]'}} wchar_t str7[] = L"string literal"; wchar_t str8[] = { L"string literal" }; + +#if __STDC_VERSION__ >= 201112L + typedef __CHAR16_TYPE__ char16_t; + typedef __CHAR32_TYPE__ char32_t; + + char str3[] = u8"string literal"; + char str4[] = { u8"string literal" }; + char16_t str9[] = u"string literal"; char16_t str10[] = { u"string literal" }; char32_t str11[] = U"string literal"; char32_t str12[] = { U"string literal" }; - wchar_t str13[] = "nope"; // expected-error {{initializing wide char array with non-wide string literal}} - wchar_t str14[] = { "nope" }; // expected-error-re {{incompatible pointer to integer conversion initializing 'wchar_t' (aka '{{.*}}') with an expression of type 'char[5]'}} char16_t str15[] = "nope"; // expected-error {{initializing wide char array with non-wide string literal}} char16_t str16[] = { "nope" }; // expected-error-re {{incompatible pointer to integer conversion initializing 'char16_t' (aka '{{.*}}') with an expression of type 'char[5]'}} char32_t str17[] = "nope"; // expected-error {{initializing wide char array with non-wide string literal}} char32_t str18[] = { "nope" }; // expected-error-re {{incompatible pointer to integer conversion initializing 'char32_t' (aka '{{.*}}') with an expression of type 'char[5]'}} +#endif + + wchar_t str13[] = "nope"; // expected-error {{initializing wide char array with non-wide string literal}} + wchar_t str14[] = { "nope" }; // expected-error-re {{incompatible pointer to integer conversion initializing 'wchar_t' (aka '{{.*}}') with an expression of type 'char[5]'}} } diff --git a/clang/test/CodeGen/X86/amx_tf32.c b/clang/test/CodeGen/X86/amx_tf32.c new file mode 100644 index 00000000000000..661a9dfbc673b2 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_tf32.c @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-tf32 \ +// RUN: -target-feature +amx-transpose -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression | FileCheck %s + +#include +#include + +void test_tile_mmultf32ps(void) { + // CHECK-LABEL: @test_tile_mmultf32ps( + // CHECK: call void @llvm.x86.tmmultf32ps(i8 1, i8 2, i8 3) + _tile_mmultf32ps(1, 2, 3); +} + +void test_tile_tmmultf32ps(void) { + // CHECK-LABEL: @test_tile_tmmultf32ps( + // CHECK: call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3) + _tile_tmmultf32ps(1, 2, 3); +} diff --git a/clang/test/CodeGen/X86/amx_tf32_api.c b/clang/test/CodeGen/X86/amx_tf32_api.c new file mode 100644 index 00000000000000..2ac8489e3e0baf --- /dev/null +++ b/clang/test/CodeGen/X86/amx_tf32_api.c @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-tf32 -target-feature +amx-transpose \ +// RUN: -target-feature +amx-bf16 -target-feature +avx512f \ +// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s + +#include + +char buf[1024]; +#define STRIDE 32 + +char buf2[1024]; + +void test_tile_mmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) { + //CHECK-LABEL: @test_tile_mmultf32ps + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call x86_amx @llvm.x86.tmmultf32ps.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + __tile_mmultf32ps(&c, a, b); +} + +void test_tile_tmmultf32ps(__tile1024i a, __tile1024i b, __tile1024i c) { + //CHECK-LABEL: @test_tile_tmmultf32ps + //CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}}) + //CHECK-DAG: call x86_amx @llvm.x86.ttmmultf32ps.internal + //CHECK-DAG: call <256 x i32> @llvm.x86.cast.tile.to.vector.v256i32(x86_amx {{%.*}}) + __tile_tmmultf32ps(&c, a, b); +} diff --git a/clang/test/CodeGen/X86/amx_tf32_errors.c b/clang/test/CodeGen/X86/amx_tf32_errors.c new file mode 100644 index 00000000000000..45021306921150 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_tf32_errors.c @@ -0,0 +1,23 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown \ +// RUN: -target-feature +amx-tf32 -target-feature +amx-transpose -verify + +#include +#include + +void test_tile_mmultf32ps() { + _tile_mmultf32ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_mmultf32ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}} + _tile_mmultf32ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}} + _tile_mmultf32ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} + _tile_mmultf32ps(1, 2, 1); // expected-error {{tile arguments must refer to different tiles}} + _tile_mmultf32ps(1, 3, 3); // expected-error {{tile arguments must refer to different tiles}} +} + +void test_tile_tmmultf32ps() { + _tile_tmmultf32ps(16, 2, 3); // expected-error {{argument value 16 is outside the valid range [0, 7]}} + _tile_tmmultf32ps(1, 26, 3); // expected-error {{argument value 26 is outside the valid range [0, 7]}} + _tile_tmmultf32ps(1, 2, 36); // expected-error {{argument value 36 is outside the valid range [0, 7]}} + _tile_tmmultf32ps(1, 1, 3); // expected-error {{tile arguments must refer to different tiles}} + _tile_tmmultf32ps(1, 2, 1); // expected-error {{tile arguments must refer to different tiles}} + _tile_tmmultf32ps(1, 2, 2); // expected-error {{tile arguments must refer to different tiles}} +} diff --git a/clang/test/CodeGen/X86/amx_tf32_inline_asm.c b/clang/test/CodeGen/X86/amx_tf32_inline_asm.c new file mode 100644 index 00000000000000..76d164737d88b6 --- /dev/null +++ b/clang/test/CodeGen/X86/amx_tf32_inline_asm.c @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tf32 -target-feature +amx-transpose -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s + +void f_tilemul(short a) +{ + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09tmmultf32ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "tmmultf32ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); + + //CHECK: call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0 \0A\09tileloadd 0(%rdx,%r14,4), %tmm6 \0A\09ttmmultf32ps %tmm6, %tmm0, %tmm7 \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"() + __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0 \n\t" + "tileloadd 0(%%rdx,%%r14,4), %%tmm6 \n\t" + "ttmmultf32ps %%tmm6, %%tmm0, %%tmm7 \n\t" + "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t" + ::: "memory", "tmm0", "tmm6", "tmm7"); +} diff --git a/clang/test/CodeGen/aarch64-fmv-dependencies.c b/clang/test/CodeGen/aarch64-fmv-dependencies.c index e1105f35282b7a..137f64d7c9c32c 100644 --- a/clang/test/CodeGen/aarch64-fmv-dependencies.c +++ b/clang/test/CodeGen/aarch64-fmv-dependencies.c @@ -189,7 +189,7 @@ int caller() { // CHECK: attributes #[[ssbs]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+ssbs,+v8a" // CHECK: attributes #[[sve]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+v8a" // CHECK: attributes #[[sve2]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+v8a" -// CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve-aes,+sve2,+v8a" +// CHECK: attributes #[[sve2_aes]] = { {{.*}} "target-features"="+aes,+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-aes,+v8a" // CHECK: attributes #[[sve2_bitperm]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-bitperm,+v8a" // CHECK: attributes #[[sve2_sha3]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-sha3,+v8a" // CHECK: attributes #[[sve2_sm4]] = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+outline-atomics,+sve,+sve2,+sve2-sm4,+v8a" diff --git a/clang/test/CodeGen/aarch64-pure-scalable-args.c b/clang/test/CodeGen/aarch64-pure-scalable-args.c index 851159ada76749..53d5ce4e8c9d9b 100644 --- a/clang/test/CodeGen/aarch64-pure-scalable-args.c +++ b/clang/test/CodeGen/aarch64-pure-scalable-args.c @@ -405,7 +405,7 @@ void test_va_arg(int n, ...) { // CHECK-AAPCS-NEXT: %new_reg_offs = add nsw i32 %gr_offs, 8 // CHECK-AAPCS-NEXT: store i32 %new_reg_offs, ptr %gr_offs_p, align 8 -// CHECK-AAPCS-NEXT: %inreg = icmp ult i32 %gr_offs, -7 +// CHECK-AAPCS-NEXT: %inreg = icmp samesign ult i32 %gr_offs, -7 // CHECK-AAPCS-NEXT: br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack // CHECK-AAPCS-EMPTY: // CHECK-AAPCS-NEXT: vaarg.in_reg: ; preds = %vaarg.maybe_reg diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c index 0839b32fecb78e..5ea27aa3b768c5 100644 --- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesd.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK // REQUIRES: aarch64-registered-target diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c index 08ca748c96fe76..9442d14de83633 100644 --- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aese.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK // REQUIRES: aarch64-registered-target diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c index 78d3debad4b34a..23f838c5bb30ec 100644 --- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesimc.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK // REQUIRES: aarch64-registered-target diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c index 48d130174788ac..575c09c325f78f 100644 --- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_aesmc.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK // REQUIRES: aarch64-registered-target diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c index 09583f98393a35..a4935d8dadd542 100644 --- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullb_128.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK #include diff --git a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c index a4ffc3165ec8b1..a712a4f847f427 100644 --- a/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c +++ b/clang/test/CodeGen/aarch64-sve2-intrinsics/acle_sve2_pmullt_128.c @@ -1,10 +1,10 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // REQUIRES: aarch64-registered-target -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +sve2-aes -O1 -Werror -Wall -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK #include diff --git a/clang/test/CodeGenHLSL/builtins/AppendStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/AppendStructuredBuffer-elementtype.hlsl new file mode 100644 index 00000000000000..1e8aae588fc33d --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/AppendStructuredBuffer-elementtype.hlsl @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL + +struct MyStruct { + float4 a; + int2 b; +}; + +// DXIL: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 0) +// DXIL: %"class.hlsl::AppendStructuredBuffer.12" = type { target("dx.RawBuffer", %struct.MyStruct = type { <4 x float>, <2 x i32>, [8 x i8] }, 1, 0) + +AppendStructuredBuffer BufI16; +AppendStructuredBuffer BufU16; +AppendStructuredBuffer BufI32; +AppendStructuredBuffer BufU32; +AppendStructuredBuffer BufI64; +AppendStructuredBuffer BufU64; +AppendStructuredBuffer BufF16; +AppendStructuredBuffer BufF32; +AppendStructuredBuffer BufF64; +AppendStructuredBuffer< vector > BufI16x4; +AppendStructuredBuffer< vector > BufU32x3; +AppendStructuredBuffer BufF16x2; +AppendStructuredBuffer BufF32x3; +// TODO: AppendStructuredBuffer BufSNormF16; +// TODO: AppendStructuredBuffer BufUNormF16; +// TODO: AppendStructuredBuffer BufSNormF32; +// TODO: AppendStructuredBuffer BufUNormF32; +// TODO: AppendStructuredBuffer BufSNormF64; +// TODO: AppendStructuredBuffer BufUNormF64; +AppendStructuredBuffer BufMyStruct; + +[numthreads(1,1,1)] +void main(int GI : SV_GroupIndex) { +} diff --git a/clang/test/CodeGenHLSL/builtins/ConsumeStructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/ConsumeStructuredBuffer-elementtype.hlsl new file mode 100644 index 00000000000000..f8574c6460d4e1 --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/ConsumeStructuredBuffer-elementtype.hlsl @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s -check-prefixes=DXIL + +struct MyStruct { + float4 a; + int2 b; +}; + +// DXIL: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", i16, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.0" = type { target("dx.RawBuffer", i16, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.1" = type { target("dx.RawBuffer", i32, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.3" = type { target("dx.RawBuffer", i64, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.5" = type { target("dx.RawBuffer", half, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.6" = type { target("dx.RawBuffer", float, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.7" = type { target("dx.RawBuffer", double, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.8" = type { target("dx.RawBuffer", <4 x i16>, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.9" = type { target("dx.RawBuffer", <3 x i32>, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.10" = type { target("dx.RawBuffer", <2 x half>, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.11" = type { target("dx.RawBuffer", <3 x float>, 1, 0) +// DXIL: %"class.hlsl::ConsumeStructuredBuffer.12" = type { target("dx.RawBuffer", %struct.MyStruct = type { <4 x float>, <2 x i32>, [8 x i8] }, 1, 0) + +ConsumeStructuredBuffer BufI16; +ConsumeStructuredBuffer BufU16; +ConsumeStructuredBuffer BufI32; +ConsumeStructuredBuffer BufU32; +ConsumeStructuredBuffer BufI64; +ConsumeStructuredBuffer BufU64; +ConsumeStructuredBuffer BufF16; +ConsumeStructuredBuffer BufF32; +ConsumeStructuredBuffer BufF64; +ConsumeStructuredBuffer< vector > BufI16x4; +ConsumeStructuredBuffer< vector > BufU32x3; +ConsumeStructuredBuffer BufF16x2; +ConsumeStructuredBuffer BufF32x3; +// TODO: ConsumeStructuredBuffer BufSNormF16; +// TODO: ConsumeStructuredBuffer BufUNormF16; +// TODO: ConsumeStructuredBuffer BufSNormF32; +// TODO: ConsumeStructuredBuffer BufUNormF32; +// TODO: ConsumeStructuredBuffer BufSNormF64; +// TODO: ConsumeStructuredBuffer BufUNormF64; +ConsumeStructuredBuffer BufMyStruct; + +[numthreads(1,1,1)] +void main(int GI : SV_GroupIndex) { +} diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl index 8d3d6abb37f02a..2e141b9279fa61 100644 --- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl +++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl @@ -5,20 +5,29 @@ StructuredBuffer Buf : register(t10); RWStructuredBuffer Buf2 : register(u5, space1); +AppendStructuredBuffer Buf3 : register(u3); +ConsumeStructuredBuffer Buf4 : register(u4); RasterizerOrderedStructuredBuffer Buf5 : register(u1, space2); -// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0), float } -// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), float } +// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 0, 0) +// CHECK: %"class.hlsl::RWStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) +// CHECK: %"class.hlsl::AppendStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) +// CHECK: %"class.hlsl::ConsumeStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0) // CHECK: %"class.hlsl::RasterizerOrderedStructuredBuffer" = type { target("dx.RawBuffer", float, 1, 1), float } // CHECK: @Buf = global %"class.hlsl::StructuredBuffer" zeroinitializer, align 4 // CHECK: @Buf2 = global %"class.hlsl::RWStructuredBuffer" zeroinitializer, align 4 +// CHECK: @Buf3 = global %"class.hlsl::AppendStructuredBuffer" zeroinitializer, align 4 +// CHECK: @Buf4 = global %"class.hlsl::ConsumeStructuredBuffer" zeroinitializer, align 4 // CHECK: @Buf5 = global %"class.hlsl::RasterizerOrderedStructuredBuffer" zeroinitializer, align 4 // CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(8) %this) // CHECK-NEXT: entry: // CHECK: define linkonce_odr void @_ZN4hlsl18RWStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(8) %this) // CHECK-NEXT: entry: +// CHECK: define linkonce_odr void @_ZN4hlsl22AppendStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) +// CHECK-NEXT: entry: +// CHECK: define linkonce_odr void @_ZN4hlsl23ConsumeStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) %this) // CHECK: define linkonce_odr void @_ZN4hlsl33RasterizerOrderedStructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(8) %this) // CHECK-NEXT: entry: @@ -32,11 +41,20 @@ RasterizerOrderedStructuredBuffer Buf5 : register(u1, space2); // CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 0, 0) %Buf_h, ptr @Buf, align 4 // CHECK-DXIL-NEXT: %Buf2_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 5, i32 1, i32 0, i1 false) // CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf2_h, ptr @Buf2, align 4 +// CHECK-DXIL-NEXT: %Buf3_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 3, i32 1, i32 0, i1 false) +// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf3_h, ptr @Buf3, align 4 +// CHECK-DXIL-NEXT: %Buf4_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 4, i32 1, i32 0, i1 false) +// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf4_h, ptr @Buf4, align 4 // CHECK-DXIL-NEXT: %Buf5_h = call target("dx.RawBuffer", float, 1, 1) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_1_1t(i32 2, i32 1, i32 1, i32 0, i1 false) // CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 1) %Buf5_h, ptr @Buf5, align 4 + // CHECK-SPIRV-NEXT: %Buf_h = call target("dx.RawBuffer", float, 0, 0) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_0_0t(i32 0, i32 10, i32 1, i32 0, i1 false) // CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 0, 0) %Buf_h, ptr @Buf", align 4 // CHECK-SPIRV-NEXT: %Buf2_h = call target("dx.RawBuffer", float, 1, 0) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 5, i32 1, i32 0, i1 false) // CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf2_h, ptr @Buf2", align 4 +// CHECK-SPIRV-NEXT: %Buf3_h = call target("dx.RawBuffer", float, 0, 0) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 3, i32 1, i32 0, i1 false) +// CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 0, 0) %Buf3_h, ptr @Buf3, align 4 +// CHECK-SPIRV-NEXT: %Buf4_h = call target("dx.RawBuffer", float, 1, 0) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 4, i32 1, i32 0, i1 false) +// CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf4_h, ptr @Buf4, align 4 // CHECK-SPIRV-NEXT: %Buf5_h = call target("dx.RawBuffer", float, 1, 1) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_1_1t(i32 2, i32 1, i32 1, i32 0, i1 false) // CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 1) %Buf5_h, ptr @Buf5, align 4 diff --git a/clang/test/Driver/aarch64-implied-sve-features.c b/clang/test/Driver/aarch64-implied-sve-features.c index 2bb0ee88330ba9..f04e1a785673b8 100644 --- a/clang/test/Driver/aarch64-implied-sve-features.c +++ b/clang/test/Driver/aarch64-implied-sve-features.c @@ -36,7 +36,7 @@ // SVE2-BITPERM-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-bitperm" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-aes+nosve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-AES-REVERT -// SVE2-AES-REVERT: "-target-feature" "+sve" "-target-feature" "+sve-aes" "-target-feature" "+sve2" "-target-feature" "-sve2-aes" +// SVE2-AES-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-aes" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sha3+nosve2-sha3 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SHA3-REVERT // SVE2-SHA3-REVERT: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "-sve2-sha3" @@ -47,11 +47,8 @@ // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sha3 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SHA3 // SVE2-SHA3: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-sha3" -// RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE-AES -// SVE-AES: "-target-feature" "+aes"{{.*}} "-target-feature" "+sve-aes" - // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-AES -// SVE2-AES: "-target-feature" "+sve" "-target-feature" "+sve-aes" "-target-feature" "+sve2" "-target-feature" "+sve2-aes" +// SVE2-AES: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-aes" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+sve2-sm4 %s -### 2>&1 | FileCheck %s --check-prefix=SVE2-SM4 // SVE2-SM4: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-sm4" @@ -69,7 +66,7 @@ // SVE-SUBFEATURE-CONFLICT-NOT: "-target-feature" "+sve" // RUN: %clang --target=aarch64-linux-gnu -march=armv8-a+nosve+sve2-aes %s -### 2>&1 | FileCheck %s --check-prefix=SVE-SUBFEATURE-CONFLICT-REV -// SVE-SUBFEATURE-CONFLICT-REV: "-target-feature" "+sve" "-target-feature" "+sve-aes" "-target-feature" "+sve2" "-target-feature" "+sve2-aes" +// SVE-SUBFEATURE-CONFLICT-REV: "-target-feature" "+sve" "-target-feature" "+sve2" "-target-feature" "+sve2-aes" // RUN: %clang --target=aarch64-linux-gnu -mcpu=neoverse-n2+nosve2 %s -### 2>&1 | FileCheck %s --check-prefix=SVE-MCPU-FEATURES // SVE-MCPU-FEATURES-NOT: "-target-feature" "+sve2-bitperm" diff --git a/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip b/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip new file mode 100644 index 00000000000000..8a60f5a150048f --- /dev/null +++ b/clang/test/Driver/hip-wavefront-size-deprecation-diagnostics.hip @@ -0,0 +1,115 @@ +// REQUIRES: amdgpu-registered-target +// RUN: %clang -xhip --offload-arch=gfx1030 --offload-host-only -pedantic -nogpuinc -nogpulib -nobuiltininc -fsyntax-only -Xclang -verify %s +// RUN: %clang -xhip --offload-arch=gfx1030 --offload-device-only -pedantic -nogpuinc -nogpulib -nobuiltininc -fsyntax-only -Xclang -verify %s + +// Test that deprecation warnings for the wavefront size macro are emitted properly. + +#define WRAPPED __AMDGCN_WAVEFRONT_SIZE__ + +#define DOUBLE_WRAPPED (WRAPPED) + +template struct my_enable_if {}; + +template struct my_enable_if { + typedef T type; +}; + +__attribute__((host, device)) void use(int, const char*); + +template __attribute__((host, device)) int templatify(int x) { + return x + N; +} + +__attribute__((device)) const int GlobalConst = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} +constexpr int GlobalConstExpr = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + +#if defined(__HIP_DEVICE_COMPILE__) && (__AMDGCN_WAVEFRONT_SIZE__ == 64) // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} +int foo(void); +#endif + +__attribute__((device)) int device_var = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + +__attribute__((device)) +void device_fun() { + use(__AMDGCN_WAVEFRONT_SIZE, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}} + use(__AMDGCN_WAVEFRONT_SIZE__, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(WRAPPED, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(DOUBLE_WRAPPED, "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(GlobalConst, "device function"); + use(GlobalConstExpr, "device function"); +} + +__attribute__((global)) +void global_fun() { + // no warnings expected + use(__AMDGCN_WAVEFRONT_SIZE, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}} + use(__AMDGCN_WAVEFRONT_SIZE__, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(WRAPPED, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(DOUBLE_WRAPPED, "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "global function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} +} + +int host_var = __AMDGCN_WAVEFRONT_SIZE__; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} +int host_var_alt = __AMDGCN_WAVEFRONT_SIZE; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}} +int host_var_wrapped = WRAPPED; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} +int host_var_double_wrapped = DOUBLE_WRAPPED; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + +__attribute__((host)) +void host_fun() { + use(__AMDGCN_WAVEFRONT_SIZE, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE' has been marked as deprecated}} + use(__AMDGCN_WAVEFRONT_SIZE__, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(WRAPPED, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(DOUBLE_WRAPPED, "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "host function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(GlobalConst, "host function"); + use(GlobalConstExpr, "host function"); +} + +__attribute((host, device)) +void host_device_fun() { + use(__AMDGCN_WAVEFRONT_SIZE__, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(WRAPPED, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(DOUBLE_WRAPPED, "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + use(templatify<__AMDGCN_WAVEFRONT_SIZE__>(42), "host device function"); // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} +} + +template // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} +class FunSelector { +public: + template + __attribute__((device)) + auto fun(void) + -> typename my_enable_if<(FunWarpSize <= __AMDGCN_WAVEFRONT_SIZE__), void>::type // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + { + use(1, "yay!"); + } + + template + __attribute__((device)) + auto fun(void) + -> typename my_enable_if<(FunWarpSize > __AMDGCN_WAVEFRONT_SIZE__), void>::type // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + { + use(0, "nay!"); + } +}; + +__attribute__((device)) +void device_fun_selector_user() { + FunSelector<> f; + f.fun<>(); + f.fun<1>(); + f.fun<1000>(); + + my_enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type x = 42; // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} +} + +__attribute__((device)) my_enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type DeviceFunTemplateRet(void) { // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + return 42; +} + +__attribute__((device)) int DeviceFunTemplateArg(my_enable_if<(1 <= __AMDGCN_WAVEFRONT_SIZE__), int>::type x) { // expected-warning {{macro '__AMDGCN_WAVEFRONT_SIZE__' has been marked as deprecated}} + return x; +} + +// expected-note@* 0+ {{macro marked 'deprecated' here}} diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c index 0396718b3ec704..03eacf99736f9e 100644 --- a/clang/test/Driver/print-supported-extensions-aarch64.c +++ b/clang/test/Driver/print-supported-extensions-aarch64.c @@ -77,18 +77,17 @@ // CHECK-NEXT: profile FEAT_SPE Enable Statistical Profiling extension // CHECK-NEXT: predres2 FEAT_SPECRES2 Enable Speculation Restriction Instruction // CHECK-NEXT: ssbs FEAT_SSBS, FEAT_SSBS2 Enable Speculative Store Bypass Safe bit -// CHECK-NEXT: ssve-aes FEAT_SSVE_AES Enable Armv9.6-A SVE AES support in streaming SVE mode +// CHECK-NEXT: ssve-aes FEAT_SSVE_AES Enable Armv9.6-A SVE2 AES support in streaming SVE mode // CHECK-NEXT: ssve-fp8dot2 FEAT_SSVE_FP8DOT2 Enable SVE2 FP8 2-way dot product instructions // CHECK-NEXT: ssve-fp8dot4 FEAT_SSVE_FP8DOT4 Enable SVE2 FP8 4-way dot product instructions // CHECK-NEXT: ssve-fp8fma FEAT_SSVE_FP8FMA Enable SVE2 FP8 multiply-add instructions // CHECK-NEXT: sve FEAT_SVE Enable Scalable Vector Extension (SVE) instructions -// CHECK-NEXT: sve-aes FEAT_SVE_AES, FEAT_SVE_PMULL128 Enable SVE AES and quadword SVE polynomial multiply instructions -// CHECK-NEXT: sve-aes2 FEAT_SVE_AES2 Enable Armv9.6-A SVE multi-vector AES and multi-vector quadword polynomial multiply instructions +// CHECK-NEXT: sve-aes2 FEAT_SVE_AES2 Enable Armv9.6-A SVE multi-vector AES and 128-bit PMULL instructions // CHECK-NEXT: sve-b16b16 FEAT_SVE_B16B16 Enable SVE2 non-widening and SME2 Z-targeting non-widening BFloat16 instructions // CHECK-NEXT: sve-bfscale FEAT_SVE_BFSCALE Enable Armv9.6-A SVE BFloat16 scaling instructions // CHECK-NEXT: sve-f16f32mm FEAT_SVE_F16F32MM Enable Armv9.6-A FP16 to FP32 Matrix Multiply // CHECK-NEXT: sve2 FEAT_SVE2 Enable Scalable Vector Extension 2 (SVE2) instructions -// CHECK-NEXT: sve2-aes An alias of +sve2+sve-aes +// CHECK-NEXT: sve2-aes FEAT_SVE_AES, FEAT_SVE_PMULL128 Enable AES SVE2 instructions // CHECK-NEXT: sve2-bitperm FEAT_SVE_BitPerm Enable bit permutation SVE2 instructions // CHECK-NEXT: sve2-sha3 FEAT_SVE_SHA3 Enable SHA3 SVE2 instructions // CHECK-NEXT: sve2-sm4 FEAT_SVE_SM4 Enable SM4 SVE2 instructions diff --git a/clang/test/Driver/riscv-profiles.c b/clang/test/Driver/riscv-profiles.c index 67e09d0e69ebc3..c87ec5a27822c5 100644 --- a/clang/test/Driver/riscv-profiles.c +++ b/clang/test/Driver/riscv-profiles.c @@ -57,6 +57,7 @@ // RVA22U64: "-target-feature" "+f" // RVA22U64: "-target-feature" "+d" // RVA22U64: "-target-feature" "+c" +// RVA22U64: "-target-feature" "+b" // RVA22U64: "-target-feature" "+zic64b" // RVA22U64: "-target-feature" "+zicbom" // RVA22U64: "-target-feature" "+zicbop" @@ -83,6 +84,7 @@ // RVA22S64: "-target-feature" "+f" // RVA22S64: "-target-feature" "+d" // RVA22S64: "-target-feature" "+c" +// RVA22S64: "-target-feature" "+b" // RVA22S64: "-target-feature" "+zic64b" // RVA22S64: "-target-feature" "+zicbom" // RVA22S64: "-target-feature" "+zicbop" @@ -118,6 +120,7 @@ // RVA23U64: "-target-feature" "+f" // RVA23U64: "-target-feature" "+d" // RVA23U64: "-target-feature" "+c" +// RVA23U64: "-target-feature" "+b" // RVA23U64: "-target-feature" "+v" // RVA23U64: "-target-feature" "+zic64b" // RVA23U64: "-target-feature" "+zicbom" @@ -156,6 +159,7 @@ // RVA23S64: "-target-feature" "+f" // RVA23S64: "-target-feature" "+d" // RVA23S64: "-target-feature" "+c" +// RVA23S64: "-target-feature" "+b" // RVA23S64: "-target-feature" "+v" // RVA23S64: "-target-feature" "+h" // RVA23S64: "-target-feature" "+zic64b" @@ -217,6 +221,7 @@ // RVB23U64: "-target-feature" "+f" // RVB23U64: "-target-feature" "+d" // RVB23U64: "-target-feature" "+c" +// RVB23U64: "-target-feature" "+b" // RVB23U64: "-target-feature" "+zic64b" // RVB23U64: "-target-feature" "+zicbom" // RVB23U64: "-target-feature" "+zicbop" @@ -249,6 +254,7 @@ // RVB23S64: "-target-feature" "+f" // RVB23S64: "-target-feature" "+d" // RVB23S64: "-target-feature" "+c" +// RVB23S64: "-target-feature" "+b" // RVB23S64: "-target-feature" "+zic64b" // RVB23S64: "-target-feature" "+zicbom" // RVB23S64: "-target-feature" "+zicbop" @@ -290,6 +296,7 @@ // RUN: %clang --target=riscv32 -### -c %s 2>&1 -march=rvm23u32 -menable-experimental-extensions \ // RUN: | FileCheck -check-prefix=RVM23U32 %s // RVM23U32: "-target-feature" "+m" +// RVM23U32: "-target-feature" "+b" // RVM23U32: "-target-feature" "+zicbop" // RVM23U32: "-target-feature" "+zicond" // RVM23U32: "-target-feature" "+zicsr" @@ -309,6 +316,7 @@ // PROFILE-WITH-ADDITIONAL: "-target-feature" "+f" // PROFILE-WITH-ADDITIONAL: "-target-feature" "+d" // PROFILE-WITH-ADDITIONAL: "-target-feature" "+c" +// PROFILE-WITH-ADDITIONAL: "-target-feature" "+b" // PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicbom" // PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicbop" // PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicboz" diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c index 822c997f71744f..339f593dc760a8 100644 --- a/clang/test/Driver/x86-target-features.c +++ b/clang/test/Driver/x86-target-features.c @@ -318,6 +318,13 @@ // AMX-AVX512: "-target-feature" "+amx-avx512" // NO-AMX-AVX512: "-target-feature" "-amx-avx512" +// RUN: %clang -target x86_64-unknown-linux-gnu -mamx-tf32 %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=AMX-TF32 %s +// RUN: %clang -target x86_64-unknown-linux-gnu -mno-amx-tf32 %s \ +// RUN: -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AMX-TF32 %s +// AMX-TF32: "-target-feature" "+amx-tf32" +// NO-AMX-TF32: "-target-feature" "-amx-tf32" + // RUN: %clang --target=i386 -march=i386 -mhreset %s -### 2>&1 | FileCheck -check-prefix=HRESET %s // RUN: %clang --target=i386 -march=i386 -mno-hreset %s -### 2>&1 | FileCheck -check-prefix=NO-HRESET %s // HRESET: "-target-feature" "+hreset" diff --git a/clang/test/OpenMP/depobj_codegen.cpp b/clang/test/OpenMP/depobj_codegen.cpp index 92751ac44b8c78..b3bcd76e237dc8 100644 --- a/clang/test/OpenMP/depobj_codegen.cpp +++ b/clang/test/OpenMP/depobj_codegen.cpp @@ -17,6 +17,15 @@ typedef void *omp_depend_t; void foo() {} +void tmainc(){ + omp_depend_t obj; +#pragma omp depobj(obj) depend(inout: omp_all_memory) +{ + volatile omp_depend_t temp = obj; + char* char_ptr = reinterpret_cast(temp); + char_ptr[0] = 1; +} +} template T tmain(T argc) { @@ -35,10 +44,25 @@ int main(int argc, char **argv) { #pragma omp depobj(b) destroy #pragma omp depobj(b) update(mutexinoutset) #pragma omp depobj(a) depend(iterator(char *p = argv[argc]:argv[0]:-1), out: p[0]) - (void)tmain(a), tmain(b); + (void)tmain(a), tmain(b); + tmainc(); return 0; } - +// CHECK-LABEL: tmainc +// CHECK: [[D_ADDR:%obj]] = alloca ptr, +// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num( +// CHECK: [[DEP_ADDR_ADDR2:%.+]] = call ptr @__kmpc_alloc(i32 [[GTID]], i64 48, ptr null) +// CHECK: [[SZ_DEOOBJ:%.+]] = getelementptr inbounds nuw %struct.kmp_depend_info, ptr [[DEP_ADDR_ADDR2]], i{{.+}} 0, i{{.+}} 0 +// CHECK: store i64 1, ptr [[SZ_DEOOBJ]], align 8 +// CHECK: [[DEPOBJ_BASE_ADDR:%.+]] = getelementptr %struct.kmp_depend_info, ptr [[DEP_ADDR_ADDR2]], i{{.+}} 1 +// CHECK: [[ADDR_ONE:%.+]] = getelementptr inbounds nuw %struct.kmp_depend_info, ptr [[DEPOBJ_BASE_ADDR]], i{{.+}} 0, i{{.+}} 0 +// CHECK: store i64 0, ptr [[ADDR_ONE]], align 8 +// CHECK: [[SZ_ADDR:%.+]] = getelementptr inbounds nuw %struct.kmp_depend_info, ptr [[DEPOBJ_BASE_ADDR]], i{{.+}} 0, i{{.+}} 1 +// CHECK: store i64 0, ptr [[SZ_ADDR]], align 8 +// CHECK: [[SZ_ADDR_NEW:%.+]] = getelementptr inbounds nuw %struct.kmp_depend_info, ptr [[DEPOBJ_BASE_ADDR]], i{{.+}} 0, i{{.+}} 2 +// CHECK: store {{i[0-9]+}} {{-?[0-9]+}}, ptr [[SZ_ADDR_NEW]], align 8 +// CHECK: [[DEP_NEW:%.+]] = getelementptr %struct.kmp_depend_info, ptr [[DEP_ADDR_ADDR2]], i{{.+}} 1 +// CHECK: store ptr [[DEP_NEW]], ptr [[D_ADDR]], align 8 // CHECK-LABEL: @main // CHECK: [[B_ADDR:%b]] = alloca ptr, // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num( @@ -69,6 +93,8 @@ int main(int argc, char **argv) { // CHECK: [[NUMDEPS_ADDR:%.+]] = getelementptr inbounds nuw %struct.kmp_depend_info, ptr [[NUMDEPS_BASE]], i{{.+}} 0, i{{.+}} 0 // CHECK: [[NUMDEPS:%.+]] = load i64, ptr [[NUMDEPS_ADDR]], align 8 // CHECK: [[END:%.+]] = getelementptr %struct.kmp_depend_info, ptr [[B_BASE]], i64 [[NUMDEPS]] + + // CHECK: br label %[[BODY:.+]] // CHECK: [[BODY]]: // CHECK: [[EL:%.+]] = phi ptr [ [[B_BASE]], %{{.+}} ], [ [[EL_NEXT:%.+]], %[[BODY]] ] @@ -228,6 +254,8 @@ int main(int argc, char **argv) { // CHECK: [[EL_NEXT]] = getelementptr %struct.kmp_depend_info, ptr [[EL]], i{{.+}} 1 // CHECK: [[IS_DONE:%.+]] = icmp eq ptr [[EL_NEXT]], [[END]] // CHECK: br i1 [[IS_DONE]], label %[[DONE:.+]], label %[[BODY]] + // CHECK: [[DONE]]: + #endif diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c index fc786f4b2e9b4d..418430b0b19b89 100644 --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -227,20 +227,8 @@ // CHECK-NONEON-NOT: __ARM_FEATURE_SVE 1 // CHECK-NONEON-NOT: __ARM_NEON 1 -// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve-aes -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVEAES %s -// CHECK-SVEAES: __ARM_FEATURE_AES 1 - // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-aes -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2AES %s -// CHECK-SVE2AES: __ARM_FEATURE_AES 1 -// CHECK-SVE2AES: __ARM_FEATURE_SVE2 1 // CHECK-SVE2AES: __ARM_FEATURE_SVE2_AES 1 - -// RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve-aes+sve2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVEAES-SVE2 %s -// CHECK-SVEAES-SVE2: __ARM_FEATURE_AES 1 -// CHECK-SVEAES-SVE2: __ARM_FEATURE_SVE2 1 -// CHECK-SVEAES-SVE2: __ARM_FEATURE_SVE2_AES 1 - - // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-sha3 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2SHA3 %s // CHECK-SVE2SHA3: __ARM_FEATURE_SVE2_SHA3 1 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2-sm4 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2SM4 %s diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 8e4ddb1526626e..fa3d0038f05a93 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -570,6 +570,15 @@ // NO-AMX-AVX512-NOT: #define __AMX_AVX512__ 1 +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-tf32 -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=AMX-TF32 %s +// AMX-TF32: #define __AMX_TF32__ 1 +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mno-amx-tf32 -x c \ +// RUN: -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TF32 %s +// RUN: %clang -target x86_64-unknown-linux-gnu -march=x86-64 -mamx-tf32 -mno-amx-tile \ +// RUN: -x c -E -dM -o - %s | FileCheck -check-prefix=NO-AMX-TF32 %s +// NO-AMX-TF32-NOT: #define __AMX_TF32__ 1 + // RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s // AVXVNNI: #define __AVX2__ 1 diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c index 381a21be7bbb9d..27fa8f7c9dccb2 100644 --- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c +++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c @@ -28,6 +28,16 @@ void incompat_sme_smc(svbool_t pg, void const *ptr) __arm_streaming_compatible _ return __builtin_sme_svld1_hor_za128(0, 0, pg, ptr); } +float incomp_sve_sm_fadda_sm(void) __arm_streaming { + // expected-error@+1 {{builtin can only be called from a non-streaming function}} + return svadda(svptrue_b32(), 0, svdup_f32(1)); +} + +float incomp_sve_sm_fadda_smc(void) __arm_streaming_compatible { + // expected-error@+1 {{builtin can only be called from a non-streaming function}} + return svadda(svptrue_b32(), 0, svdup_f32(1)); +} + svuint32_t incompat_sve_sm(svbool_t pg, svuint32_t a, int16_t b) __arm_streaming { // expected-error@+1 {{builtin can only be called from a non-streaming function}} return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b); diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp index 93d4b007016937..795bb760533034 100644 --- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp +++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_aes_bitperm_sha3_sm4.cpp @@ -14,17 +14,17 @@ void test(uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64) { - // expected-error@+2 {{'svaesd_u8' needs target feature sve,sve2,sve-aes}} - // overload-error@+1 {{'svaesd' needs target feature sve,sve2,sve-aes}} + // expected-error@+2 {{'svaesd_u8' needs target feature sve,sve2-aes}} + // overload-error@+1 {{'svaesd' needs target feature sve,sve2-aes}} SVE_ACLE_FUNC(svaesd,_u8,,)(svundef_u8(), svundef_u8()); - // expected-error@+2 {{'svaese_u8' needs target feature sve,sve2,sve-aes}} - // overload-error@+1 {{'svaese' needs target feature sve,sve2,sve-aes}} + // expected-error@+2 {{'svaese_u8' needs target feature sve,sve2-aes}} + // overload-error@+1 {{'svaese' needs target feature sve,sve2-aes}} SVE_ACLE_FUNC(svaese,_u8,,)(svundef_u8(), svundef_u8()); - // expected-error@+2 {{'svaesimc_u8' needs target feature sve,sve2,sve-aes}} - // overload-error@+1 {{'svaesimc' needs target feature sve,sve2,sve-aes}} + // expected-error@+2 {{'svaesimc_u8' needs target feature sve,sve2-aes}} + // overload-error@+1 {{'svaesimc' needs target feature sve,sve2-aes}} SVE_ACLE_FUNC(svaesimc,_u8,,)(svundef_u8()); - // expected-error@+2 {{'svaesmc_u8' needs target feature sve,sve2,sve-aes}} - // overload-error@+1 {{'svaesmc' needs target feature sve,sve2,sve-aes}} + // expected-error@+2 {{'svaesmc_u8' needs target feature sve,sve2-aes}} + // overload-error@+1 {{'svaesmc' needs target feature sve,sve2-aes}} SVE_ACLE_FUNC(svaesmc,_u8,,)(svundef_u8()); // expected-error@+2 {{'svbdep_u8' needs target feature sve,sve2-bitperm}} // overload-error@+1 {{'svbdep' needs target feature sve,sve2-bitperm}} @@ -107,17 +107,17 @@ void test(uint8_t u8, uint16_t u16, uint32_t u32, uint64_t u64) // expected-error@+2 {{'svbgrp_n_u64' needs target feature sve,sve2-bitperm}} // overload-error@+1 {{'svbgrp' needs target feature sve,sve2-bitperm}} SVE_ACLE_FUNC(svbgrp,_n_u64,,)(svundef_u64(), u64); - // expected-error@+2 {{'svpmullb_pair_u64' needs target feature sve,sve2,sve-aes}} - // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2,sve-aes}} + // expected-error@+2 {{'svpmullb_pair_u64' needs target feature sve,sve2-aes}} + // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2-aes}} SVE_ACLE_FUNC(svpmullb_pair,_u64,,)(svundef_u64(), svundef_u64()); - // expected-error@+2 {{'svpmullb_pair_n_u64' needs target feature sve,sve2,sve-aes}} - // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2,sve-aes}} + // expected-error@+2 {{'svpmullb_pair_n_u64' needs target feature sve,sve2-aes}} + // overload-error@+1 {{'svpmullb_pair' needs target feature sve,sve2-aes}} SVE_ACLE_FUNC(svpmullb_pair,_n_u64,,)(svundef_u64(), u64); - // expected-error@+2 {{'svpmullt_pair_u64' needs target feature sve,sve2,sve-aes}} - // overload-error@+1 {{'svpmullt_pair' needs target feature sve,sve2,sve-aes}} + // expected-error@+2 {{'svpmullt_pair_u64' needs target feature sve,sve2-aes}} + // overload-error@+1 {{'svpmullt_pair' needs target feature sve,sve2-aes}} SVE_ACLE_FUNC(svpmullt_pair,_u64,,)(svundef_u64(), svundef_u64()); - // expected-error@+2 {{'svpmullt_pair_n_u64' needs target feature sve,sve2,sve-aes}} - // overload-error@+1 {{'svpmullt_pair' needs target feature sve,sve2,sve-aes}} + // expected-error@+2 {{'svpmullt_pair_n_u64' needs target feature sve,sve2-aes}} + // overload-error@+1 {{'svpmullt_pair' needs target feature sve,sve2-aes}} SVE_ACLE_FUNC(svpmullt_pair,_n_u64,,)(svundef_u64(), u64); // expected-error@+2 {{'svrax1_u64' needs target feature sve,sve2-sha3}} // overload-error@+1 {{'svrax1' needs target feature sve,sve2-sha3}} diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp index 19a4c3b7942b12..cc9108c0a4fbd6 100644 --- a/clang/test/Sema/attr-nonblocking-constraints.cpp +++ b/clang/test/Sema/attr-nonblocking-constraints.cpp @@ -388,7 +388,7 @@ void nb26() [[clang::nonblocking]] { abort_wrapper(); // no diagnostic } -// --- Make sure we don't traverse a requires clause. --- +// --- Make sure we don't traverse requires and noexcept clauses. --- // Apparently some requires clauses are able to be collapsed into a constant before the nonblocking // analysis sees any function calls. This example (extracted from a real-world case where @@ -420,6 +420,7 @@ class expected { constexpr expected() {} + // This is a deliberate corruption of the real implementation for simplicity. constexpr expected(const expected&) requires(is_copy_constructible_v<_Tp> && is_copy_constructible_v<_Err>) = default; @@ -428,11 +429,20 @@ class expected { void test() [[clang::nonblocking]] { expected a; - auto b = a; + auto b = a; // Copy constructor. } } // namespace ExpectedTest +// Make sure a function call in a noexcept() clause is ignored. +constexpr bool foo() [[clang::nonblocking(false)]] { return true; } +void nb27() noexcept(foo()) [[clang::nonblocking]] {} + +// Make sure that simple type traits don't cause violations. +void nb28() [[clang::nonblocking]] { + bool x = __is_constructible(int, const int&); +} + // --- nonblocking implies noexcept --- #pragma clang diagnostic warning "-Wperf-constraint-implies-noexcept" diff --git a/clang/test/SemaCXX/attr-lifetime-capture-by.cpp b/clang/test/SemaCXX/attr-lifetime-capture-by.cpp new file mode 100644 index 00000000000000..3115dc8d6150c9 --- /dev/null +++ b/clang/test/SemaCXX/attr-lifetime-capture-by.cpp @@ -0,0 +1,46 @@ +// RUN: %clang_cc1 -std=c++23 -verify %s + +struct S { + const int *x; + void captureInt(const int&x [[clang::lifetime_capture_by(this)]]) { this->x = &x; } +}; + +/////////////////////////// +// Test for valid usages. +/////////////////////////// +[[clang::lifetime_capture_by(unknown)]] // expected-error {{'lifetime_capture_by' attribute only applies to parameters and implicit object parameters}} +void nonMember( + const int &x1 [[clang::lifetime_capture_by(s, t)]], + S &s, + S &t, + const int &x2 [[clang::lifetime_capture_by(12345 + 12)]], // expected-error {{'lifetime_capture_by' attribute argument 12345 + 12 is not a known function parameter; must be a function parameter, 'this', 'global' or 'unknown'}} + const int &x3 [[clang::lifetime_capture_by(abcdefgh)]], // expected-error {{'lifetime_capture_by' attribute argument 'abcdefgh' is not a known function parameter; must be a function parameter, 'this', 'global' or 'unknown'}} + const int &x4 [[clang::lifetime_capture_by("abcdefgh")]], // expected-error {{'lifetime_capture_by' attribute argument "abcdefgh" is not a known function parameter; must be a function parameter, 'this', 'global' or 'unknown'}} + const int &x5 [[clang::lifetime_capture_by(this)]], // expected-error {{'lifetime_capture_by' argument references unavailable implicit 'this'}} + const int &x6 [[clang::lifetime_capture_by()]], // expected-error {{'lifetime_capture_by' attribute specifies no capturing entity}} + const int& x7 [[clang::lifetime_capture_by(u, + x7)]], // expected-error {{'lifetime_capture_by' argument references itself}} + const int &x8 [[clang::lifetime_capture_by(global)]], + const int &x9 [[clang::lifetime_capture_by(unknown)]], + const S& u + ) +{ + s.captureInt(x1); +} + +void unknown_param_name(const int& unknown, // expected-error {{parameter cannot be named 'unknown' while using 'lifetime_capture_by(unknown)'}} + const int& s [[clang::lifetime_capture_by(unknown)]]); +void global_param_name(const int& global, // expected-error {{parameter cannot be named 'global' while using 'lifetime_capture_by(global)'}} + const int& s [[clang::lifetime_capture_by(global)]]); +struct T { + void member( + const int &x [[clang::lifetime_capture_by(s)]], + S &s, + S &t, + const int &y [[clang::lifetime_capture_by(s)]], + const int &z [[clang::lifetime_capture_by(this, x, y)]], + const int &u [[clang::lifetime_capture_by(global, unknown, x, s)]]) + { + s.captureInt(x); + } +}; diff --git a/clang/test/SemaOpenACC/compute-construct-default-clause.c b/clang/test/SemaOpenACC/compute-construct-default-clause.c index be6a705883429e..a8d0c2b705d22d 100644 --- a/clang/test/SemaOpenACC/compute-construct-default-clause.c +++ b/clang/test/SemaOpenACC/compute-construct-default-clause.c @@ -23,18 +23,18 @@ void SingleOnly() { // expected-warning@+2{{OpenACC clause 'default' not yet implemented}} // expected-warning@+1{{OpenACC clause 'copy' not yet implemented}} #pragma acc parallel loop self default(present) private(i) default(none) copy(i) - while(0); + for(int i = 0; i < 5; ++i); // expected-warning@+3{{OpenACC clause 'self' not yet implemented, clause ignored}} // expected-warning@+2{{OpenACC construct 'serial loop' not yet implemented}} // expected-error@+1{{expected '('}} #pragma acc serial loop self default private(i) default(none) if(i) - while(0); + for(int i = 0; i < 5; ++i); // expected-warning@+2{{OpenACC construct 'kernels loop' not yet implemented}} // expected-warning@+1{{OpenACC clause 'default' not yet implemented}} #pragma acc kernels loop default(none) - while(0); + for(int i = 0; i < 5; ++i); // expected-warning@+2{{OpenACC construct 'data' not yet implemented}} // expected-warning@+1{{OpenACC clause 'default' not yet implemented}} diff --git a/clang/test/SemaOpenACC/compute-construct-if-clause.c b/clang/test/SemaOpenACC/compute-construct-if-clause.c index 41a929e23676cd..2fbf49f207a74e 100644 --- a/clang/test/SemaOpenACC/compute-construct-if-clause.c +++ b/clang/test/SemaOpenACC/compute-construct-if-clause.c @@ -50,15 +50,15 @@ void BoolExpr(int *I, float *F) { // expected-warning@+2{{OpenACC construct 'parallel loop' not yet implemented}} // expected-warning@+1{{OpenACC clause 'if' not yet implemented}} #pragma acc parallel loop if (*I < *F) - while(0); + for(int i = 0; i < 5; ++i); // expected-warning@+2{{OpenACC construct 'serial loop' not yet implemented}} // expected-warning@+1{{OpenACC clause 'if' not yet implemented}} #pragma acc serial loop if (*I < *F) - while(0); + for(int i = 0; i < 5; ++i); // expected-warning@+2{{OpenACC construct 'kernels loop' not yet implemented}} // expected-warning@+1{{OpenACC clause 'if' not yet implemented}} #pragma acc kernels loop if (*I < *F) - while(0); + for(int i = 0; i < 5; ++i); // expected-error@+1{{OpenACC 'if' clause is not valid on 'loop' directive}} #pragma acc loop if(I) diff --git a/clang/test/SemaOpenACC/loop-ast.cpp b/clang/test/SemaOpenACC/loop-ast.cpp index cc8bbfa09df3bb..d8ecef9741deec 100644 --- a/clang/test/SemaOpenACC/loop-ast.cpp +++ b/clang/test/SemaOpenACC/loop-ast.cpp @@ -42,12 +42,12 @@ void NormalFunc() { // CHECK-NEXT: CompoundStmt { #pragma acc parallel - // CHECK-NEXT: OpenACCComputeConstruct [[PAR_ADDR:[0-9a-fx]+]] {{.*}}parallel + // CHECK-NEXT: OpenACCComputeConstruct {{.*}}parallel // CHECK-NEXT: CompoundStmt { #pragma acc loop for(int i = 0; i < 5;++i); - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: [[PAR_ADDR]] + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt // CHECK-NEXT: VarDecl {{.*}} used i 'int' @@ -91,16 +91,16 @@ void TemplFunc() { } -#pragma acc parallel +#pragma acc serial { - // CHECK-NEXT: OpenACCComputeConstruct {{.*}}parallel + // CHECK-NEXT: OpenACCComputeConstruct {{.*}}serial // CHECK-NEXT: CompoundStmt #pragma acc parallel { - // CHECK-NEXT: OpenACCComputeConstruct [[PAR_ADDR_UNINST:[0-9a-fx]+]] {{.*}}parallel + // CHECK-NEXT: OpenACCComputeConstruct {{.*}}parallel // CHECK-NEXT: CompoundStmt #pragma acc loop - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: [[PAR_ADDR_UNINST]] + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt // CHECK-NEXT: VarDecl {{.*}} i 'int' @@ -116,7 +116,7 @@ void TemplFunc() { for(int i = 0; i < 5;++i); #pragma acc loop - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: [[PAR_ADDR_UNINST]] + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt // CHECK-NEXT: VarDecl {{.*}} i 'int' @@ -166,13 +166,13 @@ void TemplFunc() { // CHECK-NEXT: DeclStmt // CHECK-NEXT: VarDecl{{.*}} I 'typename S::type':'int' - // CHECK-NEXT: OpenACCComputeConstruct {{.*}}parallel + // CHECK-NEXT: OpenACCComputeConstruct {{.*}}serial // CHECK-NEXT: CompoundStmt // - // CHECK-NEXT: OpenACCComputeConstruct [[PAR_ADDR_INST:[0-9a-fx]+]] {{.*}}parallel + // CHECK-NEXT: OpenACCComputeConstruct {{.*}}parallel // CHECK-NEXT: CompoundStmt - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: [[PAR_ADDR_INST]] + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt // CHECK-NEXT: VarDecl {{.*}} i 'int' @@ -186,7 +186,7 @@ void TemplFunc() { // CHECK-NEXT: DeclRefExpr{{.*}} 'i' 'int' // CHECK-NEXT: NullStmt - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: [[PAR_ADDR_INST]] + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt // CHECK-NEXT: VarDecl {{.*}} i 'int' diff --git a/clang/test/SemaOpenACC/loop-construct-gang-ast.cpp b/clang/test/SemaOpenACC/loop-construct-gang-ast.cpp index 738fe7c9d680bb..9ee2faa655d047 100644 --- a/clang/test/SemaOpenACC/loop-construct-gang-ast.cpp +++ b/clang/test/SemaOpenACC/loop-construct-gang-ast.cpp @@ -53,8 +53,8 @@ void NormalUses() { #pragma acc loop gang(static:Val) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: gang clause num // CHECK-NEXT: IntegerLiteral{{.*}}'int' 1 // CHECK-NEXT: gang clause static @@ -76,8 +76,8 @@ void NormalUses() { #pragma acc loop gang(num:1) gang(static:Val) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: gang clause dim static // CHECK-NEXT: ConstantExpr{{.*}} 'int' // CHECK-NEXT: value: Int 1 @@ -100,8 +100,8 @@ void NormalUses() { #pragma acc loop gang(dim:1, static:Val) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: gang clause static // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' // CHECK-NEXT: DeclRefExpr{{.*}} 'Val' 'int' @@ -121,8 +121,8 @@ void NormalUses() { #pragma acc loop gang(static:Val) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}}serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: gang clause static // CHECK-NEXT: OpenACCAsteriskSizeExpr // CHECK-NEXT: ForStmt @@ -141,8 +141,8 @@ void NormalUses() { #pragma acc loop gang(static:*) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: gang clause // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -224,8 +224,8 @@ void TemplateUses(T Val) { #pragma acc loop gang(static:*) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: gang clause dim // CHECK-NEXT: DeclRefExpr{{.*}}'unsigned int' NonTypeTemplateParm{{.*}} 'One' 'unsigned int' // CHECK-NEXT: gang clause static @@ -246,8 +246,8 @@ void TemplateUses(T Val) { #pragma acc loop gang(dim:One) gang(static:Val) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: gang clause dim static // CHECK-NEXT: DeclRefExpr{{.*}}'unsigned int' NonTypeTemplateParm{{.*}} 'One' 'unsigned int' // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 'Val' 'T' @@ -267,8 +267,8 @@ void TemplateUses(T Val) { #pragma acc loop gang(dim:One, static:Val) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: gang clause static // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 'Val' 'T' // CHECK-NEXT: ForStmt @@ -287,8 +287,8 @@ void TemplateUses(T Val) { #pragma acc loop gang(static:Val) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: gang clause // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -367,8 +367,8 @@ void TemplateUses(T Val) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: gang clause dim // CHECK-NEXT: ConstantExpr{{.*}} 'unsigned int' // CHECK-NEXT: value: Int 1 @@ -391,8 +391,8 @@ void TemplateUses(T Val) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: gang clause dim static // CHECK-NEXT: ConstantExpr{{.*}} 'unsigned int' // CHECK-NEXT: value: Int 1 @@ -414,8 +414,8 @@ void TemplateUses(T Val) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: gang clause static // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 'Val' 'int' @@ -432,8 +432,8 @@ void TemplateUses(T Val) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: gang clause // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt diff --git a/clang/test/SemaOpenACC/loop-construct-vector-ast.cpp b/clang/test/SemaOpenACC/loop-construct-vector-ast.cpp index cbf5ac952045db..b841bebf7ebf4f 100644 --- a/clang/test/SemaOpenACC/loop-construct-vector-ast.cpp +++ b/clang/test/SemaOpenACC/loop-construct-vector-ast.cpp @@ -70,8 +70,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { #pragma acc loop vector(length:CTI) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: vector clause // CHECK-NEXT: DeclRefExpr{{.*}}'Int' lvalue ParmVar{{.*}}'IsI' 'Int' // CHECK-NEXT: ForStmt @@ -90,8 +90,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { #pragma acc loop vector(length:IsI) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: vector clause // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -109,8 +109,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { #pragma acc loop vector for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: vector clause // CHECK-NEXT: DeclRefExpr{{.*}}'Int' lvalue ParmVar{{.*}}'IsI' 'Int' // CHECK-NEXT: ForStmt @@ -194,8 +194,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: vector clause // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}}'IsI' 'int' @@ -212,8 +212,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: vector clause // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -228,8 +228,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: vector clause // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}}'IsI' 'int' @@ -330,8 +330,8 @@ void uses() { #pragma acc loop vector(length:C) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: vector clause // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -349,8 +349,8 @@ void uses() { #pragma acc loop vector for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: vector clause // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' // CHECK-NEXT: CXXMemberCallExpr{{.*}}'int' @@ -372,8 +372,8 @@ void uses() { #pragma acc loop vector(C) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: vector clause // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var @@ -393,8 +393,8 @@ void uses() { #pragma acc loop vector(length:i) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: vector clause // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -412,8 +412,8 @@ void uses() { #pragma acc loop vector for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: vector clause // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' // CHECK-NEXT: CXXMemberCallExpr{{.*}}'int' @@ -435,8 +435,8 @@ void uses() { #pragma acc loop vector(C) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: vector clause // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var @@ -456,8 +456,8 @@ void uses() { #pragma acc loop vector(length:i) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: vector clause // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt diff --git a/clang/test/SemaOpenACC/loop-construct-worker-ast.cpp b/clang/test/SemaOpenACC/loop-construct-worker-ast.cpp index d5c14e2ee85057..9fbe9af146709c 100644 --- a/clang/test/SemaOpenACC/loop-construct-worker-ast.cpp +++ b/clang/test/SemaOpenACC/loop-construct-worker-ast.cpp @@ -34,8 +34,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { #pragma acc loop worker for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -53,8 +53,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { #pragma acc loop worker for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -72,8 +72,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { #pragma acc loop worker for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: DeclRefExpr{{.*}} 'ConvertsToInt' lvalue ParmVar // CHECK-NEXT: ForStmt @@ -92,8 +92,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { #pragma acc loop worker(CTI) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: DeclRefExpr{{.*}} 'Int' lvalue ParmVar // CHECK-NEXT: ForStmt @@ -112,8 +112,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { #pragma acc loop worker(num:IsI) for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: DeclRefExpr{{.*}} 'unsigned int' NonTypeTemplateParm{{.*}}'I' 'unsigned int' // CHECK-NEXT: ForStmt @@ -159,8 +159,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -175,8 +175,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -191,8 +191,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'int' @@ -211,8 +211,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' // CHECK-NEXT: DeclRefExpr{{.*}} 'int' lvalue ParmVar{{.*}} 'IsI' 'int' @@ -229,8 +229,8 @@ void TemplUses(ConvertsToInt CTI, Int IsI) { // CHECK-NEXT: DeclRefExpr{{.*}}'i' 'int' // CHECK-NEXT: NullStmt // - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'unsigned int' // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}}'unsigned int' depth 0 index 0 I @@ -277,8 +277,8 @@ void uses() { #pragma acc loop worker for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} parallel - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} parallel + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: parallel // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -296,8 +296,8 @@ void uses() { #pragma acc loop worker for(int i = 0; i < 5; ++i); - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} serial - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} serial + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: serial // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: ForStmt // CHECK-NEXT: DeclStmt @@ -320,8 +320,8 @@ void uses() { // CHECK-NEXT: VarDecl // CHECK-NEXT: CXXConstructExpr - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'int' @@ -347,8 +347,8 @@ void uses() { // CHECK-NEXT: DeclStmt // CHECK-NEXT: VarDecl - // CHECK-NEXT: OpenACCComputeConstruct 0x[[COMPUTE_ADDR:[0-9a-f]+]]{{.*}} kernels - // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: 0x[[COMPUTE_ADDR]] + // CHECK-NEXT: OpenACCComputeConstruct {{.*}} kernels + // CHECK-NEXT: OpenACCLoopConstruct{{.*}} parent: kernels // CHECK-NEXT: worker clause{{.*}} // CHECK-NEXT: ImplicitCastExpr{{.*}}'int' // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index c6d82646b40de2..68ca31e27e0a41 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -340,7 +340,7 @@ class Intrinsic { /// The index of the key type passed to CGBuiltin.cpp for polymorphic calls. int PolymorphicKeyType; /// The local variables defined. - std::map Variables; + std::map> Variables; /// NeededEarly - set if any other intrinsic depends on this intrinsic. bool NeededEarly; /// UseMacro - set if we should implement using a macro or unset for a @@ -578,7 +578,7 @@ class Intrinsic { class NeonEmitter { const RecordKeeper &Records; DenseMap ClassMap; - std::map> IntrinsicMap; + std::map, std::less<>> IntrinsicMap; unsigned UniqueNumber; void createIntrinsic(const Record *R, SmallVectorImpl &Out); @@ -1548,8 +1548,8 @@ Intrinsic::DagEmitter::emitDagCast(const DagInit *DI, bool IsBitCast) { // 5. The value "H" or "D" to half or double the bitwidth. // 6. The value "8" to convert to 8-bit (signed) integer lanes. if (!DI->getArgNameStr(ArgIdx).empty()) { - assert_with_loc(Intr.Variables.find(std::string( - DI->getArgNameStr(ArgIdx))) != Intr.Variables.end(), + assert_with_loc(Intr.Variables.find(DI->getArgNameStr(ArgIdx)) != + Intr.Variables.end(), "Variable not found"); castToType = Intr.Variables[std::string(DI->getArgNameStr(ArgIdx))].getType(); @@ -1937,9 +1937,9 @@ void Intrinsic::indexBody() { Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef Types, std::optional MangledName) { // First, look up the name in the intrinsic map. - assert_with_loc(IntrinsicMap.find(Name.str()) != IntrinsicMap.end(), + assert_with_loc(IntrinsicMap.find(Name) != IntrinsicMap.end(), ("Intrinsic '" + Name + "' not found!").str()); - auto &V = IntrinsicMap.find(Name.str())->second; + auto &V = IntrinsicMap.find(Name)->second; std::vector GoodVec; // Create a string to print if we end up failing. diff --git a/clang/utils/perf-training/bolt.lit.cfg b/clang/utils/perf-training/bolt.lit.cfg index 0e81a5501e9fcf..1d0cf9a8a17a8e 100644 --- a/clang/utils/perf-training/bolt.lit.cfg +++ b/clang/utils/perf-training/bolt.lit.cfg @@ -49,3 +49,6 @@ config.substitutions.append(("%clang_cpp", f" {config.clang} --driver-mode=g++ " config.substitutions.append(("%clang_skip_driver", config.clang)) config.substitutions.append(("%clang", config.clang)) config.substitutions.append(("%test_root", config.test_exec_root)) +config.substitutions.append(('%cmake_generator', config.cmake_generator)) +config.substitutions.append(('%cmake', config.cmake_exe)) +config.substitutions.append(('%llvm_src_dir', config.llvm_src_dir)) diff --git a/clang/utils/perf-training/bolt.lit.site.cfg.in b/clang/utils/perf-training/bolt.lit.site.cfg.in index 54de12701c1ae9..3de5026e4792ae 100644 --- a/clang/utils/perf-training/bolt.lit.site.cfg.in +++ b/clang/utils/perf-training/bolt.lit.site.cfg.in @@ -11,6 +11,9 @@ config.python_exe = "@Python3_EXECUTABLE@" config.clang_obj_root = path(r"@CLANG_BINARY_DIR@") config.clang_bolt_mode = "@CLANG_BOLT@" config.clang_bolt_name = "@CLANG_BOLT_INSTRUMENTED@" +config.cmake_exe = "@CMAKE_COMMAND@" +config.llvm_src_dir ="@CMAKE_SOURCE_DIR@" +config.cmake_generator ="@CMAKE_GENERATOR@" # Let the main config do the real work. lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/bolt.lit.cfg") diff --git a/clang/utils/perf-training/lit.cfg b/clang/utils/perf-training/lit.cfg index 0bd06c0d44f650..654961e215da68 100644 --- a/clang/utils/perf-training/lit.cfg +++ b/clang/utils/perf-training/lit.cfg @@ -34,8 +34,11 @@ config.test_format = lit.formats.ShTest(use_lit_shell == "0") config.substitutions.append( ('%clang_cpp_skip_driver', ' %s %s %s ' % (cc1_wrapper, config.clang, sysroot_flags))) config.substitutions.append( ('%clang_cpp', ' %s --driver-mode=g++ %s ' % (config.clang, sysroot_flags))) config.substitutions.append( ('%clang_skip_driver', ' %s %s %s ' % (cc1_wrapper, config.clang, sysroot_flags))) -config.substitutions.append( ('%clang', ' %s %s ' % (config.clang, sysroot_flags) ) ) +config.substitutions.append( ('%clang', '%s %s ' % (config.clang, sysroot_flags) ) ) config.substitutions.append( ('%test_root', config.test_exec_root ) ) +config.substitutions.append( ('%cmake_generator', config.cmake_generator ) ) +config.substitutions.append( ('%cmake', config.cmake_exe ) ) +config.substitutions.append( ('%llvm_src_dir', config.llvm_src_dir ) ) config.environment['LLVM_PROFILE_FILE'] = 'perf-training-%4m.profraw' diff --git a/clang/utils/perf-training/lit.site.cfg.in b/clang/utils/perf-training/lit.site.cfg.in index fae93065a4edf2..9d279d552919ac 100644 --- a/clang/utils/perf-training/lit.site.cfg.in +++ b/clang/utils/perf-training/lit.site.cfg.in @@ -8,6 +8,9 @@ config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@" config.test_source_root = "@CLANG_PGO_TRAINING_DATA@" config.target_triple = "@LLVM_TARGET_TRIPLE@" config.python_exe = "@Python3_EXECUTABLE@" +config.cmake_exe = "@CMAKE_COMMAND@" +config.llvm_src_dir ="@CMAKE_SOURCE_DIR@" +config.cmake_generator ="@CMAKE_GENERATOR@" # Let the main config do the real work. lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/lit.cfg") diff --git a/clang/utils/perf-training/llvm-support/build.test b/clang/utils/perf-training/llvm-support/build.test new file mode 100644 index 00000000000000..f29a594c846869 --- /dev/null +++ b/clang/utils/perf-training/llvm-support/build.test @@ -0,0 +1,2 @@ +RUN: %cmake -G %cmake_generator -B %t -S %llvm_src_dir -DCMAKE_C_COMPILER=%clang -DCMAKE_CXX_COMPILER=%clang -DCMAKE_CXX_FLAGS="--driver-mode=g++" -DCMAKE_BUILD_TYPE=Release +RUN: %cmake --build %t -v --target LLVMSupport diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake index 6b96a5147753e7..77261f631ea117 100644 --- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake +++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake @@ -706,16 +706,31 @@ macro(add_custom_libcxx name prefix) -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_DOCS=OFF ${LIBCXX_CMAKE_ARGS} - STEP_TARGETS configure build install + STEP_TARGETS configure build BUILD_ALWAYS 1 USES_TERMINAL_CONFIGURE 1 USES_TERMINAL_BUILD 1 USES_TERMINAL_INSTALL 1 LIST_SEPARATOR | EXCLUDE_FROM_ALL TRUE - INSTALL_BYPRODUCTS "${prefix}/lib/libc++.a" "${prefix}/lib/libc++abi.a" ) + # Once we depend on CMake 3.26, we can use the INSTALL_BYPRODUCTS argument + # instead of having to fall back to ExternalProject_Add_Step() + # Note: We can't use the normal name "install" here since that interferes + # with the default ExternalProject_Add() logic and causes errors. + ExternalProject_Add_Step(${name} install-cmake326-workaround + # Ensure that DESTDIR=... set in the out environment does not affect this + # target (we always need to install to the build directory). + COMMAND env DESTDIR= ${CMAKE_COMMAND} --build ${prefix}/build --target install + COMMENT "Installing ${name}..." + BYPRODUCTS "${prefix}/lib/libc++.a" "${prefix}/lib/libc++abi.a" + DEPENDEES build + EXCLUDE_FROM_MAIN 1 + USES_TERMINAL 1 + ) + ExternalProject_Add_StepTargets(${name} install-cmake326-workaround) + if (CMAKE_GENERATOR MATCHES "Make") set(run_clean "$(MAKE)" "-C" "${prefix}" "clean") else() diff --git a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h index 3b0cbcdd49c254..36a996632b71e3 100644 --- a/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h +++ b/compiler-rt/lib/ctx_profile/CtxInstrContextNode.h @@ -8,9 +8,9 @@ //============================================================================== // // NOTE! -// llvm/lib/ProfileData/CtxInstrContextNode.h and +// llvm/include/llvm/ProfileData/CtxInstrContextNode.h and // compiler-rt/lib/ctx_profile/CtxInstrContextNode.h -// must be exact copies of each other +// must be exact copies of each other. // // compiler-rt creates these objects as part of the instrumentation runtime for // contextual profiling. LLVM only consumes them to convert a contextual tree @@ -114,4 +114,4 @@ class ContextNode final { }; } // namespace ctx_profile } // namespace llvm -#endif \ No newline at end of file +#endif diff --git a/compiler-rt/lib/fuzzer/CMakeLists.txt b/compiler-rt/lib/fuzzer/CMakeLists.txt index a6175564e55a20..6db24610df1f06 100644 --- a/compiler-rt/lib/fuzzer/CMakeLists.txt +++ b/compiler-rt/lib/fuzzer/CMakeLists.txt @@ -166,11 +166,11 @@ if(OS_NAME MATCHES "Android|Linux|Fuchsia" AND -DLIBCXX_ABI_NAMESPACE=__Fuzzer -DLIBCXX_ENABLE_EXCEPTIONS=OFF) target_compile_options(RTfuzzer.${arch} PRIVATE -isystem ${LIBCXX_${arch}_PREFIX}/include/c++/v1) - add_dependencies(RTfuzzer.${arch} libcxx_fuzzer_${arch}-install) + add_dependencies(RTfuzzer.${arch} libcxx_fuzzer_${arch}-install-cmake326-workaround) target_compile_options(RTfuzzer_main.${arch} PRIVATE -isystem ${LIBCXX_${arch}_PREFIX}/include/c++/v1) - add_dependencies(RTfuzzer_main.${arch} libcxx_fuzzer_${arch}-install) + add_dependencies(RTfuzzer_main.${arch} libcxx_fuzzer_${arch}-install-cmake326-workaround) target_compile_options(RTfuzzer_interceptors.${arch} PRIVATE -isystem ${LIBCXX_${arch}_PREFIX}/include/c++/v1) - add_dependencies(RTfuzzer_interceptors.${arch} libcxx_fuzzer_${arch}-install) + add_dependencies(RTfuzzer_interceptors.${arch} libcxx_fuzzer_${arch}-install-cmake326-workaround) partially_link_libcxx(fuzzer_no_main ${LIBCXX_${arch}_PREFIX} ${arch}) partially_link_libcxx(fuzzer_interceptors ${LIBCXX_${arch}_PREFIX} ${arch}) partially_link_libcxx(fuzzer ${LIBCXX_${arch}_PREFIX} ${arch}) diff --git a/compiler-rt/lib/fuzzer/tests/CMakeLists.txt b/compiler-rt/lib/fuzzer/tests/CMakeLists.txt index 73ebc135312090..adfae3d63e648a 100644 --- a/compiler-rt/lib/fuzzer/tests/CMakeLists.txt +++ b/compiler-rt/lib/fuzzer/tests/CMakeLists.txt @@ -64,7 +64,7 @@ if(COMPILER_RT_DEFAULT_TARGET_ARCH IN_LIST FUZZER_SUPPORTED_ARCH) COMPILER_RT_LIBCXX_PATH AND COMPILER_RT_LIBCXXABI_PATH) file(GLOB libfuzzer_headers ../*.h) - set(LIBFUZZER_TEST_RUNTIME_DEPS libcxx_fuzzer_${arch}-install ${libfuzzer_headers}) + set(LIBFUZZER_TEST_RUNTIME_DEPS libcxx_fuzzer_${arch}-install-cmake326-workaround ${libfuzzer_headers}) set(LIBFUZZER_TEST_RUNTIME_CFLAGS -isystem ${LIBCXX_${arch}_PREFIX}/include/c++/v1) set(LIBFUZZER_TEST_RUNTIME_LINK_FLAGS ${LIBCXX_${arch}_PREFIX}/lib/libc++.a) endif() diff --git a/compiler-rt/lib/msan/tests/CMakeLists.txt b/compiler-rt/lib/msan/tests/CMakeLists.txt index 3ddae6d08b7f67..a8500225337e62 100644 --- a/compiler-rt/lib/msan/tests/CMakeLists.txt +++ b/compiler-rt/lib/msan/tests/CMakeLists.txt @@ -69,7 +69,7 @@ macro(msan_compile obj_list source arch kind cflags) sanitizer_test_compile( ${obj_list} ${source} ${arch} KIND ${kind} - COMPILE_DEPS ${MSAN_UNITTEST_HEADERS} libcxx_msan_${arch}-install + COMPILE_DEPS ${MSAN_UNITTEST_HEADERS} libcxx_msan_${arch}-install-cmake326-workaround DEPS msan CFLAGS -isystem ${MSAN_LIBCXX_DIR}/../include/c++/v1 ${MSAN_UNITTEST_INSTRUMENTED_CFLAGS} ${cflags} @@ -117,10 +117,10 @@ macro(add_msan_tests_for_arch arch kind cflags) DEPS ${MSAN_INST_LOADABLE_OBJECTS}) set(MSAN_TEST_OBJECTS ${MSAN_INST_TEST_OBJECTS} ${MSAN_INST_GTEST}) - set(MSAN_TEST_DEPS ${MSAN_TEST_OBJECTS} libcxx_msan_${arch}-install + set(MSAN_TEST_DEPS ${MSAN_TEST_OBJECTS} libcxx_msan_${arch}-install-cmake326-workaround ${MSAN_LOADABLE_SO} "${MSAN_LIBCXX_DIR}/libc++.a" "${MSAN_LIBCXX_DIR}/libc++abi.a") - list(APPEND MSAN_TEST_DEPS msan libcxx_msan_${arch}-install) + list(APPEND MSAN_TEST_DEPS msan libcxx_msan_${arch}-install-cmake326-workaround) get_target_flags_for_arch(${arch} TARGET_LINK_FLAGS) add_compiler_rt_test(MsanUnitTests "Msan-${arch}${kind}-Test" ${arch} OBJECTS ${MSAN_TEST_OBJECTS} "${MSAN_LIBCXX_DIR}/libc++.a" "${MSAN_LIBCXX_DIR}/libc++abi.a" diff --git a/compiler-rt/lib/tsan/CMakeLists.txt b/compiler-rt/lib/tsan/CMakeLists.txt index f7e2b5b6a35631..7928116879c09e 100644 --- a/compiler-rt/lib/tsan/CMakeLists.txt +++ b/compiler-rt/lib/tsan/CMakeLists.txt @@ -31,7 +31,7 @@ if(COMPILER_RT_LIBCXX_PATH AND DEPS ${TSAN_RUNTIME_LIBRARIES} CFLAGS ${TARGET_CFLAGS} -fsanitize=thread USE_TOOLCHAIN) - list(APPEND libcxx_tsan_deps libcxx_tsan_${arch}-install) + list(APPEND libcxx_tsan_deps libcxx_tsan_${arch}-install-cmake326-workaround) endforeach() add_custom_target(libcxx_tsan DEPENDS ${libcxx_tsan_deps}) diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp index 62f8d39a8abaa5..dc68f160f5d924 100644 --- a/flang/examples/FeatureList/FeatureList.cpp +++ b/flang/examples/FeatureList/FeatureList.cpp @@ -470,13 +470,17 @@ struct NodeVisitor { READ_FEATURE(OmpDefaultmapClause::ImplicitBehavior) READ_FEATURE(OmpDefaultmapClause::VariableCategory) READ_FEATURE(OmpDependClause) - READ_FEATURE(OmpDependClause::InOut) - READ_FEATURE(OmpDependClause::Sink) - READ_FEATURE(OmpDependClause::Source) + READ_FEATURE(OmpDependClause::TaskDep) + READ_FEATURE(OmpDoacross::Sink) + READ_FEATURE(OmpDoacross::Source) + READ_FEATURE(OmpDoacrossClause) + READ_FEATURE(OmpDependenceType) + READ_FEATURE(OmpDependenceType::Type) READ_FEATURE(OmpTaskDependenceType) READ_FEATURE(OmpTaskDependenceType::Type) - READ_FEATURE(OmpDependSinkVec) - READ_FEATURE(OmpDependSinkVecLength) + READ_FEATURE(OmpIteration) + READ_FEATURE(OmpIterationOffset) + READ_FEATURE(OmpIterationVector) READ_FEATURE(OmpEndAllocators) READ_FEATURE(OmpEndAtomic) READ_FEATURE(OmpEndBlockDirective) diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index 456d53389cddfd..4bbf9777a54ccb 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -177,6 +177,7 @@ class ParseTreeDumper { NODE(parser, Call) NODE(parser, CallStmt) NODE(CallStmt, Chevrons) + NODE(CallStmt, StarOrExpr) NODE(parser, CaseConstruct) NODE(CaseConstruct, Case) NODE(parser, CaseSelector) @@ -509,15 +510,20 @@ class ParseTreeDumper { NODE(parser, OmpDefaultmapClause) NODE_ENUM(OmpDefaultmapClause, ImplicitBehavior) NODE_ENUM(OmpDefaultmapClause, VariableCategory) - NODE(parser, OmpDependClause) - NODE(parser, OmpDetachClause) - NODE(OmpDependClause, InOut) - NODE(OmpDependClause, Sink) - NODE(OmpDependClause, Source) + NODE(parser, OmpDependenceType) + NODE_ENUM(OmpDependenceType, Type) NODE(parser, OmpTaskDependenceType) NODE_ENUM(OmpTaskDependenceType, Type) - NODE(parser, OmpDependSinkVec) - NODE(parser, OmpDependSinkVecLength) + NODE(parser, OmpIterationOffset) + NODE(parser, OmpIteration) + NODE(parser, OmpIterationVector) + NODE(parser, OmpDoacross) + NODE(OmpDoacross, Sink) + NODE(OmpDoacross, Source) + NODE(parser, OmpDependClause) + NODE(OmpDependClause, TaskDep) + NODE(parser, OmpDetachClause) + NODE(parser, OmpDoacrossClause) NODE(parser, OmpDestroyClause) NODE(parser, OmpEndAllocators) NODE(parser, OmpEndAtomic) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index e85187479380df..5f5650304f9987 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -3247,13 +3247,14 @@ struct FunctionReference { // R1521 call-stmt -> CALL procedure-designator [ chevrons ] // [( [actual-arg-spec-list] )] -// (CUDA) chevrons -> <<< scalar-expr, scalar-expr [, +// (CUDA) chevrons -> <<< * | scalar-expr, scalar-expr [, // scalar-int-expr [, scalar-int-expr ] ] >>> struct CallStmt { BOILERPLATE(CallStmt); + WRAPPER_CLASS(StarOrExpr, std::optional); struct Chevrons { TUPLE_CLASS_BOILERPLATE(Chevrons); - std::tuple, + std::tuple, std::optional> t; }; @@ -3439,16 +3440,35 @@ struct OmpObject { WRAPPER_CLASS(OmpObjectList, std::list); +// Ref: [4.5:169-170], [5.0:255-256], [5.1:288-289] +// +// dependence-type -> +// SINK | SOURCE | // since 4.5 +// IN | OUT | INOUT | // since 4.5, until 5.1 +// MUTEXINOUTSET | DEPOBJ | // since 5.0, until 5.1 +// INOUTSET // since 5.1, until 5.1 +// +// All of these, except SINK and SOURCE became task-dependence-type in 5.2. +// +// Keeping these two as separate types, since having them all together +// creates conflicts when parsing the DEPEND clause. For DEPEND(SINK: ...), +// the SINK may be parsed as 'task-dependence-type', and the list after +// the ':' would then be parsed as OmpObjectList (instead of the iteration +// vector). This would accept the vector "i, j, k" (although interpreted +// incorrectly), while flagging a syntax error for "i+1, j, k". +struct OmpDependenceType { + ENUM_CLASS(Type, Sink, Source); + WRAPPER_CLASS_BOILERPLATE(OmpDependenceType, Type); +}; + // Ref: [4.5:169-170], [5.0:254-256], [5.1:287-289], [5.2:321] // // task-dependence-type -> // "dependence-type" in 5.1 and before // IN | OUT | INOUT | // since 4.5 -// SOURCE | SINK | // since 4.5, until 5.1 // MUTEXINOUTSET | DEPOBJ | // since 5.0 // INOUTSET // since 5.2 struct OmpTaskDependenceType { - ENUM_CLASS( - Type, In, Out, Inout, Inoutset, Mutexinoutset, Source, Sink, Depobj) + ENUM_CLASS(Type, In, Out, Inout, Inoutset, Mutexinoutset, Depobj) WRAPPER_CLASS_BOILERPLATE(OmpTaskDependenceType, Type); }; @@ -3528,41 +3548,55 @@ struct OmpDefaultmapClause { std::tuple> t; }; -// 2.13.9 depend-vec-length -> +/- non-negative-constant -struct OmpDependSinkVecLength { - TUPLE_CLASS_BOILERPLATE(OmpDependSinkVecLength); +// 2.13.9 iteration-offset -> +/- non-negative-constant +struct OmpIterationOffset { + TUPLE_CLASS_BOILERPLATE(OmpIterationOffset); std::tuple t; }; -// 2.13.9 depend-vec -> induction-variable [depend-vec-length], ... -struct OmpDependSinkVec { - TUPLE_CLASS_BOILERPLATE(OmpDependSinkVec); - std::tuple> t; +// 2.13.9 iteration -> induction-variable [iteration-offset] +struct OmpIteration { + TUPLE_CLASS_BOILERPLATE(OmpIteration); + std::tuple> t; +}; + +WRAPPER_CLASS(OmpIterationVector, std::list); + +// Extract this into a separate structure (instead of having it directly in +// OmpDoacrossClause), so that the context in TYPE_CONTEXT_PARSER can be set +// separately for OmpDependClause and OmpDoacrossClause. +struct OmpDoacross { + OmpDependenceType::Type GetDepType() const; + + WRAPPER_CLASS(Sink, OmpIterationVector); + EMPTY_CLASS(Source); + UNION_CLASS_BOILERPLATE(OmpDoacross); + std::variant u; }; // Ref: [4.5:169-170], [5.0:255-256], [5.1:288-289], [5.2:323-324] // // depend-clause -> // DEPEND(SOURCE) | // since 4.5, until 5.1 -// DEPEND(SINK: depend-vec) | // since 4.5, until 5.1 -// DEPEND([depend-modifier,]dependence-type: locator-list) // since 4.5 +// DEPEND(SINK: iteration-vector) | // since 4.5, until 5.1 +// DEPEND([depend-modifier,] +// task-dependence-type: locator-list) // since 4.5 // // depend-modifier -> iterator-modifier // since 5.0 struct OmpDependClause { - OmpTaskDependenceType::Type GetDepType() const; - UNION_CLASS_BOILERPLATE(OmpDependClause); - EMPTY_CLASS(Source); - WRAPPER_CLASS(Sink, std::list); - struct InOut { - TUPLE_CLASS_BOILERPLATE(InOut); + struct TaskDep { + OmpTaskDependenceType::Type GetTaskDepType() const; + TUPLE_CLASS_BOILERPLATE(TaskDep); std::tuple, OmpTaskDependenceType, OmpObjectList> t; }; - std::variant u; + std::variant u; }; +WRAPPER_CLASS(OmpDoacrossClause, OmpDoacross); + // Ref: [5.0:254-255], [5.1:287-288], [5.2:73] // // destroy-clause -> @@ -3694,9 +3728,8 @@ struct OmpMapClause { // 2.9.5 order-clause -> ORDER ([order-modifier :]concurrent) struct OmpOrderModifier { - UNION_CLASS_BOILERPLATE(OmpOrderModifier); ENUM_CLASS(Kind, Reproducible, Unconstrained) - std::variant u; + WRAPPER_CLASS_BOILERPLATE(OmpOrderModifier, Kind); }; struct OmpOrderClause { @@ -3775,8 +3808,12 @@ struct OmpNumTasksClause { // Ref: [5.0:254-255], [5.1:287-288], [5.2:321-322] // -// update-clause -> UPDATE(task-dependence-type) // since 5.0 -WRAPPER_CLASS(OmpUpdateClause, OmpTaskDependenceType); +// update-clause -> UPDATE(dependence-type) // since 5.0, until 5.1 +// update-clause -> UPDATE(task-dependence-type) // since 5.2 +struct OmpUpdateClause { + UNION_CLASS_BOILERPLATE(OmpUpdateClause); + std::variant u; +}; // OMP 5.2 11.7.1 bind-clause -> // BIND( PARALLEL | TEAMS | THREAD ) diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index e768c1cbc0784a..72b9018f2d2808 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -122,28 +122,28 @@ genProcBindKindAttr(fir::FirOpBuilder &firOpBuilder, static mlir::omp::ClauseTaskDependAttr genDependKindAttr(lower::AbstractConverter &converter, - const omp::clause::Depend::TaskDependenceType kind) { + const omp::clause::DependenceType kind) { fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); mlir::Location currentLocation = converter.getCurrentLocation(); mlir::omp::ClauseTaskDepend pbKind; switch (kind) { - case omp::clause::Depend::TaskDependenceType::In: + case omp::clause::DependenceType::In: pbKind = mlir::omp::ClauseTaskDepend::taskdependin; break; - case omp::clause::Depend::TaskDependenceType::Out: + case omp::clause::DependenceType::Out: pbKind = mlir::omp::ClauseTaskDepend::taskdependout; break; - case omp::clause::Depend::TaskDependenceType::Inout: + case omp::clause::DependenceType::Inout: pbKind = mlir::omp::ClauseTaskDepend::taskdependinout; break; - case omp::clause::Depend::TaskDependenceType::Mutexinoutset: - case omp::clause::Depend::TaskDependenceType::Inoutset: + case omp::clause::DependenceType::Mutexinoutset: + case omp::clause::DependenceType::Inoutset: TODO(currentLocation, "INOUTSET and MUTEXINOUTSET are not supported yet"); break; - case omp::clause::Depend::TaskDependenceType::Depobj: - case omp::clause::Depend::TaskDependenceType::Sink: - case omp::clause::Depend::TaskDependenceType::Source: + case omp::clause::DependenceType::Depobj: + case omp::clause::DependenceType::Sink: + case omp::clause::DependenceType::Source: llvm_unreachable("unhandled parser task dependence type"); break; } @@ -803,20 +803,20 @@ bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const { auto process = [&](const omp::clause::Depend &clause, const parser::CharBlock &) { using Depend = omp::clause::Depend; - if (!std::holds_alternative(clause.u)) { + if (!std::holds_alternative(clause.u)) { TODO(converter.getCurrentLocation(), "DEPEND clause with SINK or SOURCE is not supported yet"); } - auto &depType = std::get(clause.u); - auto kind = std::get(depType.t); - auto &objects = std::get(depType.t); + auto &taskDep = std::get(clause.u); + auto depType = std::get(taskDep.t); + auto &objects = std::get(taskDep.t); - if (std::get>(depType.t)) { + if (std::get>(taskDep.t)) { TODO(converter.getCurrentLocation(), "Support for iterator modifiers is not implemented yet"); } mlir::omp::ClauseTaskDependAttr dependTypeOperand = - genDependKindAttr(converter, kind); + genDependKindAttr(converter, depType); result.dependKinds.append(objects.size(), dependTypeOperand); for (const omp::Object &object : objects) { diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index 46caafeef8e4a8..3dedd4864bafc5 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -338,27 +338,32 @@ ReductionOperator makeReductionOperator(const parser::OmpReductionOperator &inp, inp.u); } -clause::TaskDependenceType -makeDepType(const parser::OmpTaskDependenceType &inp) { +clause::DependenceType makeDepType(const parser::OmpDependenceType &inp) { + switch (inp.v) { + case parser::OmpDependenceType::Type::Sink: + return clause::DependenceType::Sink; + case parser::OmpDependenceType::Type::Source: + return clause::DependenceType::Source; + } + llvm_unreachable("Unexpected dependence type"); +} + +clause::DependenceType makeDepType(const parser::OmpTaskDependenceType &inp) { switch (inp.v) { case parser::OmpTaskDependenceType::Type::Depobj: - return clause::TaskDependenceType::Depobj; + return clause::DependenceType::Depobj; case parser::OmpTaskDependenceType::Type::In: - return clause::TaskDependenceType::In; + return clause::DependenceType::In; case parser::OmpTaskDependenceType::Type::Inout: - return clause::TaskDependenceType::Inout; + return clause::DependenceType::Inout; case parser::OmpTaskDependenceType::Type::Inoutset: - return clause::TaskDependenceType::Inoutset; + return clause::DependenceType::Inoutset; case parser::OmpTaskDependenceType::Type::Mutexinoutset: - return clause::TaskDependenceType::Mutexinoutset; + return clause::DependenceType::Mutexinoutset; case parser::OmpTaskDependenceType::Type::Out: - return clause::TaskDependenceType::Out; - case parser::OmpTaskDependenceType::Type::Sink: - return clause::TaskDependenceType::Sink; - case parser::OmpTaskDependenceType::Type::Source: - return clause::TaskDependenceType::Source; + return clause::DependenceType::Out; } - llvm_unreachable("Unexpected dependence type"); + llvm_unreachable("Unexpected task dependence type"); } // -------------------------------------------------------------------- @@ -569,54 +574,63 @@ Defaultmap make(const parser::OmpClause::Defaultmap &inp, /*VariableCategory=*/maybeApply(convert2, t1)}}; } +Doacross makeDoacross(const parser::OmpDoacross &doa, + semantics::SemanticsContext &semaCtx) { + // Iteration is the equivalent of parser::OmpIteration + using Iteration = Doacross::Vector::value_type; // LoopIterationT + + auto visitSource = [&](const parser::OmpDoacross::Source &) { + return Doacross{{/*DependenceType=*/Doacross::DependenceType::Source, + /*Vector=*/{}}}; + }; + + auto visitSink = [&](const parser::OmpDoacross::Sink &s) { + using IterOffset = parser::OmpIterationOffset; + auto convert2 = [&](const parser::OmpIteration &v) { + auto &t0 = std::get(v.t); + auto &t1 = std::get>(v.t); + + auto convert3 = [&](const IterOffset &u) { + auto &s0 = std::get(u.t); + auto &s1 = std::get(u.t); + return Iteration::Distance{ + {makeDefinedOperator(s0, semaCtx), makeExpr(s1, semaCtx)}}; + }; + return Iteration{{makeObject(t0, semaCtx), maybeApply(convert3, t1)}}; + }; + return Doacross{{/*DependenceType=*/Doacross::DependenceType::Sink, + /*Vector=*/makeList(s.v.v, convert2)}}; + }; + + return common::visit(common::visitors{visitSink, visitSource}, doa.u); +} + Depend make(const parser::OmpClause::Depend &inp, semantics::SemanticsContext &semaCtx) { // inp.v -> parser::OmpDependClause using wrapped = parser::OmpDependClause; using Variant = decltype(Depend::u); - // Iteration is the equivalent of parser::OmpDependSinkVec - using Iteration = Doacross::Vector::value_type; // LoopIterationT - return Depend{Fortran::common::visit( // + auto visitTaskDep = [&](const wrapped::TaskDep &s) -> Variant { + auto &t0 = std::get>(s.t); + auto &t1 = std::get(s.t); + auto &t2 = std::get(s.t); + + auto &&maybeIter = + maybeApply([&](auto &&s) { return makeIterator(s, semaCtx); }, t0); + return Depend::TaskDep{{/*DependenceType=*/makeDepType(t1), + /*Iterator=*/std::move(maybeIter), + /*LocatorList=*/makeObjects(t2, semaCtx)}}; + }; + + return Depend{common::visit( // common::visitors{ // Doacross - [&](const wrapped::Source &s) -> Variant { - return Doacross{ - {/*DependenceType=*/Doacross::DependenceType::Source, - /*Vector=*/{}}}; - }, - // Doacross - [&](const wrapped::Sink &s) -> Variant { - using DependLength = parser::OmpDependSinkVecLength; - auto convert2 = [&](const parser::OmpDependSinkVec &v) { - auto &t0 = std::get(v.t); - auto &t1 = std::get>(v.t); - - auto convert3 = [&](const DependLength &u) { - auto &s0 = std::get(u.t); - auto &s1 = std::get(u.t); - return Iteration::Distance{ - {makeDefinedOperator(s0, semaCtx), makeExpr(s1, semaCtx)}}; - }; - return Iteration{ - {makeObject(t0, semaCtx), maybeApply(convert3, t1)}}; - }; - return Doacross{{/*DependenceType=*/Doacross::DependenceType::Sink, - /*Vector=*/makeList(s.v, convert2)}}; - }, - // Depend::DepType - [&](const wrapped::InOut &s) -> Variant { - auto &t0 = - std::get>(s.t); - auto &t1 = std::get(s.t); - auto &t2 = std::get(s.t); - - auto &&maybeIter = maybeApply( - [&](auto &&s) { return makeIterator(s, semaCtx); }, t0); - return Depend::DepType{{/*TaskDependenceType=*/makeDepType(t1), - /*Iterator=*/std::move(maybeIter), - /*LocatorList=*/makeObjects(t2, semaCtx)}}; + [&](const parser::OmpDoacross &s) -> Variant { + return makeDoacross(s, semaCtx); }, + // Depend::TaskDep + visitTaskDep, }, inp.v.u)}; } @@ -684,8 +698,8 @@ DistSchedule make(const parser::OmpClause::DistSchedule &inp, Doacross make(const parser::OmpClause::Doacross &inp, semantics::SemanticsContext &semaCtx) { - // inp -> empty - llvm_unreachable("Empty: doacross"); + // inp.v -> OmpDoacrossClause + return makeDoacross(inp.v.v, semaCtx); } // DynamicAllocators: empty @@ -1092,8 +1106,7 @@ Order make(const parser::OmpClause::Order &inp, auto &t1 = std::get(inp.v.t); auto convert3 = [&](const parser::OmpOrderModifier &s) { - return Fortran::common::visit( - [&](parser::OmpOrderModifier::Kind k) { return convert1(k); }, s.u); + return convert1(s.v); }; return Order{ {/*OrderModifier=*/maybeApply(convert3, t0), /*Ordering=*/convert2(t1)}}; @@ -1356,7 +1369,9 @@ Uniform make(const parser::OmpClause::Uniform &inp, Update make(const parser::OmpClause::Update &inp, semantics::SemanticsContext &semaCtx) { // inp.v -> parser::OmpUpdateClause - return Update{/*TaskDependenceType=*/makeDepType(inp.v.v)}; + auto depType = + common::visit([](auto &&s) { return makeDepType(s); }, inp.v.u); + return Update{/*DependenceType=*/depType}; } Use make(const parser::OmpClause::Use &inp, diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h index 51180ebfe5745e..514f0d1ee466ac 100644 --- a/flang/lib/Lower/OpenMP/Clauses.h +++ b/flang/lib/Lower/OpenMP/Clauses.h @@ -152,7 +152,7 @@ using IteratorSpecifier = tomp::type::IteratorSpecifierT; using DefinedOperator = tomp::type::DefinedOperatorT; using ProcedureDesignator = tomp::type::ProcedureDesignatorT; using ReductionOperator = tomp::type::ReductionIdentifierT; -using TaskDependenceType = tomp::type::TaskDependenceType; +using DependenceType = tomp::type::DependenceType; // "Requires" clauses are handled early on, and the aggregated information // is stored in the Symbol details of modules, programs, and subprograms. diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 4f9e2347308aa1..91f99ba4b0ca55 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1916,10 +1916,36 @@ genTaskOp(lower::AbstractConverter &converter, lower::SymMap &symTable, mlir::omp::TaskOperands clauseOps; genTaskClauses(converter, semaCtx, stmtCtx, item->clauses, loc, clauseOps); + if (!enableDelayedPrivatization) + return genOpWithBody( + OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval, + llvm::omp::Directive::OMPD_task) + .setClauses(&item->clauses), + queue, item, clauseOps); + + DataSharingProcessor dsp(converter, semaCtx, item->clauses, eval, + lower::omp::isLastItemInQueue(item, queue), + /*useDelayedPrivatization=*/true, &symTable); + dsp.processStep1(&clauseOps); + + EntryBlockArgs taskArgs; + taskArgs.priv.syms = dsp.getDelayedPrivSymbols(); + taskArgs.priv.vars = clauseOps.privateVars; + + auto genRegionEntryCB = [&](mlir::Operation *op) { + genEntryBlock(converter, taskArgs, op->getRegion(0)); + bindEntryBlockArgs(converter, + llvm::cast(op), + taskArgs); + return llvm::to_vector(taskArgs.priv.syms); + }; + return genOpWithBody( OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval, llvm::omp::Directive::OMPD_task) - .setClauses(&item->clauses), + .setClauses(&item->clauses) + .setDataSharingProcessor(&dsp) + .setGenRegionEntryCb(genRegionEntryCB), queue, item, clauseOps); } diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp index 881f54133ce732..8e9de3d3281525 100644 --- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp @@ -337,6 +337,13 @@ struct CUFAllocOpConversion : public mlir::OpRewritePattern { seqTy.getConstantArraySize()); } bytes = rewriter.create(loc, nbElem, width); + } else if (fir::isa_derived(op.getInType())) { + mlir::Type structTy = typeConverter->convertType(op.getInType()); + std::size_t structSize = dl->getTypeSizeInBits(structTy) / 8; + bytes = builder.createIntegerConstant(loc, builder.getIndexType(), + structSize); + } else { + mlir::emitError(loc, "unsupported type in cuf.alloc\n"); } mlir::func::FuncOp func = fir::runtime::getRuntimeFunc(loc, builder); diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 1fa7ffb6af3aeb..a46b06dce001f9 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -392,12 +392,9 @@ TYPE_PARSER(construct( ":"), Parser{})) -// 2.13.9 DEPEND (SOURCE | SINK : vec | (IN | OUT | INOUT) : list -TYPE_PARSER(construct( - Parser{}, scalarIntConstantExpr)) - -TYPE_PARSER( - construct(name, maybe(Parser{}))) +TYPE_PARSER(construct( + "SINK" >> pure(OmpDependenceType::Type::Sink) || + "SOURCE" >> pure(OmpDependenceType::Type::Source))) TYPE_PARSER(construct( "DEPOBJ" >> pure(OmpTaskDependenceType::Type::Depobj) || @@ -405,18 +402,31 @@ TYPE_PARSER(construct( "INOUT"_id >> pure(OmpTaskDependenceType::Type::Inout) || "INOUTSET"_id >> pure(OmpTaskDependenceType::Type::Inoutset) || "MUTEXINOUTSET" >> pure(OmpTaskDependenceType::Type::Mutexinoutset) || - "OUT" >> pure(OmpTaskDependenceType::Type::Out) || - "SINK" >> pure(OmpTaskDependenceType::Type::Sink) || - "SOURCE" >> pure(OmpTaskDependenceType::Type::Source))) + "OUT" >> pure(OmpTaskDependenceType::Type::Out))) + +// iteration-offset -> +/- non-negative-constant-expr +TYPE_PARSER(construct( + Parser{}, scalarIntConstantExpr)) + +// iteration -> iteration-variable [+/- nonnegative-scalar-integer-constant] +TYPE_PARSER(construct(name, maybe(Parser{}))) + +TYPE_PARSER(construct(nonemptyList(Parser{}))) + +TYPE_PARSER(construct( + construct(construct( + "SINK"_tok >> ":"_tok >> Parser{})) || + construct(construct("SOURCE"_tok)))) TYPE_CONTEXT_PARSER("Omp Depend clause"_en_US, - construct(construct( - "SINK :" >> nonemptyList(Parser{}))) || - construct( - construct("SOURCE"_tok)) || - construct(construct( + construct( + construct(construct( maybe(Parser{} / ","_tok), - Parser{} / ":", Parser{}))) + Parser{} / ":", Parser{})) || + construct(Parser{}))) + +TYPE_CONTEXT_PARSER("Omp Doacross clause"_en_US, + construct(Parser{})) TYPE_PARSER(construct( "PRESENT" >> pure(OmpFromClause::Expectation::Present))) @@ -466,6 +476,10 @@ TYPE_PARSER(construct(Parser{})) TYPE_PARSER(construct( Parser{}, maybe(":" >> scalarIntConstantExpr))) +TYPE_PARSER(construct( + construct(Parser{}) || + construct(Parser{}))) + // 2.9.5 ORDER ([order-modifier :]concurrent) TYPE_PARSER(construct( "REPRODUCIBLE" >> pure(OmpOrderModifier::Kind::Reproducible)) || @@ -539,6 +553,8 @@ TYPE_PARSER( "DIST_SCHEDULE" >> construct(construct( parenthesized("STATIC" >> maybe("," >> scalarIntExpr)))) || + "DOACROSS" >> + construct(parenthesized(Parser{})) || "DYNAMIC_ALLOCATORS" >> construct(construct()) || "ENTER" >> construct(construct( @@ -642,7 +658,7 @@ TYPE_PARSER( parenthesized(nonemptyList(name)))) || "UNTIED" >> construct(construct()) || "UPDATE" >> construct(construct( - parenthesized(Parser{})))) + parenthesized(Parser{})))) // [Clause, [Clause], ...] TYPE_PARSER(sourced(construct( diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp index 60aef1666e9ba7..574e5fd84862e7 100644 --- a/flang/lib/Parser/parse-tree.cpp +++ b/flang/lib/Parser/parse-tree.cpp @@ -253,22 +253,23 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Name &x) { return os << x.ToString(); } -OmpTaskDependenceType::Type OmpDependClause::GetDepType() const { - return common::visit( +OmpDependenceType::Type OmpDoacross::GetDepType() const { + return common::visit( // common::visitors{ - [&](const parser::OmpDependClause::Source &) { - return parser::OmpTaskDependenceType::Type::Source; - }, - [&](const parser::OmpDependClause::Sink &) { - return parser::OmpTaskDependenceType::Type::Sink; + [](const OmpDoacross::Sink &) { + return OmpDependenceType::Type::Sink; }, - [&](const parser::OmpDependClause::InOut &y) { - return std::get(y.t).v; + [](const OmpDoacross::Source &) { + return OmpDependenceType::Type::Source; }, }, u); } +OmpTaskDependenceType::Type OmpDependClause::TaskDep::GetTaskDepType() const { + return std::get(t).v; +} + } // namespace Fortran::parser template static llvm::omp::Clause getClauseIdForClass(C &&) { diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp index 2b7da18a09bb30..e365cd24a6aed0 100644 --- a/flang/lib/Parser/program-parsers.cpp +++ b/flang/lib/Parser/program-parsers.cpp @@ -474,10 +474,13 @@ TYPE_CONTEXT_PARSER("function reference"_en_US, // R1521 call-stmt -> CALL procedure-designator [chevrons] /// [( [actual-arg-spec-list] )] -// (CUDA) chevrons -> <<< scalar-expr, scalar-expr [, scalar-int-expr +// (CUDA) chevrons -> <<< * | scalar-expr, scalar-expr [, scalar-int-expr // [, scalar-int-expr ] ] >>> +constexpr auto starOrExpr{ + construct("*" >> pure>() || + applyFunction(presentOptional, scalarExpr))}; TYPE_PARSER(extension( - "<<<" >> construct(scalarExpr, "," >> scalarExpr, + "<<<" >> construct(starOrExpr, ", " >> scalarExpr, maybe("," >> scalarIntExpr), maybe("," >> scalarIntExpr)) / ">>>")) constexpr auto actualArgSpecList{optionalList(actualArgSpec)}; diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index bbb126dcdb6d5e..20022f8fa984ce 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -1703,6 +1703,13 @@ class UnparseVisitor { void Unparse(const IntrinsicStmt &x) { // R1519 Word("INTRINSIC :: "), Walk(x.v, ", "); } + void Unparse(const CallStmt::StarOrExpr &x) { + if (x.v) { + Walk(*x.v); + } else { + Word("*"); + } + } void Unparse(const CallStmt::Chevrons &x) { // CUDA Walk(std::get<0>(x.t)); // grid Word(","), Walk(std::get<1>(x.t)); // block @@ -2228,36 +2235,16 @@ class UnparseVisitor { std::get>(x.t), ":"); Walk(std::get(x.t)); } - void Unparse(const OmpDependSinkVecLength &x) { - Walk(std::get(x.t)); - Walk(std::get(x.t)); - } - void Unparse(const OmpDependSinkVec &x) { - Walk(std::get(x.t)); - Walk(std::get>(x.t)); + void Unparse(const OmpDoacross::Sink &x) { + Word("SINK: "); + Walk(x.v.v); } - void Unparse(const OmpDependClause::InOut &x) { + void Unparse(const OmpDoacross::Source &) { Word("SOURCE"); } + void Unparse(const OmpDependClause::TaskDep &x) { Walk(std::get(x.t)); Put(":"); Walk(std::get(x.t)); } - bool Pre(const OmpDependClause &x) { - return common::visit( - common::visitors{ - [&](const OmpDependClause::Source &) { - Word("SOURCE"); - return false; - }, - [&](const OmpDependClause::Sink &y) { - Word("SINK:"); - Walk(y.v); - Put(")"); - return false; - }, - [&](const OmpDependClause::InOut &) { return true; }, - }, - x.u); - } void Unparse(const OmpDefaultmapClause &x) { Walk(std::get(x.t)); Walk(":", diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 7098b710d23ffb..dc90b4cccabd26 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -575,6 +575,7 @@ void OmpStructureChecker::Leave(const parser::OpenMPConstruct &) { } void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) { + loopStack_.push_back(&x); const auto &beginLoopDir{std::get(x.t)}; const auto &beginDir{std::get(beginLoopDir.t)}; @@ -968,11 +969,19 @@ void OmpStructureChecker::CheckDistLinear( } } -void OmpStructureChecker::Leave(const parser::OpenMPLoopConstruct &) { +void OmpStructureChecker::Leave(const parser::OpenMPLoopConstruct &x) { if (llvm::omp::allSimdSet.test(GetContext().directive)) { ExitDirectiveNest(SIMDNest); } dirContext_.pop_back(); + + assert(!loopStack_.empty() && "Expecting non-empty loop stack"); + const LoopConstruct &top{loopStack_.back()}; +#ifndef NDEBUG + auto *loopc{std::get_if(&top)}; + assert(loopc != nullptr && *loopc == &x && "Mismatched loop constructs"); +#endif + loopStack_.pop_back(); } void OmpStructureChecker::Enter(const parser::OmpEndLoopDirective &x) { @@ -1103,8 +1112,7 @@ void OmpStructureChecker::Leave(const parser::OpenMPBlockConstruct &) { void OmpStructureChecker::ChecksOnOrderedAsBlock() { if (FindClause(llvm::omp::Clause::OMPC_depend)) { context_.Say(GetContext().clauseSource, - "DEPEND(*) clauses are not allowed when ORDERED construct is a block" - " construct with an ORDERED region"_err_en_US); + "DEPEND clauses are not allowed when ORDERED construct is a block construct with an ORDERED region"_err_en_US); return; } @@ -1654,42 +1662,48 @@ void OmpStructureChecker::ChecksOnOrderedAsStandalone() { if (FindClause(llvm::omp::Clause::OMPC_threads) || FindClause(llvm::omp::Clause::OMPC_simd)) { context_.Say(GetContext().clauseSource, - "THREADS, SIMD clauses are not allowed when ORDERED construct is a " - "standalone construct with no ORDERED region"_err_en_US); + "THREADS and SIMD clauses are not allowed when ORDERED construct is a standalone construct with no ORDERED region"_err_en_US); } - bool isSinkPresent{false}; - int dependSourceCount{0}; - auto clauseAll = FindClauses(llvm::omp::Clause::OMPC_depend); - for (auto itr = clauseAll.first; itr != clauseAll.second; ++itr) { + int dependSinkCount{0}, dependSourceCount{0}; + bool exclusiveShown{false}, duplicateSourceShown{false}; + + auto visitDoacross{[&](const parser::OmpDoacross &doa, + const parser::CharBlock &src) { + common::visit( + common::visitors{ + [&](const parser::OmpDoacross::Source &) { dependSourceCount++; }, + [&](const parser::OmpDoacross::Sink &) { dependSinkCount++; }}, + doa.u); + if (!exclusiveShown && dependSinkCount > 0 && dependSourceCount > 0) { + exclusiveShown = true; + context_.Say(src, + "The SINK and SOURCE dependence types are mutually exclusive"_err_en_US); + } + if (!duplicateSourceShown && dependSourceCount > 1) { + duplicateSourceShown = true; + context_.Say(src, + "At most one SOURCE dependence type can appear on the ORDERED directive"_err_en_US); + } + }}; + + // Visit the DEPEND and DOACROSS clauses. + auto depClauses{FindClauses(llvm::omp::Clause::OMPC_depend)}; + for (auto itr{depClauses.first}; itr != depClauses.second; ++itr) { const auto &dependClause{ std::get(itr->second->u)}; - if (std::get_if(&dependClause.v.u)) { - dependSourceCount++; - if (isSinkPresent) { - context_.Say(itr->second->source, - "DEPEND(SOURCE) is not allowed when DEPEND(SINK: vec) is present " - "on ORDERED directive"_err_en_US); - } - if (dependSourceCount > 1) { - context_.Say(itr->second->source, - "At most one DEPEND(SOURCE) clause can appear on the ORDERED " - "directive"_err_en_US); - } - } else if (std::get_if(&dependClause.v.u)) { - isSinkPresent = true; - if (dependSourceCount > 0) { - context_.Say(itr->second->source, - "DEPEND(SINK: vec) is not allowed when DEPEND(SOURCE) is present " - "on ORDERED directive"_err_en_US); - } + if (auto *doAcross{std::get_if(&dependClause.v.u)}) { + visitDoacross(*doAcross, itr->second->source); } else { context_.Say(itr->second->source, - "Only DEPEND(SOURCE) or DEPEND(SINK: vec) are allowed when ORDERED " - "construct is a standalone construct with no ORDERED " - "region"_err_en_US); + "Only SINK or SOURCE dependence types are allowed when ORDERED construct is a standalone construct with no ORDERED region"_err_en_US); } } + auto doaClauses{FindClauses(llvm::omp::Clause::OMPC_doacross)}; + for (auto itr{doaClauses.first}; itr != doaClauses.second; ++itr) { + auto &doaClause{std::get(itr->second->u)}; + visitDoacross(doaClause.v.v, itr->second->source); + } bool isNestedInDoOrderedWithPara{false}; if (CurrentDirectiveIsNested() && @@ -1716,20 +1730,28 @@ void OmpStructureChecker::ChecksOnOrderedAsStandalone() { } void OmpStructureChecker::CheckOrderedDependClause( - std::optional orderedValue) { - auto clauseAll{FindClauses(llvm::omp::Clause::OMPC_depend)}; - for (auto itr = clauseAll.first; itr != clauseAll.second; ++itr) { - const auto &dependClause{ - std::get(itr->second->u)}; - if (const auto *sinkVectors{ - std::get_if(&dependClause.v.u)}) { - std::int64_t numVar = sinkVectors->v.size(); + std::optional orderedValue) { + auto visitDoacross{[&](const parser::OmpDoacross &doa, + const parser::CharBlock &src) { + if (auto *sinkVector{std::get_if(&doa.u)}) { + int64_t numVar = sinkVector->v.v.size(); if (orderedValue != numVar) { - context_.Say(itr->second->source, - "The number of variables in DEPEND(SINK: vec) clause does not " - "match the parameter specified in ORDERED clause"_err_en_US); + context_.Say(src, + "The number of variables in the SINK iteration vector does not match the parameter specified in ORDERED clause"_err_en_US); } } + }}; + auto depClauses{FindClauses(llvm::omp::Clause::OMPC_depend)}; + for (auto itr{depClauses.first}; itr != depClauses.second; ++itr) { + auto &dependClause{std::get(itr->second->u)}; + if (auto *doAcross{std::get_if(&dependClause.v.u)}) { + visitDoacross(*doAcross, itr->second->source); + } + } + auto doaClauses = FindClauses(llvm::omp::Clause::OMPC_doacross); + for (auto itr{doaClauses.first}; itr != doaClauses.second; ++itr) { + auto &doaClause{std::get(itr->second->u)}; + visitDoacross(doaClause.v.v, itr->second->source); } } @@ -1771,17 +1793,13 @@ void OmpStructureChecker::CheckTaskDependenceType( const parser::OmpTaskDependenceType::Type &x) { // Common checks for task-dependence-type (DEPEND and UPDATE clauses). unsigned version{context_.langOptions().OpenMPVersion}; - unsigned since{0}, deprecatedIn{~0u}; + unsigned since{0}; switch (x) { case parser::OmpTaskDependenceType::Type::In: case parser::OmpTaskDependenceType::Type::Out: case parser::OmpTaskDependenceType::Type::Inout: break; - case parser::OmpTaskDependenceType::Type::Source: - case parser::OmpTaskDependenceType::Type::Sink: - deprecatedIn = 52; - break; case parser::OmpTaskDependenceType::Type::Mutexinoutset: case parser::OmpTaskDependenceType::Type::Depobj: since = 50; @@ -1791,21 +1809,36 @@ void OmpStructureChecker::CheckTaskDependenceType( break; } - if (version >= deprecatedIn) { + if (version < since) { context_.Say(GetContext().clauseSource, - "%s task-dependence-type is deprecated in %s"_warn_en_US, - parser::ToUpperCaseLetters( - parser::OmpTaskDependenceType::EnumToString(x)), - ThisVersion(deprecatedIn)); - } else if (version < since) { - context_.Say(GetContext().clauseSource, - "%s task-dependence-type is not supported in %s, %s"_warn_en_US, + "%s task dependence type is not supported in %s, %s"_warn_en_US, parser::ToUpperCaseLetters( parser::OmpTaskDependenceType::EnumToString(x)), ThisVersion(version), TryVersion(since)); } } +void OmpStructureChecker::CheckDependenceType( + const parser::OmpDependenceType::Type &x) { + // Common checks for dependence-type (DEPEND and UPDATE clauses). + unsigned version{context_.langOptions().OpenMPVersion}; + unsigned deprecatedIn{~0u}; + + switch (x) { + case parser::OmpDependenceType::Type::Source: + case parser::OmpDependenceType::Type::Sink: + deprecatedIn = 52; + break; + } + + if (version >= deprecatedIn) { + context_.Say(GetContext().clauseSource, + "%s dependence type is deprecated in %s"_warn_en_US, + parser::ToUpperCaseLetters(parser::OmpDependenceType::EnumToString(x)), + ThisVersion(deprecatedIn)); + } +} + void OmpStructureChecker::Enter( const parser::OpenMPSimpleStandaloneConstruct &x) { const auto &dir{std::get(x.t)}; @@ -2697,7 +2730,6 @@ CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind) CHECK_SIMPLE_CLAUSE(Align, OMPC_align) CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare) CHECK_SIMPLE_CLAUSE(CancellationConstructType, OMPC_cancellation_construct_type) -CHECK_SIMPLE_CLAUSE(Doacross, OMPC_doacross) CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute) CHECK_SIMPLE_CLAUSE(OmpxBare, OMPC_ompx_bare) CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail) @@ -3469,41 +3501,51 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Device &x) { void OmpStructureChecker::Enter(const parser::OmpClause::Depend &x) { CheckAllowedClause(llvm::omp::Clause::OMPC_depend); - llvm::omp::Directive directive{GetContext().directive}; + llvm::omp::Directive dir{GetContext().directive}; unsigned version{context_.langOptions().OpenMPVersion}; - using DepType = parser::OmpTaskDependenceType::Type; - DepType depType = x.v.GetDepType(); + auto *doaDep{std::get_if(&x.v.u)}; + auto *taskDep{std::get_if(&x.v.u)}; + assert(((doaDep == nullptr) != (taskDep == nullptr)) && + "Unexpected alternative in update clause"); - CheckTaskDependenceType(depType); + if (doaDep) { + CheckDoacross(*doaDep); + CheckDependenceType(doaDep->GetDepType()); + } else { + CheckTaskDependenceType(taskDep->GetTaskDepType()); + } - if (directive == llvm::omp::OMPD_depobj) { + if (dir == llvm::omp::OMPD_depobj) { // [5.0:255:11], [5.1:288:3] // A depend clause on a depobj construct must not have source, sink [or // depobj](5.0) as dependence-type. if (version >= 50) { - bool invalidDep{depType == DepType::Source || depType == DepType::Sink}; - if (version == 50) { - invalidDep = invalidDep || depType == DepType::Depobj; + bool invalidDep{false}; + if (taskDep) { + if (version == 50) { + invalidDep = taskDep->GetTaskDepType() == + parser::OmpTaskDependenceType::Type::Depobj; + } + } else { + invalidDep = true; } if (invalidDep) { context_.Say(GetContext().clauseSource, - "A DEPEND clause on a DEPOBJ construct must not have SOURCE%s " - "as dependence-type"_err_en_US, - version == 50 ? ", SINK or DEPOBJ" : " or SINK"); + "A DEPEND clause on a DEPOBJ construct must not have %s as dependence type"_err_en_US, + version == 50 ? "SINK, SOURCE or DEPOBJ" : "SINK or SOURCE"); } } - } else if (directive != llvm::omp::OMPD_ordered) { - if (depType == DepType::Source || depType == DepType::Sink) { + } else if (dir != llvm::omp::OMPD_ordered) { + if (doaDep) { context_.Say(GetContext().clauseSource, - "DEPEND(SOURCE) or DEPEND(SINK : vec) can be used only with the " - "ordered directive. Used here in the %s construct."_err_en_US, - parser::ToUpperCaseLetters(getDirectiveName(directive))); + "The SINK and SOURCE dependence types can only be used with the ORDERED directive, used here in the %s construct"_err_en_US, + parser::ToUpperCaseLetters(getDirectiveName(dir))); } } - if (const auto *inOut{std::get_if(&x.v.u)}) { - auto &objList{std::get(inOut->t)}; - if (directive == llvm::omp::OMPD_depobj) { + if (taskDep) { + auto &objList{std::get(taskDep->t)}; + if (dir == llvm::omp::OMPD_depobj) { // [5.0:255:13], [5.1:288:6], [5.2:322:26] // A depend clause on a depobj construct must only specify one locator. if (objList.v.size() != 1) { @@ -3530,14 +3572,14 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Depend &x) { } } } - if (std::get>(inOut->t)) { + if (std::get>(taskDep->t)) { unsigned allowedInVersion{50}; if (version < allowedInVersion) { context_.Say(GetContext().clauseSource, "Iterator modifiers are not supported in %s, %s"_warn_en_US, ThisVersion(version), TryVersion(allowedInVersion)); } else { - if (directive == llvm::omp::OMPD_depobj) { + if (dir == llvm::omp::OMPD_depobj) { context_.Say(GetContext().clauseSource, "An iterator-modifier may specify multiple locators, " "a DEPEND clause on a DEPOBJ construct must only specify " @@ -3548,6 +3590,93 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Depend &x) { } } +void OmpStructureChecker::Enter(const parser::OmpClause::Doacross &x) { + CheckAllowedClause(llvm::omp::Clause::OMPC_doacross); + CheckDoacross(x.v.v); +} + +void OmpStructureChecker::CheckDoacross(const parser::OmpDoacross &doa) { + if (std::holds_alternative(doa.u)) { + // Nothing to check here. + return; + } + + // Process SINK dependence type. SINK may only appear in an ORDER construct, + // which references a prior ORDERED(n) clause on a DO or SIMD construct + // that marks the top of the loop nest. + + auto &sink{std::get(doa.u)}; + const std::list &vec{sink.v.v}; + + // Check if the variables in the iteration vector are unique. + struct Less { + bool operator()( + const parser::OmpIteration *a, const parser::OmpIteration *b) const { + auto namea{std::get(a->t)}; + auto nameb{std::get(b->t)}; + assert(namea.symbol && nameb.symbol && "Unresolved symbols"); + // The non-determinism of the "<" doesn't matter, we only care about + // equality, i.e. a == b <=> !(a < b) && !(b < a) + return reinterpret_cast(namea.symbol) < + reinterpret_cast(nameb.symbol); + } + }; + if (auto *duplicate{FindDuplicateEntry(vec)}) { + auto name{std::get(duplicate->t)}; + context_.Say(name.source, + "Duplicate variable '%s' in the iteration vector"_err_en_US, + name.ToString()); + } + + // Check if the variables in the iteration vector are induction variables. + // Ignore any mismatch between the size of the iteration vector and the + // number of DO constructs on the stack. This is checked elsewhere. + + auto GetLoopDirective{[](const parser::OpenMPLoopConstruct &x) { + auto &begin{std::get(x.t)}; + return std::get(begin.t).v; + }}; + auto GetLoopClauses{[](const parser::OpenMPLoopConstruct &x) + -> const std::list & { + auto &begin{std::get(x.t)}; + return std::get(begin.t).v; + }}; + + std::set inductionVars; + for (const LoopConstruct &loop : llvm::reverse(loopStack_)) { + if (auto *doc{std::get_if(&loop)}) { + // Do-construct, collect the induction variable. + if (auto &control{(*doc)->GetLoopControl()}) { + if (auto *b{std::get_if(&control->u)}) { + inductionVars.insert(b->name.thing.symbol); + } + } + } else { + // Omp-loop-construct, check if it's do/simd with an ORDERED clause. + auto *loopc{std::get_if(&loop)}; + assert(loopc && "Expecting OpenMPLoopConstruct"); + llvm::omp::Directive loopDir{GetLoopDirective(**loopc)}; + if (loopDir == llvm::omp::OMPD_do || loopDir == llvm::omp::OMPD_simd) { + auto IsOrdered{[](const parser::OmpClause &c) { + return c.Id() == llvm::omp::OMPC_ordered; + }}; + // If it has ORDERED clause, stop the traversal. + if (llvm::any_of(GetLoopClauses(**loopc), IsOrdered)) { + break; + } + } + } + } + for (const parser::OmpIteration &iter : vec) { + auto &name{std::get(iter.t)}; + if (!inductionVars.count(name.symbol)) { + context_.Say(name.source, + "The iteration vector element '%s' is not an induction variable within the ORDERED loop nest"_err_en_US, + name.ToString()); + } + } +} + void OmpStructureChecker::CheckCopyingPolymorphicAllocatable( SymbolSourceMap &symbols, const llvm::omp::Clause clause) { if (context_.ShouldWarn(common::UsageWarning::Portability)) { @@ -3659,29 +3788,36 @@ void OmpStructureChecker::CheckStructureElement( void OmpStructureChecker::Enter(const parser::OmpClause::Update &x) { CheckAllowedClause(llvm::omp::Clause::OMPC_update); - llvm::omp::Directive directive{GetContext().directive}; + llvm::omp::Directive dir{GetContext().directive}; unsigned version{context_.langOptions().OpenMPVersion}; - CheckTaskDependenceType(x.v.v.v); + auto *depType{std::get_if(&x.v.u)}; + auto *taskType{std::get_if(&x.v.u)}; + assert(((depType == nullptr) != (taskType == nullptr)) && + "Unexpected alternative in update clause"); + + if (depType) { + CheckDependenceType(depType->v); + } else if (taskType) { + CheckTaskDependenceType(taskType->v); + } // [5.1:288:4-5] // An update clause on a depobj construct must not have source, sink or depobj // as dependence-type. // [5.2:322:3] // task-dependence-type must not be depobj. - if (directive == llvm::omp::OMPD_depobj) { + if (dir == llvm::omp::OMPD_depobj) { if (version >= 51) { - // Update -> OmpUpdateClause -> OmpTaskDependenceType -> Type - switch (x.v.v.v) { - case parser::OmpTaskDependenceType::Type::Source: - case parser::OmpTaskDependenceType::Type::Sink: - case parser::OmpTaskDependenceType::Type::Depobj: + bool invalidDep{false}; + if (taskType) { + invalidDep = taskType->v == parser::OmpTaskDependenceType::Type::Depobj; + } else { + invalidDep = true; + } + if (invalidDep) { context_.Say(GetContext().clauseSource, - "An UPDATE clause on a DEPOBJ construct must not have SOURCE, " - "SINK or DEPOBJ as dependence-type"_err_en_US); - break; - default: - break; + "An UPDATE clause on a DEPOBJ construct must not have SINK, SOURCE or DEPOBJ as dependence type"_err_en_US); } } } @@ -4295,6 +4431,22 @@ void OmpStructureChecker::Enter( CheckAllowedRequiresClause(llvm::omp::Clause::OMPC_unified_shared_memory); } +void OmpStructureChecker::Enter(const parser::DoConstruct &x) { + Base::Enter(x); + loopStack_.push_back(&x); +} + +void OmpStructureChecker::Leave(const parser::DoConstruct &x) { + assert(!loopStack_.empty() && "Expecting non-empty loop stack"); + const LoopConstruct &top = loopStack_.back(); +#ifndef NDEBUG + auto *doc{std::get_if(&top)}; + assert(doc != nullptr && *doc == &x && "Mismatched loop constructs"); +#endif + loopStack_.pop_back(); + Base::Leave(x); +} + void OmpStructureChecker::CheckAllowedRequiresClause(llvmOmpClause clause) { CheckAllowedClause(clause); diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index af7231a95e4370..8c13dd20d1e399 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -60,6 +60,9 @@ class OmpStructureChecker : public DirectiveStructureChecker { public: + using Base = DirectiveStructureChecker; + OmpStructureChecker(SemanticsContext &context) : DirectiveStructureChecker(context, #define GEN_FLANG_DIRECTIVE_CLAUSE_MAP @@ -131,6 +134,9 @@ class OmpStructureChecker void Enter(const parser::OmpAtomicCapture &); void Leave(const parser::OmpAtomic &); + void Enter(const parser::DoConstruct &); + void Leave(const parser::DoConstruct &); + #define GEN_FLANG_CLAUSE_CHECK_ENTER #include "llvm/Frontend/OpenMP/OMP.inc" @@ -157,13 +163,19 @@ class OmpStructureChecker const parser::OmpScheduleModifierType::ModType &); void CheckAllowedMapTypes(const parser::OmpMapClause::Type &, const std::list &); - template const T *FindDuplicateEntry(const std::list &); llvm::StringRef getClauseName(llvm::omp::Clause clause) override; llvm::StringRef getDirectiveName(llvm::omp::Directive directive) override; + template struct DefaultLess { + bool operator()(const T *a, const T *b) const { return *a < *b; } + }; + template > + const T *FindDuplicateEntry(const std::list &); + void CheckDependList(const parser::DataRef &); void CheckDependArraySection( const common::Indirection &, const parser::Name &); + void CheckDoacross(const parser::OmpDoacross &doa); bool IsDataRefTypeParamInquiry(const parser::DataRef *dataRef); void CheckIsVarPartOfAnotherVar(const parser::CharBlock &source, const parser::OmpObjectList &objList, llvm::StringRef clause = ""); @@ -203,6 +215,7 @@ class OmpStructureChecker void CheckSIMDNest(const parser::OpenMPConstruct &x); void CheckTargetNest(const parser::OpenMPConstruct &x); void CheckTargetUpdate(); + void CheckDependenceType(const parser::OmpDependenceType::Type &x); void CheckTaskDependenceType(const parser::OmpTaskDependenceType::Type &x); void CheckCancellationNest( const parser::CharBlock &source, const parser::OmpCancelType::Type &type); @@ -254,9 +267,13 @@ class OmpStructureChecker int directiveNest_[LastType + 1] = {0}; SymbolSourceMap deferredNonVariables_; + + using LoopConstruct = std::variant; + std::vector loopStack_; }; -template +template const T *OmpStructureChecker::FindDuplicateEntry(const std::list &list) { // Add elements of the list to a set. If the insertion fails, return // the address of the failing element. @@ -264,10 +281,7 @@ const T *OmpStructureChecker::FindDuplicateEntry(const std::list &list) { // The objects of type T may not be copyable, so add their addresses // to the set. The set will need to compare the actual objects, so // the custom comparator is provided. - struct less { - bool operator()(const T *a, const T *b) const { return *a < *b; } - }; - std::set uniq; + std::set uniq; for (const T &item : list) { if (!uniq.insert(&item).second) { diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index c70c8a8aecc2f8..ead99821126787 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -3066,11 +3066,17 @@ std::optional ExpressionAnalyzer::AnalyzeChevrons( return false; }}; if (const auto &chevrons{call.chevrons}) { - if (auto expr{Analyze(std::get<0>(chevrons->t))}; - expr && checkLaunchArg(*expr, "grid")) { - result.emplace_back(*expr); + auto &starOrExpr{std::get<0>(chevrons->t)}; + if (starOrExpr.v) { + if (auto expr{Analyze(*starOrExpr.v)}; + expr && checkLaunchArg(*expr, "grid")) { + result.emplace_back(*expr); + } else { + return std::nullopt; + } } else { - return std::nullopt; + result.emplace_back( + AsGenericExpr(evaluate::Constant{-1})); } if (auto expr{Analyze(std::get<1>(chevrons->t))}; expr && checkLaunchArg(*expr, "block")) { diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 20885587074775..83d666283a48c8 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -553,9 +553,17 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { return false; } - void Post(const parser::OmpDependSinkVec &x) { - const auto &name{std::get(x.t)}; - ResolveName(&name); + void Post(const parser::OmpIteration &x) { + if (const auto &name{std::get(x.t)}; !name.symbol) { + auto *symbol{currScope().FindSymbol(name.source)}; + if (!symbol) { + // OmpIteration must use an existing object. If there isn't one, + // create a fake one and flag an error later. + symbol = &currScope().MakeSymbol( + name.source, Attrs{}, EntityDetails(/*isDummy=*/true)); + } + Resolve(name, symbol); + } } bool Pre(const parser::OmpClause::UseDevicePtr &x) { diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp index abb7ebb72e5923..88cdf3cf426229 100644 --- a/flang/runtime/CUDA/kernel.cpp +++ b/flang/runtime/CUDA/kernel.cpp @@ -25,6 +25,55 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY, blockDim.x = blockX; blockDim.y = blockY; blockDim.z = blockZ; + unsigned nbNegGridDim{0}; + if (gridX < 0) { + ++nbNegGridDim; + } + if (gridY < 0) { + ++nbNegGridDim; + } + if (gridZ < 0) { + ++nbNegGridDim; + } + if (nbNegGridDim == 1) { + int maxBlocks, nbBlocks, dev, multiProcCount; + cudaError_t err1, err2; + nbBlocks = blockDim.x * blockDim.y * blockDim.z; + cudaGetDevice(&dev); + err1 = cudaDeviceGetAttribute( + &multiProcCount, cudaDevAttrMultiProcessorCount, dev); + err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &maxBlocks, kernel, nbBlocks, smem); + if (err1 == cudaSuccess && err2 == cudaSuccess) { + maxBlocks = multiProcCount * maxBlocks; + } + if (maxBlocks > 0) { + if (gridDim.x > 0) { + maxBlocks = maxBlocks / gridDim.x; + } + if (gridDim.y > 0) { + maxBlocks = maxBlocks / gridDim.y; + } + if (gridDim.z > 0) { + maxBlocks = maxBlocks / gridDim.z; + } + if (maxBlocks < 1) { + maxBlocks = 1; + } + if (gridX < 0) { + gridDim.x = maxBlocks; + } + if (gridY < 0) { + gridDim.y = maxBlocks; + } + if (gridZ < 0) { + gridDim.z = maxBlocks; + } + } + } else if (nbNegGridDim > 1) { + Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; + terminator.Crash("Too many invalid grid dimensions"); + } cudaStream_t stream = 0; // TODO stream managment CUDA_REPORT_IF_ERROR( cudaLaunchKernel(kernel, gridDim, blockDim, params, smem, stream)); @@ -41,6 +90,55 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX, config.blockDim.x = blockX; config.blockDim.y = blockY; config.blockDim.z = blockZ; + unsigned nbNegGridDim{0}; + if (gridX < 0) { + ++nbNegGridDim; + } + if (gridY < 0) { + ++nbNegGridDim; + } + if (gridZ < 0) { + ++nbNegGridDim; + } + if (nbNegGridDim == 1) { + int maxBlocks, nbBlocks, dev, multiProcCount; + cudaError_t err1, err2; + nbBlocks = config.blockDim.x * config.blockDim.y * config.blockDim.z; + cudaGetDevice(&dev); + err1 = cudaDeviceGetAttribute( + &multiProcCount, cudaDevAttrMultiProcessorCount, dev); + err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &maxBlocks, kernel, nbBlocks, smem); + if (err1 == cudaSuccess && err2 == cudaSuccess) { + maxBlocks = multiProcCount * maxBlocks; + } + if (maxBlocks > 0) { + if (config.gridDim.x > 0) { + maxBlocks = maxBlocks / config.gridDim.x; + } + if (config.gridDim.y > 0) { + maxBlocks = maxBlocks / config.gridDim.y; + } + if (config.gridDim.z > 0) { + maxBlocks = maxBlocks / config.gridDim.z; + } + if (maxBlocks < 1) { + maxBlocks = 1; + } + if (gridX < 0) { + config.gridDim.x = maxBlocks; + } + if (gridY < 0) { + config.gridDim.y = maxBlocks; + } + if (gridZ < 0) { + config.gridDim.z = maxBlocks; + } + } + } else if (nbNegGridDim > 1) { + Fortran::runtime::Terminator terminator{__FILE__, __LINE__}; + terminator.Crash("Too many invalid grid dimensions"); + } config.dynamicSmemBytes = smem; config.stream = 0; // TODO stream managment cudaLaunchAttribute launchAttr[1]; diff --git a/flang/test/Fir/CUDA/cuda-alloc-free.fir b/flang/test/Fir/CUDA/cuda-alloc-free.fir index 25821418a40f11..88b1a00e4a5b25 100644 --- a/flang/test/Fir/CUDA/cuda-alloc-free.fir +++ b/flang/test/Fir/CUDA/cuda-alloc-free.fir @@ -61,4 +61,16 @@ func.func @_QPsub3(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref< // CHECK: %{{.*}} = fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref, i32) -> !fir.llvm_ptr // CHECK: fir.call @_FortranACUFMemFree +func.func @_QPtest_type() { + %0 = cuf.alloc !fir.type<_QMbarTcmplx{id:i32,c:complex}> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFtest_typeEa"} -> !fir.ref}>> + %1 = fir.declare %0 {data_attr = #cuf.cuda, uniq_name = "_QFtest_typeEa"} : (!fir.ref}>>) -> !fir.ref}>> + cuf.free %1 : !fir.ref}>> {data_attr = #cuf.cuda} + return +} + +// CHECK-LABEL: func.func @_QPtest_type() +// CHECK: %[[BYTES:.*]] = arith.constant 12 : index +// CHECK: %[[CONV_BYTES:.*]] = fir.convert %[[BYTES]] : (index) -> i64 +// CHECK: fir.call @_FortranACUFMemAlloc(%[[CONV_BYTES]], %c0{{.*}}, %{{.*}}, %{{.*}}) : (i64, i32, !fir.ref, i32) -> !fir.llvm_ptr + } // end module diff --git a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf index 82d1a61f8e157c..eb24138638e960 100644 --- a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf +++ b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf @@ -47,7 +47,10 @@ contains ! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %c0{{.*}}>>>() call dev_kernel1<<<1, 32>>>(a) -! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%1#1) : (!fir.ref) +! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%{{.*}}) : (!fir.ref) + + call dev_kernel1<<<*, 32>>>(a) +! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c-1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%{{.*}}) end end diff --git a/flang/test/Lower/OpenMP/Todo/ordered.f90 b/flang/test/Lower/OpenMP/Todo/ordered.f90 new file mode 100644 index 00000000000000..2f91e5ed28a1a0 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/ordered.f90 @@ -0,0 +1,20 @@ +!RUN: %not_todo_cmd bbc -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s +!RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -o - %s 2>&1 | FileCheck %s + +!CHECK: not yet implemented: OMPD_ordered +subroutine f00(x) + integer :: a(10) + + do i = 1, 10 + !$omp do ordered(3) + do j = 1, 10 + do k = 1, 10 + do m = 1, 10 + !$omp ordered doacross(sink: m+1, k+0, j-2) + a(i) = i + enddo + enddo + enddo + !$omp end do + enddo +end diff --git a/flang/test/Lower/OpenMP/implicit-dsa.f90 b/flang/test/Lower/OpenMP/implicit-dsa.f90 index 53d6483a7b1b98..a1912a46f9ae7e 100644 --- a/flang/test/Lower/OpenMP/implicit-dsa.f90 +++ b/flang/test/Lower/OpenMP/implicit-dsa.f90 @@ -3,6 +3,103 @@ ! Checks lowering of OpenMP variables with implicitly determined DSAs. +! Privatizers + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TEST6_Y_PRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "y" +! CHECK-NOT: } copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TEST6_X_PRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "x" +! CHECK-NOT: } copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TEST6_Z_FIRSTPRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "z" +! CHECK: } copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TEST6_Y_FIRSTPRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "y" +! CHECK: } copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TEST6_X_FIRSTPRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "x" +! CHECK: } copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TEST5_X_FIRSTPRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "x" +! CHECK: } copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TEST5_X_PRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "x" +! CHECK-NOT: } copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TEST4_Y_FIRSTPRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "y" +! CHECK: } copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TEST4_Z_FIRSTPRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "z" +! CHECK: } copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TEST4_X_FIRSTPRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "x" +! CHECK: } copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TEST4_Y_PRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "y" +! CHECK-NOT: } copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TEST4_Z_PRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "z" +! CHECK-NOT: } copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TEST4_X_PRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "x" +! CHECK-NOT: } copy { + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TEST3_X_FIRSTPRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "x" +! CHECK: } copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TEST2_X_FIRSTPRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "x" +! CHECK: } copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = firstprivate} @[[TEST1_X_FIRSTPRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "x" +! CHECK: } copy { +! CHECK: hlfir.assign + +! CHECK-LABEL: omp.private +! CHECK-SAME: {type = private} @[[TEST1_Y_PRIV:.*]] : !fir.ref +! CHECK: fir.alloca i32 {bindc_name = "y" +! CHECK-NOT: } copy { + ! Basic cases. !CHECK-LABEL: func @_QPimplicit_dsa_test1 !CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test1Ex"} @@ -11,17 +108,11 @@ !CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test1Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_test1Ez"} !CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_test1Ez"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: omp.task { -!CHECK-NEXT: %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test1Ey"} +!CHECK: omp.task private(@[[TEST1_Y_PRIV]] %[[Y_DECL]]#0 -> %[[PRIV_Y:.*]], @[[TEST1_X_FIRSTPRIV]] %[[X_DECL]]#0 -> %[[PRIV_X:.*]] : !fir.ref, !fir.ref) { !CHECK-NEXT: %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test1Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK-NEXT: %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test1Ex"} !CHECK-NEXT: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test1Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK-NEXT: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref -!CHECK-NOT: fir.alloca !CHECK: } !CHECK: omp.task { -!CHECK-NOT: fir.alloca !CHECK: } subroutine implicit_dsa_test1 integer :: x, y, z @@ -40,11 +131,8 @@ subroutine implicit_dsa_test1 !CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFimplicit_dsa_test2Ex"} !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test2Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: omp.task { -!CHECK: omp.task { -!CHECK: %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test2Ex"} +!CHECK: omp.task private(@[[TEST2_X_FIRSTPRIV]] %[[X_DECL]]#0 -> %[[PRIV_X:.*]] : !fir.ref) { !CHECK: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test2Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref !CHECK: } !CHECK: } subroutine implicit_dsa_test2 @@ -72,11 +160,8 @@ subroutine implicit_dsa_test2 !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 !CHECK: hlfir.assign %[[ONE]] to %[[Y_DECL]]#0 : i32, !fir.ref !CHECK: } -!CHECK: omp.task { -!CHECK: %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test3Ex"} +!CHECK: omp.task private(@[[TEST3_X_FIRSTPRIV]] %[[X_DECL]]#0 -> %[[PRIV_X]] : !fir.ref) { !CHECK: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test3Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 !CHECK: hlfir.assign %[[ONE]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 @@ -111,29 +196,17 @@ subroutine implicit_dsa_test3 !CHECK: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[PRIV_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV_Z]] {uniq_name = "_QFimplicit_dsa_test4Ez"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test4Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: omp.task { -!CHECK-NEXT: %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test4Ex"} +!CHECK: omp.task private(@[[TEST4_X_FIRSTPRIV]] %[[PRIV_X_DECL]]#0 -> %[[PRIV2_X:.*]], @[[TEST4_Z_FIRSTPRIV]] %[[PRIV_Z_DECL]]#0 -> %[[PRIV2_Z:.*]] : !fir.ref, !fir.ref) { !CHECK-NEXT: %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK-NEXT: %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref -!CHECK-NEXT: %[[PRIV2_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFimplicit_dsa_test4Ez"} !CHECK-NEXT: %[[PRIV2_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV2_Z]] {uniq_name = "_QFimplicit_dsa_test4Ez"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK-NEXT: %[[TEMP2:.*]] = fir.load %[[PRIV_Z_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV2_Z_DECL]]#0 : i32, !fir.ref !CHECK: %[[ZERO:.*]] = arith.constant 0 : i32 !CHECK-NEXT: hlfir.assign %[[ZERO]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 !CHECK-NEXT: hlfir.assign %[[ONE]] to %[[PRIV2_Z_DECL]]#0 : i32, !fir.ref !CHECK: } -!CHECK: omp.task { -!CHECK-NEXT: %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test4Ex"} +!CHECK: omp.task private(@[[TEST4_X_FIRSTPRIV]] %[[PRIV_X_DECL]]#0 -> %[[PRIV2_X:.*]], @[[TEST4_Y_FIRSTPRIV]] %[[PRIV_Y_DECL]]#0 -> %[[PRIV2_Y:.*]] : !fir.ref, !fir.ref) { !CHECK-NEXT: %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test4Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK-NEXT: %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref -!CHECK-NEXT: %[[PRIV2_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test4Ey"} !CHECK-NEXT: %[[PRIV2_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV2_Y]] {uniq_name = "_QFimplicit_dsa_test4Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK-NEXT: %[[TEMP2:.*]] = fir.load %[[PRIV_Y_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV2_Y_DECL]]#0 : i32, !fir.ref !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 !CHECK-NEXT: hlfir.assign %[[ONE]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref !CHECK: %[[ZERO:.*]] = arith.constant 0 : i32 @@ -162,11 +235,8 @@ subroutine implicit_dsa_test4 !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFimplicit_dsa_test5Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: omp.parallel private({{.*}} %{{.*}}#0 -> %[[PRIV_X:.*]] : {{.*}}) { !CHECK: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test5Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: omp.task { -!CHECK: %[[PRIV2_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test5Ex"} +!CHECK: omp.task private(@[[TEST5_X_FIRSTPRIV]] %[[PRIV_X_DECL]]#0 -> %[[PRIV2_X:.*]] : !fir.ref) { !CHECK: %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test5Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[TEMP:.*]] = fir.load %[[PRIV_X_DECL]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref !CHECK: omp.parallel { !CHECK: %[[ONE:.*]] = arith.constant 1 : i32 !CHECK: hlfir.assign %[[ONE]] to %[[PRIV2_X_DECL]]#0 : i32, !fir.ref @@ -193,19 +263,10 @@ subroutine implicit_dsa_test5 !CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFimplicit_dsa_test6Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFimplicit_dsa_test6Ez"} !CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFimplicit_dsa_test6Ez"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: omp.task { -!CHECK-NEXT: %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFimplicit_dsa_test6Ex"} +!CHECK: omp.task private(@[[TEST6_X_FIRSTPRIV]] %[[X_DECL]]#0 -> %[[PRIV_X:.*]], @[[TEST6_Y_FIRSTPRIV]] %[[Y_DECL]]#0 -> %[[PRIV_Y:.*]], @[[TEST6_Z_FIRSTPRIV]] %[[Z_DECL]]#0 -> %[[PRIV_Z:.*]] : !fir.ref, !fir.ref, !fir.ref) { !CHECK-NEXT: %[[PRIV_X_DECL:.*]]:2 = hlfir.declare %[[PRIV_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK-NEXT: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP]] to %[[PRIV_X_DECL]]#0 : i32, !fir.ref -!CHECK-NEXT: %[[PRIV_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFimplicit_dsa_test6Ey"} !CHECK-NEXT: %[[PRIV_Y_DECL:.*]]:2 = hlfir.declare %[[PRIV_Y]] {uniq_name = "_QFimplicit_dsa_test6Ey"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK-NEXT: %[[TEMP2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP2]] to %[[PRIV_Y_DECL]]#0 : i32, !fir.ref -!CHECK-NEXT: %[[PRIV_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFimplicit_dsa_test6Ez"} !CHECK-NEXT: %[[PRIV_Z_DECL:.*]]:2 = hlfir.declare %[[PRIV_Z]] {uniq_name = "_QFimplicit_dsa_test6Ez"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK-NEXT: %[[TEMP3:.*]] = fir.load %[[Z_DECL]]#0 : !fir.ref -!CHECK-NEXT: hlfir.assign %[[TEMP3]] to %[[PRIV_Z_DECL]]#0 : i32, !fir.ref !CHECK: omp.parallel private({{.*}} %{{.*}}#0 -> %[[PRIV2_X:.*]], {{.*}} %{{.*}}#0 -> %[[PRIV2_Y:.*]] : {{.*}}) { !CHECK: %[[PRIV2_X_DECL:.*]]:2 = hlfir.declare %[[PRIV2_X]] {uniq_name = "_QFimplicit_dsa_test6Ex"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK-NOT: hlfir.assign diff --git a/flang/test/Lower/OpenMP/statement-function.f90 b/flang/test/Lower/OpenMP/statement-function.f90 index 56601de2f4f0b8..817ad48c9b562c 100644 --- a/flang/test/Lower/OpenMP/statement-function.f90 +++ b/flang/test/Lower/OpenMP/statement-function.f90 @@ -21,10 +21,8 @@ subroutine test_implicit_use() !CHECK-LABEL: func @_QPtest_implicit_use2 !CHECK: %[[IEXP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_implicit_use2Eiexp"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[IIMP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_implicit_use2Eiimp"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: omp.task +!CHECK: omp.task private({{.*firstprivate.*}} %[[IEXP]]#0 -> %[[PRIV_IEXP:[^,]+]] : !fir.ref) { !CHECK: %[[PRIV_IEXP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_implicit_use2Eiexp"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[TEMP0:.*]] = fir.load %[[IEXP]]#0 : !fir.ref -!CHECK: hlfir.assign %[[TEMP0]] to %[[PRIV_IEXP]]#0 : i32, !fir.ref !CHECK-NOT: %[[PRIV_IIMP:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtest_implicit_use2Eiimp"} : (!fir.ref) -> (!fir.ref, !fir.ref) subroutine test_implicit_use2() implicit none diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90 index 63a43e750979d5..54189cdef1e815 100644 --- a/flang/test/Lower/OpenMP/target.f90 +++ b/flang/test/Lower/OpenMP/target.f90 @@ -24,7 +24,7 @@ subroutine omp_target_enter_depend !CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_target_enter_dependEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) integer :: a(1024) - !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) { + !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) private({{.*}}) { !$omp task depend(out: a) call foo(a) !$omp end task @@ -163,7 +163,7 @@ end subroutine omp_target_exit_device subroutine omp_target_exit_depend !CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_target_exit_dependEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) integer :: a(1024) - !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) { + !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) private({{.*}}) { !$omp task depend(out: a) call foo(a) !$omp end task @@ -183,7 +183,7 @@ subroutine omp_target_update_depend !CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_target_update_dependEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) integer :: a(1024) - !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) { + !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) private({{.*}}) { !$omp task depend(out: a) call foo(a) !$omp end task @@ -363,7 +363,7 @@ subroutine omp_target_depend !CHECK: %[[EXTENT_A:.*]] = arith.constant 1024 : index !CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFomp_target_dependEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) integer :: a(1024) - !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) { + !CHECK: omp.task depend(taskdependout -> %[[A]]#1 : !fir.ref>) private({{.*}}) { !$omp task depend(out: a) call foo(a) !$omp end task diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90 index 4f00f261fe57df..7ad260cba84dfd 100644 --- a/flang/test/Lower/OpenMP/task.f90 +++ b/flang/test/Lower/OpenMP/task.f90 @@ -114,7 +114,7 @@ end subroutine task_depend_all_kinds_one_task subroutine task_depend_multi_var() integer :: x integer :: y - !CHECK: omp.task depend(taskdependin -> %{{.*}} : !fir.ref, taskdependin -> %{{.+}} : !fir.ref) { + !CHECK: omp.task depend(taskdependin -> %{{.*}} : !fir.ref, taskdependin -> %{{.+}} : !fir.ref) private({{.*x_firstprivate.*}}, {{.*y_firstprivate.*}}) { !$omp task depend(in :x,y) !CHECK: arith.addi x = x + 12 @@ -165,12 +165,10 @@ subroutine task_private !CHECK: fir.call @_QPbar(%[[INT_VAR]]#1, %[[MYTYPE_VAR]]#1) {{.*}}: (!fir.ref, !fir.ref>) -> () call bar(int_var, mytype_var) - !CHECK: omp.task { + !CHECK: omp.task private(@{{.*int_var_private.*}} %[[INT_VAR]]#0 -> %[[INT_VAR_ARG:.*]], @{{.*mytype_var_private.*}} %[[MYTYPE_VAR]]#0 -> %[[MYTYPE_VAR_ARG:.*]] : !fir.ref, !fir.ref) { !$omp task private(int_var, mytype_var) -!CHECK: %[[INT_PRIVATE_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "int_var", pinned, uniq_name = "_QFtask_privateEint_var"} -!CHECK: %[[INT_VAR_PRIVATE:.+]]:2 = hlfir.declare %[[INT_PRIVATE_ALLOCA]] {uniq_name = "_QFtask_privateEint_var"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[MYTYPE_PRIVATE_ALLOCA:.+]] = fir.alloca !fir.type<_QFtask_privateTmytype{x:i32}> {bindc_name = "mytype_var", pinned, uniq_name = "_QFtask_privateEmytype_var"} -!CHECK: %[[MYTYPE_VAR_PRIVATE:.+]]:2 = hlfir.declare %[[MYTYPE_PRIVATE_ALLOCA]] {uniq_name = "_QFtask_privateEmytype_var"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) +!CHECK: %[[INT_VAR_PRIVATE:.+]]:2 = hlfir.declare %[[INT_VAR_ARG]] {uniq_name = "_QFtask_privateEint_var"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[MYTYPE_VAR_PRIVATE:.+]]:2 = hlfir.declare %[[MYTYPE_VAR_ARG]] {uniq_name = "_QFtask_privateEmytype_var"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) !CHECK: fir.call @_QPbar(%[[INT_VAR_PRIVATE]]#1, %[[MYTYPE_VAR_PRIVATE]]#1) fastmath : (!fir.ref, !fir.ref>) -> () call bar(int_var, mytype_var) !CHECK: omp.terminator @@ -196,15 +194,10 @@ subroutine task_firstprivate !CHECK: fir.call @_QPbaz(%[[INT_VAR]]#1, %[[MYTYPE_VAR]]#1) fastmath : (!fir.ref, !fir.ref>) -> () call baz(int_var, mytype_var) - !CHECK: omp.task { + !CHECK: omp.task private(@{{.*int_var_firstprivate.*}} %[[INT_VAR]]#0 -> %[[INT_VAR_ARG:.*]], @{{.*mytype_var_firstprivate.*}} %[[MYTYPE_VAR]]#0 -> %[[MYTYPE_VAR_ARG:.*]] : !fir.ref, !fir.ref<{{.*}}) { !$omp task firstprivate(int_var, mytype_var) -!CHECK: %[[INT_FIRSTPRIVATE_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "int_var", pinned, uniq_name = "_QFtask_firstprivateEint_var"} -!CHECK: %[[INT_VAR_FIRSTPRIVATE:.+]]:2 = hlfir.declare %[[INT_FIRSTPRIVATE_ALLOCA]] {uniq_name = "_QFtask_firstprivateEint_var"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[INT_VAR_LOAD:.+]] = fir.load %[[INT_VAR]]#0 : !fir.ref -!CHECK: hlfir.assign %[[INT_VAR_LOAD]] to %[[INT_VAR_FIRSTPRIVATE]]#0 : i32, !fir.ref -!CHECK: %[[MYTYPE_FIRSTPRIVATE_ALLOCA:.+]] = fir.alloca !fir.type<_QFtask_firstprivateTmytype{x:i32}> {bindc_name = "mytype_var", pinned, uniq_name = "_QFtask_firstprivateEmytype_var"} -!CHECK: %[[MYTYPE_VAR_FIRSTPRIVATE:.+]]:2 = hlfir.declare %[[MYTYPE_FIRSTPRIVATE_ALLOCA]] {uniq_name = "_QFtask_firstprivateEmytype_var"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) -!CHECK: hlfir.assign %[[MYTYPE_VAR]]#0 to %[[MYTYPE_VAR_FIRSTPRIVATE]]#0 : !fir.ref>, !fir.ref> +!CHECK: %[[INT_VAR_FIRSTPRIVATE:.+]]:2 = hlfir.declare %[[INT_VAR_ARG]] {uniq_name = "_QFtask_firstprivateEint_var"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[MYTYPE_VAR_FIRSTPRIVATE:.+]]:2 = hlfir.declare %[[MYTYPE_VAR_ARG]] {uniq_name = "_QFtask_firstprivateEmytype_var"} : (!fir.ref>) -> (!fir.ref>, !fir.ref>) call baz(int_var, mytype_var) !CHECK: omp.terminator !$omp end task @@ -227,15 +220,11 @@ subroutine task_multiple_clauses() integer :: x, y, z logical :: buzz - !CHECK: omp.task allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref) final(%{{.+}}) if(%{{.+}}) priority(%{{.+}}) { + !CHECK: omp.task allocate(%{{.+}} : i64 -> %{{.+}} : !fir.ref) final(%{{.+}}) if(%{{.+}}) priority(%{{.+}}) private({{.*}}) { !$omp task if(buzz) final(buzz) priority(z) allocate(omp_high_bw_mem_alloc: x) private(x) firstprivate(y) -!CHECK: %[[X_PRIV_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFtask_multiple_clausesEx"} -!CHECK: %[[X_PRIV:.+]]:2 = hlfir.declare %[[X_PRIV_ALLOCA]] {uniq_name = "_QFtask_multiple_clausesEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[Y_PRIV_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFtask_multiple_clausesEy"} -!CHECK: %[[Y_PRIV:.+]]:2 = hlfir.declare %[[Y_PRIV_ALLOCA]] {uniq_name = "_QFtask_multiple_clausesEy"} : (!fir.ref) -> (!fir.ref, !fir.ref) -!CHECK: %[[Y_LOAD:.+]] = fir.load %[[Y]]#0 : !fir.ref -!CHECK: hlfir.assign %[[Y_LOAD]] to %[[Y_PRIV]]#0 : i32, !fir.ref +!CHECK: %[[X_PRIV:.+]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtask_multiple_clausesEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) +!CHECK: %[[Y_PRIV:.+]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFtask_multiple_clausesEy"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: arith.addi x = x + 12 diff --git a/flang/test/Lower/OpenMP/task2.f90 b/flang/test/Lower/OpenMP/task2.f90 index cff9ebdd375b39..734e75c5bba06f 100644 --- a/flang/test/Lower/OpenMP/task2.f90 +++ b/flang/test/Lower/OpenMP/task2.f90 @@ -1,5 +1,15 @@ !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s + +!CHECK-LABEL: omp.private +!CHECK-SAME: {type = firstprivate} @[[PRIVATIZER:.*]] : !fir.ref>>> alloc { +!CHECK: fir.if +!CHECK: } copy { +!CHECK: fir.if +!CHECK: } dealloc { +!CHECK: fir.if +!CHECK: } + !CHECK-LABEL: func @_QPomp_task_nested_allocatable_firstprivate subroutine omp_task_nested_allocatable_firstprivate integer, allocatable :: a(:) @@ -13,14 +23,11 @@ subroutine omp_task_nested_allocatable_firstprivate !CHECK-SAME: (!fir.ref>>>, !fir.ref>>>) !CHECK: omp.task { !$omp task default(firstprivate) -!CHECK: omp.task { +!CHECK: omp.task private(@[[PRIVATIZER]] %[[A]]#0 -> %[[A_ARG:.*]] : !fir.ref>>>) { !CHECK: %[[PRIV_A:.*]]:2 = hlfir.declare %{{.*}} {fortran_attrs = #fir.var_attrs, !CHECK-SAME: uniq_name = "_QFomp_task_nested_allocatable_firstprivateEa"} : !CHECK-SAME: (!fir.ref>>>) -> !CHECK-SAME: (!fir.ref>>>, !fir.ref>>>) -!CHECK: %[[TEMP:.*]] = fir.load %[[A]]#0 : !fir.ref>>> -!CHECK: hlfir.assign %[[TEMP]] to %[[PRIV_A]]#0 realloc : -!CHECK-SAME: !fir.box>>, !fir.ref>>> !$omp task default(firstprivate) a = 2 !CHECK: } diff --git a/flang/test/Parser/OpenMP/depobj-construct.f90 b/flang/test/Parser/OpenMP/depobj-construct.f90 index 7c474071bc1e67..3de190c95bb734 100644 --- a/flang/test/Parser/OpenMP/depobj-construct.f90 +++ b/flang/test/Parser/OpenMP/depobj-construct.f90 @@ -14,7 +14,7 @@ subroutine f00 !PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPDepobjConstruct !PARSE-TREE: | Verbatim !PARSE-TREE: | OmpObject -> Designator -> DataRef -> Name = 'x' -!PARSE-TREE: | OmpClause -> Depend -> OmpDependClause -> InOut +!PARSE-TREE: | OmpClause -> Depend -> OmpDependClause -> TaskDep !PARSE-TREE: | | OmpTaskDependenceType -> Type = In !PARSE-TREE: | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'y' diff --git a/flang/test/Parser/OpenMP/doacross-clause.f90 b/flang/test/Parser/OpenMP/doacross-clause.f90 new file mode 100644 index 00000000000000..afd27d9d727e04 --- /dev/null +++ b/flang/test/Parser/OpenMP/doacross-clause.f90 @@ -0,0 +1,90 @@ +!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=52 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s +!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=52 %s | FileCheck --check-prefix="PARSE-TREE" %s + +subroutine f00(x) + integer :: x(10, 10) + !$omp do ordered(2) + do i = 1, 10 + do j = 1, 10 + !$omp ordered doacross(source) + x(i, j) = i + j + enddo + enddo + !$omp end do +end + +!UNPARSE: SUBROUTINE f00 (x) +!UNPARSE: INTEGER x(10_4,10_4) +!UNPARSE: !$OMP DO ORDERED(2_4) +!UNPARSE: DO i=1_4,10_4 +!UNPARSE: DO j=1_4,10_4 +!UNPARSE: !$OMP ORDERED DOACROSS(SOURCE) +!UNPARSE: x(int(i,kind=8),int(j,kind=8))=i+j +!UNPARSE: END DO +!UNPARSE: END DO +!UNPARSE: !$OMP END DO +!UNPARSE: END SUBROUTINE + +!PARSE-TREE-LABEL: ProgramUnit -> SubroutineSubprogram +!PARSE-TREE: OmpBeginLoopDirective +!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do +!PARSE-TREE: | OmpClauseList -> OmpClause -> Ordered -> Scalar -> Integer -> Constant -> Expr = '2_4' +!PARSE-TREE: | | LiteralConstant -> IntLiteralConstant = '2' +![...] +!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct +!PARSE-TREE: | OmpSimpleStandaloneDirective -> llvm::omp::Directive = ordered +!PARSE-TREE: | OmpClauseList -> OmpClause -> Doacross -> OmpDoacrossClause -> OmpDoacross -> Source + +subroutine f01(x) + integer :: x(10, 10) + !$omp do ordered(2) + do i = 1, 10 + do j = 1, 10 + !$omp ordered doacross(sink: i+1, j-2), doacross(sink: i, j+3) + x(i, j) = i + j + enddo + enddo + !$omp end do +end + +!UNPARSE: SUBROUTINE f01 (x) +!UNPARSE: INTEGER x(10_4,10_4) +!UNPARSE: !$OMP DO ORDERED(2_4) +!UNPARSE: DO i=1_4,10_4 +!UNPARSE: DO j=1_4,10_4 +!UNPARSE: !$OMP ORDERED DOACROSS(SINK: i+1_4, j-2_4) DOACROSS(SINK: i, j+3_4) +!UNPARSE: x(int(i,kind=8),int(j,kind=8))=i+j +!UNPARSE: END DO +!UNPARSE: END DO +!UNPARSE: !$OMP END DO +!UNPARSE: END SUBROUTINE + +!PARSE-TREE-LABEL: ProgramUnit -> SubroutineSubprogram +!PARSE-TREE: OmpBeginLoopDirective +!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do +!PARSE-TREE: | OmpClauseList -> OmpClause -> Ordered -> Scalar -> Integer -> Constant -> Expr = '2_4' +!PARSE-TREE: | | LiteralConstant -> IntLiteralConstant = '2' +![...] +!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct +!PARSE-TREE: | OmpSimpleStandaloneDirective -> llvm::omp::Directive = ordered +!PARSE-TREE: | OmpClauseList -> OmpClause -> Doacross -> OmpDoacrossClause -> OmpDoacross -> Sink -> OmpIterationVector -> OmpIteration +!PARSE-TREE: | | Name = 'i' +!PARSE-TREE: | | OmpIterationOffset +!PARSE-TREE: | | | DefinedOperator -> IntrinsicOperator = Add +!PARSE-TREE: | | | Scalar -> Integer -> Constant -> Expr = '1_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '1' +!PARSE-TREE: | OmpIteration +!PARSE-TREE: | | Name = 'j' +!PARSE-TREE: | | OmpIterationOffset +!PARSE-TREE: | | | DefinedOperator -> IntrinsicOperator = Subtract +!PARSE-TREE: | | | Scalar -> Integer -> Constant -> Expr = '2_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '2' +!PARSE-TREE: | OmpClause -> Doacross -> OmpDoacrossClause -> OmpDoacross -> Sink -> OmpIterationVector -> OmpIteration +!PARSE-TREE: | | Name = 'i' +!PARSE-TREE: | OmpIteration +!PARSE-TREE: | | Name = 'j' +!PARSE-TREE: | | OmpIterationOffset +!PARSE-TREE: | | | DefinedOperator -> IntrinsicOperator = Add +!PARSE-TREE: | | | Scalar -> Integer -> Constant -> Expr = '3_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '3' + diff --git a/flang/test/Parser/OpenMP/ordered-depend.f90 b/flang/test/Parser/OpenMP/ordered-depend.f90 new file mode 100644 index 00000000000000..9e0946af0f09ae --- /dev/null +++ b/flang/test/Parser/OpenMP/ordered-depend.f90 @@ -0,0 +1,90 @@ +!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=45 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s +!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=45 %s | FileCheck --check-prefix="PARSE-TREE" %s + +subroutine f00(x) + integer :: x(10, 10) + !$omp do ordered(2) + do i = 1, 10 + do j = 1, 10 + !$omp ordered depend(source) + x(i, j) = i + j + enddo + enddo + !$omp end do +end + +!UNPARSE: SUBROUTINE f00 (x) +!UNPARSE: INTEGER x(10_4,10_4) +!UNPARSE: !$OMP DO ORDERED(2_4) +!UNPARSE: DO i=1_4,10_4 +!UNPARSE: DO j=1_4,10_4 +!UNPARSE: !$OMP ORDERED DEPEND(SOURCE) +!UNPARSE: x(int(i,kind=8),int(j,kind=8))=i+j +!UNPARSE: END DO +!UNPARSE: END DO +!UNPARSE: !$OMP END DO +!UNPARSE: END SUBROUTINE + +!PARSE-TREE-LABEL: ProgramUnit -> SubroutineSubprogram +!PARSE-TREE: OmpBeginLoopDirective +!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do +!PARSE-TREE: | OmpClauseList -> OmpClause -> Ordered -> Scalar -> Integer -> Constant -> Expr = '2_4' +!PARSE-TREE: | | LiteralConstant -> IntLiteralConstant = '2' +![...] +!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct +!PARSE-TREE: | OmpSimpleStandaloneDirective -> llvm::omp::Directive = ordered +!PARSE-TREE: | OmpClauseList -> OmpClause -> Depend -> OmpDependClause -> OmpDoacross -> Source + +subroutine f01(x) + integer :: x(10, 10) + !$omp do ordered(2) + do i = 1, 10 + do j = 1, 10 + !$omp ordered depend(sink: i+1, j-2), depend(sink: i, j+3) + x(i, j) = i + j + enddo + enddo + !$omp end do +end + +!UNPARSE: SUBROUTINE f01 (x) +!UNPARSE: INTEGER x(10_4,10_4) +!UNPARSE: !$OMP DO ORDERED(2_4) +!UNPARSE: DO i=1_4,10_4 +!UNPARSE: DO j=1_4,10_4 +!UNPARSE: !$OMP ORDERED DEPEND(SINK: i+1_4, j-2_4) DEPEND(SINK: i, j+3_4) +!UNPARSE: x(int(i,kind=8),int(j,kind=8))=i+j +!UNPARSE: END DO +!UNPARSE: END DO +!UNPARSE: !$OMP END DO +!UNPARSE: END SUBROUTINE + +!PARSE-TREE-LABEL: ProgramUnit -> SubroutineSubprogram +!PARSE-TREE: OmpBeginLoopDirective +!PARSE-TREE: | OmpLoopDirective -> llvm::omp::Directive = do +!PARSE-TREE: | OmpClauseList -> OmpClause -> Ordered -> Scalar -> Integer -> Constant -> Expr = '2_4' +!PARSE-TREE: | | LiteralConstant -> IntLiteralConstant = '2' +![...] +!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPStandaloneConstruct -> OpenMPSimpleStandaloneConstruct +!PARSE-TREE: | OmpSimpleStandaloneDirective -> llvm::omp::Directive = ordered +!PARSE-TREE: | OmpClauseList -> OmpClause -> Depend -> OmpDependClause -> OmpDoacross -> Sink -> OmpIterationVector -> OmpIteration +!PARSE-TREE: | | Name = 'i' +!PARSE-TREE: | | OmpIterationOffset +!PARSE-TREE: | | | DefinedOperator -> IntrinsicOperator = Add +!PARSE-TREE: | | | Scalar -> Integer -> Constant -> Expr = '1_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '1' +!PARSE-TREE: | OmpIteration +!PARSE-TREE: | | Name = 'j' +!PARSE-TREE: | | OmpIterationOffset +!PARSE-TREE: | | | DefinedOperator -> IntrinsicOperator = Subtract +!PARSE-TREE: | | | Scalar -> Integer -> Constant -> Expr = '2_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '2' +!PARSE-TREE: | OmpClause -> Depend -> OmpDependClause -> OmpDoacross -> Sink -> OmpIterationVector -> OmpIteration +!PARSE-TREE: | | Name = 'i' +!PARSE-TREE: | OmpIteration +!PARSE-TREE: | | Name = 'j' +!PARSE-TREE: | | OmpIterationOffset +!PARSE-TREE: | | | DefinedOperator -> IntrinsicOperator = Add +!PARSE-TREE: | | | Scalar -> Integer -> Constant -> Expr = '3_4' +!PARSE-TREE: | | | | LiteralConstant -> IntLiteralConstant = '3' + diff --git a/flang/test/Parser/cuf-sanity-common b/flang/test/Parser/cuf-sanity-common index 9341f054d79d46..7005ef07b22650 100644 --- a/flang/test/Parser/cuf-sanity-common +++ b/flang/test/Parser/cuf-sanity-common @@ -40,6 +40,7 @@ module m call globalsub<<<1, 2>>> call globalsub<<<1, 2, 3>>> call globalsub<<<1, 2, 3, 4>>> + call globalsub<<<*,5>>> allocate(pa(32), pinned = isPinned) end subroutine end module diff --git a/flang/test/Parser/cuf-sanity-tree.CUF b/flang/test/Parser/cuf-sanity-tree.CUF index 2820441d5b5f0a..a8b2f93913ca9e 100644 --- a/flang/test/Parser/cuf-sanity-tree.CUF +++ b/flang/test/Parser/cuf-sanity-tree.CUF @@ -166,7 +166,7 @@ include "cuf-sanity-common" !CHECK: | | | | | Call !CHECK: | | | | | | ProcedureDesignator -> Name = 'globalsub' !CHECK: | | | | | Chevrons -!CHECK: | | | | | | Scalar -> Expr = '1_4' +!CHECK: | | | | | | StarOrExpr -> Scalar -> Expr = '1_4' !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1' !CHECK: | | | | | | Scalar -> Expr = '2_4' !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2' @@ -174,7 +174,7 @@ include "cuf-sanity-common" !CHECK: | | | | | Call !CHECK: | | | | | | ProcedureDesignator -> Name = 'globalsub' !CHECK: | | | | | Chevrons -!CHECK: | | | | | | Scalar -> Expr = '1_4' +!CHECK: | | | | | | StarOrExpr -> Scalar -> Expr = '1_4' !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1' !CHECK: | | | | | | Scalar -> Expr = '2_4' !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2' @@ -184,7 +184,7 @@ include "cuf-sanity-common" !CHECK: | | | | | Call !CHECK: | | | | | | ProcedureDesignator -> Name = 'globalsub' !CHECK: | | | | | Chevrons -!CHECK: | | | | | | Scalar -> Expr = '1_4' +!CHECK: | | | | | | StarOrExpr -> Scalar -> Expr = '1_4' !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1' !CHECK: | | | | | | Scalar -> Expr = '2_4' !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2' diff --git a/flang/test/Parser/cuf-sanity-unparse.CUF b/flang/test/Parser/cuf-sanity-unparse.CUF index d4be347dd044ea..2e2df9ac6646a8 100644 --- a/flang/test/Parser/cuf-sanity-unparse.CUF +++ b/flang/test/Parser/cuf-sanity-unparse.CUF @@ -43,6 +43,7 @@ include "cuf-sanity-common" !CHECK: CALL globalsub<<<1_4,2_4>>>() !CHECK: CALL globalsub<<<1_4,2_4,3_4>>>() !CHECK: CALL globalsub<<<1_4,2_4,3_4,4_4>>>() +!CHECK: CALL globalsub<<<-1_4,5_4>>>() !CHECK: ALLOCATE(pa(32_4), PINNED=ispinned) !CHECK: END SUBROUTINE !CHECK: END MODULE diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90 index 124f1a02d99fba..406d30b38948ea 100644 --- a/flang/test/Semantics/OpenMP/clause-validity01.f90 +++ b/flang/test/Semantics/OpenMP/clause-validity01.f90 @@ -495,7 +495,7 @@ !$omp taskyield !$omp barrier !$omp taskwait - !ERROR: DEPEND(SOURCE) or DEPEND(SINK : vec) can be used only with the ordered directive. Used here in the TASKWAIT construct. + !ERROR: The SINK and SOURCE dependence types can only be used with the ORDERED directive, used here in the TASKWAIT construct !$omp taskwait depend(source) ! !$omp taskwait depend(sink:i-1) ! !$omp target enter data map(to:arrayA) map(alloc:arrayB) diff --git a/flang/test/Semantics/OpenMP/depend06.f90 b/flang/test/Semantics/OpenMP/depend06.f90 index a9668c552f967c..d2e6a114676c3a 100644 --- a/flang/test/Semantics/OpenMP/depend06.f90 +++ b/flang/test/Semantics/OpenMP/depend06.f90 @@ -2,7 +2,7 @@ subroutine f00(x) integer :: x -!WARNING: INOUTSET task-dependence-type is not supported in OpenMP v4.5, try -fopenmp-version=52 +!WARNING: INOUTSET task dependence type is not supported in OpenMP v4.5, try -fopenmp-version=52 !$omp task depend(inoutset: x) x = x + 1 !$omp end task @@ -10,7 +10,7 @@ subroutine f00(x) subroutine f01(x) integer :: x -!WARNING: MUTEXINOUTSET task-dependence-type is not supported in OpenMP v4.5, try -fopenmp-version=50 +!WARNING: MUTEXINOUTSET task dependence type is not supported in OpenMP v4.5, try -fopenmp-version=50 !$omp task depend(mutexinoutset: x) x = x + 1 !$omp end task diff --git a/flang/test/Semantics/OpenMP/depobj-construct-v50.f90 b/flang/test/Semantics/OpenMP/depobj-construct-v50.f90 index e87d86ca54bee7..76661785826b4e 100644 --- a/flang/test/Semantics/OpenMP/depobj-construct-v50.f90 +++ b/flang/test/Semantics/OpenMP/depobj-construct-v50.f90 @@ -2,7 +2,7 @@ subroutine f00 integer :: obj -!ERROR: A DEPEND clause on a DEPOBJ construct must not have SOURCE, SINK or DEPOBJ as dependence-type +!ERROR: A DEPEND clause on a DEPOBJ construct must not have SINK, SOURCE or DEPOBJ as dependence type !$omp depobj(obj) depend(source) end diff --git a/flang/test/Semantics/OpenMP/depobj-construct-v51.f90 b/flang/test/Semantics/OpenMP/depobj-construct-v51.f90 index fa0c025a110100..fc403f0b2db220 100644 --- a/flang/test/Semantics/OpenMP/depobj-construct-v51.f90 +++ b/flang/test/Semantics/OpenMP/depobj-construct-v51.f90 @@ -2,12 +2,12 @@ subroutine f04 integer :: obj -!ERROR: An UPDATE clause on a DEPOBJ construct must not have SOURCE, SINK or DEPOBJ as dependence-type +!ERROR: An UPDATE clause on a DEPOBJ construct must not have SINK, SOURCE or DEPOBJ as dependence type !$omp depobj(obj) update(source) end subroutine f05 integer :: obj -!ERROR: An UPDATE clause on a DEPOBJ construct must not have SOURCE, SINK or DEPOBJ as dependence-type +!ERROR: An UPDATE clause on a DEPOBJ construct must not have SINK, SOURCE or DEPOBJ as dependence type !$omp depobj(obj) update(depobj) end diff --git a/flang/test/Semantics/OpenMP/depobj-construct-v52.f90 b/flang/test/Semantics/OpenMP/depobj-construct-v52.f90 index 42a2102500ea75..644090d7f7e8b8 100644 --- a/flang/test/Semantics/OpenMP/depobj-construct-v52.f90 +++ b/flang/test/Semantics/OpenMP/depobj-construct-v52.f90 @@ -2,8 +2,8 @@ subroutine f00 integer :: obj -!WARNING: SOURCE task-dependence-type is deprecated in OpenMP v5.2 -!ERROR: A DEPEND clause on a DEPOBJ construct must not have SOURCE or SINK as dependence-type +!WARNING: SOURCE dependence type is deprecated in OpenMP v5.2 +!ERROR: A DEPEND clause on a DEPOBJ construct must not have SINK or SOURCE as dependence type !$omp depobj(obj) depend(source) end diff --git a/flang/test/Semantics/OpenMP/doacross.f90 b/flang/test/Semantics/OpenMP/doacross.f90 new file mode 100644 index 00000000000000..381a4118ce7bfd --- /dev/null +++ b/flang/test/Semantics/OpenMP/doacross.f90 @@ -0,0 +1,28 @@ +!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=52 + +subroutine f00(x) + integer :: x(10, 10) + !$omp do ordered(2) + do i = 1, 10 + do j = 1, 10 +!ERROR: Duplicate variable 'i' in the iteration vector + !$omp ordered doacross(sink: i+1, i-2) + x(i, j) = 0 + enddo + enddo + !$omp end do +end + +subroutine f01(x) + integer :: x(10, 10) + do i = 1, 10 + !$omp do ordered(1) + do j = 1, 10 +!ERROR: The iteration vector element 'i' is not an induction variable within the ORDERED loop nest + !$omp ordered doacross(sink: i+1) + x(i, j) = 0 + enddo + !$omp end do + enddo +end + diff --git a/flang/test/Semantics/OpenMP/ordered01.f90 b/flang/test/Semantics/OpenMP/ordered01.f90 index 9433120fab10f6..12543acb2916b3 100644 --- a/flang/test/Semantics/OpenMP/ordered01.f90 +++ b/flang/test/Semantics/OpenMP/ordered01.f90 @@ -37,17 +37,16 @@ program main !$omp do ordered(1) do i = 2, N - !ERROR: Only DEPEND(SOURCE) or DEPEND(SINK: vec) are allowed when ORDERED construct is a standalone construct with no ORDERED region - !ERROR: At most one DEPEND(SOURCE) clause can appear on the ORDERED directive + !ERROR: Only SINK or SOURCE dependence types are allowed when ORDERED construct is a standalone construct with no ORDERED region + !ERROR: At most one SOURCE dependence type can appear on the ORDERED directive !$omp ordered depend(source) depend(inout: arrayA) depend(source) arrayA(i) = foo(i) - !ERROR: DEPEND(SOURCE) is not allowed when DEPEND(SINK: vec) is present on ORDERED directive - !ERROR: DEPEND(SOURCE) is not allowed when DEPEND(SINK: vec) is present on ORDERED directive - !ERROR: At most one DEPEND(SOURCE) clause can appear on the ORDERED directive + !ERROR: The SINK and SOURCE dependence types are mutually exclusive + !ERROR: At most one SOURCE dependence type can appear on the ORDERED directive !$omp ordered depend(sink: i - 1) depend(source) depend(source) arrayB(i) = bar(arrayA(i), arrayB(i-1)) - !ERROR: Only DEPEND(SOURCE) or DEPEND(SINK: vec) are allowed when ORDERED construct is a standalone construct with no ORDERED region - !ERROR: Only DEPEND(SOURCE) or DEPEND(SINK: vec) are allowed when ORDERED construct is a standalone construct with no ORDERED region + !ERROR: Only SINK or SOURCE dependence types are allowed when ORDERED construct is a standalone construct with no ORDERED region + !ERROR: Only SINK or SOURCE dependence types are allowed when ORDERED construct is a standalone construct with no ORDERED region !$omp ordered depend(out: arrayC) depend(in: arrayB) arrayC(i) = baz(arrayB(i-1)) end do @@ -55,11 +54,11 @@ program main !$omp do ordered(1) do i = 2, N - !ERROR: DEPEND(*) clauses are not allowed when ORDERED construct is a block construct with an ORDERED region + !ERROR: DEPEND clauses are not allowed when ORDERED construct is a block construct with an ORDERED region !$omp ordered depend(source) arrayA(i) = foo(i) !$omp end ordered - !ERROR: DEPEND(*) clauses are not allowed when ORDERED construct is a block construct with an ORDERED region + !ERROR: DEPEND clauses are not allowed when ORDERED construct is a block construct with an ORDERED region !$omp ordered depend(sink: i - 1) arrayB(i) = bar(arrayA(i), arrayB(i-1)) !$omp end ordered @@ -68,12 +67,12 @@ program main contains subroutine work1() - !ERROR: THREADS, SIMD clauses are not allowed when ORDERED construct is a standalone construct with no ORDERED region + !ERROR: THREADS and SIMD clauses are not allowed when ORDERED construct is a standalone construct with no ORDERED region !$omp ordered simd end subroutine work1 subroutine work2() - !ERROR: THREADS, SIMD clauses are not allowed when ORDERED construct is a standalone construct with no ORDERED region + !ERROR: THREADS and SIMD clauses are not allowed when ORDERED construct is a standalone construct with no ORDERED region !$omp ordered threads end subroutine work2 diff --git a/flang/test/Semantics/OpenMP/ordered03.f90 b/flang/test/Semantics/OpenMP/ordered03.f90 index 18f85fc24a9fb4..6a7037e2b750c5 100644 --- a/flang/test/Semantics/OpenMP/ordered03.f90 +++ b/flang/test/Semantics/OpenMP/ordered03.f90 @@ -99,7 +99,8 @@ subroutine sub1() !$omp do ordered(1) do i = 1, N - !ERROR: The number of variables in DEPEND(SINK: vec) clause does not match the parameter specified in ORDERED clause + !ERROR: The number of variables in the SINK iteration vector does not match the parameter specified in ORDERED clause + !ERROR: The iteration vector element 'j' is not an induction variable within the ORDERED loop nest !$omp ordered depend(sink: i - 1) depend(sink: i - 1, j) arrayB(i) = bar(i - 1, j) end do @@ -108,7 +109,7 @@ subroutine sub1() !$omp do ordered(2) do i = 1, N do j = 1, N - !ERROR: The number of variables in DEPEND(SINK: vec) clause does not match the parameter specified in ORDERED clause + !ERROR: The number of variables in the SINK iteration vector does not match the parameter specified in ORDERED clause !$omp ordered depend(sink: i - 1) depend(sink: i - 1, j) arrayB(i) = foo(i - 1) + bar(i - 1, j) end do @@ -119,5 +120,6 @@ subroutine sub1() !$omp ordered depend(source) !ERROR: An ORDERED construct with the DEPEND clause must be closely nested in a worksharing-loop (or parallel worksharing-loop) construct with ORDERED clause with a parameter + !ERROR: The iteration vector element 'i' is not an induction variable within the ORDERED loop nest !$omp ordered depend(sink: i - 1) end diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp index 43eb8a924aef47..b9c58b843a2409 100644 --- a/libc/src/math/generic/log1p.cpp +++ b/libc/src/math/generic/log1p.cpp @@ -822,8 +822,8 @@ constexpr Float128 BIG_COEFFS[4]{ {Sign::NEG, -128, 0x80000000'00000000'00000000'00000000_u128}, }; -LIBC_INLINE double log1p_accurate(int e_x, int index, - fputil::DoubleDouble m_x) { +[[maybe_unused]] LIBC_INLINE double log1p_accurate(int e_x, int index, + fputil::DoubleDouble m_x) { Float128 e_x_f128(static_cast(e_x)); Float128 sum = fputil::quick_mul(LOG_2, e_x_f128); sum = fputil::quick_add(sum, LOG_R1[index]); @@ -882,7 +882,6 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) { constexpr int EXP_BIAS = FPBits_t::EXP_BIAS; constexpr int FRACTION_LEN = FPBits_t::FRACTION_LEN; - constexpr uint64_t FRACTION_MASK = FPBits_t::FRACTION_MASK; FPBits_t xbits(x); uint64_t x_u = xbits.uintval(); @@ -954,12 +953,12 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) { // |x_dd.lo| < ulp(x_dd.hi) FPBits_t xhi_bits(x_dd.hi); + uint64_t xhi_frac = xhi_bits.get_mantissa(); x_u = xhi_bits.uintval(); // Range reduction: // Find k such that |x_hi - k * 2^-7| <= 2^-8. - int idx = - static_cast(((x_u & FRACTION_MASK) + (1ULL << (FRACTION_LEN - 8))) >> - (FRACTION_LEN - 7)); + int idx = static_cast((xhi_frac + (1ULL << (FRACTION_LEN - 8))) >> + (FRACTION_LEN - 7)); int x_e = xhi_bits.get_exponent() + (idx >> 7); double e_x = static_cast(x_e); @@ -974,17 +973,21 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) { constexpr double ERR_HI[2] = {0x1.0p-85, 0.0}; double err_hi = ERR_HI[hi == 0.0]; - // Scaling factior = 2^(-xh_bits.get_exponent()) - uint64_t s_u = (static_cast(EXP_BIAS) << (FRACTION_LEN + 1)) - - (x_u & FPBits_t::EXP_MASK); - // When the exponent of x is 2^1023, its inverse, 2^(-1023), is subnormal. - const double EXPONENT_CORRECTION[2] = {0.0, 0x1.0p-1023}; - double scaling = FPBits_t(s_u).get_val() + EXPONENT_CORRECTION[s_u == 0]; + // Scale x_dd by 2^(-xh_bits.get_exponent()). + int64_t s_u = static_cast(x_u & FPBits_t::EXP_MASK) - + (static_cast(EXP_BIAS) << FRACTION_LEN); // Normalize arguments: // 1 <= m_dd.hi < 2 // |m_dd.lo| < 2^-52. // This is exact. - fputil::DoubleDouble m_dd{scaling * x_dd.lo, scaling * x_dd.hi}; + uint64_t m_hi = FPBits_t::one().uintval() | xhi_frac; + + uint64_t m_lo = + FPBits_t(x_dd.lo).abs().get_val() > x_dd.hi * 0x1.0p-127 + ? static_cast(cpp::bit_cast(x_dd.lo) - s_u) + : 0; + + fputil::DoubleDouble m_dd{FPBits_t(m_lo).get_val(), FPBits_t(m_hi).get_val()}; // Perform range reduction: // r * m - 1 = r * (m_dd.hi + m_dd.lo) - 1 diff --git a/libc/test/src/math/smoke/log1p_test.cpp b/libc/test/src/math/smoke/log1p_test.cpp index eba65f56df7396..b98c0f26a8bcae 100644 --- a/libc/test/src/math/smoke/log1p_test.cpp +++ b/libc/test/src/math/smoke/log1p_test.cpp @@ -13,8 +13,6 @@ #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" -#include - using LlvmLibcLog1pTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog1pTest, SpecialNumbers) { @@ -26,6 +24,9 @@ TEST_F(LlvmLibcLog1pTest, SpecialNumbers) { EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::log1p(-0.0)); EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, LIBC_NAMESPACE::log1p(-1.0), FE_DIVBYZERO); + + EXPECT_FP_EQ(0x1.62c829bf8fd9dp9, + LIBC_NAMESPACE::log1p(0x1.9b536cac3a09dp1023)); } #ifdef LIBC_TEST_FTZ_DAZ @@ -36,18 +37,24 @@ TEST_F(LlvmLibcLog1pTest, FTZMode) { ModifyMXCSR mxcsr(FTZ); EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::log1p(min_denormal)); + EXPECT_FP_EQ(0x1.62c829bf8fd9dp9, + LIBC_NAMESPACE::log1p(0x1.9b536cac3a09dp1023)); } TEST_F(LlvmLibcLog1pTest, DAZMode) { ModifyMXCSR mxcsr(DAZ); EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::log1p(min_denormal)); + EXPECT_FP_EQ(0x1.62c829bf8fd9dp9, + LIBC_NAMESPACE::log1p(0x1.9b536cac3a09dp1023)); } TEST_F(LlvmLibcLog1pTest, FTZDAZMode) { ModifyMXCSR mxcsr(FTZ | DAZ); EXPECT_FP_EQ(0.0, LIBC_NAMESPACE::log1p(min_denormal)); + EXPECT_FP_EQ(0x1.62c829bf8fd9dp9, + LIBC_NAMESPACE::log1p(0x1.9b536cac3a09dp1023)); } #endif diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 58d50afc31e43e..86bd89dd4d561e 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -738,12 +738,10 @@ set(files __tuple/tuple_like_no_subrange.h __tuple/tuple_size.h __tuple/tuple_types.h - __type_traits/add_const.h - __type_traits/add_cv.h + __type_traits/add_cv_quals.h __type_traits/add_lvalue_reference.h __type_traits/add_pointer.h __type_traits/add_rvalue_reference.h - __type_traits/add_volatile.h __type_traits/aligned_storage.h __type_traits/aligned_union.h __type_traits/alignment_of.h diff --git a/libcxx/include/__iterator/bounded_iter.h b/libcxx/include/__iterator/bounded_iter.h index ae6fbb6b59bcff..d12750d1f81ac7 100644 --- a/libcxx/include/__iterator/bounded_iter.h +++ b/libcxx/include/__iterator/bounded_iter.h @@ -16,9 +16,13 @@ #include <__config> #include <__iterator/iterator_traits.h> #include <__memory/pointer_traits.h> +#include <__type_traits/conjunction.h> +#include <__type_traits/disjunction.h> #include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_convertible.h> +#include <__type_traits/is_same.h> +#include <__type_traits/make_const_lvalue_ref.h> #include <__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -70,7 +74,12 @@ struct __bounded_iter { _LIBCPP_HIDE_FROM_ABI __bounded_iter(__bounded_iter const&) = default; _LIBCPP_HIDE_FROM_ABI __bounded_iter(__bounded_iter&&) = default; - template ::value, int> = 0> + template < class _OtherIterator, + __enable_if_t< + _And< is_convertible, + _Or >, + is_same > > > >::value, + int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __bounded_iter(__bounded_iter<_OtherIterator> const& __other) _NOEXCEPT : __current_(__other.__current_), __begin_(__other.__begin_), diff --git a/libcxx/include/__iterator/static_bounded_iter.h b/libcxx/include/__iterator/static_bounded_iter.h index 9794c220384f55..8f4fbdf6dff961 100644 --- a/libcxx/include/__iterator/static_bounded_iter.h +++ b/libcxx/include/__iterator/static_bounded_iter.h @@ -17,9 +17,13 @@ #include <__cstddef/size_t.h> #include <__iterator/iterator_traits.h> #include <__memory/pointer_traits.h> +#include <__type_traits/conjunction.h> +#include <__type_traits/disjunction.h> #include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_convertible.h> +#include <__type_traits/is_same.h> +#include <__type_traits/make_const_lvalue_ref.h> #include <__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -93,7 +97,12 @@ struct __static_bounded_iter { _LIBCPP_HIDE_FROM_ABI __static_bounded_iter(__static_bounded_iter const&) = default; _LIBCPP_HIDE_FROM_ABI __static_bounded_iter(__static_bounded_iter&&) = default; - template ::value, int> = 0> + template , + _Or >, + is_same > > > >::value, + int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __static_bounded_iter(__static_bounded_iter<_OtherIterator, _Size> const& __other) _NOEXCEPT : __storage_(__other.__storage_.__current(), __other.__storage_.__begin()) {} @@ -264,7 +273,7 @@ struct __static_bounded_iter { private: template friend struct pointer_traits; - template + template friend struct __static_bounded_iter; __static_bounded_iter_storage<_Iterator, _Size> __storage_; diff --git a/libcxx/include/__iterator/wrap_iter.h b/libcxx/include/__iterator/wrap_iter.h index 2856833e600798..966c4675b7049a 100644 --- a/libcxx/include/__iterator/wrap_iter.h +++ b/libcxx/include/__iterator/wrap_iter.h @@ -17,9 +17,13 @@ #include <__iterator/iterator_traits.h> #include <__memory/addressof.h> #include <__memory/pointer_traits.h> +#include <__type_traits/conjunction.h> +#include <__type_traits/disjunction.h> #include <__type_traits/enable_if.h> #include <__type_traits/integral_constant.h> #include <__type_traits/is_convertible.h> +#include <__type_traits/is_same.h> +#include <__type_traits/make_const_lvalue_ref.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -45,9 +49,14 @@ class __wrap_iter { public: _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter() _NOEXCEPT : __i_() {} - template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter(const __wrap_iter<_Up>& __u) _NOEXCEPT - : __i_(__u.base()) {} + template < + class _OtherIter, + __enable_if_t< _And< is_convertible, + _Or >, + is_same > > > >::value, + int> = 0> + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 __wrap_iter(const __wrap_iter<_OtherIter>& __u) _NOEXCEPT + : __i_(__u.__i_) {} _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 reference operator*() const _NOEXCEPT { return *__i_; } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pointer operator->() const _NOEXCEPT { return std::__to_address(__i_); diff --git a/libcxx/include/__type_traits/add_const.h b/libcxx/include/__type_traits/add_const.h deleted file mode 100644 index 9a6f1c10299f7f..00000000000000 --- a/libcxx/include/__type_traits/add_const.h +++ /dev/null @@ -1,32 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___TYPE_TRAITS_ADD_CONST_H -#define _LIBCPP___TYPE_TRAITS_ADD_CONST_H - -#include <__config> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -_LIBCPP_BEGIN_NAMESPACE_STD - -template -struct _LIBCPP_TEMPLATE_VIS add_const { - typedef _LIBCPP_NODEBUG const _Tp type; -}; - -#if _LIBCPP_STD_VER >= 14 -template -using add_const_t = typename add_const<_Tp>::type; -#endif - -_LIBCPP_END_NAMESPACE_STD - -#endif // _LIBCPP___TYPE_TRAITS_ADD_CONST_H diff --git a/libcxx/include/__type_traits/add_cv.h b/libcxx/include/__type_traits/add_cv_quals.h similarity index 66% rename from libcxx/include/__type_traits/add_cv.h rename to libcxx/include/__type_traits/add_cv_quals.h index 9e23e5ceb7a3bd..1d35b89f42c2d1 100644 --- a/libcxx/include/__type_traits/add_cv.h +++ b/libcxx/include/__type_traits/add_cv_quals.h @@ -17,6 +17,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD +template +struct _LIBCPP_TEMPLATE_VIS add_const { + typedef _LIBCPP_NODEBUG const _Tp type; +}; + +#if _LIBCPP_STD_VER >= 14 +template +using add_const_t = typename add_const<_Tp>::type; +#endif + template struct _LIBCPP_TEMPLATE_VIS add_cv { typedef _LIBCPP_NODEBUG const volatile _Tp type; @@ -27,6 +37,16 @@ template using add_cv_t = typename add_cv<_Tp>::type; #endif +template +struct _LIBCPP_TEMPLATE_VIS add_volatile { + typedef _LIBCPP_NODEBUG volatile _Tp type; +}; + +#if _LIBCPP_STD_VER >= 14 +template +using add_volatile_t = typename add_volatile<_Tp>::type; +#endif + _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___TYPE_TRAITS_ADD_CV_H diff --git a/libcxx/include/__type_traits/add_volatile.h b/libcxx/include/__type_traits/add_volatile.h deleted file mode 100644 index 56b7dfaac026e7..00000000000000 --- a/libcxx/include/__type_traits/add_volatile.h +++ /dev/null @@ -1,32 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___TYPE_TRAITS_ADD_VOLATILE_H -#define _LIBCPP___TYPE_TRAITS_ADD_VOLATILE_H - -#include <__config> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -_LIBCPP_BEGIN_NAMESPACE_STD - -template -struct _LIBCPP_TEMPLATE_VIS add_volatile { - typedef _LIBCPP_NODEBUG volatile _Tp type; -}; - -#if _LIBCPP_STD_VER >= 14 -template -using add_volatile_t = typename add_volatile<_Tp>::type; -#endif - -_LIBCPP_END_NAMESPACE_STD - -#endif // _LIBCPP___TYPE_TRAITS_ADD_VOLATILE_H diff --git a/libcxx/include/__type_traits/is_trivially_assignable.h b/libcxx/include/__type_traits/is_trivially_assignable.h index 201333b0fa0b33..7720c3e637506a 100644 --- a/libcxx/include/__type_traits/is_trivially_assignable.h +++ b/libcxx/include/__type_traits/is_trivially_assignable.h @@ -10,7 +10,6 @@ #define _LIBCPP___TYPE_TRAITS_IS_TRIVIALLY_ASSIGNABLE_H #include <__config> -#include <__type_traits/add_const.h> #include <__type_traits/add_lvalue_reference.h> #include <__type_traits/add_rvalue_reference.h> #include <__type_traits/integral_constant.h> diff --git a/libcxx/include/__utility/as_const.h b/libcxx/include/__utility/as_const.h index 582dd42f407915..0f54b984725c60 100644 --- a/libcxx/include/__utility/as_const.h +++ b/libcxx/include/__utility/as_const.h @@ -10,9 +10,6 @@ #define _LIBCPP___UTILITY_AS_CONST_H #include <__config> -#include <__type_traits/add_const.h> -#include <__utility/forward.h> -#include <__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -22,7 +19,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 17 template -[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr add_const_t<_Tp>& as_const(_Tp& __t) noexcept { +[[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr const _Tp& as_const(_Tp& __t) noexcept { return __t; } diff --git a/libcxx/include/any b/libcxx/include/any index e32aa7f8e8a420..719dc2cf999e50 100644 --- a/libcxx/include/any +++ b/libcxx/include/any @@ -85,7 +85,7 @@ namespace std { #include <__memory/allocator_destructor.h> #include <__memory/allocator_traits.h> #include <__memory/unique_ptr.h> -#include <__type_traits/add_const.h> +#include <__type_traits/add_cv_quals.h> #include <__type_traits/add_pointer.h> #include <__type_traits/aligned_storage.h> #include <__type_traits/conditional.h> diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap index 6b0cc07fca0787..5465d603b2c4d0 100644 --- a/libcxx/include/module.modulemap +++ b/libcxx/include/module.modulemap @@ -63,12 +63,10 @@ module std_core [system] { } module type_traits { - module add_const { header "__type_traits/add_const.h" } - module add_cv { header "__type_traits/add_cv.h" } + module add_cv_quals { header "__type_traits/add_cv_quals.h" } module add_lvalue_reference { header "__type_traits/add_lvalue_reference.h" } module add_pointer { header "__type_traits/add_pointer.h" } module add_rvalue_reference { header "__type_traits/add_rvalue_reference.h" } - module add_volatile { header "__type_traits/add_volatile.h" } module aligned_storage { header "__type_traits/aligned_storage.h" } module aligned_union { header "__type_traits/aligned_union.h" } module alignment_of { header "__type_traits/alignment_of.h" } diff --git a/libcxx/include/string b/libcxx/include/string index b1cedbe68f7956..1034be8fdad8b3 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -3393,7 +3393,7 @@ basic_string<_CharT, _Traits, _Allocator>::__shrink_or_extend(size_type __target // The Standard mandates shrink_to_fit() does not increase the capacity. // With equal capacity keep the existing buffer. This avoids extra work // due to swapping the elements. - if (__allocation.count - 1 > __target_capacity) { + if (__allocation.count - 1 > capacity()) { __alloc_traits::deallocate(__alloc_, __allocation.ptr, __allocation.count); __annotate_new(__sz); // Undoes the __annotate_delete() return; diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits index baeed35ca8508b..cc2b7511d24d3b 100644 --- a/libcxx/include/type_traits +++ b/libcxx/include/type_traits @@ -425,12 +425,10 @@ namespace std */ #include <__config> -#include <__type_traits/add_const.h> -#include <__type_traits/add_cv.h> +#include <__type_traits/add_cv_quals.h> #include <__type_traits/add_lvalue_reference.h> #include <__type_traits/add_pointer.h> #include <__type_traits/add_rvalue_reference.h> -#include <__type_traits/add_volatile.h> #include <__type_traits/aligned_storage.h> #include <__type_traits/aligned_union.h> #include <__type_traits/alignment_of.h> diff --git a/libcxx/include/variant b/libcxx/include/variant index 6e752556a888dd..f604527cd22569 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -226,10 +226,8 @@ namespace std { #include <__memory/construct_at.h> #include <__tuple/find_index.h> #include <__tuple/sfinae_helpers.h> -#include <__type_traits/add_const.h> -#include <__type_traits/add_cv.h> +#include <__type_traits/add_cv_quals.h> #include <__type_traits/add_pointer.h> -#include <__type_traits/add_volatile.h> #include <__type_traits/common_type.h> #include <__type_traits/conditional.h> #include <__type_traits/conjunction.h> diff --git a/libcxx/test/benchmarks/GenerateInput.h b/libcxx/test/benchmarks/GenerateInput.h index cc1694311473ed..0f3e9309271bb1 100644 --- a/libcxx/test/benchmarks/GenerateInput.h +++ b/libcxx/test/benchmarks/GenerateInput.h @@ -45,30 +45,29 @@ inline std::string getRandomString(std::size_t Len) { } template -inline std::vector getDuplicateIntegerInputs(size_t N) { +inline std::vector getDuplicateIntegerInputs(std::size_t N) { std::vector inputs(N, static_cast(-1)); return inputs; } template -inline std::vector getSortedIntegerInputs(size_t N) { +inline std::vector getSortedIntegerInputs(std::size_t N) { std::vector inputs; - for (size_t i = 0; i < N; i += 1) + for (std::size_t i = 0; i < N; i += 1) inputs.push_back(i); return inputs; } template -std::vector getSortedLargeIntegerInputs(size_t N) { +std::vector getSortedLargeIntegerInputs(std::size_t N) { std::vector inputs; - for (size_t i = 0; i < N; ++i) { + for (std::size_t i = 0; i < N; ++i) inputs.push_back(i + N); - } return inputs; } template -std::vector getSortedTopBitsIntegerInputs(size_t N) { +std::vector getSortedTopBitsIntegerInputs(std::size_t N) { std::vector inputs = getSortedIntegerInputs(N); for (auto& E : inputs) E <<= ((sizeof(IntT) / 2) * CHAR_BIT); @@ -76,7 +75,7 @@ std::vector getSortedTopBitsIntegerInputs(size_t N) { } template -inline std::vector getReverseSortedIntegerInputs(size_t N) { +inline std::vector getReverseSortedIntegerInputs(std::size_t N) { std::vector inputs; std::size_t i = N; while (i > 0) { @@ -87,61 +86,58 @@ inline std::vector getReverseSortedIntegerInputs(size_t N) { } template -std::vector getPipeOrganIntegerInputs(size_t N) { +std::vector getPipeOrganIntegerInputs(std::size_t N) { std::vector v; v.reserve(N); - for (size_t i = 0; i < N / 2; ++i) + for (std::size_t i = 0; i < N / 2; ++i) v.push_back(i); - for (size_t i = N / 2; i < N; ++i) + for (std::size_t i = N / 2; i < N; ++i) v.push_back(N - i); return v; } template -std::vector getRandomIntegerInputs(size_t N) { +std::vector getRandomIntegerInputs(std::size_t N) { std::vector inputs; - for (size_t i = 0; i < N; ++i) { + for (std::size_t i = 0; i < N; ++i) inputs.push_back(getRandomInteger(0, std::numeric_limits::max())); - } return inputs; } -inline std::vector getDuplicateStringInputs(size_t N) { +inline std::vector getDuplicateStringInputs(std::size_t N) { std::vector inputs(N, getRandomString(1024)); return inputs; } -inline std::vector getRandomStringInputs(size_t N) { +inline std::vector getRandomStringInputs(std::size_t N) { std::vector inputs; - for (size_t i = 0; i < N; ++i) { + for (std::size_t i = 0; i < N; ++i) inputs.push_back(getRandomString(1024)); - } return inputs; } -inline std::vector getPrefixedRandomStringInputs(size_t N) { +inline std::vector getPrefixedRandomStringInputs(std::size_t N) { std::vector inputs; constexpr int kSuffixLength = 32; const std::string prefix = getRandomString(1024 - kSuffixLength); - for (size_t i = 0; i < N; ++i) { + for (std::size_t i = 0; i < N; ++i) inputs.push_back(prefix + getRandomString(kSuffixLength)); - } return inputs; } -inline std::vector getSortedStringInputs(size_t N) { +inline std::vector getSortedStringInputs(std::size_t N) { std::vector inputs = getRandomStringInputs(N); std::sort(inputs.begin(), inputs.end()); return inputs; } -inline std::vector getReverseSortedStringInputs(size_t N) { +inline std::vector getReverseSortedStringInputs(std::size_t N) { std::vector inputs = getSortedStringInputs(N); std::reverse(inputs.begin(), inputs.end()); return inputs; } -inline std::vector getRandomCStringInputs(size_t N) { +inline std::vector getRandomCStringInputs(std::size_t N) { static std::vector inputs = getRandomStringInputs(N); std::vector cinputs; for (auto const& str : inputs) diff --git a/libcxx/test/libcxx/iterators/contiguous_iterators.conv.compile.pass.cpp b/libcxx/test/libcxx/iterators/contiguous_iterators.conv.compile.pass.cpp new file mode 100644 index 00000000000000..372559594143ef --- /dev/null +++ b/libcxx/test/libcxx/iterators/contiguous_iterators.conv.compile.pass.cpp @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// + +// + +// __bounded_iter<_Iter> +// __static_bounded_iter<_Iter> +// __wrap_iter<_Iter> + +// Verify that libc++-wrapped iterators do not permit slicing conversion or construction. + +#include +#include +#include +#include + +#include "test_macros.h" + +struct Base {}; +struct Derived : Base {}; + +template ::iterator>::value> +struct test_array_helper : std::true_type { + typedef typename std::array::iterator BaseIter; + typedef typename std::array::iterator DerivedIter; + typedef typename std::array::const_iterator BaseConstIter; + typedef typename std::array::const_iterator DerivedConstIter; + + static_assert(!std::is_convertible::value, ""); + static_assert(!std::is_convertible::value, ""); + static_assert(!std::is_convertible::value, ""); + static_assert(!std::is_constructible::value, ""); + static_assert(!std::is_constructible::value, ""); + static_assert(!std::is_constructible::value, ""); +}; + +template +struct test_array_helper : std::true_type {}; + +static_assert(test_array_helper::value, ""); + +static_assert(!std::is_convertible::iterator, std::vector::iterator>::value, ""); +static_assert(!std::is_convertible::iterator, std::vector::const_iterator>::value, ""); +static_assert(!std::is_convertible::const_iterator, std::vector::const_iterator>::value, ""); +static_assert(!std::is_constructible::iterator, std::vector::iterator>::value, ""); +static_assert(!std::is_constructible::const_iterator, std::vector::iterator>::value, ""); +static_assert(!std::is_constructible::const_iterator, std::vector::const_iterator>::value, + ""); + +#if TEST_STD_VER >= 20 +static_assert(!std::is_convertible_v::iterator, std::span::iterator>); +static_assert(!std::is_convertible_v::iterator, std::span::iterator>); +static_assert(!std::is_convertible_v::iterator, std::span::iterator>); +static_assert(!std::is_constructible_v::iterator, std::span::iterator>); +static_assert(!std::is_constructible_v::iterator, std::span::iterator>); +static_assert(!std::is_constructible_v::iterator, std::span::iterator>); +#endif diff --git a/libcxx/test/libcxx/strings/basic.string/string.capacity/shrink_to_fit.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.capacity/shrink_to_fit.pass.cpp new file mode 100644 index 00000000000000..73b70d6f10bd5e --- /dev/null +++ b/libcxx/test/libcxx/strings/basic.string/string.capacity/shrink_to_fit.pass.cpp @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// + +// void shrink_to_fit(); // constexpr since C++20 + +// Make sure we use an allocation returned by allocate_at_least if it is smaller than the current allocation +// even if it contains more bytes than we requested + +#include +#include + +template +struct oversizing_allocator { + using value_type = T; + oversizing_allocator() = default; + template + oversizing_allocator(const oversizing_allocator&) noexcept {} + std::allocation_result allocate_at_least(std::size_t n) { + ++n; + return {static_cast(::operator new(n * sizeof(T))), n}; + } + T* allocate(std::size_t n) { return allocate_at_least(n).ptr; } + void deallocate(T* p, std::size_t) noexcept { ::operator delete(static_cast(p)); } +}; + +template +bool operator==(oversizing_allocator, oversizing_allocator) { + return true; +} + +void test_oversizing_allocator() { + std::basic_string, oversizing_allocator> s{ + "String does not fit in the internal buffer and is a bit longer"}; + s = "String does not fit in the internal buffer"; + std::size_t capacity = s.capacity(); + std::size_t size = s.size(); + s.shrink_to_fit(); + assert(s.capacity() < capacity); + assert(s.size() == size); +} + +int main(int, char**) { + test_oversizing_allocator(); + + return 0; +} diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp index 4b2d6e511df1a1..6b5efb34b3f3e7 100644 --- a/lld/COFF/InputFiles.cpp +++ b/lld/COFF/InputFiles.cpp @@ -1279,7 +1279,8 @@ void BitcodeFile::parseLazy() { } MachineTypes BitcodeFile::getMachineType() const { - switch (Triple(obj->getTargetTriple()).getArch()) { + Triple t(obj->getTargetTriple()); + switch (t.getArch()) { case Triple::x86_64: return AMD64; case Triple::x86: @@ -1288,7 +1289,7 @@ MachineTypes BitcodeFile::getMachineType() const { case Triple::thumb: return ARMNT; case Triple::aarch64: - return ARM64; + return t.isWindowsArm64EC() ? ARM64EC : ARM64; default: return IMAGE_FILE_MACHINE_UNKNOWN; } diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index ed93029721ecc1..fed6b21ddc5168 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -2792,6 +2792,8 @@ static void readSecurityNotes(Ctx &ctx) { referenceFileName = (*it)->getName(); } } + bool hasValidPauthAbiCoreInfo = llvm::any_of( + ctx.aarch64PauthAbiCoreInfo, [](uint8_t c) { return c != 0; }); for (ELFFileBase *f : ctx.objectFiles) { uint32_t features = f->andFeatures; @@ -2830,10 +2832,12 @@ static void readSecurityNotes(Ctx &ctx) { "GNU_PROPERTY_X86_FEATURE_1_IBT property"; features |= GNU_PROPERTY_X86_FEATURE_1_IBT; } - if (ctx.arg.zPacPlt && !(features & GNU_PROPERTY_AARCH64_FEATURE_1_PAC)) { + if (ctx.arg.zPacPlt && !(hasValidPauthAbiCoreInfo || + (features & GNU_PROPERTY_AARCH64_FEATURE_1_PAC))) { Warn(ctx) << f << ": -z pac-plt: file does not have " - "GNU_PROPERTY_AARCH64_FEATURE_1_PAC property"; + "GNU_PROPERTY_AARCH64_FEATURE_1_PAC property and no valid " + "PAuth core info present for this link job"; features |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC; } ctx.arg.andFeatures &= features; diff --git a/lld/test/COFF/Inputs/loadconfig-arm64ec.s b/lld/test/COFF/Inputs/loadconfig-arm64ec.s index 548634cfcfb4fa..92e052a62771f5 100644 --- a/lld/test/COFF/Inputs/loadconfig-arm64ec.s +++ b/lld/test/COFF/Inputs/loadconfig-arm64ec.s @@ -26,6 +26,7 @@ __guard_dispatch_icall_fptr: .xword 0 __os_arm64x_dispatch_call_no_redirect: .xword 0 + .globl __os_arm64x_dispatch_ret __os_arm64x_dispatch_ret: .xword 0 __os_arm64x_check_call: diff --git a/lld/test/COFF/arm64ec-pdb.test b/lld/test/COFF/arm64ec-pdb.test index 97b77039153c6f..3836d242ccc9e3 100644 --- a/lld/test/COFF/arm64ec-pdb.test +++ b/lld/test/COFF/arm64ec-pdb.test @@ -74,15 +74,15 @@ CHECK-NEXT: pdb file ni: 1 `{{.*}}out.pdb`, src file ni: 0 `` CHECK: Public Symbols CHECK-NEXT: ============================================================ CHECK-NEXT: Records -CHECK-NEXT: 544 | S_PUB32 [size = 28] `x86_64_sym` +CHECK-NEXT: 584 | S_PUB32 [size = 28] `x86_64_sym` CHECK-NEXT: flags = none, addr = 0005:0008 -CHECK-NEXT: 496 | S_PUB32 [size = 28] `arm64ec_sym` +CHECK-NEXT: 536 | S_PUB32 [size = 28] `arm64ec_sym` CHECK-NEXT: flags = none, addr = 0005:0000 CHECK-NEXT: 168 | S_PUB32 [size = 44] `__hybrid_auxiliary_iat_copy` CHECK-NEXT: flags = none, addr = 0002: CHECK-NEXT: 96 | S_PUB32 [size = 32] `__chpe_metadata` CHECK-NEXT: flags = none, addr = 0003:0000 -CHECK-NEXT: 416 | S_PUB32 [size = 48] `__x64_code_ranges_to_entry_points` +CHECK-NEXT: 456 | S_PUB32 [size = 48] `__x64_code_ranges_to_entry_points` CHECK-NEXT: flags = none, addr = 0002: CHECK-NEXT: 0 | S_PUB32 [size = 20] `#func` CHECK-NEXT: flags = function, addr = 0001:0008 @@ -90,9 +90,9 @@ CHECK-NEXT: 244 | S_PUB32 [size = 40] `__icall_helper_arm64ec` CHECK-NEXT: flags = none, addr = 0001:0000 CHECK-NEXT: 64 | S_PUB32 [size = 32] `__auximpcopy_func` CHECK-NEXT: flags = none, addr = 0002: -CHECK-NEXT: 464 | S_PUB32 [size = 32] `_load_config_used` +CHECK-NEXT: 504 | S_PUB32 [size = 32] `_load_config_used` CHECK-NEXT: flags = none, addr = 0002: -CHECK-NEXT: 524 | S_PUB32 [size = 20] `func` +CHECK-NEXT: 564 | S_PUB32 [size = 20] `func` CHECK-NEXT: flags = function, addr = 0001:4096 CHECK-NEXT: 128 | S_PUB32 [size = 40] `__hybrid_auxiliary_iat` CHECK-NEXT: flags = none, addr = 0002:8192 @@ -106,6 +106,8 @@ CHECK-NEXT: 212 | S_PUB32 [size = 32] `__hybrid_code_map` CHECK-NEXT: flags = none, addr = 0002: CHECK-NEXT: 20 | S_PUB32 [size = 44] `__arm64x_redirection_metadata` CHECK-NEXT: flags = none, addr = 0004:0000 +CHECK-NEXT: 416 | S_PUB32 [size = 40] `__os_arm64x_dispatch_ret` +CHECK-NEXT: flags = none, addr = 0002: CHECK-NEXT: 316 | S_PUB32 [size = 28] `__imp_func` CHECK-NEXT: flags = none, addr = 0002:8192 diff --git a/lld/test/COFF/lto-arm64ec.ll b/lld/test/COFF/lto-arm64ec.ll new file mode 100644 index 00000000000000..c521449fa39ebe --- /dev/null +++ b/lld/test/COFF/lto-arm64ec.ll @@ -0,0 +1,29 @@ +; REQUIRES: aarch64, x86 + +; RUN: llvm-as %s -o %t.obj +; RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o %t-loadconfig.obj + +; RUN: lld-link -machine:arm64ec %t.obj %t-loadconfig.obj -out:%t.exe -subsystem:console +; RUN: llvm-objdump -d %t.exe | FileCheck %s + +; CHECK: 0000000140001000 <.text>: +; CHECK-NEXT: 140001000: 00000009 udf #0x9 +; CHECK-NEXT: 140001004: 52800020 mov w0, #0x1 // =1 +; CHECK-NEXT: 140001008: d65f03c0 ret + +; CHECK: 0000000140002000 <.hexpthk>: +; CHECK-NEXT: 140002000: 48 8b c4 movq %rsp, %rax +; CHECK-NEXT: 140002003: 48 89 58 20 movq %rbx, 0x20(%rax) +; CHECK-NEXT: 140002007: 55 pushq %rbp +; CHECK-NEXT: 140002008: 5d popq %rbp +; CHECK-NEXT: 140002009: e9 f6 ef ff ff jmp 0x140001004 <.text+0x4> +; CHECK-NEXT: 14000200e: cc int3 +; CHECK-NEXT: 14000200f: cc int3 + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32" +target triple = "arm64ec-unknown-windows-msvc" + +define dso_local i32 @mainCRTStartup() { +entry: + ret i32 1 +} diff --git a/lld/test/ELF/aarch64-feature-pac.s b/lld/test/ELF/aarch64-feature-pac.s index beafe58887db3f..4fd1fd2acea737 100644 --- a/lld/test/ELF/aarch64-feature-pac.s +++ b/lld/test/ELF/aarch64-feature-pac.s @@ -76,12 +76,14 @@ # PACDYN-NOT: 0x0000000070000001 (AARCH64_BTI_PLT) # PACDYN-NOT: 0x0000000070000003 (AARCH64_PAC_PLT) -## Turn on PAC entries with the -z pac-plt command line option. There are no -## warnings in this case as the choice to use PAC in PLT entries is orthogonal -## to the choice of using PAC in relocatable objects. The presence of the PAC -## .note.gnu.property is an indication of preference by the relocatable object. +## Turn on PAC entries with the -z pac-plt command line option. For files w/o +## GNU_PROPERTY_AARCH64_FEATURE_1_PAC set in GNU_PROPERTY_AARCH64_FEATURE_1_AND +## property, emit a warning. + +# RUN: ld.lld %t.o %t2.o -z pac-plt %t.so -o %tpacplt.exe 2>&1 | FileCheck -DFILE=%t2.o --check-prefix WARN %s + +# WARN: warning: [[FILE]]: -z pac-plt: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_PAC property and no valid PAuth core info present for this link job -# RUN: ld.lld %t.o %t2.o -z pac-plt %t.so -o %tpacplt.exe # RUN: llvm-readelf -n %tpacplt.exe | FileCheck --check-prefix=PACPROP %s # RUN: llvm-readelf --dynamic-table %tpacplt.exe | FileCheck --check-prefix PACDYN2 %s # RUN: llvm-objdump --no-print-imm-hex -d --mattr=+v8.3a --no-show-raw-insn %tpacplt.exe | FileCheck --check-prefix PACPLT %s diff --git a/lld/test/ELF/aarch64-feature-pauth.s b/lld/test/ELF/aarch64-feature-pauth.s index 699a650d72295a..3150c130d460f5 100644 --- a/lld/test/ELF/aarch64-feature-pauth.s +++ b/lld/test/ELF/aarch64-feature-pauth.s @@ -33,13 +33,53 @@ # RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu no-info.s -o noinfo1.o # RUN: cp noinfo1.o noinfo2.o # RUN: not ld.lld -z pauth-report=error noinfo1.o tag1.o noinfo2.o -o /dev/null 2>&1 | FileCheck --check-prefix ERR5 %s -# RUN: ld.lld -z pauth-report=warning noinfo1.o tag1.o noinfo2.o -o /dev/null 2>&1 | FileCheck --check-prefix WARN %s +# RUN: ld.lld -z pauth-report=warning noinfo1.o tag1.o noinfo2.o -o /dev/null 2>&1 | FileCheck --check-prefix WARN1 %s # RUN: ld.lld -z pauth-report=none noinfo1.o tag1.o noinfo2.o --fatal-warnings -o /dev/null # ERR5: error: noinfo1.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one # ERR5-NEXT: error: noinfo2.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one -# WARN: warning: noinfo1.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one -# WARN-NEXT: warning: noinfo2.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one +# WARN1: warning: noinfo1.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one +# WARN1-NEXT: warning: noinfo2.o: -z pauth-report: file does not have AArch64 PAuth core info while 'tag1.o' has one + +# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu abi-tag-zero.s -o tag-zero.o +# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu %p/Inputs/aarch64-func2.s -o func2.o +# RUN: llvm-mc -filetype=obj -triple=aarch64-linux-gnu %p/Inputs/aarch64-func3.s -o func3.o +# RUN: ld.lld func3.o --shared -o func3.so +# RUN: ld.lld tag1.o func2.o func3.so -z pac-plt --shared -o pacplt-nowarn --fatal-warnings +# RUN: ld.lld tag-zero.o func2.o func3.so -z pac-plt --shared -o pacplt-warn 2>&1 | FileCheck --check-prefix WARN2 %s + +# WARN2: warning: tag-zero.o: -z pac-plt: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_PAC property and no valid PAuth core info present for this link job +# WARN2-NEXT: warning: func2.o: -z pac-plt: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_PAC property and no valid PAuth core info present for this link job + +# RUN: llvm-readelf -d pacplt-nowarn | FileCheck --check-prefix=PACPLTTAG %s +# RUN: llvm-readelf -d pacplt-warn | FileCheck --check-prefix=PACPLTTAG %s + +# PACPLTTAG: 0x0000000070000003 (AARCH64_PAC_PLT) + +# RUN: llvm-objdump -d pacplt-nowarn | FileCheck --check-prefix PACPLT -DA=10380 -DB=478 -DC=480 %s +# RUN: llvm-objdump -d pacplt-warn | FileCheck --check-prefix PACPLT -DA=10390 -DB=488 -DC=490 %s + +# PACPLT: Disassembly of section .text: +# PACPLT: : +# PACPLT-NEXT: bl 0x[[A]] +# PACPLT-NEXT: ret +# PACPLT: Disassembly of section .plt: +# PACPLT: <.plt>: +# PACPLT-NEXT: stp x16, x30, [sp, #-0x10]! +# PACPLT-NEXT: adrp x16, 0x30000 +# PACPLT-NEXT: ldr x17, [x16, #0x[[B]]] +# PACPLT-NEXT: add x16, x16, #0x[[B]] +# PACPLT-NEXT: br x17 +# PACPLT-NEXT: nop +# PACPLT-NEXT: nop +# PACPLT-NEXT: nop +# PACPLT: : +# PACPLT-NEXT: adrp x16, 0x30000 +# PACPLT-NEXT: ldr x17, [x16, #0x[[C]]] +# PACPLT-NEXT: add x16, x16, #0x[[C]] +# PACPLT-NEXT: autia1716 +# PACPLT-NEXT: br x17 +# PACPLT-NEXT: nop #--- abi-tag-short.s @@ -106,6 +146,18 @@ .quad 42 // platform .quad 2 // version +#--- abi-tag-zero.s + +.section ".note.gnu.property", "a" +.long 4 +.long 24 +.long 5 +.asciz "GNU" +.long 0xc0000001 +.long 16 +.quad 0 // platform +.quad 0 // version + #--- no-info.s ## define _start to avoid missing entry warning and use --fatal-warnings to assert no diagnostic diff --git a/lld/test/wasm/lto/thinlto-emit-index.ll b/lld/test/wasm/lto/thinlto-emit-index.ll new file mode 100644 index 00000000000000..a0af9492b81d92 --- /dev/null +++ b/lld/test/wasm/lto/thinlto-emit-index.ll @@ -0,0 +1,108 @@ +;; Copied from ELF/lto/thinlto-index-only.ll +;; First ensure that the ThinLTO handling in lld handles +;; bitcode without summary sections gracefully and generates index file. +; RUN: rm -rf %t && mkdir %t && cd %t +; RUN: mkdir d +; RUN: llvm-as %s -o 1.o +; RUN: llvm-as %p/Inputs/thinlto.ll -o d/2.o +; RUN: wasm-ld --thinlto-emit-index-files -shared 1.o d/2.o -o 3 +; RUN: ls d/2.o.thinlto.bc +; RUN: ls 3 +; RUN: wasm-ld -shared 1.o d/2.o -o 3 +; RUN: llvm-nm 3 | FileCheck %s --check-prefix=NM + +;; Basic ThinLTO tests. +; RUN: opt -module-summary %s -o 1.o +; RUN: opt -module-summary %p/Inputs/thinlto.ll -o d/2.o +; RUN: opt -module-summary %p/Inputs/thinlto_empty.ll -o 3.o +; RUN: cp 3.o 4.o + +;; Ensure lld generates an index and also a binary if requested. +; RUN: wasm-ld --thinlto-emit-index-files -shared 1.o --start-lib d/2.o 3.o --end-lib 4.o -o 4 +; RUN: ls 4 +; RUN: llvm-bcanalyzer -dump 1.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND1 +; RUN: llvm-bcanalyzer -dump d/2.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND2 +; RUN: llvm-dis < 3.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND3 +; RUN: llvm-dis < 4.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND4 + +; IMPORTS1: d/2.o + +;; Ensure lld generates an index and not a binary if both emit-index and index-only are present. +; RUN: wasm-ld --thinlto-emit-index-files --thinlto-index-only -shared 1.o d/2.o -o 5 +; RUN: not ls 5 + +;; Test that LLD generates an empty index even for lazy object file that is not added to link. +;; Test that LLD also generates empty imports file with the --thinlto-emit-imports-files option. +; RUN: rm -f 1.o.thinlto.bc 1.o.imports +; RUN: wasm-ld --thinlto-emit-index-files -shared d/2.o --start-lib 1.o --end-lib \ +; RUN: --thinlto-emit-imports-files -o 7 +; RUN: ls 7 +; RUN: ls 1.o.thinlto.bc +; RUN: ls 1.o.imports + +;; Ensure LLD generates an empty index for each bitcode file even if all bitcode files are lazy. +; RUN: rm -f 1.o.thinlto.bc +; RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-linux /dev/null -o dummy.o +; RUN: wasm-ld --thinlto-emit-index-files -shared dummy.o --start-lib 1.o --end-lib -o 8 +; RUN: ls 8 +; RUN: ls 1.o.thinlto.bc + +;; Test that LLD errors out when run with suffix replacement, or prefix replacement +; RUN: not wasm-ld --thinlto-emit-index-files -shared d/2.o --start-lib 1.o --end-lib \ +; RUN: --thinlto-prefix-replace="abc;xyz" 2>&1 | FileCheck %s --check-prefix=ERR1 +; ERR1: --thinlto-prefix-replace is not supported with --thinlto-emit-index-files + +; RUN: not wasm-ld --thinlto-emit-index-files -shared d/2.o --start-lib 1.o --end-lib \ +; RUN: --thinlto-object-suffix-replace="abc;xyz" 2>&1 | FileCheck %s --check-prefix=ERR2 +; ERR2: --thinlto-object-suffix-replace is not supported with --thinlto-emit-index-files + +;; But not when passed with index only as well +; RUN: wasm-ld --thinlto-emit-index-files -shared d/2.o --start-lib 1.o --end-lib \ +; RUN: --thinlto-prefix-replace="abc;xyz" --thinlto-index-only + +; RUN: wasm-ld --thinlto-emit-index-files -shared d/2.o --start-lib 1.o --end-lib \ +; RUN: --thinlto-object-suffix-replace="abc;xyz" --thinlto-index-only + +; NM: T f + +;; The backend index for this module contains summaries from itself and +;; Inputs/thinlto.ll, as it imports from the latter. +; BACKEND1: &1 | FileCheck %s --check-prefix=ERR1 +; ERR1: --thinlto-object-suffix-replace= expects 'old;new' format, but got abc:def + +;; If filename does not end with old suffix, no suffix change should occur, +;; so ".thinlto.bc" will simply be appended to the input file name. +; RUN: rm -f 1.thinlink.bc.thinlto.bc +; RUN: wasm-ld --thinlto-index-only --thinlto-object-suffix-replace=".abc;.o" -shared 1.thinlink.bc -o /dev/null +; RUN: ls 1.thinlink.bc.thinlto.bc + +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-unknown" + +define void @f() { +entry: + ret void +} + +!llvm.dbg.cu = !{} + +!1 = !{i32 2, !"Debug Info Version", i32 3} +!llvm.module.flags = !{!1} diff --git a/lld/test/wasm/lto/thinlto-prefix-replace.ll b/lld/test/wasm/lto/thinlto-prefix-replace.ll new file mode 100644 index 00000000000000..dcb6af35f129e9 --- /dev/null +++ b/lld/test/wasm/lto/thinlto-prefix-replace.ll @@ -0,0 +1,23 @@ +; Copied from ELF/lto/thinlto-prefix-replace.ll +; Check that changing the output path via thinlto-prefix-replace works +; RUN: mkdir -p %t/oldpath +; RUN: opt -module-summary %s -o %t/oldpath/thinlto_prefix_replace.o + +; Ensure that there is no existing file at the new path, so we properly +; test the creation of the new file there. +; RUN: rm -f %t/newpath/thinlto_prefix_replace.o.thinlto.bc +; RUN: wasm-ld --thinlto-index-only --thinlto-prefix-replace="%t/oldpath/;%t/newpath/" -shared %t/oldpath/thinlto_prefix_replace.o -o %t/thinlto_prefix_replace +; RUN: ls %t/newpath/thinlto_prefix_replace.o.thinlto.bc + +; Ensure that lld generates error if prefix replace option does not have 'old;new' format. +; RUN: rm -f %t/newpath/thinlto_prefix_replace.o.thinlto.bc +; RUN: not wasm-ld --thinlto-index-only --thinlto-prefix-replace=abc:def -shared %t/oldpath/thinlto_prefix_replace.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR +; ERR: --thinlto-prefix-replace= expects 'old;new' format, but got abc:def + +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-unknown" + +define void @f() { +entry: + ret void +} diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h index 18966f630e3dc2..eb32ce80f4a3d9 100644 --- a/lld/wasm/Config.h +++ b/lld/wasm/Config.h @@ -110,6 +110,10 @@ struct Configuration { llvm::StringRef thinLTOCacheDir; llvm::StringRef thinLTOJobs; llvm::StringRef thinLTOIndexOnlyArg; + std::pair thinLTOObjectSuffixReplace; + llvm::StringRef thinLTOPrefixReplaceOld; + llvm::StringRef thinLTOPrefixReplaceNew; + llvm::StringRef thinLTOPrefixReplaceNativeObject; llvm::StringRef whyExtract; llvm::StringSet<> allowUndefinedSymbols; diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp index 65d412aa3c9833..43e13c3a5ca22d 100644 --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -425,6 +425,33 @@ void LinkerDriver::createFiles(opt::InputArgList &args) { error("no input files"); } +static StringRef getAliasSpelling(opt::Arg *arg) { + if (const opt::Arg *alias = arg->getAlias()) + return alias->getSpelling(); + return arg->getSpelling(); +} + +static std::pair getOldNewOptions(opt::InputArgList &args, + unsigned id) { + auto *arg = args.getLastArg(id); + if (!arg) + return {"", ""}; + + StringRef s = arg->getValue(); + std::pair ret = s.split(';'); + if (ret.second.empty()) + error(getAliasSpelling(arg) + " expects 'old;new' format, but got " + s); + return ret; +} + +// Parse options of the form "old;new[;extra]". +static std::tuple +getOldNewOptionsExtra(opt::InputArgList &args, unsigned id) { + auto [oldDir, second] = getOldNewOptions(args, id); + auto [newDir, extraDir] = second.split(';'); + return {oldDir, newDir, extraDir}; +} + static StringRef getEntry(opt::InputArgList &args) { auto *arg = args.getLastArg(OPT_entry, OPT_no_entry); if (!arg) { @@ -577,6 +604,24 @@ static void readConfigs(opt::InputArgList &args) { config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) || args.hasArg(OPT_thinlto_index_only_eq); config->thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq); + config->thinLTOObjectSuffixReplace = + getOldNewOptions(args, OPT_thinlto_object_suffix_replace_eq); + std::tie(config->thinLTOPrefixReplaceOld, config->thinLTOPrefixReplaceNew, + config->thinLTOPrefixReplaceNativeObject) = + getOldNewOptionsExtra(args, OPT_thinlto_prefix_replace_eq); + if (config->thinLTOEmitIndexFiles && !config->thinLTOIndexOnly) { + if (args.hasArg(OPT_thinlto_object_suffix_replace_eq)) + error("--thinlto-object-suffix-replace is not supported with " + "--thinlto-emit-index-files"); + else if (args.hasArg(OPT_thinlto_prefix_replace_eq)) + error("--thinlto-prefix-replace is not supported with " + "--thinlto-emit-index-files"); + } + if (!config->thinLTOPrefixReplaceNativeObject.empty() && + config->thinLTOIndexOnlyArg.empty()) { + error("--thinlto-prefix-replace=old_dir;new_dir;obj_dir must be used with " + "--thinlto-index-only="); + } config->unresolvedSymbols = getUnresolvedSymbolPolicy(args); config->whyExtract = args.getLastArgValue(OPT_why_extract); errorHandler().verbose = args.hasArg(OPT_verbose); @@ -721,7 +766,7 @@ static void checkOptions(opt::InputArgList &args) { if (config->pie && config->shared) error("-shared and -pie may not be used together"); - if (config->outputFile.empty()) + if (config->outputFile.empty() && !config->thinLTOIndexOnly) error("no output file specified"); if (config->importTable && config->exportTable) diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp index 420865e2aea8e3..fd06788457966a 100644 --- a/lld/wasm/InputFiles.cpp +++ b/lld/wasm/InputFiles.cpp @@ -46,6 +46,13 @@ std::string toString(const wasm::InputFile *file) { namespace wasm { +std::string replaceThinLTOSuffix(StringRef path) { + auto [suffix, repl] = config->thinLTOObjectSuffixReplace; + if (path.consume_back(suffix)) + return (path + repl).str(); + return std::string(path); +} + void InputFile::checkArch(Triple::ArchType arch) const { bool is64 = arch == Triple::wasm64; if (is64 && !config->is64) { @@ -837,6 +844,8 @@ BitcodeFile::BitcodeFile(MemoryBufferRef m, StringRef archiveName, this->archiveName = std::string(archiveName); std::string path = mb.getBufferIdentifier().str(); + if (config->thinLTOIndexOnly) + path = replaceThinLTOSuffix(mb.getBufferIdentifier()); // ThinLTO assumes that all MemoryBufferRefs given to it have a unique // name. If two archives define two members with the same name, this diff --git a/lld/wasm/InputFiles.h b/lld/wasm/InputFiles.h index c3a667523ee021..1b1de98d2d17a2 100644 --- a/lld/wasm/InputFiles.h +++ b/lld/wasm/InputFiles.h @@ -195,6 +195,8 @@ InputFile *createObjectFile(MemoryBufferRef mb, StringRef archiveName = "", // Opens a given file. std::optional readFile(StringRef path); +std::string replaceThinLTOSuffix(StringRef path); + } // namespace wasm std::string toString(const wasm::InputFile *file); diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp index 94f50eae317014..d9fff748bdb657 100644 --- a/lld/wasm/LTO.cpp +++ b/lld/wasm/LTO.cpp @@ -43,6 +43,11 @@ using namespace llvm; using namespace lld::wasm; using namespace lld; +static std::string getThinLTOOutputFile(StringRef modulePath) { + return lto::getThinLTOOutputFile(modulePath, config->thinLTOPrefixReplaceOld, + config->thinLTOPrefixReplaceNew); +} + static lto::Config createConfig() { lto::Config c; c.Options = initTargetOptionsFromCodeGenFlags(); @@ -84,7 +89,10 @@ BitcodeCompiler::BitcodeCompiler() { auto onIndexWrite = [&](StringRef s) { thinIndices.erase(s); }; if (config->thinLTOIndexOnly) { backend = lto::createWriteIndexesThinBackend( - llvm::hardware_concurrency(config->thinLTOJobs), "", "", "", + llvm::hardware_concurrency(config->thinLTOJobs), + std::string(config->thinLTOPrefixReplaceOld), + std::string(config->thinLTOPrefixReplaceNew), + std::string(config->thinLTOPrefixReplaceNativeObject), config->thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite); } else { backend = lto::createInProcessThinBackend( @@ -158,7 +166,8 @@ static void thinLTOCreateEmptyIndexFiles() { continue; if (linkedBitCodeFiles.contains(f->getName())) continue; - std::string path(f->obj->getName()); + std::string path = + replaceThinLTOSuffix(getThinLTOOutputFile(f->obj->getName())); std::unique_ptr os = openFile(path + ".thinlto.bc"); if (!os) continue; diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td index 1a17452fbe8a7b..1316dc5c70d936 100644 --- a/lld/wasm/Options.td +++ b/lld/wasm/Options.td @@ -309,6 +309,8 @@ def thinlto_index_only: FF<"thinlto-index-only">; def thinlto_index_only_eq: JJ<"thinlto-index-only=">; def thinlto_jobs: JJ<"thinlto-jobs=">, HelpText<"Number of ThinLTO jobs. Default to --threads=">; +def thinlto_object_suffix_replace_eq: JJ<"thinlto-object-suffix-replace=">; +def thinlto_prefix_replace_eq: JJ<"thinlto-prefix-replace=">; def lto_debug_pass_manager: FF<"lto-debug-pass-manager">, HelpText<"Debug new pass manager">; diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt index 5827e04b5662f3..85ba4fde17418a 100644 --- a/lldb/CMakeLists.txt +++ b/lldb/CMakeLists.txt @@ -87,8 +87,7 @@ if (LLDB_ENABLE_PYTHON) endif () if (LLDB_ENABLE_LUA) - find_program(Lua_EXECUTABLE lua5.3) - set(LLDB_LUA_DEFAULT_RELATIVE_PATH "lib/lua/5.3") + set(LLDB_LUA_DEFAULT_RELATIVE_PATH "lib/lua/${LUA_VERSION_MAJOR}.${LUA_VERSION_MINOR}") set(LLDB_LUA_RELATIVE_PATH ${LLDB_LUA_DEFAULT_RELATIVE_PATH} CACHE STRING "Path where Lua modules are installed, relative to install prefix") endif () @@ -138,12 +137,12 @@ endif() if (LLDB_ENABLE_LUA) if(LLDB_BUILD_FRAMEWORK) - set(lldb_lua_target_dir "${LLDB_FRAMEWORK_ABSOLUTE_BUILD_DIR}/LLDB.framework/Resources/Lua") + set(LLDB_LUA_CPATH "${LLDB_FRAMEWORK_ABSOLUTE_BUILD_DIR}/LLDB.framework/Resources/Lua") else() - set(lldb_lua_target_dir "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/${LLDB_LUA_RELATIVE_PATH}") + set(LLDB_LUA_CPATH "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/${LLDB_LUA_RELATIVE_PATH}") endif() get_target_property(lldb_lua_bindings_dir swig_wrapper_lua BINARY_DIR) - finish_swig_lua("lldb-lua" "${lldb_lua_bindings_dir}" "${lldb_lua_target_dir}") + finish_swig_lua("lldb-lua" "${lldb_lua_bindings_dir}" "${LLDB_LUA_CPATH}") endif() set(LLDB_INCLUDE_UNITTESTS ON) diff --git a/lldb/cmake/modules/FindLuaAndSwig.cmake b/lldb/cmake/modules/FindLuaAndSwig.cmake index 11548b76f843f0..33fadb2a097407 100644 --- a/lldb/cmake/modules/FindLuaAndSwig.cmake +++ b/lldb/cmake/modules/FindLuaAndSwig.cmake @@ -8,11 +8,21 @@ if(LUA_LIBRARIES AND LUA_INCLUDE_DIR AND LLDB_ENABLE_SWIG) set(LUAANDSWIG_FOUND TRUE) else() if (LLDB_ENABLE_SWIG) - find_package(Lua 5.3 EXACT) + find_package(Lua 5.3) if(LUA_FOUND) + # Find the Lua executable. Only required to run a subset of the Lua + # tests. + find_program(LUA_EXECUTABLE + NAMES + "lua" + "lua${LUA_VERSION_MAJOR}.${LUA_VERSION_MINOR}" + ) mark_as_advanced( LUA_LIBRARIES - LUA_INCLUDE_DIR) + LUA_INCLUDE_DIR + LUA_VERSION_MINOR + LUA_VERSION_MAJOR + LUA_EXECUTABLE) endif() else() message(STATUS "SWIG 4 or later is required for Lua support in LLDB but could not be found") @@ -26,5 +36,7 @@ else() REQUIRED_VARS LUA_LIBRARIES LUA_INCLUDE_DIR + LUA_VERSION_MINOR + LUA_VERSION_MAJOR LLDB_ENABLE_SWIG) endif() diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst index 33b6a6f79def4b..66db84522bff1f 100644 --- a/lldb/docs/resources/build.rst +++ b/lldb/docs/resources/build.rst @@ -64,7 +64,7 @@ CMake configuration error. +-------------------+------------------------------------------------------+--------------------------+ | Python | Python scripting | ``LLDB_ENABLE_PYTHON`` | +-------------------+------------------------------------------------------+--------------------------+ -| Lua | Lua scripting | ``LLDB_ENABLE_LUA`` | +| Lua | Lua scripting. Lua 5.3 and 5.4 are supported. | ``LLDB_ENABLE_LUA`` | +-------------------+------------------------------------------------------+--------------------------+ Depending on your platform and package manager, one might run any of the diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h index c5167e5e0ecb6a..e6fea9e022c43a 100644 --- a/lldb/include/lldb/Interpreter/CommandObject.h +++ b/lldb/include/lldb/Interpreter/CommandObject.h @@ -35,8 +35,9 @@ namespace lldb_private { template int AddNamesMatchingPartialString( - const std::map &in_map, llvm::StringRef cmd_str, - StringList &matches, StringList *descriptions = nullptr) { + const std::map> &in_map, + llvm::StringRef cmd_str, StringList &matches, + StringList *descriptions = nullptr) { int number_added = 0; const bool add_all = cmd_str.empty(); @@ -54,7 +55,8 @@ int AddNamesMatchingPartialString( } template -size_t FindLongestCommandWord(std::map &dict) { +size_t +FindLongestCommandWord(std::map> &dict) { auto end = dict.end(); size_t max_len = 0; @@ -107,7 +109,7 @@ class CommandObject : public std::enable_shared_from_this { typedef std::vector CommandArgumentEntry; // Used to build individual command argument lists - typedef std::map CommandMap; + typedef std::map> CommandMap; CommandObject(CommandInterpreter &interpreter, llvm::StringRef name, llvm::StringRef help = "", llvm::StringRef syntax = "", diff --git a/lldb/source/Commands/CommandObjectMultiword.cpp b/lldb/source/Commands/CommandObjectMultiword.cpp index b4cdfea9b1a3ef..c99b75ff29144d 100644 --- a/lldb/source/Commands/CommandObjectMultiword.cpp +++ b/lldb/source/Commands/CommandObjectMultiword.cpp @@ -32,7 +32,7 @@ CommandObjectMultiword::GetSubcommandSPExact(llvm::StringRef sub_cmd) { if (m_subcommand_dict.empty()) return {}; - auto pos = m_subcommand_dict.find(std::string(sub_cmd)); + auto pos = m_subcommand_dict.find(sub_cmd); if (pos == m_subcommand_dict.end()) return {}; @@ -64,7 +64,7 @@ CommandObjectSP CommandObjectMultiword::GetSubcommandSP(llvm::StringRef sub_cmd, // function, since I now know I have an exact match... sub_cmd = matches->GetStringAtIndex(0); - pos = m_subcommand_dict.find(std::string(sub_cmd)); + pos = m_subcommand_dict.find(sub_cmd); if (pos != m_subcommand_dict.end()) return_cmd_sp = pos->second; } diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index 227ed802aa933c..f2712af0a08a73 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -1274,7 +1274,7 @@ CommandObject *CommandInterpreter::GetUserCommandObject( llvm::StringRef cmd, StringList *matches, StringList *descriptions) const { std::string cmd_str(cmd); auto find_exact = [&](const CommandObject::CommandMap &map) { - auto found_elem = map.find(std::string(cmd)); + auto found_elem = map.find(cmd); if (found_elem == map.end()) return (CommandObject *)nullptr; CommandObject *exact_cmd = found_elem->second.get(); @@ -1310,7 +1310,7 @@ CommandObject *CommandInterpreter::GetAliasCommandObject( llvm::StringRef cmd, StringList *matches, StringList *descriptions) const { auto find_exact = [&](const CommandObject::CommandMap &map) -> CommandObject * { - auto found_elem = map.find(cmd.str()); + auto found_elem = map.find(cmd); if (found_elem == map.end()) return (CommandObject *)nullptr; CommandObject *exact_cmd = found_elem->second.get(); @@ -1340,13 +1340,12 @@ CommandObject *CommandInterpreter::GetAliasCommandObject( } bool CommandInterpreter::CommandExists(llvm::StringRef cmd) const { - return m_command_dict.find(std::string(cmd)) != m_command_dict.end(); + return m_command_dict.find(cmd) != m_command_dict.end(); } bool CommandInterpreter::GetAliasFullName(llvm::StringRef cmd, std::string &full_name) const { - bool exact_match = - (m_alias_dict.find(std::string(cmd)) != m_alias_dict.end()); + bool exact_match = (m_alias_dict.find(cmd) != m_alias_dict.end()); if (exact_match) { full_name.assign(std::string(cmd)); return exact_match; @@ -1374,15 +1373,15 @@ bool CommandInterpreter::GetAliasFullName(llvm::StringRef cmd, } bool CommandInterpreter::AliasExists(llvm::StringRef cmd) const { - return m_alias_dict.find(std::string(cmd)) != m_alias_dict.end(); + return m_alias_dict.find(cmd) != m_alias_dict.end(); } bool CommandInterpreter::UserCommandExists(llvm::StringRef cmd) const { - return m_user_dict.find(std::string(cmd)) != m_user_dict.end(); + return m_user_dict.find(cmd) != m_user_dict.end(); } bool CommandInterpreter::UserMultiwordCommandExists(llvm::StringRef cmd) const { - return m_user_mw_dict.find(std::string(cmd)) != m_user_mw_dict.end(); + return m_user_mw_dict.find(cmd) != m_user_mw_dict.end(); } CommandAlias * @@ -1406,7 +1405,7 @@ CommandInterpreter::AddAlias(llvm::StringRef alias_name, } bool CommandInterpreter::RemoveAlias(llvm::StringRef alias_name) { - auto pos = m_alias_dict.find(std::string(alias_name)); + auto pos = m_alias_dict.find(alias_name); if (pos != m_alias_dict.end()) { m_alias_dict.erase(pos); return true; @@ -1415,7 +1414,7 @@ bool CommandInterpreter::RemoveAlias(llvm::StringRef alias_name) { } bool CommandInterpreter::RemoveCommand(llvm::StringRef cmd, bool force) { - auto pos = m_command_dict.find(std::string(cmd)); + auto pos = m_command_dict.find(cmd); if (pos != m_command_dict.end()) { if (force || pos->second->IsRemovable()) { // Only regular expression objects or python commands are removable under @@ -1428,8 +1427,7 @@ bool CommandInterpreter::RemoveCommand(llvm::StringRef cmd, bool force) { } bool CommandInterpreter::RemoveUser(llvm::StringRef user_name) { - CommandObject::CommandMap::iterator pos = - m_user_dict.find(std::string(user_name)); + CommandObject::CommandMap::iterator pos = m_user_dict.find(user_name); if (pos != m_user_dict.end()) { m_user_dict.erase(pos); return true; @@ -1438,8 +1436,7 @@ bool CommandInterpreter::RemoveUser(llvm::StringRef user_name) { } bool CommandInterpreter::RemoveUserMultiword(llvm::StringRef multi_name) { - CommandObject::CommandMap::iterator pos = - m_user_mw_dict.find(std::string(multi_name)); + CommandObject::CommandMap::iterator pos = m_user_mw_dict.find(multi_name); if (pos != m_user_mw_dict.end()) { m_user_mw_dict.erase(pos); return true; @@ -2213,7 +2210,7 @@ const CommandAlias * CommandInterpreter::GetAlias(llvm::StringRef alias_name) const { OptionArgVectorSP ret_val; - auto pos = m_alias_dict.find(std::string(alias_name)); + auto pos = m_alias_dict.find(alias_name); if (pos != m_alias_dict.end()) return (CommandAlias *)pos->second.get(); diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp index 31edd8d46c444e..08264d837f9c23 100644 --- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp +++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp @@ -583,7 +583,6 @@ class InstructionLLVMC : public lldb_private::Instruction { lldb::addr_t pc = m_address.GetFileAddress(); m_using_file_addr = true; - const bool data_from_file = disasm->m_data_from_file; bool use_hex_immediates = true; Disassembler::HexImmediateStyle hex_style = Disassembler::eHexStyleC; @@ -593,12 +592,10 @@ class InstructionLLVMC : public lldb_private::Instruction { use_hex_immediates = target->GetUseHexImmediates(); hex_style = target->GetHexImmediateStyle(); - if (!data_from_file) { - const lldb::addr_t load_addr = m_address.GetLoadAddress(target); - if (load_addr != LLDB_INVALID_ADDRESS) { - pc = load_addr; - m_using_file_addr = false; - } + const lldb::addr_t load_addr = m_address.GetLoadAddress(target); + if (load_addr != LLDB_INVALID_ADDRESS) { + pc = load_addr; + m_using_file_addr = false; } } } diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp index 39969520b74556..54c9f328b5b78b 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp @@ -394,7 +394,7 @@ bool ClassDescriptorV2::relative_list_entry_t::Read(Process *process, lldb::offset_t cursor = 0; uint64_t raw_entry = extractor.GetU64_unchecked(&cursor); m_image_index = raw_entry & 0xFFFF; - m_list_offset = (int64_t)(raw_entry >> 16); + m_list_offset = llvm::SignExtend64<48>(raw_entry >> 16); return true; } diff --git a/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py b/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py index c79cf19f36aafd..9de7eb2e4a6e3a 100644 --- a/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py +++ b/lldb/test/API/lang/cpp/const_static_integral_member/TestConstStaticIntegralMember.py @@ -146,7 +146,7 @@ def check_inline_static_members(self, flags): @skipIfWindows # On linux this passes due to the manual index @expectedFailureDarwin(debug_info=no_match(["dsym"])) - @expectedFailureAll(debug_info=["dsym"], compiler=["clang"], compiler_version=["<", "19.0"]) + @skipIf(debug_info=["dsym"], compiler=["clang"], compiler_version=["<", "19.0"]) def test_inline_static_members_dwarf5(self): self.check_inline_static_members("-gdwarf-5") @@ -200,7 +200,7 @@ def check_shadowed_static_inline_members(self, flags): @skipIfWindows # On linux this passes due to the manual index @expectedFailureDarwin(debug_info=no_match(["dsym"])) - @expectedFailureAll(debug_info=["dsym"], compiler=["clang"], compiler_version=["<", "19.0"]) + @skipIf(debug_info=["dsym"], compiler=["clang"], compiler_version=["<", "19.0"]) def test_shadowed_static_inline_members_dwarf5(self): self.check_shadowed_static_inline_members("-gdwarf-5") diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py index 6ef09f36a1907e..06c685ebc3f5a5 100644 --- a/lldb/test/API/lit.cfg.py +++ b/lldb/test/API/lit.cfg.py @@ -334,3 +334,9 @@ def delete_module_cache(path): # Propagate XDG_CACHE_HOME if "XDG_CACHE_HOME" in os.environ: config.environment["XDG_CACHE_HOME"] = os.environ["XDG_CACHE_HOME"] + +# Transfer some environment variables into the tests on Windows build host. +if platform.system() == "Windows": + for v in ["SystemDrive"]: + if v in os.environ: + config.environment[v] = os.environ[v] diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in index 7dd8ffd2f5cb4c..ecebc447748593 100644 --- a/lldb/test/API/lit.site.cfg.py.in +++ b/lldb/test/API/lit.site.cfg.py.in @@ -20,7 +20,8 @@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@" config.target_triple = "@LLVM_TARGET_TRIPLE@" config.lldb_build_directory = "@LLDB_TEST_BUILD_DIRECTORY@" config.python_executable = "@Python3_EXECUTABLE@" -config.lua_executable = "@Lua_EXECUTABLE@" +config.lua_executable = "@LUA_EXECUTABLE@" +config.lldb_lua_cpath = "@LLDB_LUA_CPATH@" config.lua_test_entry = "TestLuaAPI.py" config.dotest_common_args_str = lit_config.substitute("@LLDB_TEST_COMMON_ARGS@") config.dotest_user_args_str = lit_config.substitute("@LLDB_TEST_USER_ARGS@") diff --git a/lldb/test/API/lldbtest.py b/lldb/test/API/lldbtest.py index c888fb574f4b7f..d6b79ebc2c4342 100644 --- a/lldb/test/API/lldbtest.py +++ b/lldb/test/API/lldbtest.py @@ -56,8 +56,8 @@ def execute(self, test, litConfig): cmd = [executable] + self.dotest_cmd + [testPath, "-p", testFile] if isLuaTest: - luaExecutable = test.config.lua_executable - cmd.extend(["--env", "LUA_EXECUTABLE=%s" % luaExecutable]) + cmd.extend(["--env", "LUA_EXECUTABLE=%s" % test.config.lua_executable]) + cmd.extend(["--env", "LLDB_LUA_CPATH=%s" % test.config.lldb_lua_cpath]) timeoutInfo = None try: diff --git a/lldb/test/API/lua_api/TestLuaAPI.py b/lldb/test/API/lua_api/TestLuaAPI.py index 065de61ad6577b..4ac795d6964253 100644 --- a/lldb/test/API/lua_api/TestLuaAPI.py +++ b/lldb/test/API/lua_api/TestLuaAPI.py @@ -162,15 +162,14 @@ def test_lua_api(self): self.skipTest("Lua API tests could not find Lua executable.") return lua_executable = os.environ["LUA_EXECUTABLE"] + lldb_lua_cpath = os.environ["LLDB_LUA_CPATH"] self.build() test_exe = self.getBuildArtifact("a.out") test_output = self.getBuildArtifact("output") test_input = self.getBuildArtifact("input") - lua_lldb_cpath = "%s/lua/5.3/?.so" % configuration.lldb_libs_dir - - lua_prelude = "package.cpath = '%s;' .. package.cpath" % lua_lldb_cpath + lua_prelude = "package.cpath = '%s/?.so;' .. package.cpath" % lldb_lua_cpath lua_env = { "TEST_EXE": os.path.join(self.getBuildDir(), test_exe), diff --git a/lldb/test/API/python_api/process/cancel_attach/TestCancelAttach.py b/lldb/test/API/python_api/process/cancel_attach/TestCancelAttach.py index 9f643d50e58fc0..3be0a85d595002 100644 --- a/lldb/test/API/python_api/process/cancel_attach/TestCancelAttach.py +++ b/lldb/test/API/python_api/process/cancel_attach/TestCancelAttach.py @@ -14,6 +14,11 @@ class AttachCancelTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True + @skipIf( + remote=True, + hostoslist=["windows"], + bugnumber="https://github.com/llvm/llvm-project/issues/115618", + ) def test_scripted_implementation(self): """Test that cancelling a stuck "attach waitfor" works.""" # First make an empty target for the attach: diff --git a/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s b/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s index 191b55c545d762..685d0a84ec2896 100644 --- a/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s +++ b/lldb/test/Shell/Commands/command-disassemble-aarch64-extensions.s @@ -55,7 +55,7 @@ fn: mrs x2, ssbs // AEK_SSBS abs z31.h, p7/m, z31.h // AEK_SVE sqdmlslbt z0.d, z1.s, z31.s // AEK_SVE2 - aesd z0.b, z0.b, z31.b // AEK_SVEAES + aesd z0.b, z0.b, z31.b // AEK_SVE2AES bdep z0.b, z1.b, z31.b // AEK_SVE2BITPERM rax1 z0.d, z0.d, z0.d // AEK_SVE2SHA3 sm4e z0.s, z0.s, z0.s // AEK_SVE2SM4 diff --git a/lldb/test/Shell/Commands/command-disassemble-process.yaml b/lldb/test/Shell/Commands/command-disassemble-process.yaml index 75be1a42fb196d..ce1b37bc8aea7a 100644 --- a/lldb/test/Shell/Commands/command-disassemble-process.yaml +++ b/lldb/test/Shell/Commands/command-disassemble-process.yaml @@ -20,7 +20,7 @@ # CHECK: (lldb) disassemble # CHECK-NEXT: command-disassemble-process.exe`main: -# CHECK-NEXT: 0x4002 <+0>: addb %al, (%rcx) +# CHECK-NEXT: 0x4002 <+0>: jmp 0x4004 ; <+2> # CHECK-NEXT: -> 0x4004 <+2>: addb %al, (%rdx) # CHECK-NEXT: 0x4006 <+4>: addb %al, (%rbx) # CHECK-NEXT: 0x4008 <+6>: addb %al, (%rsi) @@ -32,7 +32,7 @@ # CHECK-NEXT: 0x400a: addb %al, (%rdi) # CHECK-NEXT: (lldb) disassemble --frame # CHECK-NEXT: command-disassemble-process.exe`main: -# CHECK-NEXT: 0x4002 <+0>: addb %al, (%rcx) +# CHECK-NEXT: 0x4002 <+0>: jmp 0x4004 ; <+2> # CHECK-NEXT: -> 0x4004 <+2>: addb %al, (%rdx) # CHECK-NEXT: 0x4006 <+4>: addb %al, (%rbx) # CHECK-NEXT: 0x4008 <+6>: addb %al, (%rsi) @@ -44,13 +44,13 @@ # CHECK-NEXT: 0x400a: addb %al, (%rdi) # CHECK-NEXT: (lldb) disassemble --address 0x4004 # CHECK-NEXT: command-disassemble-process.exe`main: -# CHECK-NEXT: 0x4002 <+0>: addb %al, (%rcx) +# CHECK-NEXT: 0x4002 <+0>: jmp 0x4004 ; <+2> # CHECK-NEXT: -> 0x4004 <+2>: addb %al, (%rdx) # CHECK-NEXT: 0x4006 <+4>: addb %al, (%rbx) # CHECK-NEXT: 0x4008 <+6>: addb %al, (%rsi) # CHECK-NEXT: (lldb) disassemble --count 7 # CHECK-NEXT: command-disassemble-process.exe`main: -# CHECK-NEXT: 0x4002 <+0>: addb %al, (%rcx) +# CHECK-NEXT: 0x4002 <+0>: jmp 0x4004 ; <+2> # CHECK-NEXT: -> 0x4004 <+2>: addb %al, (%rdx) # CHECK-NEXT: 0x4006 <+4>: addb %al, (%rbx) # CHECK-NEXT: 0x4008 <+6>: addb %al, (%rsi) @@ -81,32 +81,32 @@ Sections: - Name: .text Type: SHT_PROGBITS Flags: [ SHF_ALLOC, SHF_EXECINSTR ] - Address: 0x0000000000004000 + Address: 0x0000000000000000 AddressAlign: 0x0000000000001000 - Content: 00000001000200030006000700080009000A000B000E000F00100011001200130016001700180019001A001B001E001F00200021002200230026002700280029002A002B002E002F + Content: 0000EB00000200030006000700080009000A000B000E000F00100011001200130016001700180019001A001B001E001F00200021002200230026002700280029002A002B002E002F Size: 0x10000 - Name: .note.gnu.build-id Type: SHT_NOTE Flags: [ SHF_ALLOC ] - Address: 0x0000000000005000 + Address: 0x0000000000001000 AddressAlign: 0x0000000000001000 Content: 040000000800000003000000474E5500DEADBEEFBAADF00D Symbols: - Name: main Type: STT_FUNC Section: .text - Value: 0x0000000000004002 + Value: 0x0000000000000002 Size: [[MAIN_SIZE]] ProgramHeaders: - Type: PT_LOAD Flags: [ PF_X, PF_R ] - VAddr: 0x4000 + VAddr: 0x0000 Align: 0x1000 FirstSec: .text LastSec: .text - Type: PT_LOAD Flags: [ PF_W, PF_R ] - VAddr: 0x5000 + VAddr: 0x1000 Align: 0x1000 FirstSec: .note.gnu.build-id LastSec: .note.gnu.build-id diff --git a/lldb/test/Shell/Commands/command-disassemble.s b/lldb/test/Shell/Commands/command-disassemble.s index 10ce8354025ac5..1625f80468eb17 100644 --- a/lldb/test/Shell/Commands/command-disassemble.s +++ b/lldb/test/Shell/Commands/command-disassemble.s @@ -15,7 +15,7 @@ # CHECK-NEXT: error: Cannot disassemble around the current PC without a selected frame: no currently running process. # CHECK-NEXT: (lldb) disassemble --start-address 0x0 # CHECK-NEXT: command-disassemble.s.tmp`foo: -# CHECK-NEXT: command-disassemble.s.tmp[0x0] <+0>: int $0x10 +# CHECK-NEXT: command-disassemble.s.tmp[0x0] <+0>: jmp 0x2 ; <+2> # CHECK-NEXT: command-disassemble.s.tmp[0x2] <+2>: int $0x11 # CHECK-NEXT: command-disassemble.s.tmp[0x4] <+4>: int $0x12 # CHECK-NEXT: command-disassemble.s.tmp[0x6] <+6>: int $0x13 @@ -41,7 +41,7 @@ # CHECK-NEXT: error: End address before start address. # CHECK-NEXT: (lldb) disassemble --address 0x0 # CHECK-NEXT: command-disassemble.s.tmp`foo: -# CHECK-NEXT: command-disassemble.s.tmp[0x0] <+0>: int $0x10 +# CHECK-NEXT: command-disassemble.s.tmp[0x0] <+0>: jmp 0x2 ; <+2> # CHECK-NEXT: command-disassemble.s.tmp[0x2] <+2>: int $0x11 # CHECK-NEXT: command-disassemble.s.tmp[0x4] <+4>: int $0x12 # CHECK-NEXT: command-disassemble.s.tmp[0x6] <+6>: int $0x13 @@ -63,7 +63,7 @@ # CHECK: command-disassemble.s.tmp[0x203e] <+8190>: int $0x2a # CHECK-NEXT: (lldb) disassemble --start-address 0x0 --count 7 # CHECK-NEXT: command-disassemble.s.tmp`foo: -# CHECK-NEXT: command-disassemble.s.tmp[0x0] <+0>: int $0x10 +# CHECK-NEXT: command-disassemble.s.tmp[0x0] <+0>: jmp 0x2 ; <+2> # CHECK-NEXT: command-disassemble.s.tmp[0x2] <+2>: int $0x11 # CHECK-NEXT: command-disassemble.s.tmp[0x4] <+4>: int $0x12 # CHECK-NEXT: command-disassemble.s.tmp[0x6] <+6>: int $0x13 @@ -101,8 +101,8 @@ .text foo: - int $0x10 - int $0x11 + jmp 1f +1: int $0x11 int $0x12 int $0x13 int $0x14 diff --git a/lldb/tools/lldb-dap/Breakpoint.cpp b/lldb/tools/lldb-dap/Breakpoint.cpp index 9ea7a42ca85a1e..b3bfa61595a82c 100644 --- a/lldb/tools/lldb-dap/Breakpoint.cpp +++ b/lldb/tools/lldb-dap/Breakpoint.cpp @@ -7,10 +7,15 @@ //===----------------------------------------------------------------------===// #include "Breakpoint.h" -#include "DAP.h" #include "JSONUtils.h" +#include "lldb/API/SBAddress.h" #include "lldb/API/SBBreakpointLocation.h" +#include "lldb/API/SBLineEntry.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Support/JSON.h" +#include +#include +#include using namespace lldb_dap; @@ -51,7 +56,7 @@ void Breakpoint::CreateJsonObject(llvm::json::Object &object) { if (bp_addr.IsValid()) { std::string formatted_addr = - "0x" + llvm::utohexstr(bp_addr.GetLoadAddress(g_dap.target)); + "0x" + llvm::utohexstr(bp_addr.GetLoadAddress(bp.GetTarget())); object.try_emplace("instructionReference", formatted_addr); auto line_entry = bp_addr.GetLineEntry(); const auto line = line_entry.GetLine(); diff --git a/lldb/tools/lldb-dap/Breakpoint.h b/lldb/tools/lldb-dap/Breakpoint.h index ee9d3736d6190f..a726f27e59ee00 100644 --- a/lldb/tools/lldb-dap/Breakpoint.h +++ b/lldb/tools/lldb-dap/Breakpoint.h @@ -10,6 +10,7 @@ #define LLDB_TOOLS_LLDB_DAP_BREAKPOINT_H #include "BreakpointBase.h" +#include "DAPForward.h" #include "lldb/API/SBBreakpoint.h" namespace lldb_dap { @@ -18,9 +19,8 @@ struct Breakpoint : public BreakpointBase { // The LLDB breakpoint associated wit this source breakpoint lldb::SBBreakpoint bp; - Breakpoint() = default; - Breakpoint(const llvm::json::Object &obj) : BreakpointBase(obj){}; - Breakpoint(lldb::SBBreakpoint bp) : bp(bp) {} + Breakpoint(DAP &d, const llvm::json::Object &obj) : BreakpointBase(d, obj) {} + Breakpoint(DAP &d, lldb::SBBreakpoint bp) : BreakpointBase(d), bp(bp) {} void SetCondition() override; void SetHitCondition() override; diff --git a/lldb/tools/lldb-dap/BreakpointBase.cpp b/lldb/tools/lldb-dap/BreakpointBase.cpp index f3cb06a3562d48..1e28c29082a9fc 100644 --- a/lldb/tools/lldb-dap/BreakpointBase.cpp +++ b/lldb/tools/lldb-dap/BreakpointBase.cpp @@ -8,11 +8,12 @@ #include "BreakpointBase.h" #include "JSONUtils.h" +#include "llvm/ADT/StringRef.h" using namespace lldb_dap; -BreakpointBase::BreakpointBase(const llvm::json::Object &obj) - : condition(std::string(GetString(obj, "condition"))), +BreakpointBase::BreakpointBase(DAP &d, const llvm::json::Object &obj) + : dap(d), condition(std::string(GetString(obj, "condition"))), hitCondition(std::string(GetString(obj, "hitCondition"))) {} void BreakpointBase::UpdateBreakpoint(const BreakpointBase &request_bp) { diff --git a/lldb/tools/lldb-dap/BreakpointBase.h b/lldb/tools/lldb-dap/BreakpointBase.h index 79301480e0e588..3c248dd1736d07 100644 --- a/lldb/tools/lldb-dap/BreakpointBase.h +++ b/lldb/tools/lldb-dap/BreakpointBase.h @@ -9,12 +9,14 @@ #ifndef LLDB_TOOLS_LLDB_DAP_BREAKPOINTBASE_H #define LLDB_TOOLS_LLDB_DAP_BREAKPOINTBASE_H -#include "llvm/Support/JSON.h" +#include "DAPForward.h" #include namespace lldb_dap { struct BreakpointBase { + // Associated DAP session. + DAP &dap; // An optional expression for conditional breakpoints. std::string condition; @@ -22,8 +24,8 @@ struct BreakpointBase { // ignored. The backend is expected to interpret the expression as needed std::string hitCondition; - BreakpointBase() = default; - BreakpointBase(const llvm::json::Object &obj); + explicit BreakpointBase(DAP &d) : dap(d) {} + BreakpointBase(DAP &d, const llvm::json::Object &obj); virtual ~BreakpointBase() = default; virtual void SetCondition() = 0; diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index 283392270ba26c..e45f9bf359e5bf 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -74,21 +74,21 @@ void DAP::PopulateExceptionBreakpoints() { exception_breakpoints = std::vector{}; if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeC_plus_plus)) { - exception_breakpoints->emplace_back("cpp_catch", "C++ Catch", + exception_breakpoints->emplace_back(*this, "cpp_catch", "C++ Catch", lldb::eLanguageTypeC_plus_plus); - exception_breakpoints->emplace_back("cpp_throw", "C++ Throw", + exception_breakpoints->emplace_back(*this, "cpp_throw", "C++ Throw", lldb::eLanguageTypeC_plus_plus); } if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeObjC)) { - exception_breakpoints->emplace_back("objc_catch", "Objective-C Catch", - lldb::eLanguageTypeObjC); - exception_breakpoints->emplace_back("objc_throw", "Objective-C Throw", - lldb::eLanguageTypeObjC); + exception_breakpoints->emplace_back( + *this, "objc_catch", "Objective-C Catch", lldb::eLanguageTypeObjC); + exception_breakpoints->emplace_back( + *this, "objc_throw", "Objective-C Throw", lldb::eLanguageTypeObjC); } if (lldb::SBDebugger::SupportsLanguage(lldb::eLanguageTypeSwift)) { - exception_breakpoints->emplace_back("swift_catch", "Swift Catch", + exception_breakpoints->emplace_back(*this, "swift_catch", "Swift Catch", lldb::eLanguageTypeSwift); - exception_breakpoints->emplace_back("swift_throw", "Swift Throw", + exception_breakpoints->emplace_back(*this, "swift_throw", "Swift Throw", lldb::eLanguageTypeSwift); } // Besides handling the hardcoded list of languages from above, we try to @@ -119,7 +119,7 @@ void DAP::PopulateExceptionBreakpoints() { raw_throw_keyword ? raw_throw_keyword : "throw"; exception_breakpoints->emplace_back( - raw_lang_name + "_" + throw_keyword, + *this, raw_lang_name + "_" + throw_keyword, capitalized_lang_name + " " + capitalize(throw_keyword), lang); } @@ -130,7 +130,7 @@ void DAP::PopulateExceptionBreakpoints() { raw_catch_keyword ? raw_catch_keyword : "catch"; exception_breakpoints->emplace_back( - raw_lang_name + "_" + catch_keyword, + *this, raw_lang_name + "_" + catch_keyword, capitalized_lang_name + " " + capitalize(catch_keyword), lang); } } @@ -692,7 +692,7 @@ bool DAP::HandleObject(const llvm::json::Object &object) { const auto packet_type = GetString(object, "type"); if (packet_type == "request") { const auto command = GetString(object, "command"); - auto handler_pos = request_handlers.find(std::string(command)); + auto handler_pos = request_handlers.find(command); if (handler_pos != request_handlers.end()) { handler_pos->second(object); return true; // Success @@ -1060,7 +1060,7 @@ void DAP::SetThreadFormat(llvm::StringRef format) { InstructionBreakpoint * DAP::GetInstructionBreakpoint(const lldb::break_id_t bp_id) { for (auto &bp : instruction_breakpoints) { - if (bp.second.id == bp_id) + if (bp.second.bp.GetID() == bp_id) return &bp.second; } return nullptr; diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h index dab4ce44ab202c..1641a58c7dbd06 100644 --- a/lldb/tools/lldb-dap/DAP.h +++ b/lldb/tools/lldb-dap/DAP.h @@ -171,7 +171,7 @@ struct DAP { // the old process here so we can detect this case and keep running. lldb::pid_t restarting_process_id; bool configuration_done_sent; - std::map request_handlers; + std::map> request_handlers; bool waiting_for_run_in_terminal; ProgressEventReporter progress_event_reporter; // Keep track of the last stop thread index IDs as threads won't go away diff --git a/lldb/tools/lldb-dap/DAPForward.h b/lldb/tools/lldb-dap/DAPForward.h index 159d999a63c820..0196d83dcd6a91 100644 --- a/lldb/tools/lldb-dap/DAPForward.h +++ b/lldb/tools/lldb-dap/DAPForward.h @@ -9,6 +9,8 @@ #ifndef LLDB_TOOLS_LLDB_DAP_DAPFORWARD_H #define LLDB_TOOLS_LLDB_DAP_DAPFORWARD_H +// IWYU pragma: begin_exports + namespace lldb_dap { struct BreakpointBase; struct ExceptionBreakpoint; @@ -16,6 +18,7 @@ struct FunctionBreakpoint; struct SourceBreakpoint; struct Watchpoint; struct InstructionBreakpoint; +struct DAP; } // namespace lldb_dap namespace lldb { @@ -35,6 +38,7 @@ class SBLanguageRuntime; class SBLaunchInfo; class SBLineEntry; class SBListener; +class SBModule; class SBProcess; class SBStream; class SBStringList; @@ -44,4 +48,12 @@ class SBValue; class SBWatchpoint; } // namespace lldb +namespace llvm { +namespace json { +class Object; +} // namespace json +} // namespace llvm + +// IWYU pragma: end_exports + #endif diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp index 130c237e65441d..0fb865c19e574f 100644 --- a/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp +++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.cpp @@ -9,6 +9,7 @@ #include "ExceptionBreakpoint.h" #include "BreakpointBase.h" #include "DAP.h" +#include "lldb/API/SBTarget.h" namespace lldb_dap { @@ -17,8 +18,8 @@ void ExceptionBreakpoint::SetBreakpoint() { return; bool catch_value = filter.find("_catch") != std::string::npos; bool throw_value = filter.find("_throw") != std::string::npos; - bp = g_dap.target.BreakpointCreateForException(language, catch_value, - throw_value); + bp = dap.target.BreakpointCreateForException(language, catch_value, + throw_value); // See comments in BreakpointBase::GetBreakpointLabel() for details of why // we add a label to our breakpoints. bp.AddName(BreakpointBase::GetBreakpointLabel()); @@ -27,7 +28,7 @@ void ExceptionBreakpoint::SetBreakpoint() { void ExceptionBreakpoint::ClearBreakpoint() { if (!bp.IsValid()) return; - g_dap.target.BreakpointDelete(bp.GetID()); + dap.target.BreakpointDelete(bp.GetID()); bp = lldb::SBBreakpoint(); } diff --git a/lldb/tools/lldb-dap/ExceptionBreakpoint.h b/lldb/tools/lldb-dap/ExceptionBreakpoint.h index 7b81d845cb26be..b83c5ef7773525 100644 --- a/lldb/tools/lldb-dap/ExceptionBreakpoint.h +++ b/lldb/tools/lldb-dap/ExceptionBreakpoint.h @@ -9,21 +9,25 @@ #ifndef LLDB_TOOLS_LLDB_DAP_EXCEPTIONBREAKPOINT_H #define LLDB_TOOLS_LLDB_DAP_EXCEPTIONBREAKPOINT_H -#include - +#include "DAPForward.h" #include "lldb/API/SBBreakpoint.h" +#include "lldb/lldb-enumerations.h" +#include +#include namespace lldb_dap { struct ExceptionBreakpoint { + DAP &dap; std::string filter; std::string label; lldb::LanguageType language; - bool default_value; + bool default_value = false; lldb::SBBreakpoint bp; - ExceptionBreakpoint(std::string f, std::string l, lldb::LanguageType lang) - : filter(std::move(f)), label(std::move(l)), language(lang), - default_value(false), bp() {} + ExceptionBreakpoint(DAP &d, std::string f, std::string l, + lldb::LanguageType lang) + : dap(d), filter(std::move(f)), label(std::move(l)), language(lang), + bp() {} void SetBreakpoint(); void ClearBreakpoint(); diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp index 216c685f633da8..f266d751833c79 100644 --- a/lldb/tools/lldb-dap/FunctionBreakpoint.cpp +++ b/lldb/tools/lldb-dap/FunctionBreakpoint.cpp @@ -12,13 +12,13 @@ namespace lldb_dap { -FunctionBreakpoint::FunctionBreakpoint(const llvm::json::Object &obj) - : Breakpoint(obj), functionName(std::string(GetString(obj, "name"))) {} +FunctionBreakpoint::FunctionBreakpoint(DAP &d, const llvm::json::Object &obj) + : Breakpoint(d, obj), functionName(std::string(GetString(obj, "name"))) {} void FunctionBreakpoint::SetBreakpoint() { if (functionName.empty()) return; - bp = g_dap.target.BreakpointCreateByName(functionName.c_str()); + bp = dap.target.BreakpointCreateByName(functionName.c_str()); Breakpoint::SetBreakpoint(); } diff --git a/lldb/tools/lldb-dap/FunctionBreakpoint.h b/lldb/tools/lldb-dap/FunctionBreakpoint.h index b15ff1931a6b22..93f0b93b35291d 100644 --- a/lldb/tools/lldb-dap/FunctionBreakpoint.h +++ b/lldb/tools/lldb-dap/FunctionBreakpoint.h @@ -10,14 +10,14 @@ #define LLDB_TOOLS_LLDB_DAP_FUNCTIONBREAKPOINT_H #include "Breakpoint.h" +#include "DAPForward.h" namespace lldb_dap { struct FunctionBreakpoint : public Breakpoint { std::string functionName; - FunctionBreakpoint() = default; - FunctionBreakpoint(const llvm::json::Object &obj); + FunctionBreakpoint(DAP &dap, const llvm::json::Object &obj); // Set this breakpoint in LLDB as a new breakpoint void SetBreakpoint(); diff --git a/lldb/tools/lldb-dap/InstructionBreakpoint.cpp b/lldb/tools/lldb-dap/InstructionBreakpoint.cpp index e3a8460bb7b301..37daa8f0bdd5f2 100644 --- a/lldb/tools/lldb-dap/InstructionBreakpoint.cpp +++ b/lldb/tools/lldb-dap/InstructionBreakpoint.cpp @@ -10,20 +10,25 @@ #include "InstructionBreakpoint.h" #include "DAP.h" #include "JSONUtils.h" +#include "lldb/API/SBBreakpoint.h" +#include "lldb/API/SBTarget.h" +#include "llvm/ADT/StringRef.h" namespace lldb_dap { // Instruction Breakpoint -InstructionBreakpoint::InstructionBreakpoint(const llvm::json::Object &obj) - : Breakpoint(obj), instructionAddressReference(LLDB_INVALID_ADDRESS), id(0), +InstructionBreakpoint::InstructionBreakpoint(DAP &d, + const llvm::json::Object &obj) + : Breakpoint(d, obj), instructionAddressReference(LLDB_INVALID_ADDRESS), offset(GetSigned(obj, "offset", 0)) { GetString(obj, "instructionReference") .getAsInteger(0, instructionAddressReference); instructionAddressReference += offset; } -void InstructionBreakpoint::SetInstructionBreakpoint() { - bp = g_dap.target.BreakpointCreateByAddress(instructionAddressReference); - id = bp.GetID(); +void InstructionBreakpoint::SetBreakpoint() { + bp = dap.target.BreakpointCreateByAddress(instructionAddressReference); + Breakpoint::SetBreakpoint(); } + } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/InstructionBreakpoint.h b/lldb/tools/lldb-dap/InstructionBreakpoint.h index 53912af46ca148..b2e66a9db9e200 100644 --- a/lldb/tools/lldb-dap/InstructionBreakpoint.h +++ b/lldb/tools/lldb-dap/InstructionBreakpoint.h @@ -11,6 +11,9 @@ #define LLDB_TOOLS_LLDB_DAP_INSTRUCTIONBREAKPOINT_H #include "Breakpoint.h" +#include "DAPForward.h" +#include "lldb/lldb-types.h" +#include namespace lldb_dap { @@ -18,16 +21,12 @@ namespace lldb_dap { struct InstructionBreakpoint : public Breakpoint { lldb::addr_t instructionAddressReference; - int32_t id; int32_t offset; - InstructionBreakpoint() - : Breakpoint(), instructionAddressReference(LLDB_INVALID_ADDRESS), id(0), - offset(0) {} - InstructionBreakpoint(const llvm::json::Object &obj); + InstructionBreakpoint(DAP &d, const llvm::json::Object &obj); // Set instruction breakpoint in LLDB as a new breakpoint - void SetInstructionBreakpoint(); + void SetBreakpoint(); }; } // namespace lldb_dap diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp index 97fe6b4f9f05db..a7300abae0eac8 100644 --- a/lldb/tools/lldb-dap/JSONUtils.cpp +++ b/lldb/tools/lldb-dap/JSONUtils.cpp @@ -6,27 +6,53 @@ // //===----------------------------------------------------------------------===// -#include -#include -#include -#include - -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/ScopedPrinter.h" +#include "JSONUtils.h" +#include "BreakpointBase.h" +#include "DAP.h" +#include "ExceptionBreakpoint.h" +#include "LLDBUtils.h" +#include "lldb/API/SBAddress.h" +#include "lldb/API/SBCompileUnit.h" #include "lldb/API/SBDeclaration.h" +#include "lldb/API/SBEnvironment.h" +#include "lldb/API/SBError.h" +#include "lldb/API/SBFileSpec.h" +#include "lldb/API/SBFrame.h" +#include "lldb/API/SBFunction.h" +#include "lldb/API/SBLineEntry.h" +#include "lldb/API/SBModule.h" +#include "lldb/API/SBQueue.h" +#include "lldb/API/SBSection.h" #include "lldb/API/SBStream.h" #include "lldb/API/SBStringList.h" #include "lldb/API/SBStructuredData.h" +#include "lldb/API/SBTarget.h" +#include "lldb/API/SBThread.h" +#include "lldb/API/SBType.h" #include "lldb/API/SBValue.h" -#include "lldb/Host/PosixApi.h" - -#include "DAP.h" -#include "ExceptionBreakpoint.h" -#include "JSONUtils.h" -#include "LLDBUtils.h" +#include "lldb/Host/PosixApi.h" // IWYU pragma: keep +#include "lldb/lldb-defines.h" +#include "lldb/lldb-enumerations.h" +#include "lldb/lldb-types.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace lldb_dap { @@ -831,70 +857,6 @@ llvm::json::Value CreateExtendedStackFrameLabel(lldb::SBThread &thread) { {"presentationHint", "label"}}); } -// Response to `setInstructionBreakpoints` request. -// "Breakpoint": { -// "type": "object", -// "description": "Response to `setInstructionBreakpoints` request.", -// "properties": { -// "id": { -// "type": "number", -// "description": "The identifier for the breakpoint. It is needed if -// breakpoint events are used to update or remove breakpoints." -// }, -// "verified": { -// "type": "boolean", -// "description": "If true, the breakpoint could be set (but not -// necessarily at the desired location." -// }, -// "message": { -// "type": "string", -// "description": "A message about the state of the breakpoint. -// This is shown to the user and can be used to explain why a breakpoint -// could not be verified." -// }, -// "source": { -// "type": "Source", -// "description": "The source where the breakpoint is located." -// }, -// "line": { -// "type": "number", -// "description": "The start line of the actual range covered by the -// breakpoint." -// }, -// "column": { -// "type": "number", -// "description": "The start column of the actual range covered by the -// breakpoint." -// }, -// "endLine": { -// "type": "number", -// "description": "The end line of the actual range covered by the -// breakpoint." -// }, -// "endColumn": { -// "type": "number", -// "description": "The end column of the actual range covered by the -// breakpoint. If no end line is given, then the end column is assumed to -// be in the start line." -// }, -// "instructionReference": { -// "type": "string", -// "description": "A memory reference to where the breakpoint is set." -// }, -// "offset": { -// "type": "number", -// "description": "The offset from the instruction reference. -// This can be negative." -// }, -// }, -// "required": [ "id", "verified", "line"] -// } -llvm::json::Value CreateInstructionBreakpoint(BreakpointBase *ibp) { - llvm::json::Object object; - ibp->CreateJsonObject(object); - return llvm::json::Value(std::move(object)); -} - // "Thread": { // "type": "object", // "description": "A Thread", @@ -1523,7 +1485,7 @@ void FilterAndGetValueForKey(const lldb::SBStructuredData data, const char *key, llvm::json::Object &out) { lldb::SBStructuredData value = data.GetValueForKey(key); std::string key_utf8 = llvm::json::fixUTF8(key); - if (strcmp(key, "modules") == 0) + if (llvm::StringRef(key) == "modules") return; switch (value.GetType()) { case lldb::eStructuredDataTypeFloat: diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h index 54fc4323475723..2e13459c45556f 100644 --- a/lldb/tools/lldb-dap/JSONUtils.h +++ b/lldb/tools/lldb-dap/JSONUtils.h @@ -9,14 +9,21 @@ #ifndef LLDB_TOOLS_LLDB_DAP_JSONUTILS_H #define LLDB_TOOLS_LLDB_DAP_JSONUTILS_H -#include "BreakpointBase.h" #include "DAPForward.h" -#include "lldb/API/SBModule.h" +#include "lldb/API/SBCompileUnit.h" +#include "lldb/API/SBFileSpec.h" +#include "lldb/API/SBLineEntry.h" +#include "lldb/API/SBType.h" +#include "lldb/API/SBValue.h" +#include "lldb/lldb-types.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/JSON.h" #include #include +#include #include +#include +#include namespace lldb_dap { @@ -380,17 +387,6 @@ llvm::json::Value CreateStackFrame(lldb::SBFrame &frame); /// definition outlined by Microsoft. llvm::json::Value CreateExtendedStackFrameLabel(lldb::SBThread &thread); -/// Create a "instruction" object for a LLDB disassemble object as described in -/// the Visual Studio Code debug adaptor definition. -/// -/// \param[in] bp -/// The LLDB instruction object used to populate the disassembly -/// instruction. -/// \return -/// A "Scope" JSON object with that follows the formal JSON -/// definition outlined by Microsoft. -llvm::json::Value CreateInstructionBreakpoint(BreakpointBase *ibp); - /// Create a "Thread" object for a LLDB thread object. /// /// This function will fill in the following keys in the returned diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.cpp b/lldb/tools/lldb-dap/SourceBreakpoint.cpp index d1a3a5bedb0ae2..418e205312c9f8 100644 --- a/lldb/tools/lldb-dap/SourceBreakpoint.cpp +++ b/lldb/tools/lldb-dap/SourceBreakpoint.cpp @@ -7,20 +7,33 @@ //===----------------------------------------------------------------------===// #include "SourceBreakpoint.h" +#include "BreakpointBase.h" #include "DAP.h" #include "JSONUtils.h" +#include "lldb/API/SBBreakpoint.h" +#include "lldb/API/SBFileSpecList.h" +#include "lldb/API/SBFrame.h" +#include "lldb/API/SBTarget.h" +#include "lldb/API/SBThread.h" +#include "lldb/API/SBValue.h" +#include "lldb/lldb-enumerations.h" +#include +#include +#include +#include namespace lldb_dap { -SourceBreakpoint::SourceBreakpoint(const llvm::json::Object &obj) - : Breakpoint(obj), logMessage(std::string(GetString(obj, "logMessage"))), +SourceBreakpoint::SourceBreakpoint(DAP &dap, const llvm::json::Object &obj) + : Breakpoint(dap, obj), + logMessage(std::string(GetString(obj, "logMessage"))), line(GetUnsigned(obj, "line", 0)), column(GetUnsigned(obj, "column", 0)) { } void SourceBreakpoint::SetBreakpoint(const llvm::StringRef source_path) { lldb::SBFileSpecList module_list; - bp = g_dap.target.BreakpointCreateByLocation(source_path.str().c_str(), line, - column, 0, module_list); + bp = dap.target.BreakpointCreateByLocation(source_path.str().c_str(), line, + column, 0, module_list); if (!logMessage.empty()) SetLogMessage(); Breakpoint::SetBreakpoint(); @@ -136,7 +149,7 @@ lldb::SBError SourceBreakpoint::FormatLogText(llvm::StringRef text, return error; } // hex number in the text - if (isxdigit(text[0])) { + if (std::isxdigit(text[0])) { // Make a string that can hold onto two hex chars plus a // NULL terminator char hex_str[3] = {0, 0, 0}; @@ -144,7 +157,7 @@ lldb::SBError SourceBreakpoint::FormatLogText(llvm::StringRef text, text = text.drop_front(); - if (!text.empty() && isxdigit(text[0])) { + if (!text.empty() && std::isxdigit(text[0])) { hex_str[1] = text[0]; text = text.drop_front(); } @@ -279,7 +292,7 @@ void SourceBreakpoint::SetLogMessage() { void SourceBreakpoint::NotifyLogMessageError(llvm::StringRef error) { std::string message = "Log message has error: "; message += error; - g_dap.SendOutput(OutputType::Console, message); + dap.SendOutput(OutputType::Console, message); } /*static*/ @@ -304,14 +317,16 @@ bool SourceBreakpoint::BreakpointHitCallback( frame.GetValueForVariablePath(expr, lldb::eDynamicDontRunTarget); if (value.GetError().Fail()) value = frame.EvaluateExpression(expr); - output += VariableDescription(value).display_value; + output += + VariableDescription(value, bp->dap.enable_auto_variable_summaries) + .display_value; } else { output += messagePart.text; } } if (!output.empty() && output.back() != '\n') output.push_back('\n'); // Ensure log message has line break. - g_dap.SendOutput(OutputType::Console, output.c_str()); + bp->dap.SendOutput(OutputType::Console, output.c_str()); // Do not stop. return false; diff --git a/lldb/tools/lldb-dap/SourceBreakpoint.h b/lldb/tools/lldb-dap/SourceBreakpoint.h index aa3fbe6d0f96d2..064bd29d9fc796 100644 --- a/lldb/tools/lldb-dap/SourceBreakpoint.h +++ b/lldb/tools/lldb-dap/SourceBreakpoint.h @@ -10,7 +10,12 @@ #define LLDB_TOOLS_LLDB_DAP_SOURCEBREAKPOINT_H #include "Breakpoint.h" +#include "DAPForward.h" +#include "lldb/API/SBError.h" #include "llvm/ADT/StringRef.h" +#include +#include +#include namespace lldb_dap { @@ -31,8 +36,7 @@ struct SourceBreakpoint : public Breakpoint { uint32_t line; ///< The source line of the breakpoint or logpoint uint32_t column; ///< An optional source column of the breakpoint - SourceBreakpoint() : Breakpoint(), line(0), column(0) {} - SourceBreakpoint(const llvm::json::Object &obj); + SourceBreakpoint(DAP &d, const llvm::json::Object &obj); // Set this breakpoint in LLDB as a new breakpoint void SetBreakpoint(const llvm::StringRef source_path); diff --git a/lldb/tools/lldb-dap/Watchpoint.cpp b/lldb/tools/lldb-dap/Watchpoint.cpp index 21765509449140..0e68a35877c663 100644 --- a/lldb/tools/lldb-dap/Watchpoint.cpp +++ b/lldb/tools/lldb-dap/Watchpoint.cpp @@ -9,10 +9,17 @@ #include "Watchpoint.h" #include "DAP.h" #include "JSONUtils.h" +#include "lldb/API/SBTarget.h" +#include "lldb/lldb-enumerations.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/JSON.h" +#include +#include namespace lldb_dap { -Watchpoint::Watchpoint(const llvm::json::Object &obj) : BreakpointBase(obj) { +Watchpoint::Watchpoint(DAP &d, const llvm::json::Object &obj) + : BreakpointBase(d, obj) { llvm::StringRef dataId = GetString(obj, "dataId"); std::string accessType = GetString(obj, "accessType").str(); auto [addr_str, size_str] = dataId.split('/'); @@ -42,7 +49,7 @@ void Watchpoint::CreateJsonObject(llvm::json::Object &object) { } void Watchpoint::SetWatchpoint() { - wp = g_dap.target.WatchpointCreateByAddress(addr, size, options, error); + wp = dap.target.WatchpointCreateByAddress(addr, size, options, error); if (!condition.empty()) SetCondition(); if (!hitCondition.empty()) diff --git a/lldb/tools/lldb-dap/Watchpoint.h b/lldb/tools/lldb-dap/Watchpoint.h index 4d2e58ed753360..77cea67bb97810 100644 --- a/lldb/tools/lldb-dap/Watchpoint.h +++ b/lldb/tools/lldb-dap/Watchpoint.h @@ -10,9 +10,12 @@ #define LLDB_TOOLS_LLDB_DAP_WATCHPOINT_H #include "BreakpointBase.h" +#include "DAPForward.h" #include "lldb/API/SBError.h" #include "lldb/API/SBWatchpoint.h" #include "lldb/API/SBWatchpointOptions.h" +#include "lldb/lldb-types.h" +#include namespace lldb_dap { @@ -24,9 +27,8 @@ struct Watchpoint : public BreakpointBase { lldb::SBWatchpoint wp; lldb::SBError error; - Watchpoint() = default; - Watchpoint(const llvm::json::Object &obj); - Watchpoint(lldb::SBWatchpoint wp) : wp(wp) {} + Watchpoint(DAP &d, const llvm::json::Object &obj); + Watchpoint(DAP &d, lldb::SBWatchpoint wp) : BreakpointBase(d), wp(wp) {} void SetCondition() override; void SetHitCondition() override; diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp index a2f7be2b214e4a..94184b78a697f1 100644 --- a/lldb/tools/lldb-dap/lldb-dap.cpp +++ b/lldb/tools/lldb-dap/lldb-dap.cpp @@ -8,24 +8,52 @@ #include "DAP.h" #include "FifoFiles.h" +#include "JSONUtils.h" +#include "LLDBUtils.h" +#include "OutputRedirector.h" #include "RunInTerminal.h" #include "Watchpoint.h" #include "lldb/API/SBDeclaration.h" #include "lldb/API/SBInstruction.h" #include "lldb/API/SBListener.h" #include "lldb/API/SBMemoryRegionInfo.h" +#include "lldb/API/SBStream.h" #include "lldb/API/SBStringList.h" +#include "lldb/Host/Config.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/OptTable.h" +#include "llvm/Option/Option.h" #include "llvm/Support/Base64.h" - +#include "llvm/Support/Errno.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/PrettyStackTrace.h" +#include "llvm/Support/raw_ostream.h" +#include +#include #include #include #include #include #include #include +#include +#include #include +#include #include #include +#include +#include + #if defined(_WIN32) // We need to #define NOMINMAX in order to skip `min()` and `max()` macro // definitions that conflict with other system headers. @@ -46,35 +74,6 @@ #include #endif -#include -#include -#include -#include -#include -#include -#include - -#include "lldb/API/SBStream.h" -#include "lldb/Host/Config.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/ScopeExit.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Option/Arg.h" -#include "llvm/Option/ArgList.h" -#include "llvm/Option/OptTable.h" -#include "llvm/Option/Option.h" -#include "llvm/Support/Errno.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/InitLLVM.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/PrettyStackTrace.h" -#include "llvm/Support/raw_ostream.h" - -#include "JSONUtils.h" -#include "LLDBUtils.h" -#include "OutputRedirector.h" - #if defined(_WIN32) #ifndef PATH_MAX #define PATH_MAX MAX_PATH @@ -526,8 +525,8 @@ void EventThreadFunction() { if (event_mask & lldb::SBTarget::eBroadcastBitBreakpointChanged) { auto event_type = lldb::SBBreakpoint::GetBreakpointEventTypeFromEvent(event); - auto bp = - Breakpoint(lldb::SBBreakpoint::GetBreakpointFromEvent(event)); + auto bp = Breakpoint( + g_dap, lldb::SBBreakpoint::GetBreakpointFromEvent(event)); // If the breakpoint was originated from the IDE, it will have the // BreakpointBase::GetBreakpointLabel() label attached. Regardless // of wether the locations were added or removed, the breakpoint @@ -2689,10 +2688,10 @@ void request_setBreakpoints(const llvm::json::Object &request) { llvm::json::Object response; lldb::SBError error; FillResponse(request, response); - auto arguments = request.getObject("arguments"); - auto source = arguments->getObject("source"); + const auto *arguments = request.getObject("arguments"); + const auto *source = arguments->getObject("source"); const auto path = GetString(source, "path"); - auto breakpoints = arguments->getArray("breakpoints"); + const auto *breakpoints = arguments->getArray("breakpoints"); llvm::json::Array response_breakpoints; // Decode the source breakpoint infos for this "setBreakpoints" request @@ -2701,28 +2700,19 @@ void request_setBreakpoints(const llvm::json::Object &request) { // to an empty array. if (breakpoints) { for (const auto &bp : *breakpoints) { - auto bp_obj = bp.getAsObject(); + const auto *bp_obj = bp.getAsObject(); if (bp_obj) { - SourceBreakpoint src_bp(*bp_obj); - request_bps[src_bp.line] = src_bp; - + SourceBreakpoint src_bp(g_dap, *bp_obj); + request_bps.try_emplace(src_bp.line, src_bp); + const auto [iv, inserted] = + g_dap.source_breakpoints[path].try_emplace(src_bp.line, src_bp); // We check if this breakpoint already exists to update it - auto existing_source_bps = g_dap.source_breakpoints.find(path); - if (existing_source_bps != g_dap.source_breakpoints.end()) { - const auto &existing_bp = - existing_source_bps->second.find(src_bp.line); - if (existing_bp != existing_source_bps->second.end()) { - existing_bp->second.UpdateBreakpoint(src_bp); - AppendBreakpoint(&existing_bp->second, response_breakpoints, path, - src_bp.line); - continue; - } - } - // At this point the breakpoint is new - g_dap.source_breakpoints[path][src_bp.line] = src_bp; - SourceBreakpoint &new_bp = g_dap.source_breakpoints[path][src_bp.line]; - new_bp.SetBreakpoint(path.data()); - AppendBreakpoint(&new_bp, response_breakpoints, path, new_bp.line); + if (inserted) + iv->getSecond().SetBreakpoint(path.data()); + else + iv->getSecond().UpdateBreakpoint(src_bp); + AppendBreakpoint(&iv->getSecond(), response_breakpoints, path, + src_bp.line); } } } @@ -2799,8 +2789,8 @@ void request_setExceptionBreakpoints(const llvm::json::Object &request) { llvm::json::Object response; lldb::SBError error; FillResponse(request, response); - auto arguments = request.getObject("arguments"); - auto filters = arguments->getArray("filters"); + const auto *arguments = request.getObject("arguments"); + const auto *filters = arguments->getArray("filters"); // Keep a list of any exception breakpoint filter names that weren't set // so we can clear any exception breakpoints if needed. std::set unset_filters; @@ -2809,14 +2799,14 @@ void request_setExceptionBreakpoints(const llvm::json::Object &request) { for (const auto &value : *filters) { const auto filter = GetAsString(value); - auto exc_bp = g_dap.GetExceptionBreakpoint(std::string(filter)); + auto *exc_bp = g_dap.GetExceptionBreakpoint(std::string(filter)); if (exc_bp) { exc_bp->SetBreakpoint(); unset_filters.erase(std::string(filter)); } } for (const auto &filter : unset_filters) { - auto exc_bp = g_dap.GetExceptionBreakpoint(filter); + auto *exc_bp = g_dap.GetExceptionBreakpoint(filter); if (exc_bp) exc_bp->ClearBreakpoint(); } @@ -2905,51 +2895,38 @@ void request_setFunctionBreakpoints(const llvm::json::Object &request) { llvm::json::Object response; lldb::SBError error; FillResponse(request, response); - auto arguments = request.getObject("arguments"); - auto breakpoints = arguments->getArray("breakpoints"); - FunctionBreakpointMap request_bps; + const auto *arguments = request.getObject("arguments"); + const auto *breakpoints = arguments->getArray("breakpoints"); llvm::json::Array response_breakpoints; - for (const auto &value : *breakpoints) { - auto bp_obj = value.getAsObject(); - if (bp_obj == nullptr) - continue; - FunctionBreakpoint func_bp(*bp_obj); - request_bps[func_bp.functionName] = std::move(func_bp); - } - std::vector remove_names; - // Disable any function breakpoints that aren't in the request_bps. + // Disable any function breakpoints that aren't in this request. // There is no call to remove function breakpoints other than calling this // function with a smaller or empty "breakpoints" list. - for (auto &pair : g_dap.function_breakpoints) { - auto request_pos = request_bps.find(pair.first()); - if (request_pos == request_bps.end()) { - // This function breakpoint no longer exists delete it from LLDB - g_dap.target.BreakpointDelete(pair.second.bp.GetID()); - remove_names.push_back(pair.first()); - } else { - // Update the existing breakpoint as any setting withing the function - // breakpoint might have changed. - pair.second.UpdateBreakpoint(request_pos->second); - // Remove this breakpoint from the request breakpoints since we have - // handled it here and we don't need to set a new breakpoint below. - request_bps.erase(request_pos); - // Add this breakpoint info to the response - AppendBreakpoint(&pair.second, response_breakpoints); - } + const auto name_iter = g_dap.function_breakpoints.keys(); + llvm::DenseSet seen(name_iter.begin(), name_iter.end()); + for (const auto &value : *breakpoints) { + const auto *bp_obj = value.getAsObject(); + if (!bp_obj) + continue; + FunctionBreakpoint fn_bp(g_dap, *bp_obj); + const auto [it, inserted] = g_dap.function_breakpoints.try_emplace( + fn_bp.functionName, g_dap, *bp_obj); + if (inserted) + it->second.SetBreakpoint(); + else + it->second.UpdateBreakpoint(fn_bp); + + AppendBreakpoint(&it->second, response_breakpoints); + seen.erase(fn_bp.functionName); } + // Remove any breakpoints that are no longer in our list - for (const auto &name : remove_names) + for (const auto &name : seen) { + auto fn_bp = g_dap.function_breakpoints.find(name); + if (fn_bp == g_dap.function_breakpoints.end()) + continue; + g_dap.target.BreakpointDelete(fn_bp->second.bp.GetID()); g_dap.function_breakpoints.erase(name); - - // Any breakpoints that are left in "request_bps" are breakpoints that - // need to be set. - for (auto &pair : request_bps) { - // Add this breakpoint info to the response - g_dap.function_breakpoints[pair.first()] = std::move(pair.second); - FunctionBreakpoint &new_bp = g_dap.function_breakpoints[pair.first()]; - new_bp.SetBreakpoint(); - AppendBreakpoint(&new_bp, response_breakpoints); } llvm::json::Object body; @@ -3201,10 +3178,8 @@ void request_setDataBreakpoints(const llvm::json::Object &request) { if (breakpoints) { for (const auto &bp : *breakpoints) { const auto *bp_obj = bp.getAsObject(); - if (bp_obj) { - Watchpoint wp(*bp_obj); - watchpoints.push_back(wp); - } + if (bp_obj) + watchpoints.emplace_back(g_dap, *bp_obj); } } // If two watchpoints start at the same address, the latter overwrite the @@ -4551,7 +4526,7 @@ void request__testGetTargetBreakpoints(const llvm::json::Object &request) { FillResponse(request, response); llvm::json::Array response_breakpoints; for (uint32_t i = 0; g_dap.target.GetBreakpointAtIndex(i).IsValid(); ++i) { - auto bp = Breakpoint(g_dap.target.GetBreakpointAtIndex(i)); + auto bp = Breakpoint(g_dap, g_dap.target.GetBreakpointAtIndex(i)); AppendBreakpoint(&bp, response_breakpoints); } llvm::json::Object body; @@ -4560,10 +4535,11 @@ void request__testGetTargetBreakpoints(const llvm::json::Object &request) { g_dap.SendJSON(llvm::json::Value(std::move(response))); } -// "SetInstructionBreakpointsRequest" : { -// "allOf" : [ -// {"$ref" : "#/definitions/Request"}, { -// "type" : "object", +// "SetInstructionBreakpointsRequest": { +// "allOf": [ +// {"$ref": "#/definitions/Request"}, +// { +// "type": "object", // "description" : // "Replaces all existing instruction breakpoints. Typically, " // "instruction breakpoints would be set from a disassembly window. " @@ -4572,235 +4548,218 @@ void request__testGetTargetBreakpoints(const llvm::json::Object &request) { // "(with reason `instruction breakpoint`) is generated.\nClients " // "should only call this request if the corresponding capability " // "`supportsInstructionBreakpoints` is true.", -// "properties" : { -// "command" : {"type" : "string", "enum" : -// ["setInstructionBreakpoints"]}, "arguments" : -// {"$ref" : "#/definitions/SetInstructionBreakpointsArguments"} +// "properties": { +// "command": { "type": "string", "enum": ["setInstructionBreakpoints"] +// }, "arguments": {"$ref": +// "#/definitions/SetInstructionBreakpointsArguments"} // }, -// "required" : [ "command", "arguments" ] +// "required": [ "command", "arguments" ] // } // ] // }, -// "SetInstructionBreakpointsArguments" -// : { -// "type" : "object", -// "description" : "Arguments for `setInstructionBreakpoints` request", -// "properties" : { -// "breakpoints" : { -// "type" : "array", -// "items" : {"$ref" : "#/definitions/InstructionBreakpoint"}, -// "description" : "The instruction references of the breakpoints" -// } -// }, -// "required" : ["breakpoints"] -// }, -// "SetInstructionBreakpointsResponse" -// : { -// "allOf" : [ -// {"$ref" : "#/definitions/Response"}, { -// "type" : "object", -// "description" : "Response to `setInstructionBreakpoints` request", -// "properties" : { -// "body" : { -// "type" : "object", -// "properties" : { -// "breakpoints" : { -// "type" : "array", -// "items" : {"$ref" : "#/definitions/Breakpoint"}, -// "description" : -// "Information about the breakpoints. The array elements -// " "correspond to the elements of the `breakpoints` -// array." -// } -// }, -// "required" : ["breakpoints"] +// "SetInstructionBreakpointsArguments": { +// "type": "object", +// "description": "Arguments for `setInstructionBreakpoints` request", +// "properties": { +// "breakpoints": { +// "type": "array", +// "items": {"$ref": "#/definitions/InstructionBreakpoint"}, +// "description": "The instruction references of the breakpoints" +// } +// }, +// "required": ["breakpoints"] +// }, +// "SetInstructionBreakpointsResponse": { +// "allOf": [ +// {"$ref": "#/definitions/Response"}, +// { +// "type": "object", +// "description": "Response to `setInstructionBreakpoints` request", +// "properties": { +// "body": { +// "type": "object", +// "properties": { +// "breakpoints": { +// "type": "array", +// "items": {"$ref": "#/definitions/Breakpoint"}, +// "description": +// "Information about the breakpoints. The array elements +// " "correspond to the elements of the `breakpoints` +// array." // } // }, -// "required" : ["body"] +// "required": ["breakpoints"] // } -// ] -// }, -// "InstructionBreakpoint" : { -// "type" : "object", -// "description" : "Properties of a breakpoint passed to the " +// }, +// "required": ["body"] +// } +// ] +// }, +// "InstructionBreakpoint": { +// "type": "object", +// "description": "Properties of a breakpoint passed to the " // "`setInstructionBreakpoints` request", -// "properties" : { -// "instructionReference" : { -// "type" : "string", +// "properties": { +// "instructionReference": { +// "type": "string", // "description" : // "The instruction reference of the breakpoint.\nThis should be a " // "memory or instruction pointer reference from an // `EvaluateResponse`, " // "`Variable`, `StackFrame`, `GotoTarget`, or `Breakpoint`." // }, -// "offset" : { -// "type" : "integer", -// "description" : "The offset from the instruction reference in " +// "offset": { +// "type": "integer", +// "description": "The offset from the instruction reference in " // "bytes.\nThis can be negative." // }, -// "condition" : { -// "type" : "string", -// "description" : "An expression for conditional breakpoints.\nIt is only +// "condition": { +// "type": "string", +// "description": "An expression for conditional breakpoints.\nIt is only // " // "honored by a debug adapter if the corresponding " // "capability `supportsConditionalBreakpoints` is true." // }, -// "hitCondition" : { -// "type" : "string", -// "description" : "An expression that controls how many hits of the " +// "hitCondition": { +// "type": "string", +// "description": "An expression that controls how many hits of the " // "breakpoint are ignored.\nThe debug adapter is expected // " "to interpret the expression as needed.\nThe // attribute " "is only honored by a debug adapter if the // corresponding " "capability // `supportsHitConditionalBreakpoints` is true." // }, -// "mode" : { -// "type" : "string", -// "description" : "The mode of this breakpoint. If defined, this must be +// "mode": { +// "type": "string", +// "description": "The mode of this breakpoint. If defined, this must be // " // "one of the `breakpointModes` the debug adapter " // "advertised in its `Capabilities`." // } // }, -// "required" : ["instructionReference"] +// "required": ["instructionReference"] // }, -// "Breakpoint" -// : { -// "type" : "object", +// "Breakpoint": { +// "type": "object", +// "description" : +// "Information about a breakpoint created in `setBreakpoints`, " +// "`setFunctionBreakpoints`, `setInstructionBreakpoints`, or " +// "`setDataBreakpoints` requests.", +// "properties": { +// "id": { +// "type": "integer", // "description" : -// "Information about a breakpoint created in `setBreakpoints`, " -// "`setFunctionBreakpoints`, `setInstructionBreakpoints`, or " -// "`setDataBreakpoints` requests.", -// "properties" : { -// "id" : { -// "type" : "integer", -// "description" : -// "The identifier for the breakpoint. It is needed if breakpoint -// " "events are used to update or remove breakpoints." -// }, -// "verified" : { -// "type" : "boolean", -// "description" : "If true, the breakpoint could be set (but not " -// "necessarily at the desired location)." -// }, -// "message" : { -// "type" : "string", -// "description" : "A message about the state of the breakpoint.\nThis -// " -// "is shown to the user and can be used to explain -// why " "a breakpoint could not be verified." -// }, -// "source" : { -// "$ref" : "#/definitions/Source", -// "description" : "The source where the breakpoint is located." -// }, -// "line" : { -// "type" : "integer", -// "description" : -// "The start line of the actual range covered by the breakpoint." -// }, -// "column" : { -// "type" : "integer", -// "description" : -// "Start position of the source range covered by the breakpoint. -// " "It is measured in UTF-16 code units and the client -// capability " -// "`columnsStartAt1` determines whether it is 0- or 1-based." -// }, -// "endLine" : { -// "type" : "integer", -// "description" : -// "The end line of the actual range covered by the breakpoint." -// }, -// "endColumn" : { -// "type" : "integer", -// "description" : -// "End position of the source range covered by the breakpoint. It -// " "is measured in UTF-16 code units and the client capability " -// "`columnsStartAt1` determines whether it is 0- or 1-based.\nIf -// " "no end line is given, then the end column is assumed to be -// in " "the start line." -// }, -// "instructionReference" : { -// "type" : "string", -// "description" : "A memory reference to where the breakpoint is -// set." -// }, -// "offset" : { -// "type" : "integer", -// "description" : "The offset from the instruction reference.\nThis " -// "can be negative." -// }, -// "reason" : { -// "type" : "string", -// "description" : -// "A machine-readable explanation of why a breakpoint may not be -// " "verified. If a breakpoint is verified or a specific reason -// is " "not known, the adapter should omit this property. -// Possible " "values include:\n\n- `pending`: Indicates a -// breakpoint might be " "verified in the future, but the adapter -// cannot verify it in the " "current state.\n - `failed`: -// Indicates a breakpoint was not " "able to be verified, and the -// adapter does not believe it can be " "verified without -// intervention.", -// "enum" : [ "pending", "failed" ] -// } -// }, -// "required" : ["verified"] +// "The identifier for the breakpoint. It is needed if breakpoint +// " "events are used to update or remove breakpoints." // }, - +// "verified": { +// "type": "boolean", +// "description": "If true, the breakpoint could be set (but not " +// "necessarily at the desired location)." +// }, +// "message": { +// "type": "string", +// "description": "A message about the state of the breakpoint.\nThis +// " +// "is shown to the user and can be used to explain +// why " "a breakpoint could not be verified." +// }, +// "source": { +// "$ref": "#/definitions/Source", +// "description": "The source where the breakpoint is located." +// }, +// "line": { +// "type": "integer", +// "description" : +// "The start line of the actual range covered by the breakpoint." +// }, +// "column": { +// "type": "integer", +// "description" : +// "Start position of the source range covered by the breakpoint. +// " "It is measured in UTF-16 code units and the client +// capability " +// "`columnsStartAt1` determines whether it is 0- or 1-based." +// }, +// "endLine": { +// "type": "integer", +// "description" : +// "The end line of the actual range covered by the breakpoint." +// }, +// "endColumn": { +// "type": "integer", +// "description" : +// "End position of the source range covered by the breakpoint. It +// " "is measured in UTF-16 code units and the client capability " +// "`columnsStartAt1` determines whether it is 0- or 1-based.\nIf +// " "no end line is given, then the end column is assumed to be +// in " "the start line." +// }, +// "instructionReference": { +// "type": "string", +// "description": "A memory reference to where the breakpoint is +// set." +// }, +// "offset": { +// "type": "integer", +// "description": "The offset from the instruction reference.\nThis " +// "can be negative." +// }, +// "reason": { +// "type": "string", +// "description" : +// "A machine-readable explanation of why a breakpoint may not be +// " "verified. If a breakpoint is verified or a specific reason +// is " "not known, the adapter should omit this property. +// Possible " "values include:\n\n- `pending`: Indicates a +// breakpoint might be " "verified in the future, but the adapter +// cannot verify it in the " "current state.\n - `failed`: +// Indicates a breakpoint was not " "able to be verified, and the +// adapter does not believe it can be " "verified without +// intervention.", +// "enum": [ "pending", "failed" ] +// } +// }, +// "required": ["verified"] +// }, void request_setInstructionBreakpoints(const llvm::json::Object &request) { llvm::json::Object response; llvm::json::Array response_breakpoints; llvm::json::Object body; FillResponse(request, response); - auto arguments = request.getObject("arguments"); - auto breakpoints = arguments->getArray("breakpoints"); + const auto *arguments = request.getObject("arguments"); + const auto *breakpoints = arguments->getArray("breakpoints"); - // It holds active instruction breakpoint list received from DAP. - InstructionBreakpointMap request_ibp; - if (breakpoints) { - for (const auto &bp : *breakpoints) { - auto bp_obj = bp.getAsObject(); - if (bp_obj) { - // Read instruction breakpoint request. - InstructionBreakpoint inst_bp(*bp_obj); - // Store them into map for reference. - request_ibp[inst_bp.instructionAddressReference] = std::move(inst_bp); - } - } + // Disable any instruction breakpoints that aren't in this request. + // There is no call to remove instruction breakpoints other than calling this + // function with a smaller or empty "breakpoints" list. + llvm::DenseSet seen; + for (const auto &addr : g_dap.instruction_breakpoints) + seen.insert(addr.first); - // Iterate previous active instruction breakpoint list. - for (auto &prev_ibp : g_dap.instruction_breakpoints) { - // Find previous instruction breakpoint reference address in newly - // received instruction breakpoint list. - auto inst_reference = request_ibp.find(prev_ibp.first); - // Request for remove and delete the breakpoint, if the prev instruction - // breakpoint ID is not available in active instrcation breakpoint list. - // Means delete removed breakpoint instance. - if (inst_reference == request_ibp.end()) { - g_dap.target.BreakpointDelete(prev_ibp.second.id); - // Update Prev instruction breakpoint list. - g_dap.instruction_breakpoints.erase(prev_ibp.first); - } else { - // Instead of recreating breakpoint instance, update the breakpoint if - // there are any conditional changes. - prev_ibp.second.UpdateBreakpoint(inst_reference->second); - request_ibp.erase(inst_reference); - response_breakpoints.emplace_back( - CreateInstructionBreakpoint(&prev_ibp.second)); - } - } + for (const auto &bp : *breakpoints) { + const auto *bp_obj = bp.getAsObject(); + if (!bp_obj) + continue; + // Read instruction breakpoint request. + InstructionBreakpoint inst_bp(g_dap, *bp_obj); + const auto [iv, inserted] = g_dap.instruction_breakpoints.try_emplace( + inst_bp.instructionAddressReference, g_dap, *bp_obj); + if (inserted) + iv->second.SetBreakpoint(); + else + iv->second.UpdateBreakpoint(inst_bp); + AppendBreakpoint(&iv->second, response_breakpoints); + seen.erase(inst_bp.instructionAddressReference); + } - for (auto &req_bpi : request_ibp) { - // Add this breakpoint info to the response - g_dap.instruction_breakpoints[req_bpi.first] = std::move(req_bpi.second); - InstructionBreakpoint &new_bp = - g_dap.instruction_breakpoints[req_bpi.first]; - new_bp.SetInstructionBreakpoint(); - response_breakpoints.emplace_back(CreateInstructionBreakpoint(&new_bp)); - } + for (const auto &addr : seen) { + auto inst_bp = g_dap.instruction_breakpoints.find(addr); + if (inst_bp == g_dap.instruction_breakpoints.end()) + continue; + g_dap.target.BreakpointDelete(inst_bp->second.bp.GetID()); + g_dap.instruction_breakpoints.erase(addr); } body.try_emplace("breakpoints", std::move(response_breakpoints)); diff --git a/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst b/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst index 5b3200a4b78235..60fa024db5e991 100644 --- a/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst +++ b/llvm/docs/CommandGuide/llvm-debuginfo-analyzer.rst @@ -1742,7 +1742,7 @@ DWARF - Clang (Linux) [003] 3 {Variable} 'Var_1' -> 'int' [002] 11 {Function} extern not_inlined 'test' -> 'int' [003] 12 {Variable} 'A' -> 'int' - [003] 14 {InlinedFunction} inlined 'InlineFunction' -> 'int' + [003] 13 {InlinedFunction} inlined 'InlineFunction' -> 'int' [004] {Block} [005] {Variable} 'Var_2' -> 'int' [004] {Parameter} 'Param' -> 'int' diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst index 8920530dc3f1a1..076dc7fa93e565 100644 --- a/llvm/docs/GlobalISel/GenericOpcode.rst +++ b/llvm/docs/GlobalISel/GenericOpcode.rst @@ -753,6 +753,24 @@ The type of the operand must be equal to or larger than the vector element type. If the operand is larger than the vector element type, the scalar is implicitly truncated to the vector element type. +G_STEP_VECTOR +^^^^^^^^^^^^^ + +Create a scalable vector where all lanes are linear sequences starting at 0 +with a given unsigned step. + +The type of the operand must be equal to the vector element type. Arithmetic +is performed modulo the bitwidth of the element. The step must be > 0. +Otherwise the vector is zero. + +.. code-block:: + + %0:_() = G_STEP_VECTOR i64 4 + + %1:_() = G_STEP_VECTOR i32 4 + + 0, 1*Step, 2*Step, 3*Step, 4*Step, ... + G_VECTOR_COMPRESS ^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index e426140f328315..ef38c5ab33b926 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -20392,8 +20392,8 @@ Example: .. code-block:: text - %a = call i8 @llvm.fptoui.sat.i8.f32(float 123.9) ; yields i8: 123 - %b = call i8 @llvm.fptoui.sat.i8.f32(float -5.7) ; yields i8: 0 + %a = call i8 @llvm.fptoui.sat.i8.f32(float 123.875) ; yields i8: 123 + %b = call i8 @llvm.fptoui.sat.i8.f32(float -5.75) ; yields i8: 0 %c = call i8 @llvm.fptoui.sat.i8.f32(float 377.0) ; yields i8: 255 %d = call i8 @llvm.fptoui.sat.i8.f32(float 0xFFF8000000000000) ; yields i8: 0 @@ -20445,8 +20445,8 @@ Example: .. code-block:: text - %a = call i8 @llvm.fptosi.sat.i8.f32(float 23.9) ; yields i8: 23 - %b = call i8 @llvm.fptosi.sat.i8.f32(float -130.8) ; yields i8: -128 + %a = call i8 @llvm.fptosi.sat.i8.f32(float 23.875) ; yields i8: 23 + %b = call i8 @llvm.fptosi.sat.i8.f32(float -130.75) ; yields i8: -128 %c = call i8 @llvm.fptosi.sat.i8.f32(float 999.0) ; yields i8: 127 %d = call i8 @llvm.fptosi.sat.i8.f32(float 0xFFF8000000000000) ; yields i8: 0 @@ -23002,7 +23002,7 @@ This is an overloaded intrinsic. :: - declare float @llvm.vp.reduce.fmax.v4f32(float , <4 x float> , <4 x i1> , float ) + declare float @llvm.vp.reduce.fmax.v4f32(float , <4 x float> , <4 x i1> , i32 ) declare double @llvm.vp.reduce.fmax.nxv8f64(double , , , i32 ) Overview: @@ -23072,7 +23072,7 @@ This is an overloaded intrinsic. :: - declare float @llvm.vp.reduce.fmin.v4f32(float , <4 x float> , <4 x i1> , float ) + declare float @llvm.vp.reduce.fmin.v4f32(float , <4 x float> , <4 x i1> , i32 ) declare double @llvm.vp.reduce.fmin.nxv8f64(double , , , i32 ) Overview: @@ -23142,7 +23142,7 @@ This is an overloaded intrinsic. :: - declare float @llvm.vp.reduce.fmaximum.v4f32(float , <4 x float> , <4 x i1> , float ) + declare float @llvm.vp.reduce.fmaximum.v4f32(float , <4 x float> , <4 x i1> , i32 ) declare double @llvm.vp.reduce.fmaximum.nxv8f64(double , , , i32 ) Overview: @@ -23212,7 +23212,7 @@ This is an overloaded intrinsic. :: - declare float @llvm.vp.reduce.fminimum.v4f32(float , <4 x float> , <4 x i1> , float ) + declare float @llvm.vp.reduce.fminimum.v4f32(float , <4 x float> , <4 x i1> , i32 ) declare double @llvm.vp.reduce.fminimum.nxv8f64(double , , , i32 ) Overview: diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index dca8fd9a0bca0b..2152de9709dc6e 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -599,6 +599,70 @@ described in the ``s2g.tile`` mode intrinsics above. For more information, refer PTX ISA ``_. +'``llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(..., i32 %d0, i32 %d1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``' intrinsics +correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set +of PTX instructions. These instructions initiate an asynchronous prefetch +of tensor data from global memory to the L2 cache. In tile mode, the +multi-dimensional layout of the source tensor is preserved at the destination. +The dimension of the tensor data ranges from 1d to 5d with the coordinates +specified by the ``i32 %d0 ... i32 %d4`` arguments. + +* The last argument to these intrinsics is a boolean flag + indicating support for cache_hint. This flag argument must + be a compile-time constant. When set, it indicates a valid + cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' intrinsics +correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set +of PTX instructions. These instructions initiate an asynchronous prefetch +of tensor data from global memory to the L2 cache. In im2col mode, some +dimensions of the source tensor are unrolled into a single dimensional +column at the destination. In this mode, the tensor has to be at least +three-dimensional. Along with the tensor coordinates, im2col offsets are +also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number +of im2col offsets is two less than the number of dimensions of the tensor +operation. The last argument to these intrinsics is a boolean flag, with +the same functionality as described in the ``tile`` mode intrinsics above. + +For more information, refer PTX ISA +``_. + Other Intrinsics ---------------- diff --git a/llvm/examples/OrcV2Examples/LLJITWithInitializers/LLJITWithInitializers.cpp b/llvm/examples/OrcV2Examples/LLJITWithInitializers/LLJITWithInitializers.cpp index 32b51c31485962..704fd9c1483307 100644 --- a/llvm/examples/OrcV2Examples/LLJITWithInitializers/LLJITWithInitializers.cpp +++ b/llvm/examples/OrcV2Examples/LLJITWithInitializers/LLJITWithInitializers.cpp @@ -19,6 +19,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/StringMap.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/LLJIT.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/TargetSelect.h" diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h index 43c9b80edff78e..ace5f60b572d75 100644 --- a/llvm/include/llvm/ADT/STLExtras.h +++ b/llvm/include/llvm/ADT/STLExtras.h @@ -416,7 +416,8 @@ static constexpr bool HasFreeFunctionRBegin = } // namespace detail // Returns an iterator_range over the given container which iterates in reverse. -template auto reverse(ContainerTy &&C) { +// Does not mutate the container. +template [[nodiscard]] auto reverse(ContainerTy &&C) { if constexpr (detail::HasFreeFunctionRBegin) return make_range(adl_rbegin(C), adl_rend(C)); else @@ -1182,11 +1183,13 @@ template class concat_range { } // end namespace detail -/// Concatenated range across two or more ranges. +/// Returns a concatenated range across two or more ranges. Does not modify the +/// ranges. /// /// The desired value type must be explicitly specified. template -detail::concat_range concat(RangeTs &&... Ranges) { +[[nodiscard]] detail::concat_range +concat(RangeTs &&...Ranges) { static_assert(sizeof...(RangeTs) > 1, "Need more than one range to concatenate!"); return detail::concat_range( diff --git a/llvm/include/llvm/ADT/STLFunctionalExtras.h b/llvm/include/llvm/ADT/STLFunctionalExtras.h index 6f172504b3c167..3b9d40959d7142 100644 --- a/llvm/include/llvm/ADT/STLFunctionalExtras.h +++ b/llvm/include/llvm/ADT/STLFunctionalExtras.h @@ -16,6 +16,7 @@ #define LLVM_ADT_STLFUNCTIONALEXTRAS_H #include "llvm/ADT/STLForwardCompat.h" +#include "llvm/Support/Compiler.h" #include #include @@ -52,7 +53,7 @@ class function_ref { template function_ref( - Callable &&callable, + Callable &&callable LLVM_LIFETIME_BOUND, // This is not the copy-constructor. std::enable_if_t, function_ref>::value> * = nullptr, diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.h b/llvm/include/llvm/Analysis/TargetLibraryInfo.h index 5347c64e43e718..325c9cd9900b36 100644 --- a/llvm/include/llvm/Analysis/TargetLibraryInfo.h +++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.h @@ -410,6 +410,7 @@ class TargetLibraryInfo { // clang-format off case LibFunc_acos: case LibFunc_acosf: case LibFunc_acosl: case LibFunc_asin: case LibFunc_asinf: case LibFunc_asinl: + case LibFunc_atan2: case LibFunc_atan2f: case LibFunc_atan2l: case LibFunc_atan: case LibFunc_atanf: case LibFunc_atanl: case LibFunc_ceil: case LibFunc_ceilf: case LibFunc_ceill: case LibFunc_copysign: case LibFunc_copysignf: case LibFunc_copysignl: diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 317c13917c0cfc..224dfbb9f54b6c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -174,6 +174,7 @@ class TargetTransformInfoImplBase { Name == "asin" || Name == "asinf" || Name == "asinl" || Name == "acos" || Name == "acosf" || Name == "acosl" || Name == "atan" || Name == "atanf" || Name == "atanl" || + Name == "atan2" || Name == "atan2f" || Name == "atan2l"|| Name == "sinh" || Name == "sinhf" || Name == "sinhl" || Name == "cosh" || Name == "coshf" || Name == "coshl" || Name == "tanh" || Name == "tanhf" || Name == "tanhl" || diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def index 71ad3a35eb3f5e..7036993134fe47 100644 --- a/llvm/include/llvm/Analysis/VecFuncs.def +++ b/llvm/include/llvm/Analysis/VecFuncs.def @@ -56,6 +56,8 @@ TLI_DEFINE_VECFUNC("acosf", "vacosf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("llvm.acos.f32", "vacosf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("atanf", "vatanf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("llvm.atan.f32", "vatanf", FIXED(4), "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("atan2f", "vatan2f", FIXED(4), "_ZGV_LLVM_N4vv") +TLI_DEFINE_VECFUNC("llvm.atan2.f32", "vatan2f", FIXED(4), "_ZGV_LLVM_N4vv") // Hyperbolic Functions TLI_DEFINE_VECFUNC("sinhf", "vsinhf", FIXED(4), "_ZGV_LLVM_N4v") @@ -289,7 +291,9 @@ TLI_DEFINE_VECFUNC("acosf", "__acosf4", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("atan", "__atand2", FIXED(2), "_ZGV_LLVM_N2v") TLI_DEFINE_VECFUNC("atanf", "__atanf4", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("atan2", "__atan2d2", FIXED(2), "_ZGV_LLVM_N2vv") +TLI_DEFINE_VECFUNC("llvm.atan2.f64", "__atan2d2", FIXED(2), "_ZGV_LLVM_N2vv") TLI_DEFINE_VECFUNC("atan2f", "__atan2f4", FIXED(4), "_ZGV_LLVM_N4vv") +TLI_DEFINE_VECFUNC("llvm.atan2.f32", "__atan2f4", FIXED(4), "_ZGV_LLVM_N4vv") // Hyperbolic Functions TLI_DEFINE_VECFUNC("sinh", "__sinhd2", FIXED(2), "_ZGV_LLVM_N2v") diff --git a/llvm/include/llvm/CodeGen/FinalizeISel.h b/llvm/include/llvm/CodeGen/FinalizeISel.h index 117140417e2c2c..b2c28e330126fc 100644 --- a/llvm/include/llvm/CodeGen/FinalizeISel.h +++ b/llvm/include/llvm/CodeGen/FinalizeISel.h @@ -16,6 +16,7 @@ namespace llvm { class FinalizeISelPass : public PassInfoMixin { public: PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &); + static bool isRequired() { return true; } }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 72573facf1a7fe..cd2022e88a0df1 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -321,6 +321,9 @@ class CombinerHelper { bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); + // Transform a G_SUB with constant on the RHS to G_ADD. + bool matchCombineSubToAdd(MachineInstr &MI, BuildFnTy &MatchInfo); + // Transform a G_SHL with an extended source into a narrower shift if // possible. bool matchCombineShlOfExtend(MachineInstr &MI, RegisterImmPair &MatchData); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index cd7ebcf54c9e1e..4de14dee190fb3 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -906,6 +906,18 @@ class GVScale : public GenericMachineInstr { }; }; +/// Represents a step vector. +class GStepVector : public GenericMachineInstr { +public: + uint64_t getStep() const { + return getOperand(1).getCImm()->getValue().getZExtValue(); + } + + static bool classof(const MachineInstr *MI) { + return MI->getOpcode() == TargetOpcode::G_STEP_VECTOR; + }; +}; + /// Represents an integer subtraction. class GSub : public GIntBinOp { public: diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h index 471a7f70dd546c..9dea4c1b412dbb 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h @@ -1218,8 +1218,14 @@ class LegalizationArtifactCombiner { } else { LLT MergeSrcTy = MRI.getType(MergeI->getOperand(1).getReg()); - if (!ConvertOp && DestTy != MergeSrcTy) - ConvertOp = TargetOpcode::G_BITCAST; + if (!ConvertOp && DestTy != MergeSrcTy) { + if (DestTy.isPointer()) + ConvertOp = TargetOpcode::G_INTTOPTR; + else if (MergeSrcTy.isPointer()) + ConvertOp = TargetOpcode::G_PTRTOINT; + else + ConvertOp = TargetOpcode::G_BITCAST; + } if (ConvertOp) { Builder.setInstr(MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index 343a0172ff39ed..b737917b8442da 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -1102,6 +1102,13 @@ class LegalizeRuleSet { return minScalar(TypeIdx, MinTy).maxScalar(TypeIdx, MaxTy); } + LegalizeRuleSet &clampScalar(bool Pred, unsigned TypeIdx, const LLT MinTy, + const LLT MaxTy) { + if (!Pred) + return *this; + return clampScalar(TypeIdx, MinTy, MaxTy); + } + /// Limit the range of scalar sizes to MinTy and MaxTy. LegalizeRuleSet &clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy) { diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index a38dd34a17097a..3516065f9b6cb3 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1172,6 +1172,17 @@ class MachineIRBuilder { MachineInstrBuilder buildInsert(const DstOp &Res, const SrcOp &Src, const SrcOp &Op, unsigned Index); + /// Build and insert \p Res = G_STEP_VECTOR \p Step + /// + /// G_STEP_VECTOR returns a scalable vector of linear sequence of step \p Step + /// into \p Res. + /// + /// \pre setBasicBlock or setMI must have been called. + /// \pre \p Res must be a generic virtual register with scalable vector type. + /// + /// \return a MachineInstrBuilder for the newly created instruction. + MachineInstrBuilder buildStepVector(const DstOp &Res, unsigned Step); + /// Build and insert \p Res = G_VSCALE \p MinElts /// /// G_VSCALE puts the value of the runtime vscale multiplied by \p MinElts diff --git a/llvm/include/llvm/CodeGen/LocalStackSlotAllocation.h b/llvm/include/llvm/CodeGen/LocalStackSlotAllocation.h index bf5225d3e99a54..b02667d5c6699d 100644 --- a/llvm/include/llvm/CodeGen/LocalStackSlotAllocation.h +++ b/llvm/include/llvm/CodeGen/LocalStackSlotAllocation.h @@ -17,6 +17,7 @@ class LocalStackSlotAllocationPass : public PassInfoMixin { public: PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &); + static bool isRequired() { return true; } }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/MIRPrinter.h b/llvm/include/llvm/CodeGen/MIRPrinter.h index 85bd674c56a60c..37d9f8ff502db0 100644 --- a/llvm/include/llvm/CodeGen/MIRPrinter.h +++ b/llvm/include/llvm/CodeGen/MIRPrinter.h @@ -31,6 +31,7 @@ class PrintMIRPreparePass : public PassInfoMixin { public: PrintMIRPreparePass(raw_ostream &OS = errs()) : OS(OS) {} PreservedAnalyses run(Module &M, ModuleAnalysisManager &MFAM); + static bool isRequired() { return true; } }; class PrintMIRPass : public PassInfoMixin { @@ -40,6 +41,7 @@ class PrintMIRPass : public PassInfoMixin { PrintMIRPass(raw_ostream &OS = errs()) : OS(OS) {} PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } }; /// Print LLVM IR using the MIR serialization format to the given output stream. diff --git a/llvm/include/llvm/CodeGen/MachineVerifier.h b/llvm/include/llvm/CodeGen/MachineVerifier.h index bfd0681fb79545..9d82b5417c927e 100644 --- a/llvm/include/llvm/CodeGen/MachineVerifier.h +++ b/llvm/include/llvm/CodeGen/MachineVerifier.h @@ -21,6 +21,7 @@ class MachineVerifierPass : public PassInfoMixin { : Banner(Banner) {} PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/PHIElimination.h b/llvm/include/llvm/CodeGen/PHIElimination.h index 3a1a4c5c6133f8..d3c884b8413c7a 100644 --- a/llvm/include/llvm/CodeGen/PHIElimination.h +++ b/llvm/include/llvm/CodeGen/PHIElimination.h @@ -17,6 +17,7 @@ class PHIEliminationPass : public PassInfoMixin { public: PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/RegAllocFast.h b/llvm/include/llvm/CodeGen/RegAllocFast.h index 440264a06ae89e..b2ca9e10bf4647 100644 --- a/llvm/include/llvm/CodeGen/RegAllocFast.h +++ b/llvm/include/llvm/CodeGen/RegAllocFast.h @@ -50,6 +50,8 @@ class RegAllocFastPass : public PassInfoMixin { void printPipeline(raw_ostream &OS, function_ref MapClassName2PassName); + + static bool isRequired() { return true; } }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h index 32b2ea48179f47..f99ec4651009a0 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -551,6 +551,7 @@ class SelectionDAGISelPass : public PassInfoMixin { public: PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } }; } diff --git a/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h b/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h index 35ef0a8bc89085..ab20634b0b9384 100644 --- a/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h +++ b/llvm/include/llvm/DebugInfo/GSYM/OutputAggregator.h @@ -60,11 +60,8 @@ class OutputAggregator { // then merge it in here. Note that this is *not* thread safe. It is up to // the caller to ensure that this is only called from one thread at a time. void Merge(const OutputAggregator &other) { - for (auto &&[name, count] : other.Aggregation) { - auto [it, inserted] = Aggregation.emplace(name, count); - if (!inserted) - it->second += count; - } + for (auto &&[name, count] : other.Aggregation) + Aggregation[name] += count; } }; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/AbsoluteSymbols.h b/llvm/include/llvm/ExecutionEngine/Orc/AbsoluteSymbols.h new file mode 100644 index 00000000000000..d58fc8b11f5046 --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/Orc/AbsoluteSymbols.h @@ -0,0 +1,59 @@ +//===------ AbsoluteSymbols.h - Absolute symbols utilities ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// absoluteSymbols function and related utilities. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_ABSOLUTESYMBOLS_H +#define LLVM_EXECUTIONENGINE_ORC_ABSOLUTESYMBOLS_H + +#include "llvm/ExecutionEngine/Orc/MaterializationUnit.h" + +namespace llvm::orc { + +/// A MaterializationUnit implementation for pre-existing absolute symbols. +/// +/// All symbols will be resolved and marked ready as soon as the unit is +/// materialized. +class AbsoluteSymbolsMaterializationUnit : public MaterializationUnit { +public: + AbsoluteSymbolsMaterializationUnit(SymbolMap Symbols); + + StringRef getName() const override; + +private: + void materialize(std::unique_ptr R) override; + void discard(const JITDylib &JD, const SymbolStringPtr &Name) override; + static MaterializationUnit::Interface extractFlags(const SymbolMap &Symbols); + + SymbolMap Symbols; +}; + +/// Create an AbsoluteSymbolsMaterializationUnit with the given symbols. +/// Useful for inserting absolute symbols into a JITDylib. E.g.: +/// \code{.cpp} +/// JITDylib &JD = ...; +/// SymbolStringPtr Foo = ...; +/// ExecutorSymbolDef FooSym = ...; +/// if (auto Err = JD.define(absoluteSymbols({ +/// { Foo, FooSym }, +/// { Bar, BarSym } +/// }))) +/// return Err; +/// \endcode +/// +inline std::unique_ptr +absoluteSymbols(SymbolMap Symbols) { + return std::make_unique( + std::move(Symbols)); +} + +} // namespace llvm::orc + +#endif // LLVM_EXECUTIONENGINE_ORC_ABSOLUTESYMBOLS_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h index f578455905f210..e892005c53d8ec 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h @@ -19,7 +19,9 @@ #include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/ExecutionEngine/JITLink/JITLinkDylib.h" #include "llvm/ExecutionEngine/JITSymbol.h" +#include "llvm/ExecutionEngine/Orc/CoreContainers.h" #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h" +#include "llvm/ExecutionEngine/Orc/MaterializationUnit.h" #include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h" #include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h" #include "llvm/ExecutionEngine/Orc/Shared/WrapperFunctionUtils.h" @@ -39,7 +41,6 @@ namespace orc { // Forward declare some classes. class AsynchronousSymbolQuery; class ExecutionSession; -class MaterializationUnit; class MaterializationResponsibility; class JITDylib; class ResourceTracker; @@ -109,23 +110,6 @@ class ResourceManager { ResourceKey SrcK) = 0; }; -/// A set of symbol names (represented by SymbolStringPtrs for -// efficiency). -using SymbolNameSet = DenseSet; - -/// A vector of symbol names. -using SymbolNameVector = std::vector; - -/// A map from symbol names (as SymbolStringPtrs) to JITSymbols -/// (address/flags pairs). -using SymbolMap = DenseMap; - -/// A map from symbol names (as SymbolStringPtrs) to JITSymbolFlags. -using SymbolFlagsMap = DenseMap; - -/// A map from JITDylibs to sets of symbols. -using SymbolDependenceMap = DenseMap; - /// Lookup flags that apply to each dylib in the search order for a lookup. /// /// If MatchHiddenSymbolsOnly is used (the default) for a given dylib, then @@ -682,117 +666,6 @@ class MaterializationResponsibility { SymbolStringPtr InitSymbol; }; -/// A MaterializationUnit represents a set of symbol definitions that can -/// be materialized as a group, or individually discarded (when -/// overriding definitions are encountered). -/// -/// MaterializationUnits are used when providing lazy definitions of symbols to -/// JITDylibs. The JITDylib will call materialize when the address of a symbol -/// is requested via the lookup method. The JITDylib will call discard if a -/// stronger definition is added or already present. -class MaterializationUnit { - friend class ExecutionSession; - friend class JITDylib; - -public: - static char ID; - - struct Interface { - Interface() = default; - Interface(SymbolFlagsMap InitalSymbolFlags, SymbolStringPtr InitSymbol) - : SymbolFlags(std::move(InitalSymbolFlags)), - InitSymbol(std::move(InitSymbol)) { - assert((!this->InitSymbol || this->SymbolFlags.count(this->InitSymbol)) && - "If set, InitSymbol should appear in InitialSymbolFlags map"); - } - - SymbolFlagsMap SymbolFlags; - SymbolStringPtr InitSymbol; - }; - - MaterializationUnit(Interface I) - : SymbolFlags(std::move(I.SymbolFlags)), - InitSymbol(std::move(I.InitSymbol)) {} - virtual ~MaterializationUnit() = default; - - /// Return the name of this materialization unit. Useful for debugging - /// output. - virtual StringRef getName() const = 0; - - /// Return the set of symbols that this source provides. - const SymbolFlagsMap &getSymbols() const { return SymbolFlags; } - - /// Returns the initialization symbol for this MaterializationUnit (if any). - const SymbolStringPtr &getInitializerSymbol() const { return InitSymbol; } - - /// Implementations of this method should materialize all symbols - /// in the materialzation unit, except for those that have been - /// previously discarded. - virtual void - materialize(std::unique_ptr R) = 0; - - /// Called by JITDylibs to notify MaterializationUnits that the given symbol - /// has been overridden. - void doDiscard(const JITDylib &JD, const SymbolStringPtr &Name) { - SymbolFlags.erase(Name); - if (InitSymbol == Name) { - DEBUG_WITH_TYPE("orc", { - dbgs() << "In " << getName() << ": discarding init symbol \"" - << *Name << "\"\n"; - }); - InitSymbol = nullptr; - } - discard(JD, std::move(Name)); - } - -protected: - SymbolFlagsMap SymbolFlags; - SymbolStringPtr InitSymbol; - -private: - virtual void anchor(); - - /// Implementations of this method should discard the given symbol - /// from the source (e.g. if the source is an LLVM IR Module and the - /// symbol is a function, delete the function body or mark it available - /// externally). - virtual void discard(const JITDylib &JD, const SymbolStringPtr &Name) = 0; -}; - -/// A MaterializationUnit implementation for pre-existing absolute symbols. -/// -/// All symbols will be resolved and marked ready as soon as the unit is -/// materialized. -class AbsoluteSymbolsMaterializationUnit : public MaterializationUnit { -public: - AbsoluteSymbolsMaterializationUnit(SymbolMap Symbols); - - StringRef getName() const override; - -private: - void materialize(std::unique_ptr R) override; - void discard(const JITDylib &JD, const SymbolStringPtr &Name) override; - static MaterializationUnit::Interface extractFlags(const SymbolMap &Symbols); - - SymbolMap Symbols; -}; - -/// Create an AbsoluteSymbolsMaterializationUnit with the given symbols. -/// Useful for inserting absolute symbols into a JITDylib. E.g.: -/// \code{.cpp} -/// JITDylib &JD = ...; -/// SymbolStringPtr Foo = ...; -/// ExecutorSymbolDef FooSym = ...; -/// if (auto Err = JD.define(absoluteSymbols({{Foo, FooSym}}))) -/// return Err; -/// \endcode -/// -inline std::unique_ptr -absoluteSymbols(SymbolMap Symbols) { - return std::make_unique( - std::move(Symbols)); -} - /// A materialization unit for symbol aliases. Allows existing symbols to be /// aliased with alternate flags. class ReExportsMaterializationUnit : public MaterializationUnit { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/CoreContainers.h b/llvm/include/llvm/ExecutionEngine/Orc/CoreContainers.h new file mode 100644 index 00000000000000..06d5aef0119f9b --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/Orc/CoreContainers.h @@ -0,0 +1,47 @@ +//===---- CoreContainers.h - Symbol Containers for Core APIs ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Symbol container types for core ORC APIs. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_CORECONTAINERS_H +#define LLVM_EXECUTIONENGINE_ORC_CORECONTAINERS_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ExecutionEngine/JITSymbol.h" +#include "llvm/ExecutionEngine/Orc/Shared/ExecutorSymbolDef.h" +#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h" + +#include + +namespace llvm::orc { + +class JITDylib; + +/// A set of symbol names (represented by SymbolStringPtrs for +// efficiency). +using SymbolNameSet = DenseSet; + +/// A vector of symbol names. +using SymbolNameVector = std::vector; + +/// A map from symbol names (as SymbolStringPtrs) to JITSymbols +/// (address/flags pairs). +using SymbolMap = DenseMap; + +/// A map from symbol names (as SymbolStringPtrs) to JITSymbolFlags. +using SymbolFlagsMap = DenseMap; + +/// A map from JITDylibs to sets of symbols. +using SymbolDependenceMap = DenseMap; + +} // End namespace llvm::orc + +#endif // LLVM_EXECUTIONENGINE_ORC_CORECONTAINERS_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h b/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h index a2364b4515f01b..ebff2106e9d72b 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/LLJIT.h @@ -14,6 +14,7 @@ #define LLVM_EXECUTIONENGINE_ORC_LLJIT_H #include "llvm/ADT/SmallSet.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MaterializationUnit.h b/llvm/include/llvm/ExecutionEngine/Orc/MaterializationUnit.h new file mode 100644 index 00000000000000..4ac8f6b6ba05ad --- /dev/null +++ b/llvm/include/llvm/ExecutionEngine/Orc/MaterializationUnit.h @@ -0,0 +1,103 @@ +//===---- MaterializationUnit.h -- Materialization Black Box ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// MaterializationUnit class and related types and operations. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_EXECUTIONENGINE_ORC_MATERIALIZATIONUNIT_H +#define LLVM_EXECUTIONENGINE_ORC_MATERIALIZATIONUNIT_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/ExecutionEngine/Orc/CoreContainers.h" +#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h" + +namespace llvm::orc { + +class MaterializationResponsibility; + +/// A MaterializationUnit represents a set of symbol definitions that can +/// be materialized as a group, or individually discarded (when +/// overriding definitions are encountered). +/// +/// MaterializationUnits are used when providing lazy definitions of symbols to +/// JITDylibs. The JITDylib will call materialize when the address of a symbol +/// is requested via the lookup method. The JITDylib will call discard if a +/// stronger definition is added or already present. +class MaterializationUnit { + friend class ExecutionSession; + friend class JITDylib; + +public: + static char ID; + + struct Interface { + Interface() = default; + Interface(SymbolFlagsMap InitalSymbolFlags, SymbolStringPtr InitSymbol) + : SymbolFlags(std::move(InitalSymbolFlags)), + InitSymbol(std::move(InitSymbol)) { + assert((!this->InitSymbol || this->SymbolFlags.count(this->InitSymbol)) && + "If set, InitSymbol should appear in InitialSymbolFlags map"); + } + + SymbolFlagsMap SymbolFlags; + SymbolStringPtr InitSymbol; + }; + + MaterializationUnit(Interface I) + : SymbolFlags(std::move(I.SymbolFlags)), + InitSymbol(std::move(I.InitSymbol)) {} + virtual ~MaterializationUnit() = default; + + /// Return the name of this materialization unit. Useful for debugging + /// output. + virtual StringRef getName() const = 0; + + /// Return the set of symbols that this source provides. + const SymbolFlagsMap &getSymbols() const { return SymbolFlags; } + + /// Returns the initialization symbol for this MaterializationUnit (if any). + const SymbolStringPtr &getInitializerSymbol() const { return InitSymbol; } + + /// Implementations of this method should materialize all symbols + /// in the materialzation unit, except for those that have been + /// previously discarded. + virtual void + materialize(std::unique_ptr R) = 0; + + /// Called by JITDylibs to notify MaterializationUnits that the given symbol + /// has been overridden. + void doDiscard(const JITDylib &JD, const SymbolStringPtr &Name) { + SymbolFlags.erase(Name); + if (InitSymbol == Name) { + DEBUG_WITH_TYPE("orc", { + dbgs() << "In " << getName() << ": discarding init symbol \"" + << *Name << "\"\n"; + }); + InitSymbol = nullptr; + } + discard(JD, std::move(Name)); + } + +protected: + SymbolFlagsMap SymbolFlags; + SymbolStringPtr InitSymbol; + +private: + virtual void anchor(); + + /// Implementations of this method should discard the given symbol + /// from the source (e.g. if the source is an LLVM IR Module and the + /// symbol is a function, delete the function body or mark it available + /// externally). + virtual void discard(const JITDylib &JD, const SymbolStringPtr &Name) = 0; +}; + +} // namespace llvm::orc + +#endif // LLVM_EXECUTIONENGINE_ORC_MATERIALIZATIONUNIT_H diff --git a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h index 8ff15b51f1abdf..f4e089db0080ea 100644 --- a/llvm/include/llvm/Frontend/OpenMP/ClauseT.h +++ b/llvm/include/llvm/Frontend/OpenMP/ClauseT.h @@ -238,8 +238,9 @@ struct MapperT { // When used as arguments for other clauses, e.g. `fail`. ENUM(MemoryOrder, AcqRel, Acquire, Relaxed, Release, SeqCst); ENUM(MotionExpectation, Present); +// Union of `dependence-type` and `task-depenence-type`. // V5.2: [15.9.1] `task-dependence-type` modifier -ENUM(TaskDependenceType, Depobj, In, Inout, Inoutset, Mutexinoutset, Out, Sink, +ENUM(DependenceType, Depobj, In, Inout, Inoutset, Mutexinoutset, Out, Sink, Source); template // @@ -502,17 +503,17 @@ template // struct DependT { using Iterator = type::IteratorT; using LocatorList = ObjectListT; - using TaskDependenceType = tomp::type::TaskDependenceType; + using DependenceType = tomp::type::DependenceType; - struct DepType { // The form with task dependence type. + struct TaskDep { // The form with task dependence type. using TupleTrait = std::true_type; // Empty LocatorList means "omp_all_memory". - std::tuple t; + std::tuple t; }; using Doacross = DoacrossT; using UnionTrait = std::true_type; - std::variant u; // Doacross form is legacy + std::variant u; // Doacross form is legacy }; // V5.2: [3.5] `destroy` clause @@ -562,7 +563,7 @@ struct DistScheduleT { template // struct DoacrossT { using Vector = ListT>; - ENUM(DependenceType, Source, Sink); + using DependenceType = tomp::type::DependenceType; using TupleTrait = std::true_type; // Empty Vector means "omp_cur_iteration" std::tuple t; @@ -1162,9 +1163,9 @@ struct UntiedT { // V5.2: [15.9.3] `update` clause template // struct UpdateT { - using TaskDependenceType = tomp::type::TaskDependenceType; + using DependenceType = tomp::type::DependenceType; using WrapperTrait = std::true_type; - OPT(TaskDependenceType) v; + OPT(DependenceType) v; }; // V5.2: [14.1.3] `use` clause diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index d1cc753b7daf02..fa96c3f367f0ce 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -163,6 +163,7 @@ def OMPC_DistSchedule : Clause<"dist_schedule"> { } def OMPC_Doacross : Clause<"doacross"> { let clangClass = "OMPDoacrossClause"; + let flangClass = "OmpDoacrossClause"; } def OMPC_DynamicAllocators : Clause<"dynamic_allocators"> { let clangClass = "OMPDynamicAllocatorsClause"; diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h index a6605052ba83d3..e979d8840cbaf8 100644 --- a/llvm/include/llvm/IR/DebugProgramInstruction.h +++ b/llvm/include/llvm/IR/DebugProgramInstruction.h @@ -371,29 +371,29 @@ class DbgVariableRecord : public DbgRecord, protected DebugValueUser { return I == RHS.I; } const Value *operator*() const { - ValueAsMetadata *VAM = I.is() - ? I.get() - : *I.get(); + ValueAsMetadata *VAM = isa(I) + ? cast(I) + : *cast(I); return VAM->getValue(); }; Value *operator*() { - ValueAsMetadata *VAM = I.is() - ? I.get() - : *I.get(); + ValueAsMetadata *VAM = isa(I) + ? cast(I) + : *cast(I); return VAM->getValue(); } location_op_iterator &operator++() { - if (I.is()) - I = I.get() + 1; + if (auto *VAM = dyn_cast(I)) + I = VAM + 1; else - I = I.get() + 1; + I = cast(I) + 1; return *this; } location_op_iterator &operator--() { - if (I.is()) - I = I.get() - 1; + if (auto *VAM = dyn_cast(I)) + I = VAM - 1; else - I = I.get() - 1; + I = cast(I) - 1; return *this; } }; diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 049d843015d5ae..115fcee0b04f22 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -613,6 +613,28 @@ class CP_ASYNC_BULK_TENSOR_S2G_INTR { ImmArg>]; } +class CP_ASYNC_BULK_TENSOR_PREFETCH_INTR { + string Name = "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # "d"; + + bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0); + int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0); + list Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets); + list TensorDimsTy = !listsplat(llvm_i32_ty, dim); + list ArgsTy = !listconcat( + [llvm_ptr_ty], // tensormap_ptr + TensorDimsTy, // actual tensor dims + Im2ColOffsetsTy, // im2col offsets + [llvm_i64_ty, // cache_hint + llvm_i1_ty] // Flag for cache_hint + ); + + int TempFlagsStartIdx = !add(dim, 2); + int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets); + list IntrProp = [IntrConvergent, + ReadOnly>, NoCapture>, + ImmArg>]; +} + let TargetPrefix = "nvvm" in { def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], @@ -4902,6 +4924,8 @@ foreach dim = [1, 2, 3, 4, 5] in { def g2s.Name : DefaultAttrsIntrinsic<[], g2s.ArgsTy, g2s.IntrProp>; foreach s2g = [CP_ASYNC_BULK_TENSOR_S2G_INTR] in def s2g.Name : DefaultAttrsIntrinsic<[], s2g.ArgsTy, s2g.IntrProp>; + foreach prefetch = [CP_ASYNC_BULK_TENSOR_PREFETCH_INTR] in + def prefetch.Name : DefaultAttrsIntrinsic<[], prefetch.ArgsTy, prefetch.IntrProp>; } } diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index 3003f9887e239c..b2d6f44b7927a9 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -6101,6 +6101,25 @@ let TargetPrefix = "x86" in { Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_i32_ty], []>; + + def int_x86_tmmultf32ps : ClangBuiltin<"__builtin_ia32_tmmultf32ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, ImmArg>, + ImmArg>]>; + def int_x86_ttmmultf32ps : ClangBuiltin<"__builtin_ia32_ttmmultf32ps">, + Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], + [ImmArg>, ImmArg>, + ImmArg>]>; + def int_x86_tmmultf32ps_internal : + ClangBuiltin<"__builtin_ia32_tmmultf32ps_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, + llvm_x86amx_ty, llvm_x86amx_ty], []>; + def int_x86_ttmmultf32ps_internal : + ClangBuiltin<"__builtin_ia32_ttmmultf32ps_internal">, + Intrinsic<[llvm_x86amx_ty], + [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, + llvm_x86amx_ty, llvm_x86amx_ty], []>; } //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index 4aab658a86690c..9c56cf098ff774 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -304,6 +304,16 @@ HANDLE_LIBCALL(FMAX_F64, "fmax") HANDLE_LIBCALL(FMAX_F80, "fmaxl") HANDLE_LIBCALL(FMAX_F128, "fmaxl") HANDLE_LIBCALL(FMAX_PPCF128, "fmaxl") +HANDLE_LIBCALL(FMINIMUM_F32, "fminimumf") +HANDLE_LIBCALL(FMINIMUM_F64, "fminimum") +HANDLE_LIBCALL(FMINIMUM_F80, "fminimuml") +HANDLE_LIBCALL(FMINIMUM_F128, "fminmuml") +HANDLE_LIBCALL(FMINIMUM_PPCF128, "fminimuml") +HANDLE_LIBCALL(FMAXIMUM_F32, "fmaximumf") +HANDLE_LIBCALL(FMAXIMUM_F64, "fmaximum") +HANDLE_LIBCALL(FMAXIMUM_F80, "fmaximuml") +HANDLE_LIBCALL(FMAXIMUM_F128, "fmaxmuml") +HANDLE_LIBCALL(FMAXIMUM_PPCF128, "fmaximum_numl") HANDLE_LIBCALL(FMINIMUMNUM_F32, "fminimum_numf") HANDLE_LIBCALL(FMINIMUMNUM_F64, "fminimum_num") HANDLE_LIBCALL(FMINIMUMNUM_F80, "fminimum_numl") diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h index 4bfae9eba1a0aa..fd1f0557895446 100644 --- a/llvm/include/llvm/MC/MCPseudoProbe.h +++ b/llvm/include/llvm/MC/MCPseudoProbe.h @@ -431,7 +431,10 @@ class MCPseudoProbeDecoder { using Uint64Map = DenseMap; // Decode pseudo_probe_desc section to build GUID to PseudoProbeFuncDesc map. - bool buildGUID2FuncDescMap(const uint8_t *Start, std::size_t Size); + // If pseudo_probe_desc section is mapped to memory and \p IsMMapped is true, + // uses StringRefs pointing to the section. + bool buildGUID2FuncDescMap(const uint8_t *Start, std::size_t Size, + bool IsMMapped = false); // Decode pseudo_probe section to count the number of probes and inlined // function records for each function record. diff --git a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h index 3b0cbcdd49c254..36a996632b71e3 100644 --- a/llvm/include/llvm/ProfileData/CtxInstrContextNode.h +++ b/llvm/include/llvm/ProfileData/CtxInstrContextNode.h @@ -8,9 +8,9 @@ //============================================================================== // // NOTE! -// llvm/lib/ProfileData/CtxInstrContextNode.h and +// llvm/include/llvm/ProfileData/CtxInstrContextNode.h and // compiler-rt/lib/ctx_profile/CtxInstrContextNode.h -// must be exact copies of each other +// must be exact copies of each other. // // compiler-rt creates these objects as part of the instrumentation runtime for // contextual profiling. LLVM only consumes them to convert a contextual tree @@ -114,4 +114,4 @@ class ContextNode final { }; } // namespace ctx_profile } // namespace llvm -#endif \ No newline at end of file +#endif diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 0c4c6ccd5c568e..17987935ed3cf4 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -776,6 +776,9 @@ HANDLE_TARGET_OPCODE(G_SHUFFLE_VECTOR) /// Generic splatvector. HANDLE_TARGET_OPCODE(G_SPLAT_VECTOR) +/// Generic stepvector. +HANDLE_TARGET_OPCODE(G_STEP_VECTOR) + /// Generic masked compress. HANDLE_TARGET_OPCODE(G_VECTOR_COMPRESS) diff --git a/llvm/include/llvm/Support/YAMLTraits.h b/llvm/include/llvm/Support/YAMLTraits.h index 1d04783753d5cd..403584e52fed3b 100644 --- a/llvm/include/llvm/Support/YAMLTraits.h +++ b/llvm/include/llvm/Support/YAMLTraits.h @@ -2079,6 +2079,15 @@ template struct StdMapStringCustomMappingTraitsImpl { LLVM_YAML_IS_SEQUENCE_VECTOR_IMPL(type, true) #define LLVM_YAML_DECLARE_MAPPING_TRAITS(Type) \ + namespace llvm { \ + namespace yaml { \ + template <> struct LLVM_ABI MappingTraits { \ + static void mapping(IO &IO, Type &Obj); \ + }; \ + } \ + } + +#define LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(Type) \ namespace llvm { \ namespace yaml { \ template <> struct MappingTraits { \ @@ -2090,7 +2099,7 @@ template struct StdMapStringCustomMappingTraitsImpl { #define LLVM_YAML_DECLARE_ENUM_TRAITS(Type) \ namespace llvm { \ namespace yaml { \ - template <> struct ScalarEnumerationTraits { \ + template <> struct LLVM_ABI ScalarEnumerationTraits { \ static void enumeration(IO &io, Type &Value); \ }; \ } \ @@ -2099,7 +2108,7 @@ template struct StdMapStringCustomMappingTraitsImpl { #define LLVM_YAML_DECLARE_BITSET_TRAITS(Type) \ namespace llvm { \ namespace yaml { \ - template <> struct ScalarBitSetTraits { \ + template <> struct LLVM_ABI ScalarBitSetTraits { \ static void bitset(IO &IO, Type &Options); \ }; \ } \ @@ -2108,7 +2117,7 @@ template struct StdMapStringCustomMappingTraitsImpl { #define LLVM_YAML_DECLARE_SCALAR_TRAITS(Type, MustQuote) \ namespace llvm { \ namespace yaml { \ - template <> struct ScalarTraits { \ + template <> struct LLVM_ABI ScalarTraits { \ static void output(const Type &Value, void *ctx, raw_ostream &Out); \ static StringRef input(StringRef Scalar, void *ctxt, Type &Value); \ static QuotingType mustQuote(StringRef) { return MustQuote; } \ diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 62bb9789afe5d2..60606db078b374 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -1590,6 +1590,13 @@ def G_SPLAT_VECTOR: GenericInstruction { let hasSideEffects = false; } +// Generic stepvector. +def G_STEP_VECTOR: GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins unknown:$step); + let hasSideEffects = false; +} + // Generic masked compress. def G_VECTOR_COMPRESS: GenericInstruction { let OutOperandList = (outs type0:$dst); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 5928b369913916..6da089b719c06c 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -335,6 +335,14 @@ def mul_to_shl : GICombineRule< [{ return Helper.matchCombineMulToShl(*${mi}, ${matchinfo}); }]), (apply [{ Helper.applyCombineMulToShl(*${mi}, ${matchinfo}); }])>; +// (sub x, C) -> (add x, -C) +def sub_to_add : GICombineRule< + (defs root:$d, build_fn_matchinfo:$matchinfo), + (match (G_CONSTANT $c, $imm), + (G_SUB $d, $op1, $c):$mi, + [{ return Helper.matchCombineSubToAdd(*${mi}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFnNoErase(*${mi}, ${matchinfo}); }])>; + // shl ([asz]ext x), y => zext (shl x, y), if shift does not overflow int def reduce_shl_of_extend_matchdata : GIDefMatchData<"RegisterImmPair">; def reduce_shl_of_extend : GICombineRule< @@ -1912,8 +1920,9 @@ def bitreverse_shift : GICombineGroup<[bitreverse_shl, bitreverse_lshr]>; def select_combines : GICombineGroup<[select_undef_cmp, select_constant_cmp, select_to_iminmax, match_selects]>; -def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, add_p2i_to_ptradd, - mul_by_neg_one, idempotent_prop]>; +def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl, sub_to_add, + add_p2i_to_ptradd, mul_by_neg_one, + idempotent_prop]>; def fma_combines : GICombineGroup<[combine_fadd_fmul_to_fmad_or_fma, combine_fadd_fpext_fmul_to_fmad_or_fma, combine_fadd_fma_fmul_to_fmad_or_fma, diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index 79c07bc2fc9204..2148f5be4c41aa 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -48,6 +48,7 @@ class GINodeEquiv { // These are defined in the same order as the G_* instructions. def : GINodeEquiv; def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def index 815556e374bef5..3b643563775688 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.def +++ b/llvm/include/llvm/TargetParser/X86TargetParser.def @@ -267,6 +267,7 @@ X86_FEATURE (ZU, "zu") X86_FEATURE (AMX_FP8, "amx-fp8") X86_FEATURE (AMX_TRANSPOSE, "amx-transpose") X86_FEATURE (AMX_AVX512, "amx-avx512") +X86_FEATURE (AMX_TF32, "amx-tf32") // These features aren't really CPU features, but the frontend can set them. X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk") X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches") diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfile.h b/llvm/include/llvm/Transforms/IPO/SampleProfile.h index e94f6ba55cd0dd..6f4f9701bae930 100644 --- a/llvm/include/llvm/Transforms/IPO/SampleProfile.h +++ b/llvm/include/llvm/Transforms/IPO/SampleProfile.h @@ -41,7 +41,9 @@ class SampleProfileLoaderPass : public PassInfoMixin { SampleProfileLoaderPass( std::string File = "", std::string RemappingFile = "", ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None, - IntrusiveRefCntPtr FS = nullptr); + IntrusiveRefCntPtr FS = nullptr, + bool DisableSampleProfileInlining = false, + bool UseFlattenedProfile = false); PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); @@ -50,6 +52,8 @@ class SampleProfileLoaderPass : public PassInfoMixin { std::string ProfileRemappingFileName; const ThinOrFullLTOPhase LTOPhase; IntrusiveRefCntPtr FS; + bool DisableSampleProfileInlining; + bool UseFlattenedProfile; }; } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h index 18e34bcec81b06..02cd7650ad8a5a 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h @@ -25,10 +25,12 @@ namespace llvm::sandboxir { class BottomUpVec final : public FunctionPass { bool Change = false; std::unique_ptr Legality; + SmallVector DeadInstrCandidates; /// Creates and returns a vector instruction that replaces the instructions in /// \p Bndl. \p Operands are the already vectorized operands. Value *createVectorInstr(ArrayRef Bndl, ArrayRef Operands); + void tryEraseDeadInstrs(); Value *vectorizeRec(ArrayRef Bndl); bool tryVectorize(ArrayRef Seeds); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index f178e3a8acc259..c48068afc04816 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -4238,6 +4238,10 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB, case LibFunc_atanf: case LibFunc_atanl: return Intrinsic::atan; + case LibFunc_atan2: + case LibFunc_atan2f: + case LibFunc_atan2l: + return Intrinsic::atan2; case LibFunc_sinh: case LibFunc_sinhf: case LibFunc_sinhl: diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index cd5cf0443541fc..15e325a0fffca5 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -69,6 +69,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::asin: case Intrinsic::acos: case Intrinsic::atan: + case Intrinsic::atan2: case Intrinsic::sin: case Intrinsic::cos: case Intrinsic::tan: diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp index 59257fd6aadd52..a465f52bfd5936 100644 --- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp +++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp @@ -1051,10 +1051,10 @@ class AssignmentTrackingLowering { OS << ", s="; if (Source.isNull()) OS << "null"; - else if (isa(Source)) - OS << Source.get(); + else if (const auto *DAI = dyn_cast(Source)) + OS << DAI; else - OS << Source.get(); + OS << cast(Source); OS << ")"; } diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp index 3e73995846176e..b95516f616e0f1 100644 --- a/llvm/lib/CodeGen/EarlyIfConversion.cpp +++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp @@ -1119,9 +1119,6 @@ bool EarlyIfConverter::run(MachineFunction &MF) { PreservedAnalyses EarlyIfConverterPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { - if (MF.getFunction().hasOptNone()) - return PreservedAnalyses::all(); - MachineDominatorTree &MDT = MFAM.getResult(MF); MachineLoopInfo &LI = MFAM.getResult(MF); MachineTraceMetrics &MTM = MFAM.getResult(MF); diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 08ee6169ecee84..5126aba3658c14 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -21,7 +21,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/DataLayout.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 3b648a7e3f4472..32afbeaeaa249e 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -2044,6 +2044,31 @@ void CombinerHelper::applyCombineMulToShl(MachineInstr &MI, Observer.changedInstr(MI); } +bool CombinerHelper::matchCombineSubToAdd(MachineInstr &MI, + BuildFnTy &MatchInfo) { + GSub &Sub = cast(MI); + + LLT Ty = MRI.getType(Sub.getReg(0)); + + if (!isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, {Ty}})) + return false; + + if (!isConstantLegalOrBeforeLegalizer(Ty)) + return false; + + APInt Imm = getIConstantFromReg(Sub.getRHSReg(), MRI); + + MatchInfo = [=, &MI](MachineIRBuilder &B) { + auto NegCst = B.buildConstant(Ty, -Imm); + Observer.changingInstr(MI); + MI.setDesc(B.getTII().get(TargetOpcode::G_ADD)); + MI.getOperand(2).setReg(NegCst.getReg(0)); + MI.clearFlag(MachineInstr::MIFlag::NoUWrap); + Observer.changedInstr(MI); + }; + return true; +} + // shl ([sza]ext x), y => zext (shl x, y), if shift does not overflow source bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI, RegisterImmPair &MatchData) { diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCompares.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCompares.cpp index 7a4cfd4b1a7bb5..872b5fed11c6e6 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCompares.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCompares.cpp @@ -15,14 +15,11 @@ #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" -#include "llvm/CodeGen/LowLevelTypeUtils.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/Instructions.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/ErrorHandling.h" #include #define DEBUG_TYPE "gi-combiner" diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp index 2c98b129a1a892..fefa8f2ea85942 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp @@ -20,7 +20,6 @@ #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/ConstantRange.h" -#include "llvm/IR/Module.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "gisel-known-bits" diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 8cd3fa5f432b6e..056f4f41ffca79 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -38,7 +38,6 @@ #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/CodeGen/SwitchLoweringUtils.h" #include "llvm/CodeGen/TargetFrameLowering.h" @@ -2167,7 +2166,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: { // No stack colouring in O0, discard region information. - if (MF->getTarget().getOptLevel() == CodeGenOptLevel::None) + if (MF->getTarget().getOptLevel() == CodeGenOptLevel::None || + MF->getFunction().hasOptNone()) return true; unsigned Op = ID == Intrinsic::lifetime_start ? TargetOpcode::LIFETIME_START diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp index 9444ff518ca9cb..9185a7d1eca913 100644 --- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp +++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp @@ -31,7 +31,6 @@ #include "llvm/IR/Function.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/CodeGenCoverage.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/DebugCounter.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp index 6d75258c1041b1..ef1c54e6cef138 100644 --- a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp @@ -27,7 +27,6 @@ #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Error.h" diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp index 0d0c093648ebaa..e411e73dbe7340 100644 --- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp @@ -36,7 +36,6 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/MathExtras.h" #include #define DEBUG_TYPE "loadstore-opt" diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index 02dbe781babdba..c5e5c926160e2c 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -809,6 +809,17 @@ MachineInstrBuilder MachineIRBuilder::buildInsert(const DstOp &Res, return buildInstr(TargetOpcode::G_INSERT, Res, {Src, Op, uint64_t(Index)}); } +MachineInstrBuilder MachineIRBuilder::buildStepVector(const DstOp &Res, + unsigned Step) { + ConstantInt *CI = + ConstantInt::get(getMF().getFunction().getContext(), APInt(64, Step)); + auto StepVector = buildInstr(TargetOpcode::G_STEP_VECTOR); + StepVector->setDebugLoc(DebugLoc()); + Res.addDefToMIB(*getMRI(), StepVector); + StepVector.addCImm(CI); + return StepVector; +} + MachineInstrBuilder MachineIRBuilder::buildVScale(const DstOp &Res, unsigned MinElts) { diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 5cee07461d7e22..45807a6818ee5e 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -214,8 +214,8 @@ bool llvm::canReplaceReg(Register DstReg, Register SrcReg, // Otherwise match if the Src is already a regclass that is covered by the Dst // RegBank. - return DstRBC.is() && MRI.getRegClassOrNull(SrcReg) && - DstRBC.get()->covers( + return isa(DstRBC) && MRI.getRegClassOrNull(SrcReg) && + cast(DstRBC)->covers( *MRI.getRegClassOrNull(SrcReg)); } diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp index f4bf47c3ae158f..6a0b918d5e6f67 100644 --- a/llvm/lib/CodeGen/LiveRangeShrink.cpp +++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp @@ -246,7 +246,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) { if (MI.getOperand(0).isReg()) for (; EndIter != MBB.end() && EndIter->isDebugValue() && EndIter->hasDebugOperandForReg(MI.getOperand(0).getReg()); - ++EndIter, ++Next) + ++EndIter) IOM[&*EndIter] = NewOrder; MBB.splice(I, &MBB, MI.getIterator(), EndIter); } diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp index 8e9fcccff77645..0a547050e91a8a 100644 --- a/llvm/lib/CodeGen/MachineCSE.cpp +++ b/llvm/lib/CodeGen/MachineCSE.cpp @@ -957,9 +957,6 @@ PreservedAnalyses MachineCSEPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { MFPropsModifier _(*this, MF); - if (MF.getFunction().hasOptNone()) - return PreservedAnalyses::all(); - MachineDominatorTree &MDT = MFAM.getResult(MF); MachineBlockFrequencyInfo &MBFI = MFAM.getResult(MF); diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index da095c692e6d5d..853d455bec5f98 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -1764,9 +1764,6 @@ bool MachineLICMImpl::isTgtHotterThanSrc(MachineBasicBlock *SrcBlock, template PreservedAnalyses MachineLICMBasePass::run( MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { - if (MF.getFunction().hasOptNone()) - return PreservedAnalyses::all(); - bool Changed = MachineLICMImpl(PreRegAlloc, nullptr, &MFAM).run(MF); if (!Changed) return PreservedAnalyses::all(); diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index e2c09fe25d55cd..3910046a1652b1 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1487,7 +1487,9 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { LLT SrcTy = MRI->getType(MI->getOperand(NumDsts).getReg()); if (DstTy.isVector()) { // This case is the converse of G_CONCAT_VECTORS. - if (!SrcTy.isVector() || SrcTy.getScalarType() != DstTy.getScalarType() || + if (!SrcTy.isVector() || + (SrcTy.getScalarType() != DstTy.getScalarType() && + !SrcTy.isPointerVector()) || SrcTy.isScalableVector() != DstTy.isScalableVector() || SrcTy.getSizeInBits() != NumDsts * DstTy.getSizeInBits()) report("G_UNMERGE_VALUES source operand does not match vector " @@ -1729,6 +1731,36 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { } break; } + case TargetOpcode::G_STEP_VECTOR: { + if (!MI->getOperand(1).isCImm()) { + report("operand must be cimm", MI); + break; + } + + if (!MI->getOperand(1).getCImm()->getValue().isStrictlyPositive()) { + report("step must be > 0", MI); + break; + } + + LLT DstTy = MRI->getType(MI->getOperand(0).getReg()); + if (!DstTy.isScalableVector()) { + report("Destination type must be a scalable vector", MI); + break; + } + + // + if (!DstTy.getElementType().isScalar()) { + report("Destination element type must be scalar", MI); + break; + } + + if (MI->getOperand(1).getCImm()->getBitWidth() != + DstTy.getElementType().getScalarSizeInBits()) { + report("step bitwidth differs from result type element bitwidth", MI); + break; + } + break; + } case TargetOpcode::G_INSERT_SUBVECTOR: { const MachineOperand &Src0Op = MI->getOperand(1); if (!Src0Op.isReg()) { diff --git a/llvm/lib/CodeGen/OptimizePHIs.cpp b/llvm/lib/CodeGen/OptimizePHIs.cpp index cccc368e56e40d..569b6b2f769c00 100644 --- a/llvm/lib/CodeGen/OptimizePHIs.cpp +++ b/llvm/lib/CodeGen/OptimizePHIs.cpp @@ -81,9 +81,6 @@ INITIALIZE_PASS(OptimizePHIsLegacy, DEBUG_TYPE, PreservedAnalyses OptimizePHIsPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { - if (MF.getFunction().hasOptNone()) - return PreservedAnalyses::all(); - OptimizePHIs OP; if (!OP.run(MF)) return PreservedAnalyses::all(); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index fa2731ff7dbda7..5abd7cb97bda57 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -76,6 +76,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::FMAXNUM: R = SoftenFloatRes_FMAXNUM(N); break; case ISD::FMINIMUMNUM: R = SoftenFloatRes_FMINIMUMNUM(N); break; case ISD::FMAXIMUMNUM: R = SoftenFloatRes_FMAXIMUMNUM(N); break; + case ISD::FMINIMUM: R = SoftenFloatRes_FMINIMUM(N); break; + case ISD::FMAXIMUM: R = SoftenFloatRes_FMAXIMUM(N); break; case ISD::STRICT_FADD: case ISD::FADD: R = SoftenFloatRes_FADD(N); break; case ISD::STRICT_FACOS: @@ -342,6 +344,20 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXIMUMNUM(SDNode *N) { RTLIB::FMAXIMUMNUM_F128, RTLIB::FMAXIMUMNUM_PPCF128)); } +SDValue DAGTypeLegalizer::SoftenFloatRes_FMINIMUM(SDNode *N) { + return SoftenFloatRes_Binary( + N, GetFPLibCall(N->getValueType(0), RTLIB::FMINIMUM_F32, + RTLIB::FMINIMUM_F64, RTLIB::FMINIMUM_F80, + RTLIB::FMINIMUM_F128, RTLIB::FMINIMUM_PPCF128)); +} + +SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXIMUM(SDNode *N) { + return SoftenFloatRes_Binary( + N, GetFPLibCall(N->getValueType(0), RTLIB::FMAXIMUM_F32, + RTLIB::FMAXIMUM_F64, RTLIB::FMAXIMUM_F80, + RTLIB::FMAXIMUM_F128, RTLIB::FMAXIMUM_PPCF128)); +} + SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) { return SoftenFloatRes_Binary(N, GetFPLibCall(N->getValueType(0), RTLIB::ADD_F32, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 8d3458aaab9f86..a56cd5423e00bd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -572,6 +572,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftenFloatRes_FMAXNUM(SDNode *N); SDValue SoftenFloatRes_FMINIMUMNUM(SDNode *N); SDValue SoftenFloatRes_FMAXIMUMNUM(SDNode *N); + SDValue SoftenFloatRes_FMINIMUM(SDNode *N); + SDValue SoftenFloatRes_FMAXIMUM(SDNode *N); SDValue SoftenFloatRes_FADD(SDNode *N); SDValue SoftenFloatRes_FCBRT(SDNode *N); SDValue SoftenFloatRes_FCEIL(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 203e14f6cde3e3..901e63c47fac17 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3920,6 +3920,19 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known.Zero.setBitsFrom(1); break; } + case ISD::MGATHER: + case ISD::MLOAD: { + ISD::LoadExtType ETy = + (Opcode == ISD::MGATHER) + ? cast(Op)->getExtensionType() + : cast(Op)->getExtensionType(); + if (ETy == ISD::ZEXTLOAD) { + EVT MemVT = cast(Op)->getMemoryVT(); + KnownBits Known0(MemVT.getScalarSizeInBits()); + return Known0.zext(BitWidth); + } + break; + } case ISD::LOAD: { LoadSDNode *LD = cast(Op); const Constant *Cst = TLI->getTargetConstantFromLoad(LD); diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp index 0be31d5db11ae2..0305bdce26f731 100644 --- a/llvm/lib/CodeGen/StackColoring.cpp +++ b/llvm/lib/CodeGen/StackColoring.cpp @@ -1193,9 +1193,6 @@ bool StackColoringLegacy::runOnMachineFunction(MachineFunction &MF) { PreservedAnalyses StackColoringPass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { - if (MF.getFunction().hasOptNone()) - return PreservedAnalyses::all(); - StackColoring SC(&MFAM.getResult(MF)); if (SC.run(MF)) return PreservedAnalyses::none(); diff --git a/llvm/lib/CodeGen/TailDuplication.cpp b/llvm/lib/CodeGen/TailDuplication.cpp index b698ca675b65e2..a50cdcfa7cc60e 100644 --- a/llvm/lib/CodeGen/TailDuplication.cpp +++ b/llvm/lib/CodeGen/TailDuplication.cpp @@ -110,9 +110,6 @@ PreservedAnalyses TailDuplicatePassBase::run( MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { MFPropsModifier _(static_cast(*this), MF); - if (MF.getFunction().hasOptNone()) - return PreservedAnalyses::all(); - auto *MBPI = &MFAM.getResult(MF); auto *PSI = MFAM.getResult(MF) .getCachedResult( diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVLocation.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVLocation.cpp index 17b32a5f67b49b..3c078d8ee74b80 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVLocation.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVLocation.cpp @@ -156,7 +156,7 @@ std::string LVOperation::getOperandsDWARFInfo() { Stream << "push_object_address"; break; case dwarf::DW_OP_form_tls_address: - Stream << "form_tls_address " << hexString(Operands[0]); + Stream << "form_tls_address"; break; case dwarf::DW_OP_call_frame_cfa: Stream << "call_frame_cfa"; @@ -308,7 +308,7 @@ std::string LVOperation::getOperandsDWARFInfo() { PrintRegisterInfo(dwarf::DW_OP_reg0); break; case dwarf::DW_OP_GNU_push_tls_address: - Stream << "gnu_push_tls_address " << hexString(Operands[0]); + Stream << "gnu_push_tls_address"; break; case dwarf::DW_OP_GNU_addr_index: Stream << "gnu_addr_index " << unsigned(Operands[0]); diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp index 1c523c01314977..ce1d5619e1fa80 100644 --- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp @@ -304,12 +304,12 @@ void LVDWARFReader::processOneAttribute(const DWARFDie &Die, CurrentElement->setBitSize(*FormValue.getAsUnsignedConstant()); break; case dwarf::DW_AT_call_file: - CurrentElement->setCallFilenameIndex(GetAsUnsignedConstant()); + CurrentElement->setCallFilenameIndex(IncrementFileIndex + ? GetAsUnsignedConstant() + 1 + : GetAsUnsignedConstant()); break; case dwarf::DW_AT_call_line: - CurrentElement->setCallLineNumber(IncrementFileIndex - ? GetAsUnsignedConstant() + 1 - : GetAsUnsignedConstant()); + CurrentElement->setCallLineNumber(GetAsUnsignedConstant()); break; case dwarf::DW_AT_comp_dir: CompileUnit->setCompilationDirectory(dwarf::toStringRef(FormValue)); diff --git a/llvm/lib/ExecutionEngine/Orc/AbsoluteSymbols.cpp b/llvm/lib/ExecutionEngine/Orc/AbsoluteSymbols.cpp new file mode 100644 index 00000000000000..d37dad8925e9a6 --- /dev/null +++ b/llvm/lib/ExecutionEngine/Orc/AbsoluteSymbols.cpp @@ -0,0 +1,57 @@ +//===---------- AbsoluteSymbols.cpp - Absolute symbols utilities ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" +#include "llvm/ExecutionEngine/Orc/Core.h" + +#define DEBUG_TYPE "orc" + +namespace llvm::orc { + +AbsoluteSymbolsMaterializationUnit::AbsoluteSymbolsMaterializationUnit( + SymbolMap Symbols) + : MaterializationUnit(extractFlags(Symbols)), Symbols(std::move(Symbols)) {} + +StringRef AbsoluteSymbolsMaterializationUnit::getName() const { + return ""; +} + +void AbsoluteSymbolsMaterializationUnit::materialize( + std::unique_ptr R) { + // Even though these are just absolute symbols we need to check for failure + // to resolve/emit: the tracker for these symbols may have been removed while + // the materialization was in flight (e.g. due to a failure in some action + // triggered by the queries attached to the resolution/emission of these + // symbols). + if (auto Err = R->notifyResolved(Symbols)) { + R->getExecutionSession().reportError(std::move(Err)); + R->failMaterialization(); + return; + } + if (auto Err = R->notifyEmitted({})) { + R->getExecutionSession().reportError(std::move(Err)); + R->failMaterialization(); + return; + } +} + +void AbsoluteSymbolsMaterializationUnit::discard(const JITDylib &JD, + const SymbolStringPtr &Name) { + assert(Symbols.count(Name) && "Symbol is not part of this MU"); + Symbols.erase(Name); +} + +MaterializationUnit::Interface +AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) { + SymbolFlagsMap Flags; + for (const auto &[Name, Def] : Symbols) + Flags[Name] = Def.getFlags(); + return MaterializationUnit::Interface(std::move(Flags), nullptr); +} + +} // namespace llvm::orc diff --git a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt index 008875118fdeff..7a73ab56a5d97c 100644 --- a/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt +++ b/llvm/lib/ExecutionEngine/Orc/CMakeLists.txt @@ -7,6 +7,7 @@ if( CMAKE_HOST_UNIX AND HAVE_LIBRT ) endif() add_llvm_component_library(LLVMOrcJIT + AbsoluteSymbols.cpp COFFVCRuntimeSupport.cpp COFFPlatform.cpp CompileOnDemandLayer.cpp diff --git a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp index f46cb906bb7556..2176acc5bba807 100644 --- a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/COFFPlatform.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h" #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h" diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index 226216f781fe9e..78041993648834 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -276,47 +276,6 @@ void AsynchronousSymbolQuery::detach() { QueryRegistrations.clear(); } -AbsoluteSymbolsMaterializationUnit::AbsoluteSymbolsMaterializationUnit( - SymbolMap Symbols) - : MaterializationUnit(extractFlags(Symbols)), Symbols(std::move(Symbols)) {} - -StringRef AbsoluteSymbolsMaterializationUnit::getName() const { - return ""; -} - -void AbsoluteSymbolsMaterializationUnit::materialize( - std::unique_ptr R) { - // Even though these are just absolute symbols we need to check for failure - // to resolve/emit: the tracker for these symbols may have been removed while - // the materialization was in flight (e.g. due to a failure in some action - // triggered by the queries attached to the resolution/emission of these - // symbols). - if (auto Err = R->notifyResolved(Symbols)) { - R->getExecutionSession().reportError(std::move(Err)); - R->failMaterialization(); - return; - } - if (auto Err = R->notifyEmitted({})) { - R->getExecutionSession().reportError(std::move(Err)); - R->failMaterialization(); - return; - } -} - -void AbsoluteSymbolsMaterializationUnit::discard(const JITDylib &JD, - const SymbolStringPtr &Name) { - assert(Symbols.count(Name) && "Symbol is not part of this MU"); - Symbols.erase(Name); -} - -MaterializationUnit::Interface -AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) { - SymbolFlagsMap Flags; - for (const auto &[Name, Def] : Symbols) - Flags[Name] = Def.getFlags(); - return MaterializationUnit::Interface(std::move(Flags), nullptr); -} - ReExportsMaterializationUnit::ReExportsMaterializationUnit( JITDylib *SourceJD, JITDylibLookupFlags SourceJDLookupFlags, SymbolAliasMap Aliases) diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp index 6cea9845a3403b..3874f25751b1a2 100644 --- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp @@ -14,6 +14,7 @@ #include "llvm/ExecutionEngine/JITLink/aarch64.h" #include "llvm/ExecutionEngine/JITLink/ppc64.h" #include "llvm/ExecutionEngine/JITLink/x86_64.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h" diff --git a/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp b/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp index 8490eee22aea56..2a93fcbf6c8c83 100644 --- a/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp +++ b/llvm/lib/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h" + +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/Support/Error.h" diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp index 1dcf91443d55db..efaed1b82d0eb2 100644 --- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp @@ -8,6 +8,7 @@ #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/ExecutionEngine/JITLink/x86_64.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/Layer.h" #include "llvm/ExecutionEngine/Orc/LoadLinkableFile.h" #include "llvm/ExecutionEngine/Orc/MachO.h" diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index e5609053c74d7b..822316c4bf996e 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -12,6 +12,7 @@ #include "llvm/ExecutionEngine/JITLink/MachO.h" #include "llvm/ExecutionEngine/JITLink/aarch64.h" #include "llvm/ExecutionEngine/JITLink/x86_64.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h" diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp index 453b8f86868adb..ae76cb08765e46 100644 --- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp +++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp @@ -11,6 +11,7 @@ #include "llvm-c/OrcEE.h" #include "llvm-c/TargetMachine.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h" #include "llvm/ExecutionEngine/Orc/LLJIT.h" #include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h" diff --git a/llvm/lib/ExecutionEngine/Orc/Speculation.cpp b/llvm/lib/ExecutionEngine/Orc/Speculation.cpp index 70b536d2feda0f..74b9eb29bdccf3 100644 --- a/llvm/lib/ExecutionEngine/Orc/Speculation.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Speculation.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/Speculation.h" + +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 665eda28c7d871..3d8e12e95b774f 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -1033,8 +1033,8 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(Value *V, DbgInstPtr DVI = insertDbgValueIntrinsic( V, VarInfo, Expr, DL, InsertBefore ? InsertBefore->getParent() : nullptr, InsertBefore); - if (DVI.is()) - cast(DVI.get())->setTailCall(); + if (auto *Inst = dyn_cast(DVI)) + cast(Inst)->setTailCall(); return DVI; } diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index e20a0f053481ed..e5b45e0082a823 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -2099,10 +2099,10 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest, AddrExpr, VarRec.DL); (void)Assign; LLVM_DEBUG(if (!Assign.isNull()) { - if (Assign.is()) - errs() << " > INSERT: " << *Assign.get() << "\n"; + if (const auto *Record = dyn_cast(Assign)) + errs() << " > INSERT: " << *Record << "\n"; else - errs() << " > INSERT: " << *Assign.get() << "\n"; + errs() << " > INSERT: " << *cast(Assign) << "\n"; }); } diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index 28f2ca550f5ec9..98cfbd11fde58f 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -274,7 +274,7 @@ ReplaceableMetadataImpl::getAllDbgVariableRecordUsers() { OwnerTy Owner = Pair.second.first; if (Owner.isNull()) continue; - if (!Owner.is()) + if (!isa(Owner)) continue; DVRUsersWithID.push_back(&UseMap[Pair.first]); } @@ -288,7 +288,7 @@ ReplaceableMetadataImpl::getAllDbgVariableRecordUsers() { }); SmallVector DVRUsers; for (auto UserWithID : DVRUsersWithID) - DVRUsers.push_back(UserWithID->first.get()->getUser()); + DVRUsers.push_back(cast(UserWithID->first)->getUser()); return DVRUsers; } @@ -396,8 +396,8 @@ void ReplaceableMetadataImpl::replaceAllUsesWith(Metadata *MD) { continue; } - if (Owner.is()) { - Owner.get()->handleChangedValue(Pair.first, MD); + if (auto *DVU = dyn_cast(Owner)) { + DVU->handleChangedValue(Pair.first, MD); continue; } @@ -436,7 +436,7 @@ void ReplaceableMetadataImpl::resolveAllUses(bool ResolveUsers) { auto Owner = Pair.second.first; if (!Owner) continue; - if (!Owner.is()) + if (!isa(Owner)) continue; // Resolve MDNodes that point at this. diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp index 90d7588407068a..2a3761b2cfe718 100644 --- a/llvm/lib/MC/MCPseudoProbe.cpp +++ b/llvm/lib/MC/MCPseudoProbe.cpp @@ -375,7 +375,8 @@ ErrorOr MCPseudoProbeDecoder::readString(uint32_t Size) { } bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start, - std::size_t Size) { + std::size_t Size, + bool IsMMapped) { // The pseudo_probe_desc section has a format like: // .section .pseudo_probe_desc,"",@progbits // .quad -5182264717993193164 // GUID @@ -422,7 +423,8 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start, StringRef Name = cantFail(errorOrToExpected(readString(NameSize))); // Initialize PseudoProbeFuncDesc and populate it into GUID2FuncDescMap - GUID2FuncDescMap.emplace_back(GUID, Hash, Name.copy(FuncNameAllocator)); + GUID2FuncDescMap.emplace_back( + GUID, Hash, IsMMapped ? Name : Name.copy(FuncNameAllocator)); } assert(Data == End && "Have unprocessed data in pseudo_probe_desc section"); assert(GUID2FuncDescMap.size() == FuncDescCount && diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 785aa9bca88191..f0169877134733 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -2162,6 +2162,19 @@ PassBuilder::buildO0DefaultPipeline(OptimizationLevel Level, if (PGOOpt && PGOOpt->DebugInfoForProfiling) MPM.addPass(createModuleToFunctionPassAdaptor(AddDiscriminatorsPass())); + if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse) { + // Explicitly disable sample loader inlining and use flattened profile in O0 + // pipeline. + MPM.addPass(SampleProfileLoaderPass(PGOOpt->ProfileFile, + PGOOpt->ProfileRemappingFile, + ThinOrFullLTOPhase::None, nullptr, + /*DisableSampleProfileInlining=*/true, + /*UseFlattenedProfile=*/true)); + // Cache ProfileSummaryAnalysis once to avoid the potential need to insert + // RequireAnalysisPass for PSI before subsequent non-module passes. + MPM.addPass(RequireAnalysisPass()); + } + invokePipelineEarlySimplificationEPCallbacks(MPM, Level, Phase); // Build a minimal pipeline based on the semantics required by LLVM, diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index d4866a025c1b48..5a3bf8884cd542 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -1048,14 +1048,16 @@ void OptNoneInstrumentation::registerCallbacks( } bool OptNoneInstrumentation::shouldRun(StringRef PassID, Any IR) { - const auto *F = unwrapIR(IR); - if (!F) { - if (const auto *L = unwrapIR(IR)) - F = L->getHeader()->getParent(); - } - bool ShouldRun = !(F && F->hasOptNone()); + bool ShouldRun = true; + if (const auto *F = unwrapIR(IR)) + ShouldRun = !F->hasOptNone(); + else if (const auto *L = unwrapIR(IR)) + ShouldRun = !L->getHeader()->getParent()->hasOptNone(); + else if (const auto *MF = unwrapIR(IR)) + ShouldRun = !MF->getFunction().hasOptNone(); + if (!ShouldRun && DebugLogging) { - errs() << "Skipping pass " << PassID << " on " << F->getName() + errs() << "Skipping pass " << PassID << " on " << getIRName(IR) << " due to optnone attribute\n"; } return ShouldRun; diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp index 096b827541eeaf..df941b2fa81efe 100644 --- a/llvm/lib/SandboxIR/Instruction.cpp +++ b/llvm/lib/SandboxIR/Instruction.cpp @@ -124,10 +124,6 @@ void Instruction::moveBefore(BasicBlock &BB, const BBIterator &WhereIt) { void Instruction::insertBefore(Instruction *BeforeI) { llvm::Instruction *BeforeTopI = BeforeI->getTopmostLLVMInstruction(); - // TODO: Move this to the verifier of sandboxir::Instruction. - assert(is_sorted(getLLVMInstrs(), - [](auto *I1, auto *I2) { return I1->comesBefore(I2); }) && - "Expected program order!"); Ctx.getTracker().emplaceIfTracking(this); diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp index abcad39330094d..e4f84dee07e4da 100644 --- a/llvm/lib/SandboxIR/Tracker.cpp +++ b/llvm/lib/SandboxIR/Tracker.cpp @@ -111,10 +111,10 @@ void EraseFromParent::accept() { void EraseFromParent::revert(Tracker &Tracker) { // Place the bottom-most instruction first. auto [Operands, BotLLVMI] = InstrData[0]; - if (auto *NextLLVMI = NextLLVMIOrBB.dyn_cast()) { + if (auto *NextLLVMI = dyn_cast(NextLLVMIOrBB)) { BotLLVMI->insertBefore(NextLLVMI); } else { - auto *LLVMBB = NextLLVMIOrBB.get(); + auto *LLVMBB = cast(NextLLVMIOrBB); BotLLVMI->insertInto(LLVMBB, LLVMBB->end()); } for (auto [OpNum, Op] : enumerate(Operands)) @@ -145,10 +145,10 @@ RemoveFromParent::RemoveFromParent(Instruction *RemovedI) : RemovedI(RemovedI) { } void RemoveFromParent::revert(Tracker &Tracker) { - if (auto *NextI = NextInstrOrBB.dyn_cast()) { + if (auto *NextI = dyn_cast(NextInstrOrBB)) { RemovedI->insertBefore(NextI); } else { - auto *BB = NextInstrOrBB.get(); + auto *BB = cast(NextInstrOrBB); RemovedI->insertInto(BB, BB->end()); } } @@ -199,10 +199,10 @@ MoveInstr::MoveInstr(Instruction *MovedI) : MovedI(MovedI) { } void MoveInstr::revert(Tracker &Tracker) { - if (auto *NextI = NextInstrOrBB.dyn_cast()) { + if (auto *NextI = dyn_cast(NextInstrOrBB)) { MovedI->moveBefore(NextI); } else { - auto *BB = NextInstrOrBB.get(); + auto *BB = cast(NextInstrOrBB); MovedI->moveBefore(*BB, BB->end()); } } diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 0d69bbeb50260f..6854cccaafa1d7 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -63,7 +63,7 @@ def SVE2p1Unsupported : AArch64Unsupported; def SVE2Unsupported : AArch64Unsupported { let F = !listconcat([HasSVE2, HasSVE2orSME, HasSVE2orSME2, HasSSVE_FP8FMA, HasSMEF8F16, - HasSMEF8F32, HasSVEAES, HasSVE2SHA3, HasSVE2SM4, HasSVE2BitPerm, + HasSMEF8F32, HasSVE2AES, HasSVE2SHA3, HasSVE2SM4, HasSVE2BitPerm, HasSVEB16B16], SVE2p1Unsupported.F); } diff --git a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp index 37a65b64a885b2..b3a7c737097f00 100644 --- a/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp +++ b/llvm/lib/Target/AArch64/AArch64A53Fix835769.cpp @@ -21,7 +21,6 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index e79457f925db66..40f92efcd2a979 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -61,7 +61,6 @@ #include "llvm/Target/TargetMachine.h" #include "llvm/TargetParser/Triple.h" #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h" -#include #include #include #include diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp index 1eb34e7ca4ddc6..fa04ccfba30f06 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp @@ -17,7 +17,6 @@ #include "AArch64Subtarget.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/TargetInstrInfo.h" -#include "llvm/IR/CallingConv.h" using namespace llvm; static const MCPhysReg XRegList[] = {AArch64::X0, AArch64::X1, AArch64::X2, diff --git a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 154ae43b29d574..10661b64146125 100644 --- a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -22,7 +22,6 @@ // //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index ff569e3dce2e92..e8a4d73c671c9b 100644 --- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -98,7 +98,6 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" @@ -109,7 +108,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetMachine.h" using namespace llvm; #define DEBUG_TYPE "aarch64-collect-loh" diff --git a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp index 7d14d2d20bad33..ecab42b89ec30f 100644 --- a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp +++ b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp @@ -19,9 +19,7 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" -#include "llvm/MC/MCContext.h" #include "llvm/Support/Alignment.h" -#include "llvm/Support/Debug.h" using namespace llvm; diff --git a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index 161cf24dd4037f..37222bf34426b5 100644 --- a/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -14,7 +14,6 @@ #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 9b7fc228d5de80..055cb3cefcedf9 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -32,13 +32,11 @@ #include "llvm/MC/MCInstrDesc.h" #include "llvm/Pass.h" #include "llvm/Support/CodeGen.h" -#include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include "llvm/TargetParser/Triple.h" #include #include #include -#include using namespace llvm; diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td index f6dfbe2500168d..61cb4f7ac33792 100644 --- a/llvm/lib/Target/AArch64/AArch64FMV.td +++ b/llvm/lib/Target/AArch64/AArch64FMV.td @@ -78,7 +78,7 @@ def : FMVExtension<"sme2", "FEAT_SME2", "+sme2,+sme,+bf16", 580>; def : FMVExtension<"ssbs", "FEAT_SSBS2", "+ssbs", 490>; def : FMVExtension<"sve", "FEAT_SVE", "+sve,+fullfp16,+fp-armv8,+neon", 310>; def : FMVExtension<"sve2", "FEAT_SVE2", "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370>; -def : FMVExtension<"sve2-aes", "FEAT_SVE_PMULL128", "+sve2,+sve,+aes,+sve-aes,+fullfp16,+fp-armv8,+neon", 380>; +def : FMVExtension<"sve2-aes", "FEAT_SVE_PMULL128", "+sve2,+sve,+aes,+sve2-aes,+fullfp16,+fp-armv8,+neon", 380>; def : FMVExtension<"sve2-bitperm", "FEAT_SVE_BITPERM", "+sve2,+sve,+sve2-bitperm,+fullfp16,+fp-armv8,+neon", 400>; def : FMVExtension<"sve2-sha3", "FEAT_SVE_SHA3", "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410>; def : FMVExtension<"sve2-sm4", "FEAT_SVE_SM4", "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420>; diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index d1e5d10829d557..9f0f23b6e6a658 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -36,7 +36,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/CodeGenTypes/MachineValueType.h" #include "llvm/IR/Argument.h" @@ -62,7 +61,6 @@ #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index f681f86029490e..a2349079889175 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -358,19 +358,13 @@ def FeatureTHE : ExtensionWithMArch<"the", "THE", "FEAT_THE", // Armv9.0 Architecture Extensions //===----------------------------------------------------------------------===// -def FeatureUseScalarIncVL : SubtargetFeature<"use-scalar-inc-vl", - "UseScalarIncVL", "true", "Prefer inc/dec over add+cnt">; - def FeatureSVE2 : ExtensionWithMArch<"sve2", "SVE2", "FEAT_SVE2", "Enable Scalable Vector Extension 2 (SVE2) instructions", - [FeatureSVE, FeatureUseScalarIncVL]>; + [FeatureSVE]>; -def FeatureSVEAES : ExtensionWithMArch<"sve-aes", "SVEAES", +def FeatureSVE2AES : ExtensionWithMArch<"sve2-aes", "SVE2AES", "FEAT_SVE_AES, FEAT_SVE_PMULL128", - "Enable SVE AES and quadword SVE polynomial multiply instructions", [FeatureAES]>; - -def AliasSVE2AES : ExtensionWithMArch<"sve2-aes", "ALIAS_SVE2AES", "", - "An alias of +sve2+sve-aes", [FeatureSVE2, FeatureSVEAES]>; + "Enable AES SVE2 instructions", [FeatureSVE2, FeatureAES]>; def FeatureSVE2SM4 : ExtensionWithMArch<"sve2-sm4", "SVE2SM4", "FEAT_SVE_SM4", "Enable SM4 SVE2 instructions", [FeatureSVE2, FeatureSM4]>; @@ -406,7 +400,7 @@ def FeatureRME : Extension<"rme", "RME", "FEAT_RME", "Enable Realm Management Extension">; def FeatureSME : ExtensionWithMArch<"sme", "SME", "FEAT_SME", - "Enable Scalable Matrix Extension (SME)", [FeatureBF16, FeatureUseScalarIncVL]>; + "Enable Scalable Matrix Extension (SME)", [FeatureBF16]>; def FeatureSMEF64F64 : ExtensionWithMArch<"sme-f64f64", "SMEF64F64", "FEAT_SME_F64F64", "Enable Scalable Matrix Extension (SME) F64F64 instructions", [FeatureSME]>; @@ -541,13 +535,13 @@ def FeatureSME2p2: ExtensionWithMArch<"sme2p2", "SME2p2", "FEAT_SME2p2", "Enable Armv9.6-A Scalable Matrix Extension 2.2 instructions", [FeatureSME2p1]>; def FeatureSSVE_AES : ExtensionWithMArch<"ssve-aes", "SSVE_AES", "FEAT_SSVE_AES", - "Enable Armv9.6-A SVE AES support in streaming SVE mode", [FeatureSME2, FeatureSVEAES]>; + "Enable Armv9.6-A SVE2 AES support in streaming SVE mode", [FeatureSME2, FeatureSVE2AES]>; def FeatureSVE2p2 : ExtensionWithMArch<"sve2p2", "SVE2p2", "FEAT_SVE2p2", "Enable Armv9.6-A Scalable Vector Extension 2.2 instructions", [FeatureSVE2p1]>; def FeatureSVEAES2: ExtensionWithMArch<"sve-aes2", "SVE_AES2", "FEAT_SVE_AES2", - "Enable Armv9.6-A SVE multi-vector AES and multi-vector quadword polynomial multiply instructions">; + "Enable Armv9.6-A SVE multi-vector AES and 128-bit PMULL instructions">; def FeatureSVEBFSCALE: ExtensionWithMArch<"sve-bfscale", "SVE_BFSCALE", "FEAT_SVE_BFSCALE", "Enable Armv9.6-A SVE BFloat16 scaling instructions">; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 9af6429c5caee0..216244950ba9ee 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -209,7 +209,6 @@ #include "AArch64MachineFunctionInfo.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "Utils/AArch64SMEAttributes.h" diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0814380b188485..069aab274d3126 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -40,7 +40,6 @@ #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/ComplexDeinterleavingPass.h" -#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -50,7 +49,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/RuntimeLibcallUtil.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" @@ -77,7 +75,6 @@ #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" -#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index b5f6388ea00285..15d4e93b915c14 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -2579,8 +2579,7 @@ class BaseAddSubCarry : BaseBaseAddSubCarry { + [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]> { let Defs = [NZCV]; } @@ -5912,34 +5911,34 @@ multiclass FPComparison { let Defs = [NZCV] in { def Hrr : BaseTwoOperandFPComparison { + [(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm))]> { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } def Hri : BaseOneOperandFPComparison { + [(OpNode (f16 FPR16:$Rn), fpimm0)]> { let Inst{23-22} = 0b11; let Predicates = [HasFullFP16]; } def Srr : BaseTwoOperandFPComparison { + [(OpNode FPR32:$Rn, (f32 FPR32:$Rm))]> { let Inst{23-22} = 0b00; } def Sri : BaseOneOperandFPComparison { + [(OpNode (f32 FPR32:$Rn), fpimm0)]> { let Inst{23-22} = 0b00; } def Drr : BaseTwoOperandFPComparison { + [(OpNode FPR64:$Rn, (f64 FPR64:$Rm))]> { let Inst{23-22} = 0b01; } def Dri : BaseOneOperandFPComparison { + [(OpNode (f64 FPR64:$Rn), fpimm0)]> { let Inst{23-22} = 0b01; } } // Defs = [NZCV] diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 32bc0e7d0d6475..e84db3588e8673 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -12,7 +12,6 @@ #include "AArch64InstrInfo.h" #include "AArch64ExpandImm.h" -#include "AArch64FrameLowering.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64PointerAuth.h" #include "AArch64Subtarget.h" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 57a8c36a2fe00b..a31fd6c4b86a8a 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -149,8 +149,8 @@ def HasSVE2 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">; def HasSVE2p1 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">, AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">; -def HasSVEAES : Predicate<"Subtarget->hasSVEAES()">, - AssemblerPredicateWithAll<(all_of FeatureSVEAES), "sve-aes">; +def HasSVE2AES : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2AES()">, + AssemblerPredicateWithAll<(all_of FeatureSVE2AES), "sve2-aes">; def HasSVE2SM4 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SM4()">, AssemblerPredicateWithAll<(all_of FeatureSVE2SM4), "sve2-sm4">; def HasSVE2SHA3 : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2SHA3()">, diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index 1a9e5899892a1b..ab00da51cf4fa4 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -37,7 +37,6 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCDwarf.h" -#include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp index 6715382eefebef..1aa55864dbff49 100644 --- a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp +++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp @@ -13,10 +13,8 @@ #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64InstPrinter.h" -#include "Utils/AArch64BaseInfo.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -26,7 +24,6 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/raw_ostream.h" #include #include diff --git a/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp index f07afe7089aa69..d1a943d91c0644 100644 --- a/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp +++ b/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp @@ -15,13 +15,11 @@ //===----------------------------------------------------------------------===// #include "AArch64PBQPRegAlloc.h" -#include "AArch64.h" #include "AArch64InstrInfo.h" #include "AArch64RegisterInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegAllocPBQP.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" diff --git a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp index f702147476dc85..c399de0c56e349 100644 --- a/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp +++ b/llvm/lib/Target/AArch64/AArch64PostCoalescerPass.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// -#include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineRegisterInfo.h" diff --git a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp index 9044c94bc4fe5b..0e0b23ea41639d 100644 --- a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -31,7 +31,6 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InlineAsm.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -44,7 +43,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include #include #include diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index 18290dd5f32df9..380f37df0bc2b9 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -28,7 +28,6 @@ #include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" #include "llvm/TargetParser/Triple.h" diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index c244b8e81224d2..c10653e05841cd 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3906,7 +3906,7 @@ let Predicates = [HasSVE2orSME] in { defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">; } // End HasSVE2orSME -let Predicates = [HasSVE2, HasSVEAES] in { +let Predicates = [HasSVE2AES] in { // SVE2 crypto destructive binary operations defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>; defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>; diff --git a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp index a991d645eb6f40..9aa8102aeab211 100644 --- a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp +++ b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp @@ -90,7 +90,6 @@ // could be done for some indirect branches, such as switch jump tables. //===----------------------------------------------------------------------===// -#include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/BitVector.h" @@ -101,11 +100,9 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" -#include "llvm/Support/CodeGen.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" #include diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index a6535a532fff3f..694ee17f8e5f11 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -8,9 +8,7 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallVector.h" @@ -24,21 +22,13 @@ #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/StackSafetyAnalysis.h" #include "llvm/BinaryFormat/Dwarf.h" -#include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" @@ -47,7 +37,6 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PassManager.h" -#include "llvm/IR/ValueHandle.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -56,7 +45,6 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/MemoryTaggingSupport.h" #include -#include #include #include diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp index 090c63633df905..f8195403918018 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp @@ -6,18 +6,15 @@ // //===----------------------------------------------------------------------===// - #include "AArch64.h" -#include "AArch64MachineFunctionInfo.h" #include "AArch64InstrInfo.h" +#include "AArch64MachineFunctionInfo.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineTraceMetrics.h" #include "llvm/CodeGen/Passes.h" diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 736d57e6ae2fd9..bc11b2e06cf1c1 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -93,6 +93,10 @@ static cl::opt cl::init(false), cl::Hidden, cl::desc("Enable subreg liveness tracking")); +static cl::opt + UseScalarIncVL("sve-use-scalar-inc-vl", cl::init(false), cl::Hidden, + cl::desc("Prefer add+cnt over addvl/inc/dec")); + unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const { if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0) return OverrideVectorInsertExtractBaseCost; @@ -575,6 +579,14 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const { bool AArch64Subtarget::useAA() const { return UseAA; } +bool AArch64Subtarget::useScalarIncVL() const { + // If SVE2 or SME is present (we are not SVE-1 only) and UseScalarIncVL + // is not otherwise set, enable it by default. + if (UseScalarIncVL.getNumOccurrences()) + return UseScalarIncVL; + return hasSVE2() || hasSME(); +} + // If return address signing is enabled, tail calls are emitted as follows: // // ``` diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index f3dcce3f3994ba..7f5883289c6ddd 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -417,6 +417,10 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { return DefaultSVETFOpts; } + /// Returns true to use the addvl/inc/dec instructions, as opposed to separate + /// add + cnt instructions. + bool useScalarIncVL() const; + const char* getChkStkName() const { if (isWindowsArm64EC()) return "#__chkstk_arm64ec"; diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index e24a874e74970f..e0e559b457f6d6 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -19,10 +19,8 @@ #include "AArch64TargetTransformInfo.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "TargetInfo/AArch64TargetInfo.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/CodeGen/CFIFixup.h" #include "llvm/CodeGen/CSEConfigBase.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp index d916f644de9b50..54de42a094f340 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -9,7 +9,6 @@ #include "AArch64TargetObjectFile.h" #include "AArch64TargetMachine.h" #include "MCTargetDesc/AArch64MCExpr.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/BinaryFormat/Dwarf.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/IR/Mangler.h" diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 5359e31a3435da..712f6de52941c9 100644 --- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -3737,9 +3737,8 @@ static const struct Extension { {"rng", {AArch64::FeatureRandGen}}, {"sve", {AArch64::FeatureSVE}}, {"sve-b16b16", {AArch64::FeatureSVEB16B16}}, - {"sve-aes", {AArch64::FeatureSVEAES}}, {"sve2", {AArch64::FeatureSVE2}}, - {"sve2-aes", {AArch64::AliasSVE2AES}}, + {"sve2-aes", {AArch64::FeatureSVE2AES}}, {"sve2-sm4", {AArch64::FeatureSVE2SM4}}, {"sve2-sha3", {AArch64::FeatureSVE2SHA3}}, {"sve2-bitperm", {AArch64::FeatureSVE2BitPerm}}, diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 5b58f8c58a6e46..8b1c16d319b2c7 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -15,7 +15,6 @@ #include "MCTargetDesc/AArch64MCTargetDesc.h" #include "TargetInfo/AArch64TargetInfo.h" #include "Utils/AArch64BaseInfo.h" -#include "llvm-c/Disassembler.h" #include "llvm/MC/MCDecoderOps.h" #include "llvm/MC/MCDisassembler/MCRelocationInfo.h" #include "llvm/MC/MCInst.h" @@ -25,8 +24,6 @@ #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include #include using namespace llvm; diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp index 11964b2075e5e7..09d706f0a303b8 100644 --- a/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp +++ b/llvm/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "AArch64ExternalSymbolizer.h" -#include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index 065858c4289447..15f1c99e87246b 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -47,7 +47,6 @@ #include #include #include -#include #define DEBUG_TYPE "aarch64-call-lowering" diff --git a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp index a2b909205ea84b..0b798509c26da5 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64GlobalISelUtils.cpp @@ -9,11 +9,9 @@ /// GlobalISel pipeline. //===----------------------------------------------------------------------===// #include "AArch64GlobalISelUtils.h" -#include "AArch64InstrInfo.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/InstrTypes.h" -#include "llvm/Support/raw_ostream.h" using namespace llvm; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 663117c6b85bf7..5000078928a1d2 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -42,7 +42,6 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAArch64.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" @@ -995,9 +994,9 @@ static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI, LLT Ty = MRI.getType(Reg); const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); const TargetRegisterClass *RC = - RegClassOrBank.dyn_cast(); + dyn_cast(RegClassOrBank); if (!RC) { - const RegisterBank &RB = *RegClassOrBank.get(); + const RegisterBank &RB = *cast(RegClassOrBank); RC = getRegClassForTypeOnBank(Ty, RB); if (!RC) { LLVM_DEBUG( @@ -2590,14 +2589,14 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(DefReg); - const TargetRegisterClass *DefRC - = RegClassOrBank.dyn_cast(); + const TargetRegisterClass *DefRC = + dyn_cast(RegClassOrBank); if (!DefRC) { if (!DefTy.isValid()) { LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); return false; } - const RegisterBank &RB = *RegClassOrBank.get(); + const RegisterBank &RB = *cast(RegClassOrBank); DefRC = getRegClassForTypeOnBank(DefTy, RB); if (!DefRC) { LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); @@ -4677,7 +4676,7 @@ AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1, // If we used a register class, then this won't necessarily have an LLT. // Compute the size based off whether or not we have a class or bank. unsigned Size; - if (const auto *RC = RegClassOrBank.dyn_cast()) + if (const auto *RC = dyn_cast(RegClassOrBank)) Size = TRI.getRegSizeInBits(*RC); else Size = MRI.getType(Dst).getSizeInBits(); diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index 3677cfdaba3b21..d42ecc1c72dce9 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "AArch64LegalizerInfo.h" -#include "AArch64RegisterBankInfo.h" #include "AArch64Subtarget.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" @@ -24,7 +23,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" -#include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsAArch64.h" diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp index 0ba3a543d114ac..13dd934543a709 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp @@ -23,10 +23,8 @@ #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Instructions.h" -#include "llvm/Support/Debug.h" #define GET_GICOMBINER_DEPS #include "AArch64GenO0PreLegalizeGICombiner.inc" diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 2bcfdc1b46873b..56d70ffdece713 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -23,10 +23,8 @@ #include "AArch64GlobalISelUtils.h" #include "AArch64PerfectShuffle.h" #include "AArch64Subtarget.h" -#include "AArch64TargetMachine.h" #include "GISel/AArch64LegalizerInfo.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" -#include "TargetInfo/AArch64TargetInfo.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" @@ -46,7 +44,6 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/InstrTypes.h" #include "llvm/InitializePasses.h" -#include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include @@ -90,7 +87,7 @@ std::optional> getExtMask(ArrayRef M, // Use APInt to handle overflow when calculating expected element. unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); - APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); + APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, false, true); // The following shuffle indices must be the successive elements after the // first real element. diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 6e689d743804ac..80459827c30f3f 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -28,7 +28,6 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Instructions.h" -#include "llvm/Support/Debug.h" #define GET_GICOMBINER_DEPS #include "AArch64GenPreLegalizeGICombiner.inc" diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 8d63c36eb015f3..d9c558819db3d4 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -32,7 +32,6 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Threading.h" -#include #include #define GET_TARGET_REGBANK_IMPL diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 1fdd2b08c904cc..3ba0f2a2682855 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -14,13 +14,10 @@ #include "llvm/MC/MCAsmBackend.h" #include "llvm/MC/MCAssembler.h" #include "llvm/MC/MCContext.h" -#include "llvm/MC/MCDirectives.h" #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 490efb650d5038..5bae846824548b 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -15,7 +15,6 @@ #include "AArch64ELFStreamer.h" #include "AArch64MCTargetDesc.h" #include "AArch64TargetStreamer.h" -#include "AArch64WinCOFFStreamer.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" @@ -37,7 +36,6 @@ #include "llvm/Support/Casting.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/TargetParser/Triple.h" using namespace llvm; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 2ee2ee5a6fa500..ae84bc953f359a 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -13,7 +13,6 @@ #include "AArch64InstPrinter.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/MC/MCAsmInfo.h" diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp index 0dc2e6a589aeee..552477ebca26c7 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp @@ -13,7 +13,6 @@ #include "MCTargetDesc/AArch64AddressingModes.h" #include "MCTargetDesc/AArch64FixupKinds.h" #include "MCTargetDesc/AArch64MCExpr.h" -#include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/BinaryFormat/ELF.h" @@ -25,10 +24,8 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/Endian.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" #include #include diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index 05c7d76f0af351..47228025ce2597 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -21,7 +21,6 @@ #include "llvm/MC/MCWinCOFFObjectWriter.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" #include using namespace llvm; diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 2ee16a873e33b8..bb885d86392fe6 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -13,18 +13,14 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" -#include "Utils/AArch64BaseInfo.h" #include "Utils/AArch64SMEAttributes.h" #include "llvm/ADT/StringRef.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/InitializePasses.h" -#include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/Cloning.h" using namespace llvm; diff --git a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp index ba737afadaf943..4a0312d5b276f3 100644 --- a/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp +++ b/llvm/lib/Target/AArch64/SMEPeepholeOpt.cpp @@ -13,7 +13,6 @@ #include "AArch64InstrInfo.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64Subtarget.h" -#include "Utils/AArch64SMEAttributes.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 5cfcc01afd20f3..6de6aed3b2a816 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -59,6 +59,57 @@ class SVEType { !eq(VT, nxv8f16): nxv2f16, !eq(VT, nxv8bf16): nxv2bf16, true : untyped); + + // The 64-bit vector subreg of VT. + ValueType DSub = !cond( + !eq(VT, nxv16i8): v8i8, + !eq(VT, nxv8i16): v4i16, + !eq(VT, nxv4i32): v2i32, + !eq(VT, nxv2i64): v1i64, + !eq(VT, nxv2f16): v4f16, + !eq(VT, nxv4f16): v4f16, + !eq(VT, nxv8f16): v4f16, + !eq(VT, nxv2f32): v2f32, + !eq(VT, nxv4f32): v2f32, + !eq(VT, nxv2f64): v1f64, + !eq(VT, nxv2bf16): v4bf16, + !eq(VT, nxv4bf16): v4bf16, + !eq(VT, nxv8bf16): v4bf16, + true : untyped); + + // The 128-bit vector subreg of VT. + ValueType ZSub = !cond( + !eq(VT, nxv16i8): v16i8, + !eq(VT, nxv8i16): v8i16, + !eq(VT, nxv4i32): v4i32, + !eq(VT, nxv2i64): v2i64, + !eq(VT, nxv2f16): v8f16, + !eq(VT, nxv4f16): v8f16, + !eq(VT, nxv8f16): v8f16, + !eq(VT, nxv2f32): v4f32, + !eq(VT, nxv4f32): v4f32, + !eq(VT, nxv2f64): v2f64, + !eq(VT, nxv2bf16): v8bf16, + !eq(VT, nxv4bf16): v8bf16, + !eq(VT, nxv8bf16): v8bf16, + true : untyped); + + // The legal scalar used to hold a vector element. + ValueType EltAsScalar = !cond( + !eq(VT, nxv16i8): i32, + !eq(VT, nxv8i16): i32, + !eq(VT, nxv4i32): i32, + !eq(VT, nxv2i64): i64, + !eq(VT, nxv2f16): f16, + !eq(VT, nxv4f16): f16, + !eq(VT, nxv8f16): f16, + !eq(VT, nxv2f32): f32, + !eq(VT, nxv4f32): f32, + !eq(VT, nxv2f64): f64, + !eq(VT, nxv2bf16): bf16, + !eq(VT, nxv4bf16): bf16, + !eq(VT, nxv8bf16): bf16, + true : untyped); } def SDT_AArch64Setcc : SDTypeProfile<1, 4, [ @@ -1402,29 +1453,61 @@ multiclass sve_int_perm_dup_i { def : InstAlias<"mov $Zd, $Qn", (!cast(NAME # _Q) ZPR128:$Zd, FPR128asZPR:$Qn, 0), 2>; - // Duplicate extracted element of vector into all vector elements + // Duplicate an extracted vector element across a vector. + def : Pat<(nxv16i8 (splat_vector (i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)))), (!cast(NAME # _B) ZPR:$vec, sve_elm_idx_extdup_b:$index)>; - def : Pat<(nxv8i16 (splat_vector (i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), - (!cast(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; - def : Pat<(nxv4i32 (splat_vector (i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), - (!cast(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; - def : Pat<(nxv2i64 (splat_vector (i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), - (!cast(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; - def : Pat<(nxv8f16 (splat_vector (f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), - (!cast(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; - def : Pat<(nxv8bf16 (splat_vector (bf16 (vector_extract (nxv8bf16 ZPR:$vec), sve_elm_idx_extdup_h:$index)))), - (!cast(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; - def : Pat<(nxv4f16 (splat_vector (f16 (vector_extract (nxv4f16 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), - (!cast(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; - def : Pat<(nxv2f16 (splat_vector (f16 (vector_extract (nxv2f16 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), - (!cast(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; - def : Pat<(nxv4f32 (splat_vector (f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)))), - (!cast(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; - def : Pat<(nxv2f32 (splat_vector (f32 (vector_extract (nxv2f32 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), - (!cast(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; - def : Pat<(nxv2f64 (splat_vector (f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)))), - (!cast(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; + def : Pat<(nxv16i8 (splat_vector (i32 (vector_extract (v16i8 V128:$vec), sve_elm_idx_extdup_b:$index)))), + (!cast(NAME # _B) (SUBREG_TO_REG (i64 0), $vec, zsub), sve_elm_idx_extdup_b:$index)>; + def : Pat<(nxv16i8 (splat_vector (i32 (vector_extract (v8i8 V64:$vec), sve_elm_idx_extdup_b:$index)))), + (!cast(NAME # _B) (SUBREG_TO_REG (i64 0), $vec, dsub), sve_elm_idx_extdup_b:$index)>; + + foreach VT = [nxv8i16, nxv2f16, nxv4f16, nxv8f16, nxv2bf16, nxv4bf16, nxv8bf16] in { + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (SVEType.Packed ZPR:$vec), sve_elm_idx_extdup_h:$index)))), + (!cast(NAME # _H) ZPR:$vec, sve_elm_idx_extdup_h:$index)>; + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (SVEType.ZSub V128:$vec), sve_elm_idx_extdup_h:$index)))), + (!cast(NAME # _H) (SUBREG_TO_REG (i64 0), $vec, zsub), sve_elm_idx_extdup_h:$index)>; + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (SVEType.DSub V64:$vec), sve_elm_idx_extdup_h:$index)))), + (!cast(NAME # _H) (SUBREG_TO_REG (i64 0), $vec, dsub), sve_elm_idx_extdup_h:$index)>; + } + + foreach VT = [nxv4i32, nxv2f32, nxv4f32 ] in { + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (SVEType.Packed ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (SVEType.ZSub V128:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast(NAME # _S) (SUBREG_TO_REG (i64 0), $vec, zsub), sve_elm_idx_extdup_s:$index)>; + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (SVEType.DSub V64:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast(NAME # _S) (SUBREG_TO_REG (i64 0), $vec, dsub), sve_elm_idx_extdup_s:$index)>; + } + + foreach VT = [nxv2i64, nxv2f64] in { + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (VT ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (SVEType.ZSub V128:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast(NAME # _D) (SUBREG_TO_REG (i64 0), $vec, zsub), sve_elm_idx_extdup_d:$index)>; + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (SVEType.DSub V64:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast(NAME # _D) (SUBREG_TO_REG (i64 0), $vec, dsub), sve_elm_idx_extdup_d:$index)>; + } + + // When extracting from an unpacked vector the index must be scaled to account + // for the "holes" in the underlying packed vector type. We get the scaling + // for free by "promoting" the element type to one whose underlying vector + // type is packed. This is only valid when extracting from a vector whose + // length is the same or bigger than the result of the splat. + + foreach VT = [nxv4f16, nxv4bf16] in { + def : Pat<(SVEType.HalfLength (splat_vector (SVEType.EltAsScalar (vector_extract (VT ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (VT ZPR:$vec), sve_elm_idx_extdup_s:$index)))), + (!cast(NAME # _S) ZPR:$vec, sve_elm_idx_extdup_s:$index)>; + } + + foreach VT = [nxv2f16, nxv2f32, nxv2bf16] in { + def : Pat<(VT (splat_vector (SVEType.EltAsScalar (vector_extract (VT ZPR:$vec), sve_elm_idx_extdup_d:$index)))), + (!cast(NAME # _D) ZPR:$vec, sve_elm_idx_extdup_d:$index)>; + } + + // Duplicate an indexed 128-bit segment across a vector. def : Pat<(nxv16i8 (AArch64duplane128 nxv16i8:$Op1, i64:$imm)), (!cast(NAME # _Q) $Op1, $imm)>; diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp index f90a6d3c1a2a1e..9cb790b99f1de0 100644 --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -34,7 +34,6 @@ #include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" -#include "llvm/Support/Debug.h" #include using namespace llvm; diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp index 59937a7d2a1f68..d83c22e7179505 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp @@ -10,7 +10,6 @@ // //===----------------------------------------------------------------------===// #include "AArch64BaseInfo.h" -#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/Regex.h" diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 5ebe4a069569c2..d801f2b1591275 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -563,7 +563,7 @@ const MCExpr *AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties( KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; } - if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) { + if (UserSGPRInfo.hasQueuePtr()) { KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; } @@ -1575,7 +1575,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(AMDGPUMCKernelCodeT &Out, if (UserSGPRInfo.hasDispatchPtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; - if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) + if (UserSGPRInfo.hasQueuePtr()) Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR; if (UserSGPRInfo.hasKernargSegmentPtr()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 1873251ea358b1..d51d136ba4200c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -81,7 +81,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg, auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); const TargetRegisterClass *RC = - RegClassOrBank.dyn_cast(); + dyn_cast(RegClassOrBank); if (RC) { const LLT Ty = MRI.getType(Reg); if (!Ty.isValid() || Ty.getSizeInBits() != 1) @@ -91,7 +91,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg, RC->hasSuperClassEq(TRI.getBoolRC()); } - const RegisterBank *RB = RegClassOrBank.get(); + const RegisterBank *RB = cast(RegClassOrBank); return RB->getID() == AMDGPU::VCCRegBankID; } @@ -233,15 +233,15 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const { const RegClassOrRegBank &RegClassOrBank = MRI->getRegClassOrRegBank(DefReg); - const TargetRegisterClass *DefRC - = RegClassOrBank.dyn_cast(); + const TargetRegisterClass *DefRC = + dyn_cast(RegClassOrBank); if (!DefRC) { if (!DefTy.isValid()) { LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); return false; } - const RegisterBank &RB = *RegClassOrBank.get(); + const RegisterBank &RB = *cast(RegClassOrBank); DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB); if (!DefRC) { LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); @@ -2395,11 +2395,11 @@ const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank( Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const { const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg); - if (auto *RB = RegClassOrBank.dyn_cast()) + if (auto *RB = dyn_cast(RegClassOrBank)) return RB; // Ignore the type, since we don't use vcc in artifacts. - if (auto *RC = RegClassOrBank.dyn_cast()) + if (auto *RC = dyn_cast(RegClassOrBank)) return &RBI.getRegBankFromRegClass(*RC, LLT()); return nullptr; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 0658e030ffa5d6..25df5dabdc6aa1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2175,8 +2175,11 @@ foreach vt = [i32, p3, p5, p6, p2] in { >; } +// FIXME: The register bank of the frame index should depend on the +// users, and transitive users of the add. We may require an +// unnecessary copy from SGPR to VGPR. def : GCNPat < - (p5 frameindex:$fi), + (VGPRImm<(p5 frameindex)>:$fi), (V_MOV_B32_e32 (p5 (frameindex_to_targetframeindex $fi))) >; @@ -3554,7 +3557,7 @@ def : AMDGPUPat < >; def uint5Bits : PatLeaf<(i32 VGPR_32:$width), [{ - return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxTrailingOnes() <= 5; + return CurDAG->computeKnownBits(SDValue(N, 0)).countMaxActiveBits() <= 5; }]>; // x << (bitwidth - y) >> (bitwidth - y) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 707468892d1779..f76d1266f495cf 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -3682,10 +3682,10 @@ const TargetRegisterClass * SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const { const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg()); - if (const RegisterBank *RB = RCOrRB.dyn_cast()) + if (const RegisterBank *RB = dyn_cast(RCOrRB)) return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB); - if (const auto *RC = RCOrRB.dyn_cast()) + if (const auto *RC = dyn_cast(RCOrRB)) return getAllocatableClass(RC); return nullptr; diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td index 6cfbf9c83dc329..e912878e9b23cc 100644 --- a/llvm/lib/Target/AVR/AVRInstrInfo.td +++ b/llvm/lib/Target/AVR/AVRInstrInfo.td @@ -365,7 +365,7 @@ let isCommutable = 1, Constraints = "$src = $rd", Defs = [SREG] in { // Adds two 8-bit registers. def ADDRdRr : FRdRr<0b0000, 0b11, (outs GPR8:$rd),(ins GPR8:$src, GPR8:$rr), "add\t$rd, $rr", - [(set i8:$rd, (add i8:$src, i8:$rr)), (implicit SREG)]>; + [(set i8:$rd, (add i8:$src, i8:$rr))]>; // ADDW Rd+1:Rd, Rr+1:Rr // Pseudo instruction to add four 8-bit registers as two 16-bit values. @@ -375,15 +375,14 @@ let isCommutable = 1, Constraints = "$src = $rd", Defs = [SREG] in { // adc Rd+1, Rr+1 def ADDWRdRr : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, DREGS:$rr), "addw\t$rd, $rr", - [(set i16:$rd, (add i16:$src, i16:$rr)), - (implicit SREG)]>; + [(set i16:$rd, (add i16:$src, i16:$rr))]>; // ADC Rd, Rr // Adds two 8-bit registers with carry. let Uses = [SREG] in def ADCRdRr : FRdRr<0b0001, 0b11, (outs GPR8:$rd), (ins GPR8:$src, GPR8:$rr), "adc\t$rd, $rr", - [(set i8:$rd, (adde i8:$src, i8:$rr)), (implicit SREG)]>; + [(set i8:$rd, (adde i8:$src, i8:$rr))]>; // ADCW Rd+1:Rd, Rr+1:Rr // Pseudo instruction to add four 8-bit registers as two 16-bit values with @@ -395,15 +394,13 @@ let isCommutable = 1, Constraints = "$src = $rd", Defs = [SREG] in { let Uses = [SREG] in def ADCWRdRr : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, DREGS:$rr), "adcw\t$rd, $rr", - [(set i16:$rd, (adde i16:$src, i16:$rr)), - (implicit SREG)]>; + [(set i16:$rd, (adde i16:$src, i16:$rr))]>; // AIDW Rd, k // Adds an immediate 6-bit value K to Rd, placing the result in Rd. def ADIWRdK : FWRdK<0b0, (outs IWREGS:$rd), (ins IWREGS :$src, imm_arith6:$k), "adiw\t$rd, $k", - [(set i16:$rd, (add i16:$src, uimm6:$k)), - (implicit SREG)]>, + [(set i16:$rd, (add i16:$src, uimm6:$k))]>, Requires<[HasADDSUBIW]>; } @@ -415,7 +412,7 @@ let Constraints = "$rs = $rd", Defs = [SREG] in { // Subtracts the 8-bit value of Rr from Rd and places the value in Rd. def SUBRdRr : FRdRr<0b0001, 0b10, (outs GPR8:$rd), (ins GPR8:$rs, GPR8:$rr), "sub\t$rd, $rr", - [(set i8:$rd, (sub i8:$rs, i8:$rr)), (implicit SREG)]>; + [(set i8:$rd, (sub i8:$rs, i8:$rr))]>; // SUBW Rd+1:Rd, Rr+1:Rr // Subtracts two 16-bit values and places the result into Rd. @@ -425,12 +422,11 @@ let Constraints = "$rs = $rd", Defs = [SREG] in { // sbc Rd+1, Rr+1 def SUBWRdRr : Pseudo<(outs DREGS:$rd), (ins DREGS:$rs, DREGS:$rr), "subw\t$rd, $rr", - [(set i16:$rd, (sub i16:$rs, i16:$rr)), - (implicit SREG)]>; + [(set i16:$rd, (sub i16:$rs, i16:$rr))]>; def SUBIRdK : FRdK<0b0101, (outs LD8:$rd), (ins LD8:$rs, imm_ldi8:$k), "subi\t$rd, $k", - [(set i8:$rd, (sub i8:$rs, imm:$k)), (implicit SREG)]>; + [(set i8:$rd, (sub i8:$rs, imm:$k))]>; // SUBIW Rd+1:Rd, K+1:K // @@ -439,20 +435,18 @@ let Constraints = "$rs = $rd", Defs = [SREG] in { // sbci Rd+1, K+1 def SUBIWRdK : Pseudo<(outs DLDREGS:$rd), (ins DLDREGS:$rs, i16imm:$rr), "subiw\t$rd, $rr", - [(set i16:$rd, (sub i16:$rs, imm:$rr)), - (implicit SREG)]>; + [(set i16:$rd, (sub i16:$rs, imm:$rr))]>; def SBIWRdK : FWRdK<0b1, (outs IWREGS:$rd), (ins IWREGS:$rs, imm_arith6:$k), "sbiw\t$rd, $k", - [(set i16:$rd, (sub i16:$rs, uimm6:$k)), - (implicit SREG)]>, + [(set i16:$rd, (sub i16:$rs, uimm6:$k))]>, Requires<[HasADDSUBIW]>; // Subtract with carry operations which must read the carry flag in SREG. let Uses = [SREG] in { def SBCRdRr : FRdRr<0b0000, 0b10, (outs GPR8:$rd), (ins GPR8:$rs, GPR8:$rr), "sbc\t$rd, $rr", - [(set i8:$rd, (sube i8:$rs, i8:$rr)), (implicit SREG)]>; + [(set i8:$rd, (sube i8:$rs, i8:$rr))]>; // SBCW Rd+1:Rd, Rr+1:Rr // @@ -461,20 +455,18 @@ let Constraints = "$rs = $rd", Defs = [SREG] in { // sbc Rd+1, Rr+1 def SBCWRdRr : Pseudo<(outs DREGS:$rd), (ins DREGS:$rs, DREGS:$rr), "sbcw\t$rd, $rr", - [(set i16:$rd, (sube i16:$rs, i16:$rr)), - (implicit SREG)]>; + [(set i16:$rd, (sube i16:$rs, i16:$rr))]>; def SBCIRdK : FRdK<0b0100, (outs LD8:$rd), (ins LD8:$rs, imm_ldi8:$k), "sbci\t$rd, $k", - [(set i8:$rd, (sube i8:$rs, imm:$k)), (implicit SREG)]>; + [(set i8:$rd, (sube i8:$rs, imm:$k))]>; // SBCIW Rd+1:Rd, K+1:K // sbci Rd, K // sbci Rd+1, K+1 def SBCIWRdK : Pseudo<(outs DLDREGS:$rd), (ins DLDREGS:$rs, i16imm:$rr), "sbciw\t$rd, $rr", - [(set i16:$rd, (sube i16:$rs, imm:$rr)), - (implicit SREG)]>; + [(set i16:$rd, (sube i16:$rs, imm:$rr))]>; } } @@ -484,11 +476,11 @@ let Constraints = "$rs = $rd", Defs = [SREG] in { let Constraints = "$src = $rd", Defs = [SREG] in { def INCRd : FRd<0b1001, 0b0100011, (outs GPR8:$rd), (ins GPR8:$src), "inc\t$rd", - [(set i8:$rd, (add i8:$src, 1)), (implicit SREG)]>; + [(set i8:$rd, (add i8:$src, 1))]>; def DECRd : FRd<0b1001, 0b0101010, (outs GPR8:$rd), (ins GPR8:$src), "dec\t$rd", - [(set i8:$rd, (add i8:$src, -1)), (implicit SREG)]>; + [(set i8:$rd, (add i8:$src, -1))]>; } //===----------------------------------------------------------------------===// @@ -538,7 +530,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { let isCommutable = 1 in { def ANDRdRr : FRdRr<0b0010, 0b00, (outs GPR8:$rd), (ins GPR8:$src, GPR8:$rr), "and\t$rd, $rr", - [(set i8:$rd, (and i8:$src, i8:$rr)), (implicit SREG)]>; + [(set i8:$rd, (and i8:$src, i8:$rr))]>; // ANDW Rd+1:Rd, Rr+1:Rr // @@ -547,12 +539,11 @@ let Constraints = "$src = $rd", Defs = [SREG] in { // and Rd+1, Rr+1 def ANDWRdRr : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, DREGS:$rr), "andw\t$rd, $rr", - [(set i16:$rd, (and i16:$src, i16:$rr)), - (implicit SREG)]>; + [(set i16:$rd, (and i16:$src, i16:$rr))]>; def ORRdRr : FRdRr<0b0010, 0b10, (outs GPR8:$rd), (ins GPR8:$src, GPR8:$rr), "or\t$rd, $rr", - [(set i8:$rd, (or i8:$src, i8:$rr)), (implicit SREG)]>; + [(set i8:$rd, (or i8:$src, i8:$rr))]>; // ORW Rd+1:Rd, Rr+1:Rr // @@ -561,12 +552,11 @@ let Constraints = "$src = $rd", Defs = [SREG] in { // or Rd+1, Rr+1 def ORWRdRr : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, DREGS:$rr), "orw\t$rd, $rr", - [(set i16:$rd, (or i16:$src, i16:$rr)), - (implicit SREG)]>; + [(set i16:$rd, (or i16:$src, i16:$rr))]>; def EORRdRr : FRdRr<0b0010, 0b01, (outs GPR8:$rd), (ins GPR8:$src, GPR8:$rr), "eor\t$rd, $rr", - [(set i8:$rd, (xor i8:$src, i8:$rr)), (implicit SREG)]>; + [(set i8:$rd, (xor i8:$src, i8:$rr))]>; // EORW Rd+1:Rd, Rr+1:Rr // @@ -575,13 +565,12 @@ let Constraints = "$src = $rd", Defs = [SREG] in { // eor Rd+1, Rr+1 def EORWRdRr : Pseudo<(outs DREGS:$rd), (ins DREGS:$src, DREGS:$rr), "eorw\t$rd, $rr", - [(set i16:$rd, (xor i16:$src, i16:$rr)), - (implicit SREG)]>; + [(set i16:$rd, (xor i16:$src, i16:$rr))]>; } def ANDIRdK : FRdK<0b0111, (outs LD8:$rd), (ins LD8:$src, imm_ldi8:$k), "andi\t$rd, $k", - [(set i8:$rd, (and i8:$src, imm:$k)), (implicit SREG)]>; + [(set i8:$rd, (and i8:$src, imm:$k))]>; // ANDI Rd+1:Rd, K+1:K // @@ -590,12 +579,11 @@ let Constraints = "$src = $rd", Defs = [SREG] in { // andi Rd+1, K+1 def ANDIWRdK : Pseudo<(outs DLDREGS:$rd), (ins DLDREGS:$src, i16imm:$k), "andiw\t$rd, $k", - [(set i16:$rd, (and i16:$src, imm:$k)), - (implicit SREG)]>; + [(set i16:$rd, (and i16:$src, imm:$k))]>; def ORIRdK : FRdK<0b0110, (outs LD8:$rd), (ins LD8:$src, imm_ldi8:$k), "ori\t$rd, $k", - [(set i8:$rd, (or i8:$src, imm:$k)), (implicit SREG)]>; + [(set i8:$rd, (or i8:$src, imm:$k))]>; // ORIW Rd+1:Rd, K+1,K // @@ -604,8 +592,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { // ori Rd+1, K+1 def ORIWRdK : Pseudo<(outs DLDREGS:$rd), (ins DLDREGS:$src, i16imm:$rr), "oriw\t$rd, $rr", - [(set i16:$rd, (or i16:$src, imm:$rr)), - (implicit SREG)]>; + [(set i16:$rd, (or i16:$src, imm:$rr))]>; } //===----------------------------------------------------------------------===// @@ -613,7 +600,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { //===----------------------------------------------------------------------===// let Constraints = "$src = $rd", Defs = [SREG] in { def COMRd : FRd<0b1001, 0b0100000, (outs GPR8:$rd), (ins GPR8:$src), - "com\t$rd", [(set i8:$rd, (not i8:$src)), (implicit SREG)]>; + "com\t$rd", [(set i8:$rd, (not i8:$src))]>; // COMW Rd+1:Rd // @@ -621,10 +608,10 @@ let Constraints = "$src = $rd", Defs = [SREG] in { // com Rd // com Rd+1 def COMWRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "comw\t$rd", - [(set i16:$rd, (not i16:$src)), (implicit SREG)]>; + [(set i16:$rd, (not i16:$src))]>; def NEGRd : FRd<0b1001, 0b0100001, (outs GPR8:$rd), (ins GPR8:$src), - "neg\t$rd", [(set i8:$rd, (ineg i8:$src)), (implicit SREG)]>; + "neg\t$rd", [(set i8:$rd, (ineg i8:$src))]>; // NEGW Rd+1:Rd // @@ -722,7 +709,7 @@ let Defs = [SREG] in { def CPRdRr : FRdRr<0b0001, 0b01, (outs), (ins GPR8:$rd, GPR8:$rr), "cp\t$rd, $rr", - [(AVRcmp i8:$rd, i8:$rr), (implicit SREG)]>; + [(AVRcmp i8:$rd, i8:$rr)]>; // CPW Rd+1:Rd, Rr+1:Rr // @@ -731,12 +718,12 @@ let Defs = [SREG] in { // cpc Rd+1, Rr+1 def CPWRdRr : Pseudo<(outs), (ins DREGS:$src, DREGS:$src2), "cpw\t$src, $src2", - [(AVRcmp i16:$src, i16:$src2), (implicit SREG)]>; + [(AVRcmp i16:$src, i16:$src2)]>; let Uses = [SREG] in def CPCRdRr : FRdRr<0b0000, 0b01, (outs), (ins GPR8:$rd, GPR8:$rr), "cpc\t$rd, $rr", - [(AVRcmpc i8:$rd, i8:$rr), (implicit SREG)]>; + [(AVRcmpc i8:$rd, i8:$rr)]>; // CPCW Rd+1:Rd. Rr+1:Rr // @@ -746,12 +733,12 @@ let Defs = [SREG] in { let Uses = [SREG] in def CPCWRdRr : Pseudo<(outs), (ins DREGS:$src, DREGS:$src2), "cpcw\t$src, $src2", - [(AVRcmpc i16:$src, i16:$src2), (implicit SREG)]>; + [(AVRcmpc i16:$src, i16:$src2)]>; // CPI Rd, K // Compares a register with an 8 bit immediate. def CPIRdK : FRdK<0b0011, (outs), (ins LD8:$rd, imm_ldi8:$k), "cpi\t$rd, $k", - [(AVRcmp i8:$rd, imm:$k), (implicit SREG)]>; + [(AVRcmp i8:$rd, imm:$k)]>; } //===----------------------------------------------------------------------===// @@ -1386,11 +1373,10 @@ let Constraints = "$src = $rd", Defs = [SREG] in { "lslw\t$rd", [(set i16 : $rd, (AVRlsl i16 - : $src)), - (implicit SREG)]>; + : $src))]>; def LSLWHiRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lslwhi\t$rd", - [(set i16:$rd, (AVRlslhi i16:$src)), (implicit SREG)]>; + [(set i16:$rd, (AVRlslhi i16:$src))]>; def LSLWNRd : Pseudo<(outs DLDREGS : $rd), @@ -1401,8 +1387,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { (set i16 : $rd, (AVRlslwn i16 : $src, imm - : $bits)), - (implicit SREG) + : $bits)) ]>; def LSLBNRd : Pseudo<(outs LD8 @@ -1414,8 +1399,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { (set i8 : $rd, (AVRlslbn i8 : $src, imm - : $bits)), - (implicit SREG) + : $bits)) ]>; def LSRRd @@ -1426,8 +1410,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { : $src), "lsr\t$rd", [(set i8 : $rd, (AVRlsr i8 - : $src)), - (implicit SREG)]>; + : $src))]>; def LSRWRd : Pseudo<(outs DREGS : $rd), @@ -1436,11 +1419,10 @@ let Constraints = "$src = $rd", Defs = [SREG] in { "lsrw\t$rd", [(set i16 : $rd, (AVRlsr i16 - : $src)), - (implicit SREG)]>; + : $src))]>; def LSRWLoRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lsrwlo\t$rd", - [(set i16:$rd, (AVRlsrlo i16:$src)), (implicit SREG)]>; + [(set i16:$rd, (AVRlsrlo i16:$src))]>; def LSRWNRd : Pseudo<(outs DLDREGS : $rd), @@ -1451,8 +1433,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { (set i16 : $rd, (AVRlsrwn i16 : $src, imm - : $bits)), - (implicit SREG) + : $bits)) ]>; def LSRBNRd : Pseudo<(outs LD8 @@ -1464,8 +1445,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { (set i8 : $rd, (AVRlsrbn i8 : $src, imm - : $bits)), - (implicit SREG) + : $bits)) ]>; def ASRRd @@ -1476,8 +1456,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { : $src), "asr\t$rd", [(set i8 : $rd, (AVRasr i8 - : $src)), - (implicit SREG)]>; + : $src))]>; def ASRWNRd : Pseudo<(outs DREGS : $rd), @@ -1488,8 +1467,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { (set i16 : $rd, (AVRasrwn i16 : $src, imm - : $bits)), - (implicit SREG) + : $bits)) ]>; def ASRBNRd : Pseudo<(outs LD8 @@ -1501,8 +1479,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { (set i8 : $rd, (AVRasrbn i8 : $src, imm - : $bits)), - (implicit SREG) + : $bits)) ]>; def ASRWRd : Pseudo<(outs DREGS @@ -1512,25 +1489,22 @@ let Constraints = "$src = $rd", Defs = [SREG] in { "asrw\t$rd", [(set i16 : $rd, (AVRasr i16 - : $src)), - (implicit SREG)]>; + : $src))]>; def ASRWLoRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "asrwlo\t$rd", - [(set i16:$rd, (AVRasrlo i16:$src)), (implicit SREG)]>; + [(set i16:$rd, (AVRasrlo i16:$src))]>; let Uses = [R1] in def ROLBRdR1 : Pseudo<(outs GPR8:$rd), (ins GPR8:$src), "rolb\t$rd", - [(set i8:$rd, (AVRrol i8:$src)), - (implicit SREG)]>, + [(set i8:$rd, (AVRrol i8:$src))]>, Requires<[HasNonTinyEncoding]>; let Uses = [R17] in def ROLBRdR17 : Pseudo<(outs GPR8:$rd), (ins GPR8:$src), "rolb\t$rd", - [(set i8:$rd, (AVRrol i8:$src)), - (implicit SREG)]>, + [(set i8:$rd, (AVRrol i8:$src))]>, Requires<[HasTinyEncoding]>; def RORBRd : Pseudo<(outs GPR8 @@ -1540,8 +1514,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { "rorb\t$rd", [(set i8 : $rd, (AVRror i8 - : $src)), - (implicit SREG)]>; + : $src))]>; // Bit rotate operations. let Uses = [SREG] in { @@ -1554,8 +1527,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { "rolw\t$rd", [(set i16 : $rd, (AVRrol i16 - : $src)), - (implicit SREG)]>; + : $src))]>; def RORRd : FRd<0b1001, 0b0100111, (outs GPR8 @@ -1572,8 +1544,7 @@ let Constraints = "$src = $rd", Defs = [SREG] in { "rorw\t$rd", [(set i16 : $rd, (AVRror i16 - : $src)), - (implicit SREG)]>; + : $src))]>; } } @@ -1740,8 +1711,7 @@ def SEXT "sext\t$dst, $src", [(set i16 : $dst, (sext i8 - : $src)), - (implicit SREG)]>; + : $src))]>; def ZEXT : ExtensionPseudo<(outs DREGS @@ -1751,8 +1721,7 @@ def ZEXT "zext\t$dst, $src", [(set i16 : $dst, (zext i8 - : $src)), - (implicit SREG)]>; + : $src))]>; // This pseudo gets expanded into a movw+adiw thus it clobbers SREG. let Defs = [SREG], diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp index 39f38259a193b4..df948e4407c6fc 100644 --- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -303,7 +303,7 @@ static uint32_t calcArraySize(const DICompositeType *CTy, uint32_t StartDim) { if (auto *Element = dyn_cast_or_null(Elements[I])) if (Element->getTag() == dwarf::DW_TAG_subrange_type) { const DISubrange *SR = cast(Element); - auto *CI = SR->getCount().dyn_cast(); + auto *CI = dyn_cast(SR->getCount()); DimSize *= CI->getSExtValue(); } } diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp index a14e9db5f7500d..1a9ee3128e20d2 100644 --- a/llvm/lib/Target/BPF/BTFDebug.cpp +++ b/llvm/lib/Target/BPF/BTFDebug.cpp @@ -715,7 +715,7 @@ void BTFDebug::visitArrayType(const DICompositeType *CTy, uint32_t &TypeId) { if (auto *Element = dyn_cast_or_null(Elements[I])) if (Element->getTag() == dwarf::DW_TAG_subrange_type) { const DISubrange *SR = cast(Element); - auto *CI = SR->getCount().dyn_cast(); + auto *CI = dyn_cast(SR->getCount()); int64_t Count = CI->getSExtValue(); // For struct s { int b; char c[]; }, the c[] will be represented diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td index ccdd5165728231..3de20d6e599dbf 100644 --- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td @@ -1469,12 +1469,12 @@ def PseudoBR : Pseudo<(outs), (ins simm26_b:$imm26), [(br bb:$imm26)]>, PseudoInstExpansion<(B simm26_b:$imm26)>; let isBarrier = 1, isBranch = 1, isIndirectBranch = 1, isTerminator = 1 in -def PseudoBRIND : Pseudo<(outs), (ins GPR:$rj, simm16_lsl2:$imm16)>, +def PseudoBRIND : Pseudo<(outs), (ins GPRJR:$rj, simm16_lsl2:$imm16)>, PseudoInstExpansion<(JIRL R0, GPR:$rj, simm16_lsl2:$imm16)>; -def : Pat<(brind GPR:$rj), (PseudoBRIND GPR:$rj, 0)>; -def : Pat<(brind (add GPR:$rj, simm16_lsl2:$imm16)), - (PseudoBRIND GPR:$rj, simm16_lsl2:$imm16)>; +def : Pat<(brind GPRJR:$rj), (PseudoBRIND GPRJR:$rj, 0)>; +def : Pat<(brind (add GPRJR:$rj, simm16_lsl2:$imm16)), + (PseudoBRIND GPRJR:$rj, simm16_lsl2:$imm16)>; // Function call with 'Small' code model. let isCall = 1, Defs = [R1] in diff --git a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td index 2d3a7c364f0bf4..a8419980868ee1 100644 --- a/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchRegisterInfo.td @@ -121,6 +121,12 @@ def GPR : GPRRegisterClass<(add // Argument registers (a0...a7) def GPRT : GPRRegisterClass<(add // a0...a7, t0...t8 (sequence "R%u", 4, 20))>; +// Don't use R1 for JR since that micro-architecture unconditionally treats a +// "jr $ra" as "return from subroutine", hence doing "jr $ra" would interfere +// with both subroutine return prediction and the more general indirect branch +// prediction. +def GPRJR : GPRRegisterClass<(sub GPR, R1)>; + // Floating point registers let RegAltNameIndices = [RegAliasName] in { diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp index 62e900dc65babb..1d4b73d1b7f27a 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.cpp @@ -55,7 +55,7 @@ static MCInstrInfo *createLoongArchMCInstrInfo() { static MCSubtargetInfo * createLoongArchMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { if (CPU.empty() || CPU == "generic") - CPU = TT.isArch64Bit() ? "la464" : "generic-la32"; + CPU = TT.isArch64Bit() ? "generic-la64" : "generic-la32"; return createLoongArchMCSubtargetInfoImpl(TT, CPU, /*TuneCPU*/ CPU, FS); } diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.cpp b/llvm/lib/Target/M68k/M68kFrameLowering.cpp index 4245061f0ae749..721395027b512e 100644 --- a/llvm/lib/Target/M68k/M68kFrameLowering.cpp +++ b/llvm/lib/Target/M68k/M68kFrameLowering.cpp @@ -33,6 +33,8 @@ using namespace llvm; +#define DEBUG_TYPE "m68k-frame" + M68kFrameLowering::M68kFrameLowering(const M68kSubtarget &STI, Align Alignment) : TargetFrameLowering(StackGrowsDown, Alignment, -4), STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) { @@ -231,8 +233,8 @@ MachineBasicBlock::iterator M68kFrameLowering::eliminateCallFramePseudoInstr( unsigned Opcode = I->getOpcode(); bool IsDestroy = Opcode == TII.getCallFrameDestroyOpcode(); DebugLoc DL = I->getDebugLoc(); - uint64_t Amount = !ReserveCallFrame ? I->getOperand(0).getImm() : 0; - uint64_t InternalAmt = (IsDestroy && Amount) ? I->getOperand(1).getImm() : 0; + uint64_t Amount = I->getOperand(0).getImm(); + uint64_t InternalAmt = (IsDestroy || Amount) ? I->getOperand(1).getImm() : 0; I = MBB.erase(I); if (!ReserveCallFrame) { diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.td b/llvm/lib/Target/MSP430/MSP430InstrInfo.td index 714a5d4f511655..546ba6fd8e3b49 100644 --- a/llvm/lib/Target/MSP430/MSP430InstrInfo.td +++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.td @@ -451,21 +451,17 @@ multiclass Arith opcode, string asmstring, SDPatternOperator node, let isCommutable = commutes in { def 8rr : I8rr; + [(set GR8:$rd, (node GR8:$src2, GR8:$rs))]>; def 16rr : I16rr; + [(set GR16:$rd, (node GR16:$src2, GR16:$rs))]>; } def 8rm : I8rm; + [(set GR8:$rd, (node GR8:$src2, (load addr:$src)))]>; def 16rm : I16rm; + [(set GR16:$rd, (node GR16:$src2, (load addr:$src)))]>; def 8rn : I8rn; def 16rn : I16rn opcode, string asmstring, SDPatternOperator node, } def 8rc : I8rc; + [(set GR8:$rd, (node GR8:$src2, cg8imm:$imm))]>; def 16rc : I16rc; + [(set GR16:$rd, (node GR16:$src2, cg16imm:$imm))]>; def 8ri : I8ri; + [(set GR8:$rd, (node GR8:$src2, imm:$imm))]>; def 16ri : I16ri; + [(set GR16:$rd, (node GR16:$src2, imm:$imm))]>; } def 8mr : I8mr; + [(store (node (load addr:$dst), GR8:$rs), addr:$dst)]>; def 16mr : I16mr; + [(store (node (load addr:$dst), GR16:$rs), addr:$dst)]>; def 8mc : I8mc; + [(store (node (load addr:$dst), (i8 cg8imm:$imm)), addr:$dst)]>; def 16mc : I16mc; + [(store (node (load addr:$dst), (i16 cg16imm:$imm)), addr:$dst)]>; def 8mi : I8mi; + [(store (node (load addr:$dst), (i8 imm:$imm)), addr:$dst)]>; def 16mi : I16mi; + [(store (node (load addr:$dst), (i16 imm:$imm)), addr:$dst)]>; def 8mm : I8mm; + (i8 (load addr:$src))), addr:$dst)]>; def 16mm : I16mm; + (i16 (load addr:$src))), addr:$dst)]>; def 8mn : I8mn; def 16mn : I16mn; + [(set GR8:$rd, (MSP430rra GR8:$rs))]>; def RRA16r : II16r<0b010, (outs GR16:$rd), (ins GR16:$rs), "rra\t$rd", - [(set GR16:$rd, (MSP430rra GR16:$rs)), - (implicit SR)]>; + [(set GR16:$rd, (MSP430rra GR16:$rs))]>; let Uses = [SR] in { def RRC8r : II8r<0b000, (outs GR8:$rd), (ins GR8:$rs), "rrc.b\t$rd", - [(set GR8:$rd, (MSP430rrc GR8:$rs)), - (implicit SR)]>; + [(set GR8:$rd, (MSP430rrc GR8:$rs))]>; def RRC16r : II16r<0b000, (outs GR16:$rd), (ins GR16:$rs), "rrc\t$rd", - [(set GR16:$rd, (MSP430rrc GR16:$rs)), - (implicit SR)]>; + [(set GR16:$rd, (MSP430rrc GR16:$rs))]>; } // Uses = [SR] def SEXT16r : II16r<0b011, (outs GR16:$rd), (ins GR16:$rs), "sxt\t$rd", - [(set GR16:$rd, (sext_inreg GR16:$rs, i8)), - (implicit SR)]>; + [(set GR16:$rd, (sext_inreg GR16:$rs, i8))]>; } // Defs = [SR] @@ -678,13 +657,11 @@ let Defs = [SR] in { def RRA8m : II8m<0b010, (outs), (ins memsrc:$src), "rra.b\t$src", - [(store (MSP430rra (i8 (load addr:$src))), addr:$src), - (implicit SR)]>; + [(store (MSP430rra (i8 (load addr:$src))), addr:$src)]>; def RRA16m : II16m<0b010, (outs), (ins memsrc:$src), "rra\t$src", - [(store (MSP430rra (i16 (load addr:$src))), addr:$src), - (implicit SR)]>; + [(store (MSP430rra (i16 (load addr:$src))), addr:$src)]>; def RRA8n : II8n<0b010, (outs), (ins indreg:$rs), "rra.b\t$rs", []>; def RRA16n : II16n<0b010, (outs), (ins indreg:$rs), "rra\t$rs", []>; @@ -695,13 +672,11 @@ let Uses = [SR] in { def RRC8m : II8m<0b000, (outs), (ins memsrc:$src), "rrc.b\t$src", - [(store (MSP430rrc (i8 (load addr:$src))), addr:$src), - (implicit SR)]>; + [(store (MSP430rrc (i8 (load addr:$src))), addr:$src)]>; def RRC16m : II16m<0b000, (outs), (ins memsrc:$src), "rrc\t$src", - [(store (MSP430rrc (i16 (load addr:$src))), addr:$src), - (implicit SR)]>; + [(store (MSP430rrc (i16 (load addr:$src))), addr:$src)]>; def RRC8n : II8n<0b000, (outs), (ins indreg:$rs), "rrc.b\t$rs", []>; def RRC16n : II16n<0b000, (outs), (ins indreg:$rs), "rrc\t$rs", []>; @@ -714,8 +689,7 @@ def SEXT16m : II16m<0b011, (outs), (ins memsrc:$src), "sxt\t$src", [(store (sext_inreg (extloadi16i8 addr:$src), i8), - addr:$src), - (implicit SR)]>; + addr:$src)]>; def SEXT16n : II16n<0b011, (outs), (ins indreg:$rs), "sxt\t$rs", []>; def SEXT16p : II16p<0b011, (outs), (ins postreg:$rs), "sxt\t$rs", []>; @@ -733,62 +707,58 @@ let Defs = [SR] in { def CMP8rr : I8rr<0b1001, (outs), (ins GR8:$rd, GR8:$rs), "cmp.b\t$rs, $rd", - [(MSP430cmp GR8:$rd, GR8:$rs), (implicit SR)]>; + [(MSP430cmp GR8:$rd, GR8:$rs)]>; def CMP16rr : I16rr<0b1001, (outs), (ins GR16:$rd, GR16:$rs), "cmp\t$rs, $rd", - [(MSP430cmp GR16:$rd, GR16:$rs), (implicit SR)]>; + [(MSP430cmp GR16:$rd, GR16:$rs)]>; def CMP8rc : I8rc<0b1001, (outs), (ins GR8:$rd, cg8imm:$imm), "cmp.b\t$imm, $rd", - [(MSP430cmp GR8:$rd, cg8imm:$imm), (implicit SR)]>; + [(MSP430cmp GR8:$rd, cg8imm:$imm)]>; def CMP16rc : I16rc<0b1001, (outs), (ins GR16:$rd, cg16imm:$imm), "cmp\t$imm, $rd", - [(MSP430cmp GR16:$rd, cg16imm:$imm), (implicit SR)]>; + [(MSP430cmp GR16:$rd, cg16imm:$imm)]>; def CMP8ri : I8ri<0b1001, (outs), (ins GR8:$rd, i8imm:$imm), "cmp.b\t$imm, $rd", - [(MSP430cmp GR8:$rd, imm:$imm), (implicit SR)]>; + [(MSP430cmp GR8:$rd, imm:$imm)]>; def CMP16ri : I16ri<0b1001, (outs), (ins GR16:$rd, i16imm:$imm), "cmp\t$imm, $rd", - [(MSP430cmp GR16:$rd, imm:$imm), (implicit SR)]>; + [(MSP430cmp GR16:$rd, imm:$imm)]>; def CMP8mc : I8mc<0b1001, (outs), (ins memsrc:$dst, cg8imm:$imm), "cmp.b\t$imm, $dst", - [(MSP430cmp (load addr:$dst), (i8 cg8imm:$imm)), - (implicit SR)]>; + [(MSP430cmp (load addr:$dst), (i8 cg8imm:$imm))]>; def CMP16mc : I16mc<0b1001, (outs), (ins memsrc:$dst, cg16imm:$imm), "cmp\t$imm, $dst", - [(MSP430cmp (load addr:$dst), (i16 cg16imm:$imm)), - (implicit SR)]>; + [(MSP430cmp (load addr:$dst), (i16 cg16imm:$imm))]>; def CMP8mi : I8mi<0b1001, (outs), (ins memsrc:$dst, i8imm:$imm), "cmp.b\t$imm, $dst", [(MSP430cmp (load addr:$dst), - (i8 imm:$imm)), (implicit SR)]>; + (i8 imm:$imm))]>; def CMP16mi : I16mi<0b1001, (outs), (ins memsrc:$dst, i16imm:$imm), "cmp\t$imm, $dst", [(MSP430cmp (load addr:$dst), - (i16 imm:$imm)), (implicit SR)]>; + (i16 imm:$imm))]>; def CMP8rm : I8rm<0b1001, (outs), (ins GR8:$rd, memsrc:$src), "cmp.b\t$src, $rd", - [(MSP430cmp GR8:$rd, (load addr:$src)), - (implicit SR)]>; + [(MSP430cmp GR8:$rd, (load addr:$src))]>; def CMP16rm : I16rm<0b1001, (outs), (ins GR16:$rd, memsrc:$src), "cmp\t$src, $rd", - [(MSP430cmp GR16:$rd, (load addr:$src)), - (implicit SR)]>; + [(MSP430cmp GR16:$rd, (load addr:$src))]>; def CMP8rn : I8rn<0b1001, (outs), (ins GR8:$rd, indreg:$rs), "cmp.b\t$rs, $rd", []>; @@ -803,22 +773,18 @@ def CMP16rp : I16rp<0b1001, def CMP8mr : I8mr<0b1001, (outs), (ins memsrc:$dst, GR8:$rs), "cmp.b\t$rs, $dst", - [(MSP430cmp (load addr:$dst), GR8:$rs), - (implicit SR)]>; + [(MSP430cmp (load addr:$dst), GR8:$rs)]>; def CMP16mr : I16mr<0b1001, (outs), (ins memsrc:$dst, GR16:$rs), "cmp\t$rs, $dst", - [(MSP430cmp (load addr:$dst), GR16:$rs), - (implicit SR)]>; + [(MSP430cmp (load addr:$dst), GR16:$rs)]>; def CMP8mm : I8mm<0b1001, (outs), (ins memdst:$dst, memsrc:$src), "cmp.b\t$src, $dst", - [(MSP430cmp (load addr:$dst), (i8 (load addr:$src))), - (implicit SR)]>; + [(MSP430cmp (load addr:$dst), (i8 (load addr:$src)))]>; def CMP16mm : I16mm<0b1001, (outs), (ins memdst:$dst, memsrc:$src), "cmp\t$src, $dst", - [(MSP430cmp (load addr:$dst), (i16 (load addr:$src))), - (implicit SR)]>; + [(MSP430cmp (load addr:$dst), (i16 (load addr:$src)))]>; def CMP8mn : I8mn<0b1001, (outs), (ins memsrc:$dst, indreg:$rs), "cmp.b\t$rs, $dst", []>; @@ -836,46 +802,38 @@ let isCommutable = 1 in { def BIT8rr : I8rr<0b1011, (outs), (ins GR8:$rd, GR8:$rs), "bit.b\t$rs, $rd", - [(MSP430cmp (and_su GR8:$rd, GR8:$rs), 0), - (implicit SR)]>; + [(MSP430cmp (and_su GR8:$rd, GR8:$rs), 0)]>; def BIT16rr : I16rr<0b1011, (outs), (ins GR16:$rd, GR16:$rs), "bit\t$rs, $rd", - [(MSP430cmp (and_su GR16:$rd, GR16:$rs), 0), - (implicit SR)]>; + [(MSP430cmp (and_su GR16:$rd, GR16:$rs), 0)]>; } def BIT8rc : I8rc<0b1011, (outs), (ins GR8:$rd, cg8imm:$imm), "bit.b\t$imm, $rd", - [(MSP430cmp (and_su GR8:$rd, cg8imm:$imm), 0), - (implicit SR)]>; + [(MSP430cmp (and_su GR8:$rd, cg8imm:$imm), 0)]>; def BIT16rc : I16rc<0b1011, (outs), (ins GR16:$rd, cg16imm:$imm), "bit\t$imm, $rd", - [(MSP430cmp (and_su GR16:$rd, cg16imm:$imm), 0), - (implicit SR)]>; + [(MSP430cmp (and_su GR16:$rd, cg16imm:$imm), 0)]>; def BIT8ri : I8ri<0b1011, (outs), (ins GR8:$rd, i8imm:$imm), "bit.b\t$imm, $rd", - [(MSP430cmp (and_su GR8:$rd, imm:$imm), 0), - (implicit SR)]>; + [(MSP430cmp (and_su GR8:$rd, imm:$imm), 0)]>; def BIT16ri : I16ri<0b1011, (outs), (ins GR16:$rd, i16imm:$imm), "bit\t$imm, $rd", - [(MSP430cmp (and_su GR16:$rd, imm:$imm), 0), - (implicit SR)]>; + [(MSP430cmp (and_su GR16:$rd, imm:$imm), 0)]>; def BIT8rm : I8rm<0b1011, (outs), (ins GR8:$rd, memdst:$src), "bit.b\t$src, $rd", - [(MSP430cmp (and_su GR8:$rd, (load addr:$src)), 0), - (implicit SR)]>; + [(MSP430cmp (and_su GR8:$rd, (load addr:$src)), 0)]>; def BIT16rm : I16rm<0b1011, (outs), (ins GR16:$rd, memdst:$src), "bit\t$src, $rd", - [(MSP430cmp (and_su GR16:$rd, (load addr:$src)), 0), - (implicit SR)]>; + [(MSP430cmp (and_su GR16:$rd, (load addr:$src)), 0)]>; def BIT8rn : I8rn<0b1011, (outs), (ins GR8:$rd, indreg:$rs), "bit.b\t$rs, $rd", []>; @@ -890,50 +848,42 @@ def BIT16rp : I16rp<0b1011, (outs), (ins GR16:$rd, postreg:$rs), def BIT8mr : I8mr<0b1011, (outs), (ins memsrc:$dst, GR8:$rs), "bit.b\t$rs, $dst", - [(MSP430cmp (and_su (load addr:$dst), GR8:$rs), 0), - (implicit SR)]>; + [(MSP430cmp (and_su (load addr:$dst), GR8:$rs), 0)]>; def BIT16mr : I16mr<0b1011, (outs), (ins memsrc:$dst, GR16:$rs), "bit\t$rs, $dst", - [(MSP430cmp (and_su (load addr:$dst), GR16:$rs), 0), - (implicit SR)]>; + [(MSP430cmp (and_su (load addr:$dst), GR16:$rs), 0)]>; def BIT8mc : I8mc<0b1011, (outs), (ins memsrc:$dst, cg8imm:$imm), "bit.b\t$imm, $dst", - [(MSP430cmp (and_su (load addr:$dst), (i8 cg8imm:$imm)), 0), - (implicit SR)]>; + [(MSP430cmp (and_su (load addr:$dst), (i8 cg8imm:$imm)), 0)]>; def BIT16mc : I16mc<0b1011, (outs), (ins memdst:$dst, cg16imm:$imm), "bit\t$imm, $dst", - [(MSP430cmp (and_su (load addr:$dst), (i16 cg16imm:$imm)), 0), - (implicit SR)]>; + [(MSP430cmp (and_su (load addr:$dst), (i16 cg16imm:$imm)), 0)]>; def BIT8mi : I8mi<0b1011, (outs), (ins memsrc:$dst, i8imm:$imm), "bit.b\t$imm, $dst", - [(MSP430cmp (and_su (load addr:$dst), (i8 imm:$imm)), 0), - (implicit SR)]>; + [(MSP430cmp (and_su (load addr:$dst), (i8 imm:$imm)), 0)]>; def BIT16mi : I16mi<0b1011, (outs), (ins memsrc:$dst, i16imm:$imm), "bit\t$imm, $dst", - [(MSP430cmp (and_su (load addr:$dst), (i16 imm:$imm)), 0), - (implicit SR)]>; + [(MSP430cmp (and_su (load addr:$dst), (i16 imm:$imm)), 0)]>; def BIT8mm : I8mm<0b1011, (outs), (ins memsrc:$dst, memsrc:$src), "bit.b\t$src, $dst", [(MSP430cmp (and_su (i8 (load addr:$dst)), (load addr:$src)), - 0), - (implicit SR)]>; + 0)]>; def BIT16mm : I16mm<0b1011, (outs), (ins memsrc:$dst, memsrc:$src), "bit\t$src, $dst", [(MSP430cmp (and_su (i16 (load addr:$dst)), (load addr:$src)), - 0), - (implicit SR)]>; + 0)]>; def BIT8mn : I8mn<0b1011, (outs), (ins memsrc:$dst, indreg:$rs), "bit.b\t$rs, $dst", []>; def BIT16mn : I16mn<0b1011, (outs), (ins memsrc:$dst, indreg:$rs), diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 0c472c456bd5dd..2e7cf10d48cb62 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -4175,6 +4175,10 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; } return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, ); \ }() +#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode) \ + (IsCacheHint ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \ + : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode) + static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32, bool IsCacheHint, bool IsIm2Col) { if (IsIm2Col) { @@ -4242,6 +4246,55 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, } } +static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint, + bool IsIm2Col) { + if (IsIm2Col) { + switch (Dim) { + case 3: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL); + case 4: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL); + case 5: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL); + default: + llvm_unreachable("Invalid Dimension in im2col mode for " + "GetCpAsyncBulkTensorPrefetchOpcode."); + } + } else { + switch (Dim) { + case 1: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE); + case 2: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE); + case 3: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE); + case 4: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE); + case 5: + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE); + default: + llvm_unreachable("Invalid Dimension in tile mode for " + "GetCpAsyncBulkTensorPrefetchOpcode."); + } + } +} + +static size_t GetDimsFromIntrinsic(unsigned IID) { + switch (IID) { + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d: + return 3; + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d: + return 4; + case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d: + return 5; + default: + llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic."); + } +} + void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col) { // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: @@ -4250,21 +4303,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, // multicast_flag, cache_hint_flag} // NumOperands = {Chain, IID} + {Actual intrinsic args} // = {2} + {7 + dims + im2col_offsets} - auto getDimsFromIntrinsic = [](unsigned IID) { - switch (IID) { - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: - return 3; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: - return 4; - case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: - return 5; - default: - llvm_unreachable( - "Invalid im2col intrinsic in SelectCpAsyncBulkTensorG2SCommon."); - } - }; size_t NumOps = N->getNumOperands(); - size_t NumDims = IsIm2Col ? getDimsFromIntrinsic(N->getConstantOperandVal(1)) + size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1)) : (NumOps - 9); // Offsets is always 'NumDims - 2' and only for im2col mode size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; @@ -4316,6 +4356,30 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N, ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } +void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, + bool IsIm2Col) { + // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: + // {src, dims{d0...dN}, im2col_offsets{dims-2} + // cache_hint, cache_hint_flag} + // NumOperands = {Chain, IID} + {Actual intrinsic args} + // = {2} + {3 + dims + im2col_offsets} + size_t NumOps = N->getNumOperands(); + size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1)) + : (NumOps - 5); + // Offsets is always 'NumDims - 2' and only for im2col mode + size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; + bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; + size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1); + + SDLoc DL(N); + SmallVector Ops(N->ops().slice(2, NumArgs)); + Ops.push_back(N->getOperand(0)); // Chain operand + + unsigned Opcode = + GetCpAsyncBulkTensorPrefetchOpcode(NumDims, IsCacheHint, IsIm2Col); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); +} + bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { unsigned IID = N->getConstantOperandVal(1); switch (IID) { @@ -4345,5 +4409,17 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true); return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d: + SelectCpAsyncBulkTensorPrefetchCommon(N); + return true; + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d: + case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d: + SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true); + return true; } } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 6aa4e9f615a481..d6c80a31b7463d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -94,6 +94,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { void SelectI128toV2I64(SDNode *N); void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); + void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false); inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 536be22510703d..5878940812f62b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -605,6 +605,52 @@ foreach dim = [1, 2, 3, 4, 5] in { } } +// TMA Prefetch from Global memory to L2 cache +class PREFETCH_STRINGS { + string prefix = "cp.async.bulk.prefetch.tensor"; + string dir = "L2.global"; + string inst_name = prefix + # "." # dim # "d" + # "." # dir + # "." # mode + # !if(ch, ".L2::cache_hint", ""); + string intr_name = "CP_ASYNC_BULK_TENSOR_PREFETCH_" + # dim # "D" + # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); +} + +multiclass CP_ASYNC_BULK_TENSOR_PREFETCH_INTR { + defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i)); + defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); + defvar asm_str_default = " [$tmap, {{" # dims_str # "}}]"; + + defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0); + defvar im2col_dag = !if(!eq(mode, "im2col"), + !dag(ins, !listsplat(Int16Regs, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)), + (ins)); + defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", "); + defvar im2col_asm_str = ", {{" # im2col_str # "}}"; + + defvar asm_str = !if(!eq(mode, "im2col"), + !strconcat(asm_str_default, im2col_asm_str), asm_str_default); + + def "": NVPTXInst<(outs), + !con((ins Int64Regs:$tmap), dims_dag, im2col_dag), + !strconcat(PREFETCH_STRINGS.inst_name, asm_str, ";"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def _CH: NVPTXInst<(outs), + !con((ins Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)), + !strconcat(PREFETCH_STRINGS.inst_name, asm_str, ", $ch;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; +} + +foreach dim = [1, 2, 3, 4, 5] in { + foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { + defm PREFETCH_STRINGS.intr_name : + CP_ASYNC_BULK_TENSOR_PREFETCH_INTR; + } +} + //----------------------------------- // MBarrier Functions //----------------------------------- diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 1dd5c5e04d2132..a4d818028c89d5 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -6176,7 +6176,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SDValue GA = N->getOperand(0); SDValue TOCbase = N->getOperand(1); - EVT VT = isPPC64 ? MVT::i64 : MVT::i32; + EVT VT = Subtarget->getScalarIntVT(); SDNode *Tmp = CurDAG->getMachineNode( isPPC64 ? PPC::ADDIStocHA8 : PPC::ADDIStocHA, dl, VT, TOCbase, GA); @@ -6309,7 +6309,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { SDValue ZeroReg = CurDAG->getRegister(Subtarget->isPPC64() ? PPC::ZERO8 : PPC::ZERO, - Subtarget->isPPC64() ? MVT::i64 : MVT::i32); + Subtarget->getScalarIntVT()); unsigned LIOpcode = Subtarget->isPPC64() ? PPC::LI8 : PPC::LI; // v16i8 LD_SPLAT addr // ======> diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index ec4f8f4be425ed..e4582e85e11e79 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -183,6 +183,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, // arguments are at least 4/8 bytes aligned. bool isPPC64 = Subtarget.isPPC64(); setMinStackArgumentAlignment(isPPC64 ? Align(8) : Align(4)); + const MVT RegVT = Subtarget.getScalarIntVT(); // Set up the register classes. addRegisterClass(MVT::i32, &PPC::GPRCRegClass); @@ -198,7 +199,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } } - setOperationAction(ISD::UADDO, isPPC64 ? MVT::i64 : MVT::i32, Custom); + setOperationAction(ISD::UADDO, RegVT, Custom); // Match BITREVERSE to customized fast code sequence in the td file. setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); @@ -268,32 +269,24 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, if (isPPC64 || Subtarget.hasFPCVT()) { setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Promote); - AddPromotedToType(ISD::STRICT_SINT_TO_FP, MVT::i1, - isPPC64 ? MVT::i64 : MVT::i32); + AddPromotedToType(ISD::STRICT_SINT_TO_FP, MVT::i1, RegVT); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Promote); - AddPromotedToType(ISD::STRICT_UINT_TO_FP, MVT::i1, - isPPC64 ? MVT::i64 : MVT::i32); + AddPromotedToType(ISD::STRICT_UINT_TO_FP, MVT::i1, RegVT); setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote); - AddPromotedToType (ISD::SINT_TO_FP, MVT::i1, - isPPC64 ? MVT::i64 : MVT::i32); + AddPromotedToType(ISD::SINT_TO_FP, MVT::i1, RegVT); setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote); - AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, - isPPC64 ? MVT::i64 : MVT::i32); + AddPromotedToType(ISD::UINT_TO_FP, MVT::i1, RegVT); setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i1, Promote); - AddPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::i1, - isPPC64 ? MVT::i64 : MVT::i32); + AddPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::i1, RegVT); setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i1, Promote); - AddPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::i1, - isPPC64 ? MVT::i64 : MVT::i32); + AddPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::i1, RegVT); setOperationAction(ISD::FP_TO_SINT, MVT::i1, Promote); - AddPromotedToType(ISD::FP_TO_SINT, MVT::i1, - isPPC64 ? MVT::i64 : MVT::i32); + AddPromotedToType(ISD::FP_TO_SINT, MVT::i1, RegVT); setOperationAction(ISD::FP_TO_UINT, MVT::i1, Promote); - AddPromotedToType(ISD::FP_TO_UINT, MVT::i1, - isPPC64 ? MVT::i64 : MVT::i32); + AddPromotedToType(ISD::FP_TO_UINT, MVT::i1, RegVT); } else { setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i1, Custom); setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i1, Custom); @@ -482,9 +475,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::BSWAP, MVT::i64, Legal); } else { setOperationAction(ISD::BSWAP, MVT::i32, Expand); - setOperationAction( - ISD::BSWAP, MVT::i64, - (Subtarget.hasP9Vector() && Subtarget.isPPC64()) ? Custom : Expand); + setOperationAction(ISD::BSWAP, MVT::i64, + (Subtarget.hasP9Vector() && isPPC64) ? Custom : Expand); } // CTPOP or CTTZ were introduced in P8/P9 respectively @@ -709,7 +701,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); - if (Subtarget.hasLFIWAX() || Subtarget.isPPC64()) { + if (Subtarget.hasLFIWAX() || isPPC64) { setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); } @@ -3191,12 +3183,11 @@ static void setUsesTOCBasePtr(SelectionDAG &DAG) { SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl, SDValue GA) const { - const bool Is64Bit = Subtarget.isPPC64(); - EVT VT = Is64Bit ? MVT::i64 : MVT::i32; - SDValue Reg = Is64Bit ? DAG.getRegister(PPC::X2, VT) - : Subtarget.isAIXABI() - ? DAG.getRegister(PPC::R2, VT) - : DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); + EVT VT = Subtarget.getScalarIntVT(); + SDValue Reg = Subtarget.isPPC64() ? DAG.getRegister(PPC::X2, VT) + : Subtarget.isAIXABI() + ? DAG.getRegister(PPC::R2, VT) + : DAG.getNode(PPCISD::GlobalBaseReg, dl, VT); SDValue Ops[] = { GA, Reg }; return DAG.getMemIntrinsicNode( PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT, @@ -4008,8 +3999,8 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, Entry.Node = Trmp; Args.push_back(Entry); // TrampSize == (isPPC64 ? 48 : 40); - Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40, dl, - isPPC64 ? MVT::i64 : MVT::i32); + Entry.Node = + DAG.getConstant(isPPC64 ? 48 : 40, dl, Subtarget.getScalarIntVT()); Args.push_back(Entry); Entry.Node = FPtr; Args.push_back(Entry); @@ -5237,13 +5228,12 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain, MachineFunction &MF = DAG.getMachineFunction(); const PPCSubtarget &Subtarget = MF.getSubtarget(); const PPCFrameLowering *FL = Subtarget.getFrameLowering(); - bool isPPC64 = Subtarget.isPPC64(); - int SlotSize = isPPC64 ? 8 : 4; + int SlotSize = Subtarget.isPPC64() ? 8 : 4; int NewRetAddrLoc = SPDiff + FL->getReturnSaveOffset(); int NewRetAddr = MF.getFrameInfo().CreateFixedObject(SlotSize, NewRetAddrLoc, true); - EVT VT = isPPC64 ? MVT::i64 : MVT::i32; - SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT); + SDValue NewRetAddrFrIdx = + DAG.getFrameIndex(NewRetAddr, Subtarget.getScalarIntVT()); Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack(MF, NewRetAddr)); } @@ -5252,14 +5242,14 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain, /// CalculateTailCallArgDest - Remember Argument for later processing. Calculate /// the position of the argument. -static void -CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64, - SDValue Arg, int SPDiff, unsigned ArgOffset, - SmallVectorImpl& TailCallArguments) { +static void CalculateTailCallArgDest( + SelectionDAG &DAG, MachineFunction &MF, bool IsPPC64, SDValue Arg, + int SPDiff, unsigned ArgOffset, + SmallVectorImpl &TailCallArguments) { int Offset = ArgOffset + SPDiff; uint32_t OpSize = (Arg.getValueSizeInBits() + 7) / 8; int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); - EVT VT = isPPC64 ? MVT::i64 : MVT::i32; + EVT VT = IsPPC64 ? MVT::i64 : MVT::i32; SDValue FIN = DAG.getFrameIndex(FI, VT); TailCallArgumentInfo Info; Info.Arg = Arg; @@ -5276,9 +5266,9 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr( SDValue &FPOpOut, const SDLoc &dl) const { if (SPDiff) { // Load the LR and FP stack slot for later adjusting. - EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; LROpOut = getReturnAddrFrameIndex(DAG); - LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo()); + LROpOut = DAG.getLoad(Subtarget.getScalarIntVT(), dl, Chain, LROpOut, + MachinePointerInfo()); Chain = SDValue(LROpOut.getNode(), 1); } return Chain; @@ -5320,8 +5310,9 @@ static void LowerMemOpCallTo( MemOpChains.push_back( DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo())); // Calculate and remember argument location. - } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset, - TailCallArguments); + } else + CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset, + TailCallArguments); } static void @@ -5672,7 +5663,7 @@ static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee, const unsigned TOCAnchorOffset = Subtarget.descriptorTOCAnchorOffset(); const unsigned EnvPtrOffset = Subtarget.descriptorEnvironmentPointerOffset(); - const MVT RegVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; + const MVT RegVT = Subtarget.getScalarIntVT(); const Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4); // One load for the functions entry point address. @@ -5724,7 +5715,7 @@ buildCallOperands(SmallVectorImpl &Ops, const PPCSubtarget &Subtarget) { const bool IsPPC64 = Subtarget.isPPC64(); // MVT for a general purpose register. - const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32; + const MVT RegVT = Subtarget.getScalarIntVT(); // First operand is always the chain. Ops.push_back(Chain); @@ -6867,7 +6858,7 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT, const unsigned PtrSize = IsPPC64 ? 8 : 4; const Align PtrAlign(PtrSize); const Align StackAlign(16); - const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32; + const MVT RegVT = Subtarget.getScalarIntVT(); if (ValVT == MVT::f128) report_fatal_error("f128 is unimplemented on AIX."); @@ -7818,7 +7809,7 @@ SDValue PPCTargetLowering::LowerCall_AIX( assert(!CFlags.IsTailCall && "Indirect tail-calls not supported."); const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister(); const MCRegister StackPtrReg = Subtarget.getStackPointerRegister(); - const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; + const MVT PtrVT = Subtarget.getScalarIntVT(); const unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset(); @@ -8383,7 +8374,7 @@ static SDValue convertFPToInt(SDValue Op, SelectionDAG &DAG, Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src); } if ((DestTy == MVT::i8 || DestTy == MVT::i16) && Subtarget.hasP9Vector()) - DestTy = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; + DestTy = Subtarget.getScalarIntVT(); unsigned Opc = ISD::DELETED_NODE; switch (DestTy.SimpleTy) { default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!"); @@ -11319,11 +11310,11 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op, Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, Val); } unsigned Opcode = Subtarget.isPPC64() ? PPC::CFENCE8 : PPC::CFENCE; - EVT FTy = Subtarget.isPPC64() ? MVT::i64 : MVT::i32; return SDValue( - DAG.getMachineNode(Opcode, DL, MVT::Other, - DAG.getNode(ISD::ANY_EXTEND, DL, FTy, Val), - Op.getOperand(0)), + DAG.getMachineNode( + Opcode, DL, MVT::Other, + DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getScalarIntVT(), Val), + Op.getOperand(0)), 0); } default: @@ -15687,16 +15678,20 @@ static SDValue isScalarToVec(SDValue Op) { // On little endian, that's just the corresponding element in the other // half of the vector. On big endian, it is in the same half but right // justified rather than left justified in that half. -static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl &ShuffV, - int LHSMaxIdx, int RHSMinIdx, - int RHSMaxIdx, int HalfVec, - unsigned ValidLaneWidth, - const PPCSubtarget &Subtarget) { - for (int i = 0, e = ShuffV.size(); i < e; i++) { - int Idx = ShuffV[i]; - if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx)) - ShuffV[i] += - Subtarget.isLittleEndian() ? HalfVec : HalfVec - ValidLaneWidth; +static void fixupShuffleMaskForPermutedSToV( + SmallVectorImpl &ShuffV, int LHSFirstElt, int LHSLastElt, + int RHSFirstElt, int RHSLastElt, int HalfVec, unsigned LHSNumValidElts, + unsigned RHSNumValidElts, const PPCSubtarget &Subtarget) { + int LHSEltFixup = + Subtarget.isLittleEndian() ? HalfVec : HalfVec - LHSNumValidElts; + int RHSEltFixup = + Subtarget.isLittleEndian() ? HalfVec : HalfVec - RHSNumValidElts; + for (int I = 0, E = ShuffV.size(); I < E; ++I) { + int Idx = ShuffV[I]; + if (Idx >= LHSFirstElt && Idx <= LHSLastElt) + ShuffV[I] += LHSEltFixup; + else if (Idx >= RHSFirstElt && Idx <= RHSLastElt) + ShuffV[I] += RHSEltFixup; } } @@ -15735,6 +15730,51 @@ static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG, OrigSToV.getOperand(0)); } +static bool isShuffleMaskInRange(const SmallVectorImpl &ShuffV, + int HalfVec, int LHSLastElementDefined, + int RHSLastElementDefined) { + for (int Index : ShuffV) { + if (Index < 0) // Skip explicitly undefined mask indices. + continue; + // Handle first input vector of the vector_shuffle. + if ((LHSLastElementDefined >= 0) && (Index < HalfVec) && + (Index > LHSLastElementDefined)) + return false; + // Handle second input vector of the vector_shuffle. + if ((RHSLastElementDefined >= 0) && + (Index > HalfVec + RHSLastElementDefined)) + return false; + } + return true; +} + +static SDValue generateSToVPermutedForVecShuffle( + int ScalarSize, uint64_t ShuffleEltWidth, unsigned &NumValidElts, + int FirstElt, int &LastElt, SDValue VecShuffOperand, SDValue SToVNode, + SelectionDAG &DAG, const PPCSubtarget &Subtarget) { + EVT VecShuffOperandType = VecShuffOperand.getValueType(); + // Set up the values for the shuffle vector fixup. + NumValidElts = ScalarSize / VecShuffOperandType.getScalarSizeInBits(); + // The last element depends on if the input comes from the LHS or RHS. + // + // For example: + // (shuff (s_to_v i32), (bitcast (s_to_v i64), v4i32), ...) + // + // For the LHS: The last element that comes from the LHS is actually 0, not 3 + // because elements 1 and higher of a scalar_to_vector are undefined. + // For the RHS: The last element that comes from the RHS is actually 5, not 7 + // because elements 1 and higher of a scalar_to_vector are undefined. + // It is also not 4 because the original scalar_to_vector is wider and + // actually contains two i32 elements. + LastElt = (uint64_t)ScalarSize > ShuffleEltWidth + ? ScalarSize / ShuffleEltWidth - 1 + FirstElt + : FirstElt; + SDValue SToVPermuted = getSToVPermuted(SToVNode, DAG, Subtarget); + if (SToVPermuted.getValueType() != VecShuffOperandType) + SToVPermuted = DAG.getBitcast(VecShuffOperandType, SToVPermuted); + return SToVPermuted; +} + // On little endian subtargets, combine shuffles such as: // vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, , %b // into: @@ -15782,36 +15822,25 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN, SDValue SToVLHS = isScalarToVec(LHS); SDValue SToVRHS = isScalarToVec(RHS); if (SToVLHS || SToVRHS) { - // FIXME: If both LHS and RHS are SCALAR_TO_VECTOR, but are not the - // same type and have differing element sizes, then do not perform - // the following transformation. The current transformation for - // SCALAR_TO_VECTOR assumes that both input vectors have the same - // element size. This will be updated in the future to account for - // differing sizes of the LHS and RHS. - if (SToVLHS && SToVRHS && - (SToVLHS.getValueType().getScalarSizeInBits() != - SToVRHS.getValueType().getScalarSizeInBits())) - return Res; - - int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements() - : SToVRHS.getValueType().getVectorNumElements(); - int NumEltsOut = ShuffV.size(); + EVT VT = SVN->getValueType(0); + uint64_t ShuffleEltWidth = VT.getVectorElementType().getSizeInBits(); + int ShuffleNumElts = ShuffV.size(); + int HalfVec = ShuffleNumElts / 2; // The width of the "valid lane" (i.e. the lane that contains the value that // is vectorized) needs to be expressed in terms of the number of elements // of the shuffle. It is thereby the ratio of the values before and after - // any bitcast. - unsigned ValidLaneWidth = - SToVLHS ? SToVLHS.getValueType().getScalarSizeInBits() / - LHS.getValueType().getScalarSizeInBits() - : SToVRHS.getValueType().getScalarSizeInBits() / - RHS.getValueType().getScalarSizeInBits(); + // any bitcast, which will be set later on if the LHS or RHS are + // SCALAR_TO_VECTOR nodes. + unsigned LHSNumValidElts = HalfVec; + unsigned RHSNumValidElts = HalfVec; // Initially assume that neither input is permuted. These will be adjusted - // accordingly if either input is. - int LHSMaxIdx = -1; - int RHSMinIdx = -1; - int RHSMaxIdx = -1; - int HalfVec = LHS.getValueType().getVectorNumElements() / 2; + // accordingly if either input is. Note, that -1 means that all elements + // are undefined. + int LHSFirstElt = 0; + int RHSFirstElt = ShuffleNumElts; + int LHSLastElt = -1; + int RHSLastElt = -1; // Get the permuted scalar to vector nodes for the source(s) that come from // ISD::SCALAR_TO_VECTOR. @@ -15819,34 +15848,38 @@ SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN, // than 64 bits since for 64-bit elements, all instructions already put // the value into element zero. Since scalar size of LHS and RHS may differ // after isScalarToVec, this should be checked using their own sizes. + int LHSScalarSize = 0; + int RHSScalarSize = 0; if (SToVLHS) { - if (!IsLittleEndian && SToVLHS.getValueType().getScalarSizeInBits() >= 64) + LHSScalarSize = SToVLHS.getValueType().getScalarSizeInBits(); + if (!IsLittleEndian && LHSScalarSize >= 64) return Res; - // Set up the values for the shuffle vector fixup. - LHSMaxIdx = NumEltsOut / NumEltsIn; - SToVLHS = getSToVPermuted(SToVLHS, DAG, Subtarget); - if (SToVLHS.getValueType() != LHS.getValueType()) - SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS); - LHS = SToVLHS; } if (SToVRHS) { - if (!IsLittleEndian && SToVRHS.getValueType().getScalarSizeInBits() >= 64) + RHSScalarSize = SToVRHS.getValueType().getScalarSizeInBits(); + if (!IsLittleEndian && RHSScalarSize >= 64) return Res; - RHSMinIdx = NumEltsOut; - RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx; - SToVRHS = getSToVPermuted(SToVRHS, DAG, Subtarget); - if (SToVRHS.getValueType() != RHS.getValueType()) - SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS); - RHS = SToVRHS; } + if (LHSScalarSize != 0) + LHS = generateSToVPermutedForVecShuffle( + LHSScalarSize, ShuffleEltWidth, LHSNumValidElts, LHSFirstElt, + LHSLastElt, LHS, SToVLHS, DAG, Subtarget); + if (RHSScalarSize != 0) + RHS = generateSToVPermutedForVecShuffle( + RHSScalarSize, ShuffleEltWidth, RHSNumValidElts, RHSFirstElt, + RHSLastElt, RHS, SToVRHS, DAG, Subtarget); + + if (!isShuffleMaskInRange(ShuffV, HalfVec, LHSLastElt, RHSLastElt)) + return Res; // Fix up the shuffle mask to reflect where the desired element actually is. // The minimum and maximum indices that correspond to element zero for both // the LHS and RHS are computed and will control which shuffle mask entries // are to be changed. For example, if the RHS is permuted, any shuffle mask - // entries in the range [RHSMinIdx,RHSMaxIdx) will be adjusted. - fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx, - HalfVec, ValidLaneWidth, Subtarget); + // entries in the range [RHSFirstElt,RHSLastElt] will be adjusted. + fixupShuffleMaskForPermutedSToV( + ShuffV, LHSFirstElt, LHSLastElt, RHSFirstElt, RHSLastElt, HalfVec, + LHSNumValidElts, RHSNumValidElts, Subtarget); Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV); // We may have simplified away the shuffle. We won't be able to do anything @@ -17319,7 +17352,6 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, // the stack. PPCFunctionInfo *FuncInfo = MF.getInfo(); FuncInfo->setLRStoreRequired(); - bool isPPC64 = Subtarget.isPPC64(); auto PtrVT = getPointerTy(MF.getDataLayout()); if (Depth > 0) { @@ -17331,7 +17363,7 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, LowerFRAMEADDR(Op, DAG), MachinePointerInfo()); SDValue Offset = DAG.getConstant(Subtarget.getFrameLowering()->getReturnSaveOffset(), dl, - isPPC64 ? MVT::i64 : MVT::i32); + Subtarget.getScalarIntVT()); return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo()); diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h index 2079dc0acc3cf7..f6ace4daa336bf 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.h +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h @@ -216,8 +216,8 @@ class PPCSubtarget : public PPCGenSubtargetInfo { bool isSVR4ABI() const { return !isAIXABI(); } bool isELFv2ABI() const; - bool is64BitELFABI() const { return isSVR4ABI() && isPPC64(); } - bool is32BitELFABI() const { return isSVR4ABI() && !isPPC64(); } + bool is64BitELFABI() const { return isSVR4ABI() && isPPC64(); } + bool is32BitELFABI() const { return isSVR4ABI() && !isPPC64(); } bool isUsingPCRelativeCalls() const; /// Originally, this function return hasISEL(). Now we always enable it, @@ -246,6 +246,8 @@ class PPCSubtarget : public PPCGenSubtargetInfo { /// True if the GV will be accessed via an indirect symbol. bool isGVIndirectSymbol(const GlobalValue *GV) const; + MVT getScalarIntVT() const { return isPPC64() ? MVT::i64 : MVT::i32; } + /// Calculates the effective code model for argument GV. CodeModel::Model getCodeModel(const TargetMachine &TM, const GlobalValue *GV) const; diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp index 27f15e07e47b8a..1a042375d0720e 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp @@ -76,7 +76,6 @@ class RISCVInstructionSelector : public InstructionSelector { bool materializeImm(Register Reg, int64_t Imm, MachineIRBuilder &MIB) const; bool selectAddr(MachineInstr &MI, MachineIRBuilder &MIB, bool IsLocal = true, bool IsExternWeak = false) const; - bool selectSExtInreg(MachineInstr &MI, MachineIRBuilder &MIB) const; bool selectSelect(MachineInstr &MI, MachineIRBuilder &MIB) const; bool selectFPCompare(MachineInstr &MI, MachineIRBuilder &MIB) const; void emitFence(AtomicOrdering FenceOrdering, SyncScope::ID FenceSSID, @@ -599,14 +598,14 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { MRI->getRegClassOrRegBank(DefReg); const TargetRegisterClass *DefRC = - RegClassOrBank.dyn_cast(); + dyn_cast(RegClassOrBank); if (!DefRC) { if (!DefTy.isValid()) { LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n"); return false; } - const RegisterBank &RB = *RegClassOrBank.get(); + const RegisterBank &RB = *cast(RegClassOrBank); DefRC = getRegClassForTypeOnBank(DefTy, RB); if (!DefRC) { LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n"); @@ -761,8 +760,6 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) { MI.setDesc(TII.get(RISCV::PseudoBRIND)); MI.addOperand(MachineOperand::CreateImm(0)); return constrainSelectedInstRegOperands(MI, TII, TRI, RBI); - case TargetOpcode::G_SEXT_INREG: - return selectSExtInreg(MI, MIB); case TargetOpcode::G_FRAME_INDEX: { // TODO: We may want to replace this code with the SelectionDAG patterns, // which fail to get imported because it uses FrameAddrRegImm, which is a @@ -1160,31 +1157,6 @@ bool RISCVInstructionSelector::selectAddr(MachineInstr &MI, return false; } -bool RISCVInstructionSelector::selectSExtInreg(MachineInstr &MI, - MachineIRBuilder &MIB) const { - Register DstReg = MI.getOperand(0).getReg(); - Register SrcReg = MI.getOperand(1).getReg(); - unsigned SrcSize = MI.getOperand(2).getImm(); - - MachineInstr *NewMI; - if (SrcSize == 32) { - assert(Subtarget->is64Bit() && "Unexpected extend"); - // addiw rd, rs, 0 (i.e. sext.w rd, rs) - NewMI = MIB.buildInstr(RISCV::ADDIW, {DstReg}, {SrcReg}).addImm(0U); - } else { - assert(Subtarget->hasStdExtZbb() && "Unexpected extension"); - assert((SrcSize == 8 || SrcSize == 16) && "Unexpected size"); - unsigned Opc = SrcSize == 16 ? RISCV::SEXT_H : RISCV::SEXT_B; - NewMI = MIB.buildInstr(Opc, {DstReg}, {SrcReg}); - } - - if (!constrainSelectedInstRegOperands(*NewMI, TII, TRI, RBI)) - return false; - - MI.eraseFromParent(); - return true; -} - bool RISCVInstructionSelector::selectSelect(MachineInstr &MI, MachineIRBuilder &MIB) const { auto &SelectMI = cast(MI); diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp index ba4442fe613c70..2643a1a708dd25 100644 --- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp +++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp @@ -30,18 +30,6 @@ using namespace llvm; using namespace LegalityPredicates; using namespace LegalizeMutations; -// Is this type supported by scalar FP arithmetic operations given the current -// subtarget. -static LegalityPredicate typeIsScalarFPArith(unsigned TypeIdx, - const RISCVSubtarget &ST) { - return [=, &ST](const LegalityQuery &Query) { - return Query.Types[TypeIdx].isScalar() && - ((ST.hasStdExtZfh() && Query.Types[TypeIdx].getSizeInBits() == 16) || - (ST.hasStdExtF() && Query.Types[TypeIdx].getSizeInBits() == 32) || - (ST.hasStdExtD() && Query.Types[TypeIdx].getSizeInBits() == 64)); - }; -} - static LegalityPredicate typeIsLegalIntOrFPVec(unsigned TypeIdx, std::initializer_list IntOrFPVecTys, @@ -158,7 +146,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getActionDefinitionsBuilder({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}) .lower(); - getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL}) + getActionDefinitionsBuilder({G_SHL, G_ASHR, G_LSHR}) .legalFor({{s32, s32}, {sXLen, sXLen}}) .widenScalarToNextPow2(0) .clampScalar(1, s32, sXLen) @@ -202,7 +190,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getActionDefinitionsBuilder({G_FSHL, G_FSHR}).lower(); - getActionDefinitionsBuilder({G_ROTL, G_ROTR}) + getActionDefinitionsBuilder({G_ROTR, G_ROTL}) .legalFor(ST.hasStdExtZbb() || ST.hasStdExtZbkb(), {{sXLen, sXLen}}) .customFor(ST.is64Bit() && (ST.hasStdExtZbb() || ST.hasStdExtZbkb()), {{s32, s32}}) @@ -456,11 +444,17 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) } if (ST.hasStdExtM()) { - getActionDefinitionsBuilder({G_UDIV, G_SDIV, G_UREM, G_SREM}) - .legalFor({s32, sXLen}) + getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_UREM}) + .legalFor({sXLen}) + .customFor({s32}) .libcallFor({sDoubleXLen}) .clampScalar(0, s32, sDoubleXLen) .widenScalarToNextPow2(0); + getActionDefinitionsBuilder(G_SREM) + .legalFor({sXLen}) + .libcallFor({sDoubleXLen}) + .clampScalar(0, sXLen, sDoubleXLen) + .widenScalarToNextPow2(0); } else { getActionDefinitionsBuilder({G_UDIV, G_SDIV, G_UREM, G_SREM}) .libcallFor({sXLen, sDoubleXLen}) @@ -469,7 +463,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) } // TODO: Use libcall for sDoubleXLen. - getActionDefinitionsBuilder({G_UDIVREM, G_SDIVREM}).lower(); + getActionDefinitionsBuilder({G_SDIVREM, G_UDIVREM}).lower(); getActionDefinitionsBuilder(G_ABS) .customFor(ST.hasStdExtZbb(), {sXLen}) @@ -492,7 +486,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG, G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM}) - .legalIf(typeIsScalarFPArith(0, ST)); + .legalFor(ST.hasStdExtF(), {s32}) + .legalFor(ST.hasStdExtD(), {s64}) + .legalFor(ST.hasStdExtZfh(), {s16}); getActionDefinitionsBuilder(G_FREM) .libcallFor({s32, s64}) @@ -500,51 +496,55 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) .scalarize(0); getActionDefinitionsBuilder(G_FCOPYSIGN) - .legalIf(all(typeIsScalarFPArith(0, ST), typeIsScalarFPArith(1, ST))); + .legalFor(ST.hasStdExtF(), {{s32, s32}}) + .legalFor(ST.hasStdExtD(), {{s64, s64}, {s32, s64}, {s64, s32}}) + .legalFor(ST.hasStdExtZfh(), {{s16, s16}, {s16, s32}, {s32, s16}}) + .legalFor(ST.hasStdExtZfh() && ST.hasStdExtD(), {{s16, s64}, {s64, s16}}); // FIXME: Use Zfhmin. - getActionDefinitionsBuilder(G_FPTRUNC).legalIf( - [=, &ST](const LegalityQuery &Query) -> bool { - return (ST.hasStdExtD() && typeIs(0, s32)(Query) && - typeIs(1, s64)(Query)) || - (ST.hasStdExtZfh() && typeIs(0, s16)(Query) && - typeIs(1, s32)(Query)) || - (ST.hasStdExtZfh() && ST.hasStdExtD() && typeIs(0, s16)(Query) && - typeIs(1, s64)(Query)); - }); - getActionDefinitionsBuilder(G_FPEXT).legalIf( - [=, &ST](const LegalityQuery &Query) -> bool { - return (ST.hasStdExtD() && typeIs(0, s64)(Query) && - typeIs(1, s32)(Query)) || - (ST.hasStdExtZfh() && typeIs(0, s32)(Query) && - typeIs(1, s16)(Query)) || - (ST.hasStdExtZfh() && ST.hasStdExtD() && typeIs(0, s64)(Query) && - typeIs(1, s16)(Query)); - }); + getActionDefinitionsBuilder(G_FPTRUNC) + .legalFor(ST.hasStdExtD(), {{s32, s64}}) + .legalFor(ST.hasStdExtZfh(), {{s16, s32}}) + .legalFor(ST.hasStdExtZfh() && ST.hasStdExtD(), {{s16, s64}}); + getActionDefinitionsBuilder(G_FPEXT) + .legalFor(ST.hasStdExtD(), {{s64, s32}}) + .legalFor(ST.hasStdExtZfh(), {{s32, s16}}) + .legalFor(ST.hasStdExtZfh() && ST.hasStdExtD(), {{s64, s16}}); getActionDefinitionsBuilder(G_FCMP) - .legalIf(all(typeIs(0, sXLen), typeIsScalarFPArith(1, ST))) - .clampScalar(0, sXLen, sXLen); + .legalFor(ST.hasStdExtF(), {{sXLen, s32}}) + .legalFor(ST.hasStdExtD(), {{sXLen, s64}}) + .legalFor(ST.hasStdExtZfh(), {{sXLen, s16}}) + .clampScalar(ST.hasStdExtF(), 0, sXLen, sXLen); // TODO: Support vector version of G_IS_FPCLASS. getActionDefinitionsBuilder(G_IS_FPCLASS) - .customIf(all(typeIs(0, s1), typeIsScalarFPArith(1, ST))); + .customFor(ST.hasStdExtF(), {{s1, s32}}) + .customFor(ST.hasStdExtD(), {{s1, s64}}) + .customFor(ST.hasStdExtZfh(), {{s1, s16}}); getActionDefinitionsBuilder(G_FCONSTANT) - .legalIf(typeIsScalarFPArith(0, ST)) + .legalFor(ST.hasStdExtF(), {s32}) + .legalFor(ST.hasStdExtD(), {s64}) + .legalFor(ST.hasStdExtZfh(), {s16}) .lowerFor({s32, s64}); - auto &FPToIActions = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}); - FPToIActions.legalIf(all(typeInSet(0, {sXLen}), typeIsScalarFPArith(1, ST))); - if (ST.is64Bit()) - FPToIActions.customIf(all(typeInSet(0, {s32}), typeIsScalarFPArith(1, ST))); - FPToIActions.widenScalarToNextPow2(0) + getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) + .legalFor(ST.hasStdExtF(), {{sXLen, s32}}) + .legalFor(ST.hasStdExtD(), {{sXLen, s64}}) + .legalFor(ST.hasStdExtZfh(), {{sXLen, s16}}) + .customFor(ST.is64Bit() && ST.hasStdExtF(), {{s32, s32}}) + .customFor(ST.is64Bit() && ST.hasStdExtD(), {{s32, s64}}) + .customFor(ST.is64Bit() && ST.hasStdExtZfh(), {{s32, s16}}) + .widenScalarToNextPow2(0) .minScalar(0, s32) .libcallFor({{s32, s32}, {s64, s32}, {s32, s64}, {s64, s64}}) .libcallFor(ST.is64Bit(), {{s128, s32}, {s128, s64}}); getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) - .legalIf(all(typeIsScalarFPArith(0, ST), typeInSet(1, {sXLen}))) + .legalFor(ST.hasStdExtF(), {{s32, sXLen}}) + .legalFor(ST.hasStdExtD(), {{s64, sXLen}}) + .legalFor(ST.hasStdExtZfh(), {{s16, sXLen}}) .widenScalarToNextPow2(1) .minScalar(1, sXLen) .libcallFor({{s32, s32}, {s64, s32}, {s32, s64}, {s64, s64}}) @@ -626,6 +626,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST))); getLegacyLegalizerInfo().computeTables(); + verify(*ST.getInstrInfo()); } bool RISCVLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, @@ -1165,6 +1166,12 @@ static unsigned getRISCVWOpcode(unsigned Opcode) { switch (Opcode) { default: llvm_unreachable("Unexpected opcode"); + case TargetOpcode::G_SDIV: + return RISCV::G_DIVW; + case TargetOpcode::G_UDIV: + return RISCV::G_DIVUW; + case TargetOpcode::G_UREM: + return RISCV::G_REMUW; case TargetOpcode::G_ROTL: return RISCV::G_ROLW; case TargetOpcode::G_ROTR: @@ -1216,6 +1223,9 @@ bool RISCVLegalizerInfo::legalizeCustom( return Helper.lower(MI, 0, /* Unused hint type */ LLT()) == LegalizerHelper::Legalized; } + case TargetOpcode::G_SDIV: + case TargetOpcode::G_UDIV: + case TargetOpcode::G_UREM: case TargetOpcode::G_ROTL: case TargetOpcode::G_ROTR: { Helper.Observer.changingInstr(MI); diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td index 60d942957c8861..22a447c2649b29 100644 --- a/llvm/lib/Target/RISCV/RISCVCombine.td +++ b/llvm/lib/Target/RISCV/RISCVCombine.td @@ -23,6 +23,7 @@ def RISCVO0PreLegalizerCombiner: GICombiner< // TODO: Add more combines. def RISCVPostLegalizerCombiner : GICombiner<"RISCVPostLegalizerCombinerImpl", - [combines_for_extload, redundant_and, identity_combines, - commute_constant_to_rhs, constant_fold_cast_op]> { + [sub_to_add, combines_for_extload, redundant_and, + identity_combines, shift_immed_chain, commute_constant_to_rhs, + constant_fold_cast_op]> { } diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td index 10906aebf1bf84..83018f28176564 100644 --- a/llvm/lib/Target/RISCV/RISCVGISel.td +++ b/llvm/lib/Target/RISCV/RISCVGISel.td @@ -106,15 +106,6 @@ def gi_zexti16 : GIComplexOperandMatcher">, def gi_zexti8 : GIComplexOperandMatcher">, GIComplexPatternEquiv; -// FIXME: Canonicalize (sub X, C) -> (add X, -C) earlier. -def : Pat<(XLenVT (sub GPR:$rs1, simm12Plus1:$imm)), - (ADDI GPR:$rs1, (NegImm simm12Plus1:$imm))>; - -let Predicates = [IsRV64] in { -def : Pat<(i32 (sub GPR:$rs1, simm12Plus1i32:$imm)), - (ADDIW GPR:$rs1, (i64 (NegImm $imm)))>; -} - // Ptr type used in patterns with GlobalISelEmitter def PtrVT : PtrValueTypeByHwMode; @@ -255,13 +246,6 @@ let Predicates = [HasStdExtZmmul, IsRV64] in { def : PatGprGpr; } -let Predicates = [HasStdExtM, IsRV64] in { -def : PatGprGpr; -def : PatGprGpr; -def : PatGprGpr; -def : PatGprGpr; -} - //===----------------------------------------------------------------------===// // Zb* RV64 i32 patterns not used by SelectionDAG. //===----------------------------------------------------------------------===// @@ -280,7 +264,7 @@ def : Pat<(i32 (xor GPR:$rs1, (not GPR:$rs2))), (XNOR GPR:$rs1, GPR:$rs2)>; } // Predicates = [HasStdExtZbbOrZbkb, IsRV64] let Predicates = [HasStdExtZba, IsRV64] in { -def : Pat<(shl (i64 (zext i32:$rs1)), uimm5:$shamt), +def : Pat<(shl (i64 (zext GPR:$rs1)), uimm5:$shamt), (SLLI_UW GPR:$rs1, uimm5:$shamt)>; def : Pat<(i64 (add_like_non_imm12 (zext GPR:$rs1), GPR:$rs2)), diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8b5a79990d7c61..831b0b30d47fcc 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2516,7 +2516,9 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const { case MVT::i64: return Subtarget.hasVInstructionsI64(); case MVT::f16: - return Subtarget.hasVInstructionsF16(); + return Subtarget.hasVInstructionsF16Minimal(); + case MVT::bf16: + return Subtarget.hasVInstructionsBF16Minimal(); case MVT::f32: return Subtarget.hasVInstructionsF32(); case MVT::f64: @@ -21519,12 +21521,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType( if (!isTypeLegal(VT)) return false; - // TODO: Move bf16/f16 support into isLegalElementTypeForRVV - if (!(isLegalElementTypeForRVV(VT.getScalarType()) || - (VT.getScalarType() == MVT::bf16 && - Subtarget.hasVInstructionsBF16Minimal()) || - (VT.getScalarType() == MVT::f16 && - Subtarget.hasVInstructionsF16Minimal())) || + if (!isLegalElementTypeForRVV(VT.getScalarType()) || !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace, Alignment)) return false; @@ -21564,10 +21561,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, return false; EVT ScalarType = DataType.getScalarType(); - // TODO: Move bf16/f16 support into isLegalElementTypeForRVV - if (!(isLegalElementTypeForRVV(ScalarType) || - (ScalarType == MVT::bf16 && Subtarget.hasVInstructionsBF16Minimal()) || - (ScalarType == MVT::f16 && Subtarget.hasVInstructionsF16Minimal()))) + if (!isLegalElementTypeForRVV(ScalarType)) return false; if (!Subtarget.enableUnalignedVectorMem() && diff --git a/llvm/lib/Target/RISCV/RISCVInstrGISel.td b/llvm/lib/Target/RISCV/RISCVInstrGISel.td index 32e63977b51e68..bf2f8663cfa156 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrGISel.td +++ b/llvm/lib/Target/RISCV/RISCVInstrGISel.td @@ -17,6 +17,30 @@ class RISCVGenericInstruction : GenericInstruction { let Namespace = "RISCV"; } +// Pseudo equivalent to a RISCVISD::DIVW. +def G_DIVW : RISCVGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = false; +} +def : GINodeEquiv; + +// Pseudo equivalent to a RISCVISD::DIVUW. +def G_DIVUW : RISCVGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = false; +} +def : GINodeEquiv; + +// Pseudo equivalent to a RISCVISD::REMUW. +def G_REMUW : RISCVGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = false; +} +def : GINodeEquiv; + // Pseudo equivalent to a RISCVISD::RORW. def G_RORW : RISCVGenericInstruction { let OutOperandList = (outs type0:$dst); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index 0df0187d40889b..021c4b3b724b02 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -25,6 +25,7 @@ def rvv_vnot : PatFrag<(ops node:$in), (xor node:$in, (riscv_vmset_vl (XLenVT srcvalue)))>; multiclass VPatUSLoadStoreSDNode; // Store - def : Pat<(store type:$rs2, (XLenVT GPR:$rs1)), + def : Pat<(store (type regclass:$rs2), (XLenVT GPR:$rs1)), (store_instr reg_class:$rs2, GPR:$rs1, avl, log2sew)>; } @@ -49,7 +50,7 @@ multiclass VPatUSLoadStoreMaskSDNode { (load_instr (m.Mask (IMPLICIT_DEF)), GPR:$rs1, m.AVL, m.Log2SEW, TA_MA)>; // Store - def : Pat<(store m.Mask:$rs2, GPR:$rs1), + def : Pat<(store (m.Mask VR:$rs2), GPR:$rs1), (store_instr VR:$rs2, GPR:$rs1, m.AVL, m.Log2SEW)>; } @@ -884,7 +885,7 @@ multiclass VPatAVGADD_VV_VX_RM { foreach vti = AllVectors in let Predicates = !if(!eq(vti.Scalar, f16), [HasVInstructionsF16Minimal], GetVTypePredicates.Predicates) in - defm : VPatUSLoadStoreSDNode; foreach mti = AllMasks in let Predicates = [HasVInstructions] in diff --git a/llvm/lib/Target/RISCV/RISCVProfiles.td b/llvm/lib/Target/RISCV/RISCVProfiles.td index cbf2a2eddf38ed..bcb776e682aea7 100644 --- a/llvm/lib/Target/RISCV/RISCVProfiles.td +++ b/llvm/lib/Target/RISCV/RISCVProfiles.td @@ -45,9 +45,7 @@ defvar RVA22U64Features = !listconcat(RVA20U64BaseFeatures, [FeatureStdExtZa64rs, FeatureStdExtZihpm, FeatureStdExtZihintpause, - FeatureStdExtZba, - FeatureStdExtZbb, - FeatureStdExtZbs, + FeatureStdExtB, FeatureStdExtZic64b, FeatureStdExtZicbom, FeatureStdExtZicbop, @@ -92,9 +90,7 @@ defvar RVB23U64Features = !listconcat(RVA20U64BaseFeatures, [FeatureStdExtZihpm, FeatureStdExtZa64rs, FeatureStdExtZihintpause, - FeatureStdExtZba, - FeatureStdExtZbb, - FeatureStdExtZbs, + FeatureStdExtB, FeatureStdExtZic64b, FeatureStdExtZicbom, FeatureStdExtZicbop, @@ -128,9 +124,7 @@ defvar RVB23S64Features = !listconcat(RVB23U64Features, defvar RVM23U32Features = [Feature32Bit, FeatureStdExtI, FeatureStdExtM, - FeatureStdExtZba, - FeatureStdExtZbb, - FeatureStdExtZbs, + FeatureStdExtB, FeatureStdExtZicond, FeatureStdExtZihintpause, FeatureStdExtZihintntl, diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index a5afbcfd79710f..6a97755c279a29 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -53,6 +53,13 @@ static cl::opt EnableGlobalMerge("riscv-enable-global-merge", cl::Hidden, cl::desc("Enable the global merge pass")); +static cl::opt ForceEnableGlobalMergeExternalGlobals( + "riscv-force-enable-global-merge-external-globals", cl::Hidden, + cl::init(false), + cl::desc( + "If the global merge pass is enabled, force enable global merging of " + "external globals (overriding any logic that might disable it)")); + static cl::opt EnableMachineCombiner("riscv-enable-machine-combiner", cl::desc("Enable the machine combiner pass"), @@ -472,7 +479,8 @@ bool RISCVPassConfig::addPreISel() { if (EnableGlobalMerge == cl::BOU_TRUE) { addPass(createGlobalMergePass(TM, /* MaxOffset */ 2047, /* OnlyOptimizeForSize */ false, - /* MergeExternalByDefault */ true)); + /* MergeExternalByDefault */ + ForceEnableGlobalMergeExternalGlobals)); } return false; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 4c01c1679cd818..498f48353dc0c7 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -239,12 +239,7 @@ class RISCVTTIImpl : public BasicTTIImplBase { if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - // TODO: Move bf16/f16 support into isLegalElementTypeForRVV - return TLI->isLegalElementTypeForRVV(ElemType) || - (DataTypeVT.getVectorElementType() == MVT::bf16 && - ST->hasVInstructionsBF16Minimal()) || - (DataTypeVT.getVectorElementType() == MVT::f16 && - ST->hasVInstructionsF16Minimal()); + return TLI->isLegalElementTypeForRVV(ElemType); } bool isLegalMaskedLoad(Type *DataType, Align Alignment) { @@ -274,12 +269,7 @@ class RISCVTTIImpl : public BasicTTIImplBase { if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize()) return false; - // TODO: Move bf16/f16 support into isLegalElementTypeForRVV - return TLI->isLegalElementTypeForRVV(ElemType) || - (DataTypeVT.getVectorElementType() == MVT::bf16 && - ST->hasVInstructionsBF16Minimal()) || - (DataTypeVT.getVectorElementType() == MVT::f16 && - ST->hasVInstructionsF16Minimal()); + return TLI->isLegalElementTypeForRVV(ElemType); } bool isLegalMaskedGather(Type *DataType, Align Alignment) { @@ -341,6 +331,12 @@ class RISCVTTIImpl : public BasicTTIImplBase { if (!TLI->isLegalElementTypeForRVV(TLI->getValueType(DL, Ty))) return false; + // We can't promote f16/bf16 fadd reductions and scalable vectors can't be + // expanded. + // TODO: Promote f16/bf16 fmin/fmax reductions + if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16())) + return false; + switch (RdxDesc.getRecurrenceKind()) { case RecurKind::Add: case RecurKind::FAdd: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index 3a5dda946adfba..7bffeefc788a2f 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -141,8 +141,8 @@ yaml::WebAssemblyFunctionInfo::WebAssemblyFunctionInfo( for (const auto &MBB : MF) MBBs.insert(&MBB); for (auto KV : EHInfo->SrcToUnwindDest) { - auto *SrcBB = KV.first.get(); - auto *DestBB = KV.second.get(); + auto *SrcBB = cast(KV.first); + auto *DestBB = cast(KV.second); if (MBBs.count(SrcBB) && MBBs.count(DestBB)) SrcToUnwindDest[SrcBB->getNumber()] = DestBB->getNumber(); } diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp index ea9571f23e6fa5..41a646621c7ead 100644 --- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -13,7 +13,6 @@ #include "MCTargetDesc/X86MCTargetDesc.h" #include "MCTargetDesc/X86TargetStreamer.h" #include "TargetInfo/X86TargetInfo.h" -#include "X86AsmParserCommon.h" #include "X86Operand.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" diff --git a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp index 3487efbbbec097..e84ee879a8323b 100644 --- a/llvm/lib/Target/X86/GISel/X86CallLowering.cpp +++ b/llvm/lib/Target/X86/GISel/X86CallLowering.cpp @@ -43,7 +43,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/Value.h" -#include "llvm/MC/MCRegisterInfo.h" #include #include diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp index d2ee0f1bac6831..d425a0d507524a 100644 --- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp +++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp @@ -38,7 +38,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/IntrinsicsX86.h" -#include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp index c811d621e60eb7..0f76808f55bc7d 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp @@ -19,7 +19,6 @@ #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 2f6b55b0d6023e..8be8f0b6d735c4 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -23,11 +23,9 @@ #include "llvm/MC/MCFixupKindInfo.h" #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCObjectStreamer.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/TargetRegistry.h" diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp index 8e7dae229275bb..c2b284ad924d00 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp @@ -20,7 +20,6 @@ #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include #include diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 511df758f59681..1c4d68d5448d6f 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -25,7 +25,6 @@ #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MachineLocation.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TargetParser/Host.h" diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 59780ba5b99fcf..35bbffdb20942d 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -280,6 +280,9 @@ def FeatureAMXAVX512 : SubtargetFeature<"amx-avx512", "HasAMXAVX512", "true", "Support AMX-AVX512 instructions", [FeatureAMXTILE]>; +def FeatureAMXTF32 : SubtargetFeature<"amx-tf32", "HasAMXTF32", "true", + "Support AMX-TF32 instructions", + [FeatureAMXTILE]>; def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true", "Support CMPCCXADD instructions">; def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true", diff --git a/llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp b/llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp index 7ce1960b57a450..19c4751e50f719 100644 --- a/llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp +++ b/llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" @@ -23,7 +22,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/Attributes.h" diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp index a72eeb53915d65..1919923f41bb20 100644 --- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -44,7 +44,6 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" #include "llvm/InitializePasses.h" diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 154cb1399880bc..b85d9d9a7e535b 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -15,7 +15,6 @@ #include "X86Subtarget.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/IR/CallingConv.h" #include "llvm/IR/Module.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp index a909440f983173..6fb480c37e1ff8 100644 --- a/llvm/lib/Target/X86/X86CompressEVEX.cpp +++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp @@ -37,7 +37,6 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/X86BaseInfo.h" -#include "MCTargetDesc/X86InstComments.h" #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" diff --git a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp index becd221e1e86ac..b5717081361ed1 100644 --- a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp +++ b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp @@ -12,9 +12,6 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "X86InstrBuilder.h" -#include "X86InstrInfo.h" -#include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -22,7 +19,6 @@ #include "llvm/ProfileData/SampleProf.h" #include "llvm/ProfileData/SampleProfReader.h" #include "llvm/Support/Debug.h" -#include "llvm/Transforms/IPO/SampleProfile.h" #include using namespace llvm; diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp index 9c667f5036dd56..a3547f802976bf 100644 --- a/llvm/lib/Target/X86/X86DomainReassignment.cpp +++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp @@ -15,7 +15,6 @@ #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" -#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/lib/Target/X86/X86DynAllocaExpander.cpp b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp index fc48055b2fac73..1e88824c688790 100644 --- a/llvm/lib/Target/X86/X86DynAllocaExpander.cpp +++ b/llvm/lib/Target/X86/X86DynAllocaExpander.cpp @@ -15,7 +15,6 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" @@ -27,7 +26,6 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/Function.h" -#include "llvm/Support/raw_ostream.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 9511a82f0e97d2..4f045d78f75fb2 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -14,7 +14,6 @@ #include "X86.h" #include "X86FrameLowering.h" -#include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" @@ -756,7 +755,9 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PTDPBUSDV: case X86::PTDPBUUDV: case X86::PTDPBF16PSV: - case X86::PTDPFP16PSV: { + case X86::PTDPFP16PSV: + case X86::PTMMULTF32PSV: + case X86::PTTMMULTF32PSV: { MI.untieRegOperand(4); for (unsigned i = 3; i > 0; --i) MI.removeOperand(i); @@ -770,6 +771,13 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB, case X86::PTDPBUUDV: Opc = X86::TDPBUUD; break; case X86::PTDPBF16PSV: Opc = X86::TDPBF16PS; break; case X86::PTDPFP16PSV: Opc = X86::TDPFP16PS; break; + case X86::PTMMULTF32PSV: + Opc = X86::TMMULTF32PS; + break; + case X86::PTTMMULTF32PSV: + Opc = X86::TTMMULTF32PS; + break; + default: llvm_unreachable("Unexpected Opcode"); } diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index 9e4e5547c642cc..039b8929c93a8e 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -30,7 +30,6 @@ #include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GetElementPtrTypeIterator.h" -#include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" diff --git a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp index 62d0f6ca794348..3812ea0ebd3f39 100644 --- a/llvm/lib/Target/X86/X86FastPreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastPreTileConfig.cpp @@ -29,7 +29,6 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/InitializePasses.h" #include "llvm/Support/Debug.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp index 72264dd6a5c38f..c2305b24d6e5cb 100644 --- a/llvm/lib/Target/X86/X86FastTileConfig.cpp +++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp @@ -20,7 +20,6 @@ #include "X86.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" -#include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -29,7 +28,6 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" -#include "llvm/InitializePasses.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp index 8ffd971515a66f..8c10a078046868 100644 --- a/llvm/lib/Target/X86/X86FixupInstTuning.cpp +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -27,7 +27,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp index ab8b3dc3dd6d58..ea1b6e97aa32db 100644 --- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp +++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp @@ -21,7 +21,6 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/DepthFirstIterator.h" @@ -49,7 +48,6 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCSchedule.h" #include "llvm/Pass.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index a35b04606e595d..4d40c23eb5617a 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -32,7 +32,6 @@ #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/Support/Debug.h" #include "llvm/Support/LEB128.h" #include "llvm/Target/TargetOptions.h" #include diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index aea86c280e2f99..8ad8641d1de485 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -14,7 +14,6 @@ #include "X86ISelDAGToDAG.h" #include "X86.h" #include "X86MachineFunctionInfo.h" -#include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/Statistic.h" @@ -3585,11 +3584,10 @@ static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, // be transferred from a node in the pattern to the result node, probably with // a new keyword. For example, we have this // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", -// [(store (add (loadi64 addr:$dst), -1), addr:$dst), -// (implicit EFLAGS)]>; +// [(store (add (loadi64 addr:$dst), -1), addr:$dst)]>; // but maybe need something like this // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", -// [(store (add (loadi64 addr:$dst), -1), addr:$dst), +// [(store (X86add_flag (loadi64 addr:$dst), -1), addr:$dst), // (transferrable EFLAGS)]>; // // Until then, we manually fold these and instruction select the operation diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 19a85a6d7ec6ce..c08efc9f7271e6 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -14,20 +14,17 @@ #include "X86ISelLowering.h" #include "MCTargetDesc/X86ShuffleDecode.h" #include "X86.h" -#include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86IntrinsicsInfo.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" -#include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/BlockFrequencyInfo.h" -#include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/IntrinsicLowering.h" @@ -12640,6 +12637,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); + int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; }); // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. @@ -12759,16 +12757,28 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, if (VT == MVT::v4f64 || VT == MVT::v4i64) return SDValue(); - // Only broadcast the zero-element of a 128-bit subvector. - if ((BitOffset % 128) != 0) - return SDValue(); + // If we are broadcasting an element from the lowest 128-bit subvector, try + // to move the element in position. + if (BitOffset < 128 && NumActiveElts > 1 && + V.getScalarValueSizeInBits() == NumEltBits) { + assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && + "Unexpected bit-offset"); + SmallVector ExtractMask(128 / NumEltBits, SM_SentinelUndef); + ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits(); + V = extractSubVector(V, 0, DAG, DL, 128); + V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask); + } else { + // Only broadcast the zero-element of a 128-bit subvector. + if ((BitOffset % 128) != 0) + return SDValue(); - assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && - "Unexpected bit-offset"); - assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && - "Unexpected vector size"); - unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits(); - V = extract128BitVector(V, ExtractIdx, DAG, DL); + assert((BitOffset % V.getScalarValueSizeInBits()) == 0 && + "Unexpected bit-offset"); + assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && + "Unexpected vector size"); + unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits(); + V = extract128BitVector(V, ExtractIdx, DAG, DL); + } } // On AVX we can use VBROADCAST directly for scalar sources. @@ -14910,6 +14920,7 @@ static SDValue lowerShuffleAsLanePermuteAndPermute( SmallVector InLaneMask(NumElts, SM_SentinelUndef); // CrossLaneMask but one entry == one sublane. SmallVector CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef); + APInt DemandedCrossLane = APInt::getZero(NumElts); for (int i = 0; i != NumElts; ++i) { int M = Mask[i]; @@ -14932,6 +14943,7 @@ static SDValue lowerShuffleAsLanePermuteAndPermute( CrossLaneMaskLarge[DstSublane] = SrcSublane; int DstSublaneOffset = DstSublane * NumEltsPerSublane; InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane; + DemandedCrossLane.setBit(InLaneMask[i]); break; } if (!Found) @@ -14966,6 +14978,12 @@ static SDValue lowerShuffleAsLanePermuteAndPermute( if (CrossLaneMask == Mask || InLaneMask == Mask) return SDValue(); + // Simplify CrossLaneMask based on the actual demanded elements. + if (V1.hasOneUse()) + for (int i = 0; i != NumElts; ++i) + if (!DemandedCrossLane[i]) + CrossLaneMask[i] = SM_SentinelUndef; + SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask); return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT), InLaneMask); @@ -27331,8 +27349,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case Intrinsic::x86_t2rpntlvwz0t1_internal: case Intrinsic::x86_t2rpntlvwz1_internal: case Intrinsic::x86_t2rpntlvwz1t1_internal: { - if (!Subtarget.hasAMXTILE()) - break; auto *X86MFI = DAG.getMachineFunction().getInfo(); X86MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); unsigned IntNo = Op.getConstantOperandVal(1); @@ -37473,7 +37489,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPBF8PS: case X86::PTDPBHF8PS: case X86::PTDPHBF8PS: - case X86::PTDPHF8PS: { + case X86::PTDPHF8PS: + case X86::PTMMULTF32PS: + case X86::PTTMMULTF32PS: { unsigned Opc; switch (MI.getOpcode()) { // clang-format off @@ -37488,6 +37506,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break; case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break; case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break; + case X86::PTMMULTF32PS: Opc = X86::TMMULTF32PS; break; + case X86::PTTMMULTF32PS: Opc = X86::TTMMULTF32PS; break; // clang-format on } diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 12cd92e2d0d773..084bd479f9a623 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -18,7 +18,6 @@ #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" -#include "X86TargetObjectFile.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ObjCARCUtil.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" diff --git a/llvm/lib/Target/X86/X86IndirectThunks.cpp b/llvm/lib/Target/X86/X86IndirectThunks.cpp index 4f4a8d8bd09d51..c5a5e6e621ffea 100644 --- a/llvm/lib/Target/X86/X86IndirectThunks.cpp +++ b/llvm/lib/Target/X86/X86IndirectThunks.cpp @@ -31,17 +31,11 @@ #include "X86Subtarget.h" #include "llvm/CodeGen/IndirectThunks.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp index 5d6b829aea7ec0..953b755a0ca4c4 100644 --- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp +++ b/llvm/lib/Target/X86/X86InsertPrefetch.cpp @@ -19,9 +19,6 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "X86InstrBuilder.h" -#include "X86InstrInfo.h" -#include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineModuleInfo.h" diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index b954c977f8c6c9..04527716e31627 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -516,3 +516,55 @@ let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in { TILE:$src3, GR32:$src4))]>; } } + +let Predicates = [HasAMXTF32, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let Constraints = "$src1 = $dst" in { + def TMMULTF32PS: I<0x48, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "tmmultf32ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", + []>, VEX, VVVV, T8, PD; + } + let Constraints = "$src4 = $dst" in { + def PTMMULTF32PSV : PseudoI<(outs TILE:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, + TILE:$src4, TILE:$src5, TILE:$src6), + [(set TILE:$dst, + (int_x86_tmmultf32ps_internal GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6))]>; + } + let usesCustomInserter = 1 in { + def PTMMULTF32PS : PseudoI<(outs), + (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), + [(int_x86_tmmultf32ps timm:$src1, timm:$src2, + timm:$src3)]>; + } + } // SchedRW = [WriteSystem] +} // HasAMXTF32 + +let Predicates = [HasAMXTF32, HasAMXTRANSPOSE, In64BitMode] in { + let SchedRW = [WriteSystem] in { + let Constraints = "$src1 = $dst" in { + def TTMMULTF32PS: I<0x48, MRMSrcReg4VOp3, (outs TILE:$dst), + (ins TILE:$src1, TILE:$src2, TILE:$src3), + "ttmmultf32ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", + []>, VEX, VVVV, T8, PS; + } + let Constraints = "$src4 = $dst" in { + def PTTMMULTF32PSV : PseudoI<(outs TILE:$dst), + (ins GR16:$src1, GR16:$src2, GR16:$src3, + TILE:$src4, TILE:$src5, TILE:$src6), + [(set TILE:$dst, + (int_x86_ttmmultf32ps_internal GR16:$src1, + GR16:$src2, GR16:$src3, TILE:$src4, + TILE:$src5, TILE:$src6))]>; + } + let usesCustomInserter = 1 in { + def PTTMMULTF32PS : PseudoI<(outs), + (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), + [(int_x86_ttmmultf32ps timm:$src1, timm:$src2, + timm:$src3)]>; + } + } // SchedRW = [WriteSystem] +} // HasAMXTF32, HasAMXTRANSPOSE diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td index ffa8a105e2d193..16ca2882a84daf 100644 --- a/llvm/lib/Target/X86/X86InstrArithmetic.td +++ b/llvm/lib/Target/X86/X86InstrArithmetic.td @@ -73,7 +73,7 @@ multiclass Mul o, string m, Format RegMRM, Format MemMRM, SDPatternOpera // syntax can be accepted. let Defs = [AL, EFLAGS, AX], Uses = [AL] in def 8r : MulDivOpR; + [(set AL, EFLAGS, (node AL, GR8:$src1))]>; let Defs = [AX, DX, EFLAGS], Uses = [AX] in def 16r : MulDivOpR, OpSize16; let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in @@ -82,7 +82,7 @@ multiclass Mul o, string m, Format RegMRM, Format MemMRM, SDPatternOpera def 64r : MulDivOpR; let Defs = [AL, EFLAGS, AX], Uses = [AL] in def 8m : MulDivOpM; + [(set AL, EFLAGS, (node AL, (loadi8 addr:$src1)))]>; let Defs = [AX, DX, EFLAGS], Uses = [AX] in def 16m : MulDivOpM, OpSize16; let Defs = [EAX, EDX, EFLAGS], Uses = [EAX] in @@ -386,12 +386,10 @@ class DecOpR_RF : UnaryOpR_RF<0xFF, MRM1r, "dec", t, class IncOpR_R : UnaryOpR_R<0xFF, MRM0r, "inc", t, null_frag, ndd>; class DecOpR_R : UnaryOpR_R<0xFF, MRM1r, "dec", t, null_frag, ndd>; class IncOpM_MF : UnaryOpM_MF<0xFF, MRM0m, "inc", t, null_frag> { - let Pattern = [(store (add (t.LoadNode addr:$src1), 1), addr:$src1), - (implicit EFLAGS)]; + let Pattern = [(store (add (t.LoadNode addr:$src1), 1), addr:$src1)]; } class DecOpM_MF : UnaryOpM_MF<0xFF, MRM1m, "dec", t, null_frag> { - let Pattern = [(store (add (t.LoadNode addr:$src1), -1), addr:$src1), - (implicit EFLAGS)]; + let Pattern = [(store (add (t.LoadNode addr:$src1), -1), addr:$src1)]; } class IncOpM_RF : UnaryOpM_RF<0xFF, MRM0m, "inc", t, null_frag> { let Pattern = [(set t.RegClass:$dst, EFLAGS, (add (t.LoadNode addr:$src1), 1))]; diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 669351fd786eb9..ea0b66c2f55162 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -74,8 +74,7 @@ def VASTART_SAVE_XMM_REGS : I<0, Pseudo, (outs), (ins GR8:$al, i8mem:$regsavefi, variable_ops), "#VASTART_SAVE_XMM_REGS $al, $regsavefi", - [(X86vastart_save_xmm_regs GR8:$al, addr:$regsavefi), - (implicit EFLAGS)]>; + [(X86vastart_save_xmm_regs GR8:$al, addr:$regsavefi)]>; } let usesCustomInserter = 1, Defs = [EFLAGS] in { @@ -87,15 +86,15 @@ def VAARG_64 : I<0, Pseudo, (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), "#VAARG_64 $dst, $ap, $size, $mode, $align", [(set GR64:$dst, - (X86vaarg64 addr:$ap, timm:$size, timm:$mode, timm:$align)), - (implicit EFLAGS)]>, Requires<[In64BitMode, IsLP64]>; + (X86vaarg64 addr:$ap, timm:$size, timm:$mode, timm:$align))]>, + Requires<[In64BitMode, IsLP64]>; def VAARG_X32 : I<0, Pseudo, (outs GR32:$dst), (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), "#VAARG_X32 $dst, $ap, $size, $mode, $align", [(set GR32:$dst, - (X86vaargx32 addr:$ap, timm:$size, timm:$mode, timm:$align)), - (implicit EFLAGS)]>, Requires<[In64BitMode, NotLP64]>; + (X86vaargx32 addr:$ap, timm:$size, timm:$mode, timm:$align))]>, + Requires<[In64BitMode, NotLP64]>; } // When using segmented stacks these are lowered into instructions which first diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp index 94de164d5f0785..090ec687d28c4e 100644 --- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp +++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp @@ -13,7 +13,6 @@ #include "X86InstrFMA3Info.h" #include "X86InstrInfo.h" -#include "llvm/Support/Threading.h" #include #include #include diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td index c09522709d2f0d..f6231b78f4c2e8 100644 --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -1450,9 +1450,6 @@ def X86Vpshufbitqmb_su : PatFrag<(ops node:$src1, node:$src2), return N->hasOneUse(); }]>; -// This fragment treats X86cmpm as commutable to help match loads in both -// operands for PCMPEQ. -def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>; def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), (setcc node:$src1, node:$src2, SETGT)>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 9b002ebd3a93bc..1b95450596314b 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -22,7 +22,6 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/LiveVariables.h" -#include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -4738,6 +4737,7 @@ static bool isAMXOpcode(unsigned Opc) { case X86::TILELOADD_EVEX: case X86::TILESTORED_EVEX: case X86::PTILEPAIRLOAD: + case X86::PTILEPAIRSTORE: return true; } } diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td index c9ff8abb02efd3..5789678c182c3b 100644 --- a/llvm/lib/Target/X86/X86InstrMisc.td +++ b/llvm/lib/Target/X86/X86InstrMisc.td @@ -1163,14 +1163,12 @@ multiclass Lzcnt o, string m, SDPatternOperator node, X86TypeInfo t, SchedWrite schedrr, SchedWrite schedrm, string suffix = ""> { def rr#suffix : ITy, + [(set t.RegClass:$dst, (node t.RegClass:$src1))]>, TB, Sched<[schedrr]>; let mayLoad = 1 in def rm#suffix : ITy, + [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1)))]>, TB, Sched<[schedrm]>; } diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td index 2eb4e4fb941b29..a9ec5f660ff1d8 100644 --- a/llvm/lib/Target/X86/X86InstrPredicates.td +++ b/llvm/lib/Target/X86/X86InstrPredicates.td @@ -186,6 +186,7 @@ def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">; def HasAMXFP8 : Predicate<"Subtarget->hasAMXFP8()">; def HasAMXTRANSPOSE : Predicate<"Subtarget->hasAMXTRANSPOSE()">; def HasAMXAVX512 : Predicate<"Subtarget->hasAMXAVX512()">; +def HasAMXTF32 : Predicate<"Subtarget->hasAMXTF32()">; def HasUINTR : Predicate<"Subtarget->hasUINTR()">; def HasUSERMSR : Predicate<"Subtarget->hasUSERMSR()">; def HasCRC32 : Predicate<"Subtarget->hasCRC32()">; diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td index dc701f1afc915f..eb0b5a43afdf97 100644 --- a/llvm/lib/Target/X86/X86InstrSystem.td +++ b/llvm/lib/Target/X86/X86InstrSystem.td @@ -690,7 +690,7 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in let SchedRW = [WriteSystem] in { let Defs = [EAX, EDX], Uses = [ECX] in def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", - [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB; + [(set EAX, EDX, (X86rdpkru ECX))]>, TB; let Uses = [EAX, ECX, EDX] in def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", [(X86wrpkru EAX, EDX, ECX)]>, TB; diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td index 531268b41da968..ab171ac79fb377 100644 --- a/llvm/lib/Target/X86/X86InstrUtils.td +++ b/llvm/lib/Target/X86/X86InstrUtils.td @@ -193,10 +193,8 @@ class X86TypeInfo>", SDTIntLeaf,[],"<>">; - def Xi8 : X86TypeInfo; def Xi16 : X86TypeInfo o, string m, X86TypeInfo t> // BinOpMR_MF - Instructions that read "[mem], reg" and write "[mem]", EFLAGS. class BinOpMR_MF o, string m, X86TypeInfo t, SDPatternOperator node> : BinOpMR, + [(store (node (load addr:$src1), t.RegClass:$src2), addr:$src1)]>, Sched<[WriteALURMW, // base, scale, index, offset, segment ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault, @@ -1168,7 +1165,7 @@ class BinOpMRF_RF o, string m, X86TypeInfo t, SDPatternOperator node> class BinOpMRF_MF o, string m, X86TypeInfo t, SDPatternOperator node> : BinOpMR, + addr:$src1)]>, Sched<[WriteADCRMW, // base, scale, index, offset, segment ReadDefault, ReadDefault, ReadDefault, @@ -1211,7 +1208,7 @@ class BinOpMI_M o, string m, X86TypeInfo t, Format f> class BinOpMI_MF o, string m, X86TypeInfo t, SDPatternOperator node, Format f> : BinOpMI, + t.ImmOperator:$src2), addr:$src1)]>, Sched<[WriteALURMW]>, DefEFLAGS { let mayStore = 1; } @@ -1227,7 +1224,7 @@ class BinOpMIF_RF o, string m, X86TypeInfo t, SDNode node, Format f> class BinOpMIF_MF o, string m, X86TypeInfo t, SDNode node, Format f> : BinOpMI, + t.ImmOperator:$src2, EFLAGS), addr:$src1)]>, Sched<[WriteADCRMW]>, DefEFLAGS, UseEFLAGS { let mayStore = 1; } @@ -1348,8 +1345,8 @@ class UnaryOpR_RF o, Format f, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0> : UnaryOpR, DefEFLAGS, NDD; + [(set t.RegClass:$dst, (node t.RegClass:$src1))]>, + DefEFLAGS, NDD; // UnaryOpM - Instructions that read "[mem]". class UnaryOpM o, Format f, string m, string args, X86TypeInfo t, @@ -1381,7 +1378,7 @@ class UnaryOpM_M o, Format f, string m, X86TypeInfo t, class UnaryOpM_MF o, Format f, string m, X86TypeInfo t, SDPatternOperator node = null_frag> : UnaryOpM, Sched<[WriteALURMW]>, DefEFLAGS { + [(store (node (t.LoadNode addr:$src1)), addr:$src1)]>, + Sched<[WriteALURMW]>, DefEFLAGS { let mayStore = 1; } diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 7e2445c01ff0da..efab93d61c7c5d 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -19,20 +19,17 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGenTypes/MachineValueType.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include #include #include -#include using namespace llvm; diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index 65dd3111f492dc..31a93f9c2a6ef6 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -53,7 +53,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RDFGraph.h" #include "llvm/CodeGen/RDFLiveness.h" #include "llvm/InitializePasses.h" diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index e355a4b9d35b91..0a187ee42e3f8b 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -17,10 +17,8 @@ //===----------------------------------------------------------------------===// // #include "X86.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index af6fb04295bdec..0e74cfa75e9606 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -41,8 +41,6 @@ #include "X86.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" @@ -243,7 +241,8 @@ std::pair ShapeCalculator::getShape(IntrinsicInst *II, case Intrinsic::x86_tdpbusd_internal: case Intrinsic::x86_tdpbuud_internal: case Intrinsic::x86_tdpbf16ps_internal: - case Intrinsic::x86_tdpfp16ps_internal: { + case Intrinsic::x86_tdpfp16ps_internal: + case Intrinsic::x86_tmmultf32ps_internal: { switch (OpNo) { case 3: Row = II->getArgOperand(0); @@ -277,6 +276,23 @@ std::pair ShapeCalculator::getShape(IntrinsicInst *II, Col = II->getArgOperand(1); break; } + case Intrinsic::x86_ttmmultf32ps_internal: { + switch (OpNo) { + case 3: + Row = II->getArgOperand(0); + Col = II->getArgOperand(1); + break; + case 4: + Row = getRowFromCol(II, II->getArgOperand(2), 4); + Col = getColFromRow(II, II->getArgOperand(0), 4); + break; + case 5: + Row = getRowFromCol(II, II->getArgOperand(2), 4); + Col = II->getArgOperand(1); + break; + } + break; + } } return std::make_pair(Row, Col); diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp index 9cf700d6b65b05..fe13ebc6c76754 100644 --- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp +++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp @@ -31,8 +31,6 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/DebugLoc.h" -#include "llvm/InitializePasses.h" -#include "llvm/Support/Debug.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp index 24db39c4e98b96..7bae16c0667168 100644 --- a/llvm/lib/Target/X86/X86MCInstLower.cpp +++ b/llvm/lib/Target/X86/X86MCInstLower.cpp @@ -42,10 +42,8 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCSection.h" -#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp index 50d63e196d1d0c..c43fd97a055fcc 100644 --- a/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -25,8 +25,6 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/Function.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index e88702caa9a52b..1d0815ee830b9e 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -19,9 +19,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/IntrinsicsX86.h" -#include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/KnownBits.h" diff --git a/llvm/lib/Target/X86/X86PfmCounters.td b/llvm/lib/Target/X86/X86PfmCounters.td index 38d8d19091e0fd..0c80f1eaadadb8 100644 --- a/llvm/lib/Target/X86/X86PfmCounters.td +++ b/llvm/lib/Target/X86/X86PfmCounters.td @@ -189,13 +189,13 @@ def IceLakePfmCounters : ProcPfmCounters { let CycleCounter = UnhaltedCoreCyclesPfmCounter; let UopsCounter = UopsIssuedPfmCounter; let IssueCounters = [ - PfmIssueCounter<"ICXPort0", "uops_dispatched_port:port_0">, - PfmIssueCounter<"ICXPort1", "uops_dispatched_port:port_1">, - PfmIssueCounter<"ICXPort23", "uops_dispatched_port:port_2_3">, - PfmIssueCounter<"ICXPort49", "uops_dispatched_port:port_4_9">, - PfmIssueCounter<"ICXPort5", "uops_dispatched_port:port_5">, - PfmIssueCounter<"ICXPort6", "uops_dispatched_port:port_6">, - PfmIssueCounter<"ICXPort78", "uops_dispatched_port:port_7_8"> + PfmIssueCounter<"ICXPort0", "uops_dispatched:port_0">, + PfmIssueCounter<"ICXPort1", "uops_dispatched:port_1">, + PfmIssueCounter<"ICXPort23", "uops_dispatched:port_2_3">, + PfmIssueCounter<"ICXPort49", "uops_dispatched:port_4_9">, + PfmIssueCounter<"ICXPort5", "uops_dispatched:port_5">, + PfmIssueCounter<"ICXPort6", "uops_dispatched:port_6">, + PfmIssueCounter<"ICXPort78", "uops_dispatched:port_7_8"> ]; let ValidationCounters = DefaultIntelPfmValidationCounters; } @@ -208,13 +208,16 @@ def AlderLakePfmCounters : ProcPfmCounters { let CycleCounter = UnhaltedCoreCyclesPfmCounter; let UopsCounter = UopsIssuedPfmCounter; let IssueCounters = [ - PfmIssueCounter<"ADLPPort00", "uops_dispatched_port:port_0">, - PfmIssueCounter<"ADLPPort01", "uops_dispatched_port:port_1">, - PfmIssueCounter<"ADLPPort02_03_10", "uops_dispatched_port:port_2_3_10">, - PfmIssueCounter<"ADLPPort04_09", "uops_dispatched_port:port_4_9">, - PfmIssueCounter<"ADLPPort05_11", "uops_dispatched_port:port_5_11">, - PfmIssueCounter<"ADLPPort06", "uops_dispatched_port:port_6">, - PfmIssueCounter<"ADLPPort07_08", "uops_dispatched_port:port_7_8"> + PfmIssueCounter<"ADLPPort00", "uops_dispatched:port_0">, + PfmIssueCounter<"ADLPPort01", "uops_dispatched:port_1">, + // The perfmon documentation and thus libpfm seems to incorrectly label + // this performance counter, as ports 2,3, and 11 are actually grouped + // according to most documentation. See #113941 for additional details. + PfmIssueCounter<"ADLPPort02_03_11", "uops_dispatched:port_2_3_10">, + PfmIssueCounter<"ADLPPort04_09", "uops_dispatched:port_4_9">, + PfmIssueCounter<"ADLPPort05_11", "uops_dispatched:port_5_11">, + PfmIssueCounter<"ADLPPort06", "uops_dispatched:port_6">, + PfmIssueCounter<"ADLPPort07_08", "uops_dispatched:port_7_8"> ]; let ValidationCounters = DefaultIntelPfmValidationCounters; } @@ -224,13 +227,16 @@ def SapphireRapidsPfmCounters : ProcPfmCounters { let CycleCounter = UnhaltedCoreCyclesPfmCounter; let UopsCounter = UopsIssuedPfmCounter; let IssueCounters = [ - PfmIssueCounter<"SPRPort00", "uops_dispatched_port:port_0">, - PfmIssueCounter<"SPRPort01", "uops_dispatched_port:port_1">, - PfmIssueCounter<"SPRPort02_03_10", "uops_dispatched_port:port_2_3_10">, - PfmIssueCounter<"SPRPort04_09", "uops_dispatched_port:port_4_9">, - PfmIssueCounter<"SPRPort05_11", "uops_dispatched_port:port_5_11">, - PfmIssueCounter<"SPRPort06", "uops_dispatched_port:port_6">, - PfmIssueCounter<"SPRPort07_08", "uops_dispatched_port:port_7_8">, + PfmIssueCounter<"SPRPort00", "uops_dispatched:port_0">, + PfmIssueCounter<"SPRPort01", "uops_dispatched:port_1">, + // The perfmon documentation and thus libpfm seems to incorrectly label + // this performance counter, as ports 2,3, and 11 are actually grouped + // according to most documentation. See #113941 for additional details. + PfmIssueCounter<"SPRPort02_03_11", "uops_dispatched:port_2_3_10">, + PfmIssueCounter<"SPRPort04_09", "uops_dispatched:port_4_9">, + PfmIssueCounter<"SPRPort05_11", "uops_dispatched:port_5_11">, + PfmIssueCounter<"SPRPort06", "uops_dispatched:port_6">, + PfmIssueCounter<"SPRPort07_08", "uops_dispatched:port_7_8">, ]; let ValidationCounters = DefaultIntelPfmValidationCounters; } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 2daaa95b06be0d..09418c9bb74d34 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -22,13 +22,11 @@ #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TileShapeInfo.h" #include "llvm/CodeGen/VirtRegMap.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/MC/MCContext.h" @@ -1078,7 +1076,9 @@ static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, case X86::PTDPFP16PSV: case X86::PTCMMIMFP16PSV: case X86::PTCMMRLFP16PSV: - case X86::PTTRANSPOSEDV: { + case X86::PTTRANSPOSEDV: + case X86::PTMMULTF32PSV: + case X86::PTTMMULTF32PSV: { MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); ShapeT Shape(&MO1, &MO2, MRI); diff --git a/llvm/lib/Target/X86/X86SchedAlderlakeP.td b/llvm/lib/Target/X86/X86SchedAlderlakeP.td index aec6906310d96b..f8c6b32a853be9 100644 --- a/llvm/lib/Target/X86/X86SchedAlderlakeP.td +++ b/llvm/lib/Target/X86/X86SchedAlderlakeP.td @@ -60,7 +60,6 @@ def ADLPPort01_05_10 : ProcResGroup<[ADLPPort01, ADLPPort05, ADLPPort10]>; def ADLPPort02_03 : ProcResGroup<[ADLPPort02, ADLPPort03]>; def ADLPPort02_03_07 : ProcResGroup<[ADLPPort02, ADLPPort03, ADLPPort07]>; def ADLPPort02_03_11 : ProcResGroup<[ADLPPort02, ADLPPort03, ADLPPort11]>; -def ADLPPort02_03_10 : ProcResGroup<[ADLPPort02, ADLPPort03, ADLPPort10]>; def ADLPPort05_11 : ProcResGroup<[ADLPPort05, ADLPPort11]>; def ADLPPort07_08 : ProcResGroup<[ADLPPort07, ADLPPort08]>; diff --git a/llvm/lib/Target/X86/X86SchedSapphireRapids.td b/llvm/lib/Target/X86/X86SchedSapphireRapids.td index b0ebe70c31fd44..0545f9b7f4c00e 100644 --- a/llvm/lib/Target/X86/X86SchedSapphireRapids.td +++ b/llvm/lib/Target/X86/X86SchedSapphireRapids.td @@ -59,7 +59,6 @@ def SPRPort01_05 : ProcResGroup<[SPRPort01, SPRPort05]>; def SPRPort01_05_10 : ProcResGroup<[SPRPort01, SPRPort05, SPRPort10]>; def SPRPort02_03 : ProcResGroup<[SPRPort02, SPRPort03]>; def SPRPort02_03_11 : ProcResGroup<[SPRPort02, SPRPort03, SPRPort11]>; -def SPRPort02_03_10 : ProcResGroup<[SPRPort02, SPRPort03, SPRPort10]>; def SPRPort05_11 : ProcResGroup<[SPRPort05, SPRPort11]>; def SPRPort07_08 : ProcResGroup<[SPRPort07, SPRPort08]>; diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp index 3841a8b1a38356..3f88bcf9ce5ec5 100644 --- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -18,7 +18,6 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" -#include "llvm/IR/DerivedTypes.h" using namespace llvm; diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index 46317cb33776fa..ce5a1bce5b107f 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -20,7 +20,6 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/ArrayRef.h" @@ -52,7 +51,6 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" -#include #include #include #include diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index c3a0244c7eddca..9f58404c351c21 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -19,7 +19,6 @@ #include "X86Subtarget.h" #include "X86TargetObjectFile.h" #include "X86TargetTransformInfo.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -35,7 +34,6 @@ #include "llvm/CodeGen/MIRYamlMapping.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" -#include "llvm/CodeGen/RegAllocRegistry.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 7f362f48c2e78f..179e29e40614e7 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -55,7 +55,6 @@ #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/Support/Debug.h" #include using namespace llvm; diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp index 95a84c2cda5369..75f002f418ba6d 100644 --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -20,7 +20,6 @@ #include "X86.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" -#include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineFrameInfo.h" diff --git a/llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp b/llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp index 7101b0bd70312a..5c12af1fee6376 100644 --- a/llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp +++ b/llvm/lib/Target/X86/X86WinFixupBufferSecurityCheck.cpp @@ -19,9 +19,7 @@ #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Module.h" -#include using namespace llvm; diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp index 4c440da715fefe..7e00215ef3b971 100644 --- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp +++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.cpp @@ -529,8 +529,6 @@ void XtensaInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB, unsigned XtensaInstrInfo::insertConstBranchAtInst( MachineBasicBlock &MBB, MachineInstr *I, int64_t offset, ArrayRef Cond, DebugLoc DL, int *BytesAdded) const { - // Shouldn't be a fall through. - assert(&MBB && "InsertBranch must not be told to insert a fallthrough"); assert(Cond.size() <= 4 && "Xtensa branch conditions have less than four components!"); diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index a973aaaa4806e6..140e565e1686f2 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1880,6 +1880,7 @@ const StringMap sys::getHostCPUFeatures() { !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX); Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave; Features["amx-transpose"] = HasLeaf1E && ((EAX >> 5) & 1) && HasAMXSave; + Features["amx-tf32"] = HasLeaf1E && ((EAX >> 6) & 1) && HasAMXSave; Features["amx-avx512"] = HasLeaf1E && ((EAX >> 7) & 1) && HasAMXSave; bool HasLeaf24 = diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index eb55e6fc9134c8..6b53424833bd47 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -602,6 +602,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 = FeatureAMX_TILE | FeatureAVX10_2_512; +constexpr FeatureBitset ImpliedFeaturesAMX_TF32 = FeatureAMX_TILE; constexpr FeatureBitset ImpliedFeaturesHRESET = {}; constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {}; diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index f176a76164698a..353dc00c9928e1 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -132,6 +132,11 @@ cl::opt EnableMemProfContextDisambiguation( cl::opt SupportsHotColdNew( "supports-hot-cold-new", cl::init(false), cl::Hidden, cl::desc("Linking with hot/cold operator new interfaces")); + +cl::opt MemProfRequireDefinitionForPromotion( + "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, + cl::desc( + "Require target function definition when promoting indirect calls")); } // namespace llvm extern cl::opt MemProfReportHintedSizes; @@ -4602,7 +4607,13 @@ void MemProfContextDisambiguation::performICP( // target (or version of the code), and we need to be conservative // (similar to what is done in the ICP pass). Function *TargetFunction = Symtab->getFunction(Candidate.Value); - if (TargetFunction == nullptr || TargetFunction->isDeclaration()) { + if (TargetFunction == nullptr || + // Any ThinLTO global dead symbol removal should have already + // occurred, so it should be safe to promote when the target is a + // declaration. + // TODO: Remove internal option once more fully tested. + (MemProfRequireDefinitionForPromotion && + TargetFunction->isDeclaration())) { ORE.emit([&]() { return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB) << "Memprof cannot promote indirect call: target with md5sum " diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 3dd06626a2d4c6..b2fa66f2a6d379 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -469,7 +469,8 @@ class SampleProfileLoader final : public SampleProfileLoaderBaseImpl { std::function GetAssumptionCache, std::function GetTargetTransformInfo, std::function GetTLI, - LazyCallGraph &CG) + LazyCallGraph &CG, bool DisableSampleProfileInlining, + bool UseFlattenedProfile) : SampleProfileLoaderBaseImpl(std::string(Name), std::string(RemapName), std::move(FS)), GetAC(std::move(GetAssumptionCache)), @@ -478,7 +479,9 @@ class SampleProfileLoader final : public SampleProfileLoaderBaseImpl { AnnotatedPassName(AnnotateSampleProfileInlinePhase ? llvm::AnnotateInlinePassName(InlineContext{ LTOPhase, InlinePass::SampleProfileInliner}) - : CSINLINE_DEBUG) {} + : CSINLINE_DEBUG), + DisableSampleProfileInlining(DisableSampleProfileInlining), + UseFlattenedProfile(UseFlattenedProfile) {} bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr); bool runOnModule(Module &M, ModuleAnalysisManager *AM, @@ -592,6 +595,10 @@ class SampleProfileLoader final : public SampleProfileLoaderBaseImpl { // attribute. bool ProfAccForSymsInList; + bool DisableSampleProfileInlining; + + bool UseFlattenedProfile; + // External inline advisor used to replay inline decision from remarks. std::unique_ptr ExternalInlineAdvisor; @@ -919,7 +926,7 @@ bool SampleProfileLoader::tryPromoteAndInlineCandidate( Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, uint64_t &Sum, SmallVector *InlinedCallSite) { // Bail out early if sample-loader inliner is disabled. - if (DisableSampleLoaderInlining) + if (DisableSampleProfileInlining) return false; // Bail out early if MaxNumPromotions is zero. @@ -1230,7 +1237,7 @@ bool SampleProfileLoader::tryInlineCandidate( InlineCandidate &Candidate, SmallVector *InlinedCallSites) { // Do not attempt to inline a candidate if // --disable-sample-loader-inlining is true. - if (DisableSampleLoaderInlining) + if (DisableSampleProfileInlining) return false; CallBase &CB = *Candidate.CallInstr; @@ -1974,6 +1981,13 @@ bool SampleProfileLoader::doInitialization(Module &M, PSL = Reader->getProfileSymbolList(); + if (DisableSampleLoaderInlining.getNumOccurrences()) + DisableSampleProfileInlining = DisableSampleLoaderInlining; + + if (UseFlattenedProfile) + ProfileConverter::flattenProfile(Reader->getProfiles(), + Reader->profileIsCS()); + // While profile-sample-accurate is on, ignore symbol list. ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate; @@ -2304,9 +2318,12 @@ bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) } SampleProfileLoaderPass::SampleProfileLoaderPass( std::string File, std::string RemappingFile, ThinOrFullLTOPhase LTOPhase, - IntrusiveRefCntPtr FS) + IntrusiveRefCntPtr FS, bool DisableSampleProfileInlining, + bool UseFlattenedProfile) : ProfileFileName(File), ProfileRemappingFileName(RemappingFile), - LTOPhase(LTOPhase), FS(std::move(FS)) {} + LTOPhase(LTOPhase), FS(std::move(FS)), + DisableSampleProfileInlining(DisableSampleProfileInlining), + UseFlattenedProfile(UseFlattenedProfile) {} PreservedAnalyses SampleProfileLoaderPass::run(Module &M, ModuleAnalysisManager &AM) { @@ -2331,7 +2348,8 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M, ProfileFileName.empty() ? SampleProfileFile : ProfileFileName, ProfileRemappingFileName.empty() ? SampleProfileRemappingFile : ProfileRemappingFileName, - LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI, CG); + LTOPhase, FS, GetAssumptionCache, GetTTI, GetTLI, CG, + DisableSampleProfileInlining, UseFlattenedProfile); if (!SampleLoader.doInitialization(M, &FAM)) return PreservedAnalyses::all(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index f7d5369bd87b74..5a8814dfd6b3d3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -7715,6 +7715,32 @@ Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) { if (Instruction *Res = foldReductionIdiom(I, Builder, DL)) return Res; + { + Value *A; + const APInt *C1, *C2; + ICmpInst::Predicate Pred = I.getPredicate(); + if (ICmpInst::isEquality(Pred)) { + // sext(a) & c1 == c2 --> a & c3 == trunc(c2) + // sext(a) & c1 != c2 --> a & c3 != trunc(c2) + if (match(Op0, m_And(m_SExt(m_Value(A)), m_APInt(C1))) && + match(Op1, m_APInt(C2))) { + Type *InputTy = A->getType(); + unsigned InputBitWidth = InputTy->getScalarSizeInBits(); + // c2 must be non-negative at the bitwidth of a. + if (C2->getActiveBits() < InputBitWidth) { + APInt TruncC1 = C1->trunc(InputBitWidth); + // Check if there are 1s in C1 high bits of size InputBitWidth. + if (C1->uge(APInt::getOneBitSet(C1->getBitWidth(), InputBitWidth))) + TruncC1.setBit(InputBitWidth - 1); + Value *AndInst = Builder.CreateAnd(A, TruncC1); + return new ICmpInst( + Pred, AndInst, + ConstantInt::get(InputTy, C2->trunc(InputBitWidth))); + } + } + } + } + return Changed ? &I : nullptr; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 8ca705ae1d364d..5eb807dcb76cef 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -30,6 +30,12 @@ static cl::opt "SimplifyDemandedBits() are consistent"), cl::Hidden, cl::init(false)); +static cl::opt SimplifyDemandedVectorEltsDepthLimit( + "instcombine-simplify-vector-elts-depth", + cl::desc( + "Depth limit when simplifying vector instructions and their operands"), + cl::Hidden, cl::init(10)); + /// Check to see if the specified operand of the specified instruction is a /// constant integer. If so, check to see if there are any bits set in the /// constant that are not demanded. If so, shrink the constant and return true. @@ -1432,7 +1438,7 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V, } // Limit search depth. - if (Depth == 10) + if (Depth == SimplifyDemandedVectorEltsDepthLimit) return nullptr; if (!AllowMultipleUsers) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index 454fe5a91d375a..ede89b099e8deb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -2904,7 +2904,9 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) { if (auto *SI = dyn_cast(LHS)) { // We cannot do this fold for elementwise select since ShuffleVector is // not elementwise. - if (SI->getCondition()->getType()->isIntegerTy()) { + if (SI->getCondition()->getType()->isIntegerTy() && + (isa(RHS) || + isGuaranteedNotToBePoison(SI->getCondition()))) { if (Instruction *I = FoldOpIntoSelect(SVI, SI)) return I; } diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp index 0b4d3ff201e622..64e850c7d9316d 100644 --- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp @@ -810,10 +810,6 @@ memprof::extractCallsFromIR(Module &M) { for (auto &BB : F) { for (auto &I : BB) { - const DILocation *DIL = I.getDebugLoc(); - if (!DIL) - continue; - if (!isa(&I) || isa(&I)) continue; @@ -824,11 +820,17 @@ memprof::extractCallsFromIR(Module &M) { continue; StringRef CalleeName = CalledFunction->getName(); - uint64_t CallerGUID = - IndexedMemProfRecord::getGUID(DIL->getSubprogramLinkageName()); - uint64_t CalleeGUID = IndexedMemProfRecord::getGUID(CalleeName); - LineLocation Loc = {GetOffset(DIL), DIL->getColumn()}; - Calls[CallerGUID].emplace_back(Loc, CalleeGUID); + for (const DILocation *DIL = I.getDebugLoc(); DIL; + DIL = DIL->getInlinedAt()) { + StringRef CallerName = DIL->getSubprogramLinkageName(); + assert(!CallerName.empty() && + "Be sure to enable -fdebug-info-for-profiling"); + uint64_t CallerGUID = IndexedMemProfRecord::getGUID(CallerName); + uint64_t CalleeGUID = IndexedMemProfRecord::getGUID(CalleeName); + LineLocation Loc = {GetOffset(DIL), DIL->getColumn()}; + Calls[CallerGUID].emplace_back(Loc, CalleeGUID); + CalleeName = CallerName; + } } } } diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 7f838340410b51..8e74b8645fad9a 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -288,24 +288,35 @@ static bool processICmp(ICmpInst *Cmp, LazyValueInfo *LVI) { if (!Cmp->getOperand(0)->getType()->isIntOrIntVectorTy()) return false; - if (!Cmp->isSigned()) + if (!Cmp->isSigned() && (!Cmp->isUnsigned() || Cmp->hasSameSign())) return false; - ICmpInst::Predicate UnsignedPred = - ConstantRange::getEquivalentPredWithFlippedSignedness( - Cmp->getPredicate(), - LVI->getConstantRangeAtUse(Cmp->getOperandUse(0), - /*UndefAllowed*/ true), - LVI->getConstantRangeAtUse(Cmp->getOperandUse(1), - /*UndefAllowed*/ true)); + bool Changed = false; - if (UnsignedPred == ICmpInst::Predicate::BAD_ICMP_PREDICATE) - return false; + ConstantRange CR1 = LVI->getConstantRangeAtUse(Cmp->getOperandUse(0), + /*UndefAllowed=*/false), + CR2 = LVI->getConstantRangeAtUse(Cmp->getOperandUse(1), + /*UndefAllowed=*/false); - ++NumSICmps; - Cmp->setPredicate(UnsignedPred); + if (Cmp->isSigned()) { + ICmpInst::Predicate UnsignedPred = + ConstantRange::getEquivalentPredWithFlippedSignedness( + Cmp->getPredicate(), CR1, CR2); - return true; + if (UnsignedPred == ICmpInst::Predicate::BAD_ICMP_PREDICATE) + return false; + + ++NumSICmps; + Cmp->setPredicate(UnsignedPred); + Changed = true; + } + + if (ConstantRange::areInsensitiveToSignednessOfICmpPredicate(CR1, CR2)) { + Cmp->setSameSign(); + Changed = true; + } + + return Changed; } /// See if LazyValueInfo's ability to exploit edge conditions or range diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index e742d2ed12af1a..bc50f23d8eb27b 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -874,6 +874,8 @@ static Value *NegateValue(Value *V, Instruction *BI, // negation. Instruction *NewNeg = CreateNeg(V, V->getName() + ".neg", BI->getIterator(), BI); + // NewNeg is generated to potentially replace BI, so use its DebugLoc. + NewNeg->setDebugLoc(BI->getDebugLoc()); ToRedo.insert(NewNeg); return NewNeg; } diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index ef7ef8ef7911e0..746fdaa340fc7a 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -577,8 +577,15 @@ static Value *findBaseDefiningValue(Value *I, DefiningValueMapTy &Cache, return I; } - assert(!isa(I) && "Xchg handled above, all others are " - "binary ops which don't apply to pointers"); + if (isa(I)) { + assert(cast(I)->getOperation() == AtomicRMWInst::Xchg && + "Only Xchg is allowed for pointer values"); + // A RMW Xchg is a combined atomic load and store, so we can treat the + // loaded value as a base pointer. + Cache[I] = I; + setKnownBase(I, /* IsKnownBase */ true, KnownBases); + return I; + } // The aggregate ops. Aggregates can either be in the heap or on the // stack, but in either case, this is simply a field load. As a result, diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 3b01089962e29c..d80af26451ac75 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -5166,11 +5166,9 @@ insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, AllocaInst *NewAddr, DIAssignID::getDistinct(NewAddr->getContext())); } - Instruction *NewAssign = - DIB.insertDbgAssign(NewAddr, Orig->getValue(), Orig->getVariable(), - NewFragmentExpr, NewAddr, NewAddrExpr, - Orig->getDebugLoc()) - .get(); + Instruction *NewAssign = cast(DIB.insertDbgAssign( + NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr, + NewAddrExpr, Orig->getDebugLoc())); LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n"); (void)NewAssign; } @@ -5590,12 +5588,11 @@ bool SROA::promoteAllocas(Function &F) { if (PromotableAllocas.empty()) return false; - NumPromoted += PromotableAllocas.size(); - if (SROASkipMem2Reg) { LLVM_DEBUG(dbgs() << "Not promoting allocas with mem2reg!\n"); } else { LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); + NumPromoted += PromotableAllocas.size(); PromoteMemToReg(PromotableAllocas.getArrayRef(), DTU->getDomTree(), AC); } diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index a27cb4dd219c30..aa5e04d71657a7 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1465,7 +1465,7 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB, } } AL = AL.addParamAttributes(Context, I, NewAB); - } else { + } else if (NewInnerCB->getArgOperand(I)->getType()->isPointerTy()) { // Check if the underlying value for the parameter is an argument. const Value *UnderlyingV = getUnderlyingObject(InnerCB->getArgOperand(I)); @@ -1473,10 +1473,13 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB, if (!Arg) continue; ArgNo = Arg->getArgNo(); + } else { + continue; } // If so, propagate its access attributes. AL = AL.addParamAttributes(Context, I, ValidObjParamAttrs[ArgNo]); + // We can have conflicting attributes from the inner callsite and // to-be-inlined callsite. In that case, choose the most // restrictive. diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 768765b6c1e632..509b6d62265517 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1696,7 +1696,7 @@ static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV, if (!UseNewDbgInfoFormat) { auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, (Instruction *)nullptr); - DbgVal.get()->insertBefore(Instr); + cast(DbgVal)->insertBefore(Instr); } else { // RemoveDIs: if we're using the new debug-info format, allocate a // DbgVariableRecord directly instead of a dbg.value intrinsic. @@ -1713,7 +1713,7 @@ static void insertDbgValueOrDbgVariableRecordAfter( if (!UseNewDbgInfoFormat) { auto DbgVal = Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, (Instruction *)nullptr); - DbgVal.get()->insertAfter(&*Instr); + cast(DbgVal)->insertAfter(&*Instr); } else { // RemoveDIs: if we're using the new debug-info format, allocate a // DbgVariableRecord directly instead of a dbg.value intrinsic. diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 39da38e4918176..791d528823972d 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1467,8 +1467,8 @@ Value *SCEVExpander::FindValueInExprValueMap( if (!CanonicalMode && SE.containsAddRecurrence(S)) return nullptr; - // If S is a constant, it may be worse to reuse an existing Value. - if (isa(S)) + // If S is a constant or unknown, it may be worse to reuse an existing Value. + if (isa(S) || isa(S)) return nullptr; for (Value *V : SE.getSCEVValues(S)) { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c07af8519049c4..1ebc62f9843905 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6858,8 +6858,8 @@ void LoopVectorizationCostModel::collectValuesToIgnore() { if ((SI = dyn_cast(&I)) && Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) { ValuesToIgnore.insert(&I); - auto I = DeadInvariantStoreOps.insert({SI->getPointerOperand(), {}}); - I.first->second.push_back(SI->getValueOperand()); + DeadInvariantStoreOps[SI->getPointerOperand()].push_back( + SI->getValueOperand()); } if (VecValuesToIgnore.contains(&I) || ValuesToIgnore.contains(&I)) @@ -7521,6 +7521,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { precomputeCosts(BestPlan, BestFactor.Width, CostCtx); assert((BestFactor.Width == LegacyVF.Width || planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width), + CostCtx, OrigLoop) || + planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width), CostCtx, OrigLoop)) && " VPlan cost model and legacy cost model disagreed"); assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) && @@ -8082,9 +8084,9 @@ void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) { // ignored - they will get there anyhow. if (Dst == DefaultDst) continue; - auto I = Dst2Compares.insert({Dst, {}}); + auto &Compares = Dst2Compares[Dst]; VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue()); - I.first->second.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); + Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); } // We need to handle 2 separate cases below for all entries in Dst2Compares, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a6accf0318a30f..da8e0d8cc09a8b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -832,6 +832,7 @@ struct InstructionsState { InstructionsState() = delete; InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} + static InstructionsState invalid() { return {nullptr, nullptr, nullptr}; } }; } // end anonymous namespace @@ -891,20 +892,19 @@ static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, const TargetLibraryInfo &TLI) { - constexpr unsigned BaseIndex = 0; // Make sure these are all Instructions. - if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + if (!all_of(VL, IsaPred)) + return InstructionsState::invalid(); - bool IsCastOp = isa(VL[BaseIndex]); - bool IsBinOp = isa(VL[BaseIndex]); - bool IsCmpOp = isa(VL[BaseIndex]); + Value *V = VL.front(); + bool IsCastOp = isa(V); + bool IsBinOp = isa(V); + bool IsCmpOp = isa(V); CmpInst::Predicate BasePred = - IsCmpOp ? cast(VL[BaseIndex])->getPredicate() - : CmpInst::BAD_ICMP_PREDICATE; - unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); + IsCmpOp ? cast(V)->getPredicate() : CmpInst::BAD_ICMP_PREDICATE; + unsigned Opcode = cast(V)->getOpcode(); unsigned AltOpcode = Opcode; - unsigned AltIndex = BaseIndex; + unsigned AltIndex = 0; bool SwappedPredsCompatible = [&]() { if (!IsCmpOp) @@ -931,14 +931,14 @@ static InstructionsState getSameOpcode(ArrayRef VL, }(); // Check for one alternate opcode from another BinaryOperator. // TODO - generalize to support all operators (types, calls etc.). - auto *IBase = cast(VL[BaseIndex]); + auto *IBase = cast(V); Intrinsic::ID BaseID = 0; SmallVector BaseMappings; if (auto *CallBase = dyn_cast(IBase)) { BaseID = getVectorIntrinsicIDForCall(CallBase, &TLI); BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase); if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty()) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState::invalid(); } for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { auto *I = cast(VL[Cnt]); @@ -970,7 +970,7 @@ static InstructionsState getSameOpcode(ArrayRef VL, } } } else if (auto *Inst = dyn_cast(VL[Cnt]); Inst && IsCmpOp) { - auto *BaseInst = cast(VL[BaseIndex]); + auto *BaseInst = cast(V); Type *Ty0 = BaseInst->getOperand(0)->getType(); Type *Ty1 = Inst->getOperand(0)->getType(); if (Ty0 == Ty1) { @@ -988,7 +988,7 @@ static InstructionsState getSameOpcode(ArrayRef VL, if (isCmpSameOrSwapped(BaseInst, Inst, TLI)) continue; auto *AltInst = cast(VL[AltIndex]); - if (AltIndex != BaseIndex) { + if (AltIndex) { if (isCmpSameOrSwapped(AltInst, Inst, TLI)) continue; } else if (BasePred != CurrentPred) { @@ -1007,27 +1007,28 @@ static InstructionsState getSameOpcode(ArrayRef VL, if (auto *Gep = dyn_cast(I)) { if (Gep->getNumOperands() != 2 || Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType()) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState::invalid(); } else if (auto *EI = dyn_cast(I)) { if (!isVectorLikeInstWithConstOps(EI)) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState::invalid(); } else if (auto *LI = dyn_cast(I)) { auto *BaseLI = cast(IBase); if (!LI->isSimple() || !BaseLI->isSimple()) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState::invalid(); } else if (auto *Call = dyn_cast(I)) { auto *CallBase = cast(IBase); if (Call->getCalledFunction() != CallBase->getCalledFunction()) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); - if (Call->hasOperandBundles() && (!CallBase->hasOperandBundles() || - !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(), - Call->op_begin() + Call->getBundleOperandsEndIndex(), - CallBase->op_begin() + - CallBase->getBundleOperandsStartIndex()))) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState::invalid(); + if (Call->hasOperandBundles() && + (!CallBase->hasOperandBundles() || + !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(), + Call->op_begin() + Call->getBundleOperandsEndIndex(), + CallBase->op_begin() + + CallBase->getBundleOperandsStartIndex()))) + return InstructionsState::invalid(); Intrinsic::ID ID = getVectorIntrinsicIDForCall(Call, &TLI); if (ID != BaseID) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState::invalid(); if (!ID) { SmallVector Mappings = VFDatabase(*Call).getMappings(*Call); if (Mappings.size() != BaseMappings.size() || @@ -1037,15 +1038,15 @@ static InstructionsState getSameOpcode(ArrayRef VL, Mappings.front().Shape.VF != BaseMappings.front().Shape.VF || Mappings.front().Shape.Parameters != BaseMappings.front().Shape.Parameters) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState::invalid(); } } continue; } - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState::invalid(); } - return InstructionsState(VL[BaseIndex], cast(VL[BaseIndex]), + return InstructionsState(V, cast(V), cast(VL[AltIndex])); } @@ -8019,7 +8020,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, } // Don't handle vectors. - if (!SLPReVec && getValueType(S.OpValue)->isVectorTy()) { + if (!SLPReVec && getValueType(VL.front())->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); return; @@ -8088,7 +8089,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize; bool AreAllSameBlock = S.getOpcode() && allSameBlock(VL); bool AreScatterAllGEPSameBlock = - (IsScatterVectorizeUserTE && S.OpValue->getType()->isPointerTy() && + (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() && VL.size() > 2 && all_of(VL, [&BB](Value *V) { @@ -8104,7 +8105,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, SortedIndices)); bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock; if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) || - (isa( + (isa_and_present( S.OpValue) && !all_of(VL, isVectorLikeInstWithConstOps)) || NotProfitableForVectorization(VL)) { @@ -8161,7 +8162,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Special processing for sorted pointers for ScatterVectorize node with // constant indeces only. if (!AreAllSameBlock && AreScatterAllGEPSameBlock) { - assert(S.OpValue->getType()->isPointerTy() && + assert(VL.front()->getType()->isPointerTy() && count_if(VL, IsaPred) >= 2 && "Expected pointers only."); // Reset S to make it GetElementPtr kind of node. @@ -10985,7 +10986,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, // If the selects are the only uses of the compares, they will be // dead and we can adjust the cost by removing their cost. if (VI && SelectOnly) { - assert(!Ty->isVectorTy() && "Expected only for scalar type."); + assert((!Ty->isVectorTy() || SLPReVec) && + "Expected only for scalar type."); auto *CI = cast(VI->getOperand(0)); IntrinsicCost -= TTI->getCmpSelInstrCost( CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(), diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp index 0a930d30aeab58..3617d369776418 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp @@ -153,6 +153,17 @@ Value *BottomUpVec::createVectorInstr(ArrayRef Bndl, // TODO: Propagate debug info. } +void BottomUpVec::tryEraseDeadInstrs() { + // Visiting the dead instructions bottom-to-top. + sort(DeadInstrCandidates, + [](Instruction *I1, Instruction *I2) { return I1->comesBefore(I2); }); + for (Instruction *I : reverse(DeadInstrCandidates)) { + if (I->hasNUses(0)) + I->eraseFromParent(); + } + DeadInstrCandidates.clear(); +} + Value *BottomUpVec::vectorizeRec(ArrayRef Bndl) { Value *NewVec = nullptr; const auto &LegalityRes = Legality->canVectorize(Bndl); @@ -182,7 +193,11 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl) { } NewVec = createVectorInstr(Bndl, VecOperands); - // TODO: Collect potentially dead instructions. + // Collect the original scalar instructions as they may be dead. + if (NewVec != nullptr) { + for (Value *V : Bndl) + DeadInstrCandidates.push_back(cast(V)); + } break; } case LegalityResultID::Pack: { @@ -194,7 +209,9 @@ Value *BottomUpVec::vectorizeRec(ArrayRef Bndl) { } bool BottomUpVec::tryVectorize(ArrayRef Bndl) { + DeadInstrCandidates.clear(); vectorizeRec(Bndl); + tryEraseDeadInstrs(); return Change; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 00ba2f49017899..8b1a4aeb88f81f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -58,7 +58,7 @@ static cl::opt PrintVPlansInDotFormat( "vplan-print-in-dot-format", cl::Hidden, cl::desc("Use dot format instead of plain text when dumping VPlans")); -#define DEBUG_TYPE "vplan" +#define DEBUG_TYPE "loop-vectorize" #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { @@ -552,18 +552,10 @@ VPBasicBlock *VPBasicBlock::splitAt(iterator SplitAt) { "can only split at a position in the same block"); SmallVector Succs(successors()); - // First, disconnect the current block from its successors. - for (VPBlockBase *Succ : Succs) - VPBlockUtils::disconnectBlocks(this, Succ); - // Create new empty block after the block to split. auto *SplitBlock = new VPBasicBlock(getName() + ".split"); VPBlockUtils::insertBlockAfter(SplitBlock, this); - // Add successors for block to split to new block. - for (VPBlockBase *Succ : Succs) - VPBlockUtils::connectBlocks(SplitBlock, Succ); - // Finally, move the recipes starting at SplitAt to new block. for (VPRecipeBase &ToMove : make_early_inc_range(make_range(SplitAt, this->end()))) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 18f5f13073aa63..abfe97b4ab55b6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -4113,17 +4113,29 @@ class VPBlockUtils { IfFalse->setParent(BlockPtr->getParent()); } - /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to - /// the successors of \p From and \p From to the predecessors of \p To. Both - /// VPBlockBases must have the same parent, which can be null. Both - /// VPBlockBases can be already connected to other VPBlockBases. - static void connectBlocks(VPBlockBase *From, VPBlockBase *To) { + /// Connect VPBlockBases \p From and \p To bi-directionally. If \p PredIdx is + /// -1, append \p From to the predecessors of \p To, otherwise set \p To's + /// predecessor at \p PredIdx to \p From. If \p SuccIdx is -1, append \p To to + /// the successors of \p From, otherwise set \p From's successor at \p SuccIdx + /// to \p To. Both VPBlockBases must have the same parent, which can be null. + /// Both VPBlockBases can be already connected to other VPBlockBases. + static void connectBlocks(VPBlockBase *From, VPBlockBase *To, + unsigned PredIdx = -1u, unsigned SuccIdx = -1u) { assert((From->getParent() == To->getParent()) && "Can't connect two block with different parents"); - assert(From->getNumSuccessors() < 2 && + assert((SuccIdx != -1u || From->getNumSuccessors() < 2) && "Blocks can't have more than two successors."); - From->appendSuccessor(To); - To->appendPredecessor(From); + assert((PredIdx != -1u || To->getNumPredecessors() < 2) && + "Blocks can't have more than two predecessors."); + if (SuccIdx == -1u) + From->appendSuccessor(To); + else + From->getSuccessors()[SuccIdx] = To; + + if (PredIdx == -1u) + To->appendPredecessor(From); + else + To->getPredecessors()[PredIdx] = From; } /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To @@ -4165,6 +4177,24 @@ class VPBlockUtils { return cast(&Block); }); } + + /// Inserts \p BlockPtr on the edge between \p From and \p To. That is, update + /// \p From's successor to \p To to point to \p BlockPtr and \p To's + /// predecessor from \p From to \p BlockPtr. \p From and \p To are added to \p + /// BlockPtr's predecessors and successors respectively. There must be a + /// single edge between \p From and \p To. + static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, + VPBlockBase *BlockPtr) { + auto &Successors = From->getSuccessors(); + auto &Predecessors = To->getPredecessors(); + assert(count(Successors, To) == 1 && count(Predecessors, From) == 1 && + "must have single between From and To"); + unsigned SuccIdx = std::distance(Successors.begin(), find(Successors, To)); + unsigned PredIx = + std::distance(Predecessors.begin(), find(Predecessors, From)); + VPBlockUtils::connectBlocks(From, BlockPtr, -1, SuccIdx); + VPBlockUtils::connectBlocks(BlockPtr, To, PredIx, -1); + } }; class VPInterleavedAccessInfo { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6254ea15191819..ef2ca9af7268d1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1484,6 +1484,8 @@ void VPWidenCastRecipe::execute(VPTransformState &State) { Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy); State.set(this, Cast); State.addMetadata(Cast, cast_or_null(getUnderlyingValue())); + if (auto *CastOp = dyn_cast(Cast)) + setFlags(CastOp); } InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF, diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ea8845eaa75d4d..b9ab8a8fe60107 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -360,9 +360,7 @@ static void addReplicateRegions(VPlan &Plan) { // Record predicated instructions for above packing optimizations. VPBlockBase *Region = createReplicateRegion(RepR, Plan); Region->setParent(CurrentBlock->getParent()); - VPBlockUtils::disconnectBlocks(CurrentBlock, SplitBlock); - VPBlockUtils::connectBlocks(CurrentBlock, Region); - VPBlockUtils::connectBlocks(Region, SplitBlock); + VPBlockUtils::insertOnEdge(CurrentBlock, SplitBlock, Region); } } diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index dd005682203b75..f653269713b30b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -297,12 +297,12 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { if (auto *Red = dyn_cast(&R)) { auto *Phi = cast(R.getOperand(0)); if (Phi->isOrdered()) { - auto Ins = VPV2Parts.insert({Phi, {}}); + auto &Parts = VPV2Parts[Phi]; if (Part == 1) { - Ins.first->second.clear(); - Ins.first->second.push_back(Red); + Parts.clear(); + Parts.push_back(Red); } - Ins.first->second.push_back(Copy->getVPSingleValue()); + Parts.push_back(Copy->getVPSingleValue()); Phi->setOperand(1, Copy->getVPSingleValue()); } } diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 04ea12ef0f1221..b8754b03c2ebc6 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -952,6 +952,12 @@ bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) { if (!IsConst0 && !IsConst1 && Index0 != Index1) return false; + auto *VecTy0 = cast(Ins0->getType()); + auto *VecTy1 = cast(Ins1->getType()); + if (VecTy0->getElementCount().getKnownMinValue() <= Index0 || + VecTy1->getElementCount().getKnownMinValue() <= Index1) + return false; + // Bail for single insertion if it is a load. // TODO: Handle this once getVectorInstrCost can cost for load/stores. auto *I0 = dyn_cast_or_null(V0); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll index ed7bcff5160f81..7a67cf3fd4c942 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll @@ -1477,6 +1477,20 @@ define void @test_lifetime_intrin() { ret void } +define void @test_lifetime_intrin_optnone() optnone noinline { +; CHECK-LABEL: name: test_lifetime_intrin_optnone +; CHECK: RET_ReallyLR +; O3-LABEL: name: test_lifetime_intrin_optnone +; O3: {{%[0-9]+}}:_(p0) = G_FRAME_INDEX %stack.0.slot +; O3-NEXT: G_STORE +; O3-NEXT: RET_ReallyLR + %slot = alloca i8, i32 4 + call void @llvm.lifetime.start.p0(i64 0, ptr %slot) + store volatile i8 10, ptr %slot + call void @llvm.lifetime.end.p0(i64 0, ptr %slot) + ret void +} + define void @test_load_store_atomics(ptr %addr) { ; CHECK-LABEL: name: test_load_store_atomics ; CHECK: [[ADDR:%[0-9]+]]:_(p0) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir index 2f10a497fa74cb..5cbff0f0c74cb7 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-integer.mir @@ -308,8 +308,8 @@ body: | ; CHECK: liveins: $w0, $w1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %a:_(s64) = COPY $x0 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 71 - ; CHECK-NEXT: %sub:_(s64) = G_SUB %a, [[C]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -71 + ; CHECK-NEXT: %sub:_(s64) = G_ADD %a, [[C]] ; CHECK-NEXT: $x0 = COPY %sub(s64) ; CHECK-NEXT: RET_ReallyLR implicit $x0 %a:_(s64) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir index f207e9c149a476..e9d4af7da5d06f 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-narrow-binop.mir @@ -88,8 +88,8 @@ body: | ; CHECK-LABEL: name: test_combine_trunc_sub_i128 ; CHECK: %lhs:_(s128) = COPY $q0 ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %lhs(s128) - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 - ; CHECK-NEXT: %small:_(s32) = G_SUB [[TRUNC]], [[C]] + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -5 + ; CHECK-NEXT: %small:_(s32) = G_ADD [[TRUNC]], [[C]] ; CHECK-NEXT: $w0 = COPY %small(s32) %lhs:_(s128) = COPY $q0 %rhs:_(s128) = G_CONSTANT i128 5 @@ -103,8 +103,8 @@ body: | bb.1: ; CHECK-LABEL: name: test_combine_trunc_sub_i128_multi_use ; CHECK: %lhs:_(s128) = COPY $q0 - ; CHECK-NEXT: %rhs:_(s128) = G_CONSTANT i128 5 - ; CHECK-NEXT: %res:_(s128) = G_SUB %lhs, %rhs + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s128) = G_CONSTANT i128 -5 + ; CHECK-NEXT: %res:_(s128) = G_ADD %lhs, [[C]] ; CHECK-NEXT: %small:_(s32) = G_TRUNC %res(s128) ; CHECK-NEXT: $q0 = COPY %res(s128) ; CHECK-NEXT: $w0 = COPY %small(s32) diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-unmerge-values.mir index 5bee62da346341..4f0ca34408012b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-unmerge-values.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-unmerge-values.mir @@ -1,13 +1,21 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=aarch64 -O0 -run-pass=legalizer %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64 -O0 -run-pass=legalizer %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-O0 +# RUN: llc -mtriple=aarch64 -run-pass=legalizer %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-O2 --- name: test_unmerge_s4_constant body: | bb.0: - ; CHECK-LABEL: name: test_unmerge_s4_constant - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 - ; CHECK: $x0 = COPY [[C]](s64) + ; CHECK-O0-LABEL: name: test_unmerge_s4_constant + ; CHECK-O0: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-O0-NEXT: $x0 = COPY [[C]](s64) + ; + ; CHECK-O2-LABEL: name: test_unmerge_s4_constant + ; CHECK-O2: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0 + ; CHECK-O2-NEXT: [[TRUNC:%[0-9]+]]:_(s4) = G_TRUNC [[C]](s8) + ; CHECK-O2-NEXT: [[COPY:%[0-9]+]]:_(s4) = COPY [[TRUNC]](s4) + ; CHECK-O2-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-O2-NEXT: $x0 = COPY [[C1]](s64) %0:_(s8) = G_CONSTANT i8 0 %1:_(s4), %2:_(s4)= G_UNMERGE_VALUES %0 %3:_(s64) = G_ANYEXT %1 @@ -21,15 +29,17 @@ body: | bb.0: liveins: $w0 ; CHECK-LABEL: name: test_unmerge_s4 - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 - ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[COPY]](s32) - ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UV]](s8) - ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ZEXT]], [[C]](s64) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV]](s8) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32) - ; CHECK: $x0 = COPY [[ANYEXT]](s64) - ; CHECK: $x1 = COPY [[ANYEXT1]](s64) + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[COPY]](s32) + ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[UV]](s8) + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ZEXT]], [[C]](s64) + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV]](s8) + ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32) + ; CHECK-NEXT: $x0 = COPY [[ANYEXT]](s64) + ; CHECK-NEXT: $x1 = COPY [[ANYEXT1]](s64) %0:_(s32) = COPY $w0 %1:_(s8) = G_TRUNC %0 %2:_(s4), %3:_(s4)= G_UNMERGE_VALUES %1 @@ -39,3 +49,70 @@ body: | $x1 = COPY %5 ... + +--- +name: test_unmerge_unmerge_s64 +body: | + bb.0: + liveins: $x0,$d0,$d1,$d2 + ; CHECK-LABEL: name: test_unmerge_unmerge_s64 + ; CHECK: liveins: $x0, $d0, $d1, $d2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: $x0 = COPY [[COPY]](s64) + ; CHECK-NEXT: $x1 = COPY [[COPY]](s64) + ; CHECK-NEXT: $x2 = COPY [[COPY]](s64) + %0:_(s64) = COPY $x0 + %1:_(<3 x s64>) = G_BUILD_VECTOR %0:_(s64), %0:_(s64), %0:_(s64) + %2:_(s64), %3:_(s64), %4:_(s64) = G_UNMERGE_VALUES %1:_(<3 x s64>) + $x0 = COPY %2:_(s64) + $x1 = COPY %3:_(s64) + $x2 = COPY %3:_(s64) + +... + +--- +name: test_unmerge_unmerge_to_p0 +body: | + bb.0: + liveins: $x0,$d0,$d1,$d2 + ; CHECK-LABEL: name: test_unmerge_unmerge_to_p0 + ; CHECK: liveins: $x0, $d0, $d1, $d2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[COPY]](s64) + ; CHECK-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p0) = G_INTTOPTR [[COPY]](s64) + ; CHECK-NEXT: $x0 = COPY [[INTTOPTR]](p0) + ; CHECK-NEXT: $x1 = COPY [[INTTOPTR1]](p0) + ; CHECK-NEXT: $x2 = COPY [[INTTOPTR1]](p0) + %0:_(s64) = COPY $x0 + %1:_(<3 x s64>) = G_BUILD_VECTOR %0:_(s64), %0:_(s64), %0:_(s64) + %2:_(p0), %3:_(p0), %4:_(p0) = G_UNMERGE_VALUES %1:_(<3 x s64>) + $x0 = COPY %2:_(p0) + $x1 = COPY %3:_(p0) + $x2 = COPY %3:_(p0) + +... + +--- +name: test_unmerge_unmerge_from_p0 +body: | + bb.0: + liveins: $x0,$d0,$d1,$d2 + ; CHECK-LABEL: name: test_unmerge_unmerge_from_p0 + ; CHECK: liveins: $x0, $d0, $d1, $d2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY]](p0) + ; CHECK-NEXT: [[PTRTOINT1:%[0-9]+]]:_(s64) = G_PTRTOINT [[COPY]](p0) + ; CHECK-NEXT: $d0 = COPY [[PTRTOINT]](s64) + ; CHECK-NEXT: $d1 = COPY [[PTRTOINT1]](s64) + ; CHECK-NEXT: $d2 = COPY [[PTRTOINT1]](s64) + %0:_(p0) = COPY $x0 + %1:_(<3 x p0>) = G_BUILD_VECTOR %0:_(p0), %0:_(p0), %0:_(p0) + %2:_(s64), %3:_(s64), %4:_(s64) = G_UNMERGE_VALUES %1:_(<3 x p0>) + $d0 = COPY %2:_(s64) + $d1 = COPY %3:_(s64) + $d2 = COPY %3:_(s64) + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 6be99d0088f1cb..4fea713ee4c32c 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -665,6 +665,9 @@ # DEBUG-NEXT: G_SPLAT_VECTOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_STEP_VECTOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_VECTOR_COMPRESS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir index 04968dab3a37ce..591b6a17928cb1 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-extending-loads-cornercases.mir @@ -95,7 +95,7 @@ body: | %11:_(s8) = G_CONSTANT i8 1 ; CHECK: [[T3:%[0-9]+]]:_(s8) = G_TRUNC [[T0]](s32) %7:_(s8) = G_SUB %2, %11 - ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_SUB [[T3]], {{.*}} + ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_ADD [[T3]], {{.*}} G_BR %bb.3.exit bb.3.exit: ; CHECK: bb.3.exit: @@ -197,7 +197,7 @@ body: | %7:_(s8) = G_CONSTANT i8 1 ; CHECK: [[T3:%[0-9]+]]:_(s8) = G_TRUNC [[T0]](s32) %8:_(s8) = G_SUB %2, %7 - ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_SUB [[T3]], {{.*}} + ; CHECK: [[T4:%[0-9]+]]:_(s8) = G_ADD [[T3]], {{.*}} G_BR %bb.3.exit bb.3.exit: ; CHECK: bb.3.exit: diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir index 0900dd4267a2e4..4c3faa94039097 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-trivial-arith.mir @@ -289,8 +289,8 @@ body: | ; CHECK: liveins: $w0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: %x:_(s32) = COPY $w0 - ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1 - ; CHECK-NEXT: %op:_(s32) = G_SUB %x, %cst + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: %op:_(s32) = G_ADD %x, [[C]] ; CHECK-NEXT: $w0 = COPY %op(s32) ; CHECK-NEXT: RET_ReallyLR implicit $w0 %x:_(s32) = COPY $w0 @@ -488,3 +488,66 @@ body: | RET_ReallyLR implicit $w0 ... +--- +name: sub_to_add +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0 + ; CHECK-LABEL: name: sub_to_add + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: %op:_(s32) = G_ADD %x, [[C]] + ; CHECK-NEXT: $w0 = COPY %op(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %x:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %op:_(s32) = G_SUB %x(s32), %cst + $w0 = COPY %op(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: sub_to_add_nuw +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0 + ; CHECK-LABEL: name: sub_to_add_nuw + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: %op:_(s32) = G_ADD %x, [[C]] + ; CHECK-NEXT: $w0 = COPY %op(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %x:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %op:_(s32) = nuw G_SUB %x(s32), %cst + $w0 = COPY %op(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: sub_to_add_nsw +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0 + ; CHECK-LABEL: name: sub_to_add_nsw + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s32) = COPY $w0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1 + ; CHECK-NEXT: %op:_(s32) = nsw G_ADD %x, [[C]] + ; CHECK-NEXT: $w0 = COPY %op(s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %x:_(s32) = COPY $w0 + %cst:_(s32) = G_CONSTANT i32 1 + %op:_(s32) = nsw G_SUB %x(s32), %cst + $w0 = COPY %op(s32) + RET_ReallyLR implicit $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/translate-gep.ll b/llvm/test/CodeGen/AArch64/GlobalISel/translate-gep.ll index a916fb2bf12377..29763f24861926 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/translate-gep.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/translate-gep.ll @@ -145,3 +145,24 @@ entry: %0 = getelementptr inbounds [8 x i32], ptr @arr, i64 0, <2 x i64> %offs ret <2 x ptr> %0 } + +define <4 x ptr> @vector_gep_v4i32(<4 x ptr> %b, <4 x i32> %off) { + ; CHECK-LABEL: name: vector_gep_v4i32 + ; CHECK: bb.1.entry: + ; CHECK-NEXT: liveins: $q0, $q1, $q2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x p0>) = G_CONCAT_VECTORS [[COPY]](<2 x s64>), [[COPY1]](<2 x s64>) + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $q2 + ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(<4 x s64>) = G_SEXT [[COPY2]](<4 x s32>) + ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(<4 x p0>) = G_PTR_ADD [[CONCAT_VECTORS]], [[SEXT]](<4 x s64>) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<4 x p0>) = COPY [[PTR_ADD]](<4 x p0>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s64>), [[UV1:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[COPY3]](<4 x p0>) + ; CHECK-NEXT: $q0 = COPY [[UV]](<2 x s64>) + ; CHECK-NEXT: $q1 = COPY [[UV1]](<2 x s64>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 +entry: + %g = getelementptr i8, <4 x ptr> %b, <4 x i32> %off + ret <4 x ptr> %g +} diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll index a39c2b5d14dddd..0ce92a20fb3a17 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext.ll @@ -15,8 +15,7 @@ define <8 x i16> @dupsext_v8i8_v8i16(i8 %src, <8 x i8> %b) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: lsl w8, w0, #8 ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: sxth w8, w8 -; CHECK-GI-NEXT: asr w8, w8, #8 +; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: dup v1.8h, w8 ; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h ; CHECK-GI-NEXT: ret @@ -175,9 +174,8 @@ define <2 x i16> @dupsext_v2i8_v2i16(i8 %src, <2 x i8> %b) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: lsl w8, w0, #8 ; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24 -; CHECK-GI-NEXT: sxth w8, w8 +; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24 -; CHECK-GI-NEXT: asr w8, w8, #8 ; CHECK-GI-NEXT: dup v1.4h, w8 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: mul v0.2s, v1.2s, v0.2s @@ -254,8 +252,7 @@ define <8 x i16> @nonsplat_shuffleinsert(i8 %src, <8 x i8> %b) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: lsl w8, w0, #8 ; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: sxth w8, w8 -; CHECK-GI-NEXT: asr w8, w8, #8 +; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: mov v1.h[1], w8 ; CHECK-GI-NEXT: ext v1.16b, v1.16b, v1.16b, #4 ; CHECK-GI-NEXT: mul v0.8h, v1.8h, v0.8h diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll index 8c9661730f1f94..888aa9d7f9cdcc 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-dup-extract-scalable.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve | FileCheck %s -define @dup_extract_i8( %data) { -; CHECK-LABEL: dup_extract_i8: +define @dup_extract_nxv16i8_nxv16i8( %data) { +; CHECK-LABEL: dup_extract_nxv16i8_nxv16i8: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.b, z0.b[1] ; CHECK-NEXT: ret @@ -12,8 +12,32 @@ define @dup_extract_i8( %data) { ret %.splat } -define @dup_extract_i16( %data) { -; CHECK-LABEL: dup_extract_i16: +define @dup_extract_nxv16i8_v16i8(<16 x i8> %data) { +; CHECK-LABEL: dup_extract_nxv16i8_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.b, z0.b[1] +; CHECK-NEXT: ret + %1 = extractelement <16 x i8> %data, i8 1 + %.splatinsert = insertelement poison, i8 %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv16i8_v8i8(<8 x i8> %data) { +; CHECK-LABEL: dup_extract_nxv16i8_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.b, z0.b[1] +; CHECK-NEXT: ret + %1 = extractelement <8 x i8> %data, i8 1 + %.splatinsert = insertelement poison, i8 %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8i16_nxv8i16( %data) { +; CHECK-LABEL: dup_extract_nxv8i16_nxv8i16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: ret @@ -23,8 +47,32 @@ define @dup_extract_i16( %data) { ret %.splat } -define @dup_extract_i32( %data) { -; CHECK-LABEL: dup_extract_i32: +define @dup_extract_nxv8i16_v8i16(<8 x i16> %data) { +; CHECK-LABEL: dup_extract_nxv8i16_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <8 x i16> %data, i16 1 + %.splatinsert = insertelement poison, i16 %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8i16_v4i16(<4 x i16> %data) { +; CHECK-LABEL: dup_extract_nxv8i16_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <4 x i16> %data, i16 1 + %.splatinsert = insertelement poison, i16 %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4i32_nxv4i32( %data) { +; CHECK-LABEL: dup_extract_nxv4i32_nxv4i32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.s, z0.s[1] ; CHECK-NEXT: ret @@ -34,8 +82,32 @@ define @dup_extract_i32( %data) { ret %.splat } -define @dup_extract_i64( %data) { -; CHECK-LABEL: dup_extract_i64: +define @dup_extract_nxv4i32_v4i32(<4 x i32> %data) { +; CHECK-LABEL: dup_extract_nxv4i32_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement <4 x i32> %data, i32 1 + %.splatinsert = insertelement poison, i32 %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4i32_v2i32(<2 x i32> %data) { +; CHECK-LABEL: dup_extract_nxv4i32_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement <2 x i32> %data, i32 1 + %.splatinsert = insertelement poison, i32 %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2i64_nxv2i64( %data) { +; CHECK-LABEL: dup_extract_nxv2i64_nxv2i64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.d, z0.d[1] ; CHECK-NEXT: ret @@ -45,8 +117,31 @@ define @dup_extract_i64( %data) { ret %.splat } -define @dup_extract_f16( %data) { -; CHECK-LABEL: dup_extract_f16: +define @dup_extract_nxv2i64_v2i64(<2 x i64> %data) { +; CHECK-LABEL: dup_extract_nxv2i64_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: ret + %1 = extractelement <2 x i64> %data, i64 1 + %.splatinsert = insertelement poison, i64 %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2i64_v1i64(<1 x i64> %data) { +; CHECK-LABEL: dup_extract_nxv2i64_v1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, x8 +; CHECK-NEXT: ret + %1 = extractelement <1 x i64> %data, i64 1 + %.splatinsert = insertelement poison, i64 %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8f16_nxv8f16( %data) { +; CHECK-LABEL: dup_extract_nxv8f16_nxv8f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: ret @@ -56,19 +151,136 @@ define @dup_extract_f16( %data) { ret %.splat } -define @dup_extract_f16_4( %data) { -; CHECK-LABEL: dup_extract_f16_4: +define @dup_extract_nxv8f16_nxv4f16( %data) { +; CHECK-LABEL: dup_extract_nxv8f16_nxv4f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: mov z0.h, h0 ; CHECK-NEXT: ret %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8f16_nxv2f16( %data) { +; CHECK-LABEL: dup_extract_nxv8f16_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8f16_v8f16(<8 x half> %data) { +; CHECK-LABEL: dup_extract_nxv8f16_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <8 x half> %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8f16_v4f16(<4 x half> %data) { +; CHECK-LABEL: dup_extract_nxv8f16_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <4 x half> %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4f16_nxv8f16( %data) { +; CHECK-LABEL: dup_extract_nxv4f16_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4f16_nxv4f16( %data) { +; CHECK-LABEL: dup_extract_nxv4f16_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4f16_nxv2f16( %data) { +; CHECK-LABEL: dup_extract_nxv4f16_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4f16_v8f16(<8 x half> %data) { +; CHECK-LABEL: dup_extract_nxv4f16_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <8 x half> %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4f16_v4f16(<4 x half> %data) { +; CHECK-LABEL: dup_extract_nxv4f16_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <4 x half> %data, i16 1 %.splatinsert = insertelement poison, half %1, i32 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer ret %.splat } -define @dup_extract_f16_2( %data) { -; CHECK-LABEL: dup_extract_f16_2: +define @dup_extract_nxv2f16_nxv8f16( %data) { +; CHECK-LABEL: dup_extract_nxv2f16_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2f16_nxv4f16( %data) { +; CHECK-LABEL: dup_extract_nxv2f16_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2f16_nxv2f16( %data) { +; CHECK-LABEL: dup_extract_nxv2f16_nxv2f16: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.d, z0.d[1] ; CHECK-NEXT: ret @@ -78,19 +290,32 @@ define @dup_extract_f16_2( %data) { ret %.splat } -define @dup_extract_bf16( %data) #0 { -; CHECK-LABEL: dup_extract_bf16: +define @dup_extract_nxv2f16_v8f16(<8 x half> %data) { +; CHECK-LABEL: dup_extract_nxv2f16_v8f16: ; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 ; CHECK-NEXT: mov z0.h, z0.h[1] ; CHECK-NEXT: ret - %1 = extractelement %data, i16 1 - %.splatinsert = insertelement poison, bfloat %1, i32 0 - %.splat = shufflevector %.splatinsert, poison, zeroinitializer - ret %.splat + %1 = extractelement <8 x half> %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2f16_v4f16(<4 x half> %data) { +; CHECK-LABEL: dup_extract_nxv2f16_v4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <4 x half> %data, i16 1 + %.splatinsert = insertelement poison, half %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat } -define @dup_extract_f32( %data) { -; CHECK-LABEL: dup_extract_f32: +define @dup_extract_nxv4f32_nxv4f32( %data) { +; CHECK-LABEL: dup_extract_nxv4f32_nxv4f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.s, z0.s[1] ; CHECK-NEXT: ret @@ -100,19 +325,90 @@ define @dup_extract_f32( %data) { ret %.splat } -define @dup_extract_f32_2( %data) { -; CHECK-LABEL: dup_extract_f32_2: +define @dup_extract_nxv4f32_nxv2f32( %data) { +; CHECK-LABEL: dup_extract_nxv4f32_nxv2f32: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z0.s, s0 ; CHECK-NEXT: ret %1 = extractelement %data, i32 1 + %.splatinsert = insertelement poison, float %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4f32_v4f32(<4 x float> %data) { +; CHECK-LABEL: dup_extract_nxv4f32_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement <4 x float> %data, i32 1 + %.splatinsert = insertelement poison, float %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4f32_v2f32(<2 x float> %data) { +; CHECK-LABEL: dup_extract_nxv4f32_v2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement <2 x float> %data, i32 1 + %.splatinsert = insertelement poison, float %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2f32_nxv4f32( %data) { +; CHECK-LABEL: dup_extract_nxv2f32_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i32 1 %.splatinsert = insertelement poison, float %1, i32 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer ret %.splat } -define @dup_extract_f64( %data) { -; CHECK-LABEL: dup_extract_f64: +define @dup_extract_nxv2f32_nxv2f32( %data) { +; CHECK-LABEL: dup_extract_nxv2f32_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i32 1 + %.splatinsert = insertelement poison, float %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2f32_v4f32(<4 x float> %data) { +; CHECK-LABEL: dup_extract_nxv2f32_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement <4 x float> %data, i32 1 + %.splatinsert = insertelement poison, float %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2f32_v2f32(<2 x float> %data) { +; CHECK-LABEL: dup_extract_nxv2f32_v2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement <2 x float> %data, i32 1 + %.splatinsert = insertelement poison, float %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2f64_nxv2f64( %data) { +; CHECK-LABEL: dup_extract_nxv2f64_nxv2f64: ; CHECK: // %bb.0: ; CHECK-NEXT: mov z0.d, z0.d[1] ; CHECK-NEXT: ret @@ -122,5 +418,201 @@ define @dup_extract_f64( %data) { ret %.splat } -; +bf16 is required for the bfloat version. -attributes #0 = { "target-features"="+sve,+bf16" } +define @dup_extract_nxv2f64_v2f64(<2 x double> %data) { +; CHECK-LABEL: dup_extract_nxv2f64_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: ret + %1 = extractelement <2 x double> %data, i64 1 + %.splatinsert = insertelement poison, double %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2f64_v1f64(<1 x double> %data) { +; CHECK-LABEL: dup_extract_nxv2f64_v1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, d0 +; CHECK-NEXT: ret + %1 = extractelement <1 x double> %data, i64 1 + %.splatinsert = insertelement poison, double %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8bf16_nxv8bf16( %data) { +; CHECK-LABEL: dup_extract_nxv8bf16_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8bf16_nxv4bf16( %data) { +; CHECK-LABEL: dup_extract_nxv8bf16_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8bf16_nxv2bf16( %data) { +; CHECK-LABEL: dup_extract_nxv8bf16_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8bf16_v8bf16(<8 x bfloat> %data) { +; CHECK-LABEL: dup_extract_nxv8bf16_v8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <8 x bfloat> %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv8bf16_v4bf16(<4 x bfloat> %data) { +; CHECK-LABEL: dup_extract_nxv8bf16_v4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <4 x bfloat> %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4bf16_nxv8bf16( %data) { +; CHECK-LABEL: dup_extract_nxv4bf16_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4bf16_nxv4bf16( %data) { +; CHECK-LABEL: dup_extract_nxv4bf16_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4bf16_nxv2bf16( %data) { +; CHECK-LABEL: dup_extract_nxv4bf16_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: mov z0.h, h0 +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4bf16_v8bf16(<8 x bfloat> %data) { +; CHECK-LABEL: dup_extract_nxv4bf16_v8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <8 x bfloat> %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv4bf16_v4bf16(<4 x bfloat> %data) { +; CHECK-LABEL: dup_extract_nxv4bf16_v4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <4 x bfloat> %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2bf16_nxv8bf16( %data) { +; CHECK-LABEL: dup_extract_nxv2bf16_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2bf16_nxv4bf16( %data) { +; CHECK-LABEL: dup_extract_nxv2bf16_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.s, z0.s[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2bf16_nxv2bf16( %data) { +; CHECK-LABEL: dup_extract_nxv2bf16_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.d, z0.d[1] +; CHECK-NEXT: ret + %1 = extractelement %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2bf16_v8bf16(<8 x bfloat> %data) { +; CHECK-LABEL: dup_extract_nxv2bf16_v8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <8 x bfloat> %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +define @dup_extract_nxv2bf16_v4bf16(<4 x bfloat> %data) { +; CHECK-LABEL: dup_extract_nxv2bf16_v4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 +; CHECK-NEXT: mov z0.h, z0.h[1] +; CHECK-NEXT: ret + %1 = extractelement <4 x bfloat> %data, i16 1 + %.splatinsert = insertelement poison, bfloat %1, i32 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + ret %.splat +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 99dfac807dcd15..2a2f304b23e9b4 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -652,8 +652,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: add x10, x0, #8 -; CHECK-GI-NEXT: sxth w9, w9 -; CHECK-GI-NEXT: asr w9, w9, #8 +; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: dup v2.8h, w9 ; CHECK-GI-NEXT: and x9, x8, #0xfffffff0 ; CHECK-GI-NEXT: mov x11, x9 diff --git a/llvm/test/CodeGen/AArch64/arm64-ext.ll b/llvm/test/CodeGen/AArch64/arm64-ext.ll index c38ab076e4ea53..a74972deb5552d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ext.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ext.ll @@ -1,92 +1,100 @@ -; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=arm64-eabi -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=arm64-eabi -global-isel=1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI -define <8 x i8> @test_vextd(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: test_vextd: -;CHECK: {{ext.8b.*#3}} - %tmp1 = load <8 x i8>, ptr %A - %tmp2 = load <8 x i8>, ptr %B - %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> - ret <8 x i8> %tmp3 +define <8 x i8> @test_vextd(<8 x i8> %tmp1, <8 x i8> %tmp2) { +; CHECK-LABEL: test_vextd: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #3 +; CHECK-NEXT: ret + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + ret <8 x i8> %tmp3 } -define <8 x i8> @test_vextRd(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: test_vextRd: -;CHECK: {{ext.8b.*#5}} - %tmp1 = load <8 x i8>, ptr %A - %tmp2 = load <8 x i8>, ptr %B - %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> - ret <8 x i8> %tmp3 +define <8 x i8> @test_vextRd(<8 x i8> %tmp1, <8 x i8> %tmp2) { +; CHECK-LABEL: test_vextRd: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.8b, v1.8b, v0.8b, #5 +; CHECK-NEXT: ret + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + ret <8 x i8> %tmp3 } -define <16 x i8> @test_vextq(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: test_vextq: -;CHECK: {{ext.16b.*3}} - %tmp1 = load <16 x i8>, ptr %A - %tmp2 = load <16 x i8>, ptr %B - %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> - ret <16 x i8> %tmp3 +define <16 x i8> @test_vextq(<16 x i8> %tmp1, <16 x i8> %tmp2) { +; CHECK-LABEL: test_vextq: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #3 +; CHECK-NEXT: ret + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 } -define <16 x i8> @test_vextRq(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: test_vextRq: -;CHECK: {{ext.16b.*7}} - %tmp1 = load <16 x i8>, ptr %A - %tmp2 = load <16 x i8>, ptr %B - %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> - ret <16 x i8> %tmp3 +define <16 x i8> @test_vextRq(<16 x i8> %tmp1, <16 x i8> %tmp2) { +; CHECK-LABEL: test_vextRq: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #7 +; CHECK-NEXT: ret + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 } -define <4 x i16> @test_vextd16(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: test_vextd16: -;CHECK: {{ext.8b.*#6}} - %tmp1 = load <4 x i16>, ptr %A - %tmp2 = load <4 x i16>, ptr %B - %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> - ret <4 x i16> %tmp3 +define <4 x i16> @test_vextd16(<4 x i16> %tmp1, <4 x i16> %tmp2) { +; CHECK-LABEL: test_vextd16: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #6 +; CHECK-NEXT: ret + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> + ret <4 x i16> %tmp3 } -define <4 x i32> @test_vextq32(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: test_vextq32: -;CHECK: {{ext.16b.*12}} - %tmp1 = load <4 x i32>, ptr %A - %tmp2 = load <4 x i32>, ptr %B - %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> - ret <4 x i32> %tmp3 +define <4 x i32> @test_vextq32(<4 x i32> %tmp1, <4 x i32> %tmp2) { +; CHECK-LABEL: test_vextq32: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #12 +; CHECK-NEXT: ret + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + ret <4 x i32> %tmp3 } ; Undef shuffle indices should not prevent matching to VEXT: -define <8 x i8> @test_vextd_undef(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: test_vextd_undef: -;CHECK: {{ext.8b.*}} - %tmp1 = load <8 x i8>, ptr %A - %tmp2 = load <8 x i8>, ptr %B - %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> - ret <8 x i8> %tmp3 +define <8 x i8> @test_vextd_undef(<8 x i8> %tmp1, <8 x i8> %tmp2) { +; CHECK-LABEL: test_vextd_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #3 +; CHECK-NEXT: ret + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + ret <8 x i8> %tmp3 } -define <8 x i8> @test_vextd_undef2(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: test_vextd_undef2: -;CHECK: {{ext.8b.*#6}} - %tmp1 = load <8 x i8>, ptr %A - %tmp2 = load <8 x i8>, ptr %B +define <8 x i8> @test_vextd_undef2(<8 x i8> %tmp1, <8 x i8> %tmp2) { +; CHECK-SD-LABEL: test_vextd_undef2: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ext v0.8b, v0.8b, v0.8b, #6 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vextd_undef2: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: ext v0.8b, v1.8b, v0.8b, #6 +; CHECK-GI-NEXT: ret %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> ret <8 x i8> %tmp3 } -define <16 x i8> @test_vextRq_undef(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: test_vextRq_undef: -;CHECK: {{ext.16b.*#7}} - %tmp1 = load <16 x i8>, ptr %A - %tmp2 = load <16 x i8>, ptr %B - %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> - ret <16 x i8> %tmp3 +define <16 x i8> @test_vextRq_undef(<16 x i8> %tmp1, <16 x i8> %tmp2) { +; CHECK-LABEL: test_vextRq_undef: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #7 +; CHECK-NEXT: ret + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + ret <16 x i8> %tmp3 } -define <8 x i16> @test_vextRq_undef2(ptr %A) nounwind { -;CHECK-LABEL: test_vextRq_undef2: -;CHECK: {{ext.16b.*#10}} - %tmp1 = load <8 x i16>, ptr %A +define <8 x i16> @test_vextRq_undef2(<8 x i16> %tmp1) nounwind { +; CHECK-LABEL: test_vextRq_undef2: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #10 +; CHECK-NEXT: ret %vext = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> ret <8 x i16> %vext; } @@ -95,11 +103,31 @@ define <8 x i16> @test_vextRq_undef2(ptr %A) nounwind { ; chosen to reach lowering phase as a BUILD_VECTOR. ; An undef in the shuffle list should still be optimizable -define <4 x i16> @test_undef(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: test_undef: -;CHECK: zip1.4h - %tmp1 = load <8 x i16>, ptr %A - %tmp2 = load <8 x i16>, ptr %B - %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> - ret <4 x i16> %tmp3 +define <4 x i16> @test_undef(<8 x i16> %tmp1, <8 x i16> %tmp2) { +; CHECK-SD-LABEL: test_undef: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: ext v0.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: zip1 v0.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_undef: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: adrp x8, .LCPI10_0 +; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI10_0] +; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: ret + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> + ret <4 x i16> %tmp3 +} + +define <2 x i64> @test_v2s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: test_v2s64: +; CHECK: // %bb.0: +; CHECK-NEXT: ext v0.16b, v1.16b, v0.16b, #8 +; CHECK-NEXT: ret + %s = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> + ret <2 x i64> %s } diff --git a/llvm/test/CodeGen/AArch64/arm64-mul.ll b/llvm/test/CodeGen/AArch64/arm64-mul.ll index e4d2ce7ccdabc8..5ae2722693f898 100644 --- a/llvm/test/CodeGen/AArch64/arm64-mul.ll +++ b/llvm/test/CodeGen/AArch64/arm64-mul.ll @@ -152,20 +152,12 @@ entry: ; Check the sext_inreg case. define i64 @t11(i64 %a) nounwind { -; CHECK-SD-LABEL: t11: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov w8, #29594 // =0x739a -; CHECK-SD-NEXT: movk w8, #65499, lsl #16 -; CHECK-SD-NEXT: smnegl x0, w0, w8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: t11: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sxtw x8, w0 -; CHECK-GI-NEXT: mov x9, #-35942 // =0xffffffffffff739a -; CHECK-GI-NEXT: movk x9, #65499, lsl #16 -; CHECK-GI-NEXT: mneg x0, x8, x9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: t11: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #29594 // =0x739a +; CHECK-NEXT: movk w8, #65499, lsl #16 +; CHECK-NEXT: smnegl x0, w0, w8 +; CHECK-NEXT: ret entry: %tmp1 = trunc i64 %a to i32 %tmp2 = sext i32 %tmp1 to i64 @@ -175,20 +167,12 @@ entry: } define i64 @t12(i64 %a, i64 %b) nounwind { -; CHECK-SD-LABEL: t12: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov w8, #35118 // =0x892e -; CHECK-SD-NEXT: movk w8, #65008, lsl #16 -; CHECK-SD-NEXT: smaddl x0, w0, w8, x1 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: t12: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sxtw x8, w0 -; CHECK-GI-NEXT: mov x9, #-30418 // =0xffffffffffff892e -; CHECK-GI-NEXT: movk x9, #65008, lsl #16 -; CHECK-GI-NEXT: madd x0, x8, x9, x1 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: t12: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w8, #35118 // =0x892e +; CHECK-NEXT: movk w8, #65008, lsl #16 +; CHECK-NEXT: smaddl x0, w0, w8, x1 +; CHECK-NEXT: ret entry: %tmp1 = trunc i64 %a to i32 %tmp2 = sext i32 %tmp1 to i64 diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll index 3263a5e03c1fdc..427542377bfdae 100644 --- a/llvm/test/CodeGen/AArch64/ptradd.ll +++ b/llvm/test/CodeGen/AArch64/ptradd.ll @@ -1,40 +1,52 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-SD -; RUN: llc < %s -mtriple=aarch64-none-eabi -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI +; RUN: llc < %s -mtriple=aarch64 -verify-machineinstrs -global-isel=0 | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc < %s -mtriple=aarch64 -verify-machineinstrs -global-isel=1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI -; Note: these tests use stores instead of returns as the return handling for -; vector ptrs is currently sometimes create invalid unmerge values. - -define void @vector_gep_i32(ptr %b, i32 %off, ptr %p) { -; CHECK-LABEL: vector_gep_i32: +define ptr @scalar_gep_i32(ptr %b, i32 %off) { +; CHECK-LABEL: scalar_gep_i32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add x8, x0, w1, sxtw -; CHECK-NEXT: str x8, [x2] +; CHECK-NEXT: add x0, x0, w1, sxtw ; CHECK-NEXT: ret entry: %g = getelementptr i8, ptr %b, i32 %off - store ptr %g, ptr %p - ret void + ret ptr %g } -define void @vector_gep_i64(ptr %b, i64 %off, ptr %p) { -; CHECK-LABEL: vector_gep_i64: +define ptr @scalar_gep_i64(ptr %b, i64 %off) { +; CHECK-LABEL: scalar_gep_i64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: add x8, x0, x1 -; CHECK-NEXT: str x8, [x2] +; CHECK-NEXT: add x0, x0, x1 ; CHECK-NEXT: ret entry: %g = getelementptr i8, ptr %b, i64 %off - store ptr %g, ptr %p - ret void + ret ptr %g +} + +define ptr @scalar_gep_c10(ptr %b) { +; CHECK-LABEL: scalar_gep_c10: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add x0, x0, #10 +; CHECK-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, i64 10 + ret ptr %g +} + +define ptr @scalar_gep_cm10(ptr %b) { +; CHECK-LABEL: scalar_gep_cm10: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub x0, x0, #10 +; CHECK-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, i64 -10 + ret ptr %g } -define void @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off, ptr %p) { +define <1 x ptr> @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off) { ; CHECK-SD-LABEL: vector_gep_v1i32: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: shl d1, d1, #32 ; CHECK-SD-NEXT: ssra d0, d1, #32 -; CHECK-SD-NEXT: str d0, [x0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vector_gep_v1i32: @@ -42,87 +54,82 @@ define void @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off, ptr %p) { ; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: fmov x9, d0 ; CHECK-GI-NEXT: add x8, x9, w8, sxtw -; CHECK-GI-NEXT: str x8, [x0] +; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: ret entry: %g = getelementptr i8, <1 x ptr> %b, <1 x i32> %off - store <1 x ptr> %g, ptr %p - ret void + ret <1 x ptr> %g } -define void @vector_gep_v2i32(<2 x ptr> %b, <2 x i32> %off, ptr %p) { +define <2 x ptr> @vector_gep_v2i32(<2 x ptr> %b, <2 x i32> %off) { ; CHECK-LABEL: vector_gep_v2i32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: saddw v0.2d, v0.2d, v1.2s -; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret entry: %g = getelementptr i8, <2 x ptr> %b, <2 x i32> %off - store <2 x ptr> %g, ptr %p - ret void + ret <2 x ptr> %g } -define void @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off, ptr %p) { +define <3 x ptr> @vector_gep_v3i32(<3 x ptr> %b, <3 x i32> %off) { ; CHECK-SD-LABEL: vector_gep_v3i32: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: ext v4.16b, v3.16b, v3.16b, #8 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: saddw2 v2.2d, v2.2d, v3.4s -; CHECK-SD-NEXT: str d2, [x0, #16] +; CHECK-SD-NEXT: saddw v2.2d, v2.2d, v4.2s ; CHECK-SD-NEXT: saddw v0.2d, v0.2d, v3.2s -; CHECK-SD-NEXT: str q0, [x0] +; CHECK-SD-NEXT: // kill: def $d2 killed $d2 killed $q2 +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vector_gep_v3i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: smov x9, v3.s[0] ; CHECK-GI-NEXT: fmov x8, d0 -; CHECK-GI-NEXT: smov x10, v3.s[1] ; CHECK-GI-NEXT: mov v0.d[0], x8 -; CHECK-GI-NEXT: fmov x8, d1 +; CHECK-GI-NEXT: smov x8, v3.s[1] ; CHECK-GI-NEXT: mov v4.d[0], x9 +; CHECK-GI-NEXT: fmov x9, d1 +; CHECK-GI-NEXT: mov v0.d[1], x9 ; CHECK-GI-NEXT: fmov x9, d2 -; CHECK-GI-NEXT: mov v0.d[1], x8 +; CHECK-GI-NEXT: mov v4.d[1], x8 ; CHECK-GI-NEXT: mov w8, v3.s[2] -; CHECK-GI-NEXT: mov v4.d[1], x10 -; CHECK-GI-NEXT: add x8, x9, w8, sxtw ; CHECK-GI-NEXT: add v0.2d, v0.2d, v4.2d -; CHECK-GI-NEXT: str x8, [x0, #16] -; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: add x8, x9, w8, sxtw +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: ret entry: %g = getelementptr i8, <3 x ptr> %b, <3 x i32> %off - store <3 x ptr> %g, ptr %p - ret void + ret <3 x ptr> %g } -define void @vector_gep_v4i32(<4 x ptr> %b, <4 x i32> %off, ptr %p) { +define <4 x ptr> @vector_gep_v4i32(<4 x ptr> %b, <4 x i32> %off) { ; CHECK-SD-LABEL: vector_gep_v4i32: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: saddw2 v1.2d, v1.2d, v2.4s ; CHECK-SD-NEXT: saddw v0.2d, v0.2d, v2.2s -; CHECK-SD-NEXT: stp q0, q1, [x0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vector_gep_v4i32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: saddw v0.2d, v0.2d, v2.2s ; CHECK-GI-NEXT: saddw2 v1.2d, v1.2d, v2.4s -; CHECK-GI-NEXT: stp q0, q1, [x0] ; CHECK-GI-NEXT: ret entry: %g = getelementptr i8, <4 x ptr> %b, <4 x i32> %off - store <4 x ptr> %g, ptr %p - ret void + ret <4 x ptr> %g } -define void @vector_gep_v1i64(<1 x ptr> %b, <1 x i64> %off, ptr %p) { +define <1 x ptr> @vector_gep_v1i64(<1 x ptr> %b, <1 x i64> %off) { ; CHECK-SD-LABEL: vector_gep_v1i64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: add d0, d0, d1 -; CHECK-SD-NEXT: str d0, [x0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vector_gep_v1i64: @@ -130,39 +137,29 @@ define void @vector_gep_v1i64(<1 x ptr> %b, <1 x i64> %off, ptr %p) { ; CHECK-GI-NEXT: fmov x8, d0 ; CHECK-GI-NEXT: fmov x9, d1 ; CHECK-GI-NEXT: add x8, x8, x9 -; CHECK-GI-NEXT: str x8, [x0] +; CHECK-GI-NEXT: fmov d0, x8 ; CHECK-GI-NEXT: ret entry: %g = getelementptr i8, <1 x ptr> %b, <1 x i64> %off - store <1 x ptr> %g, ptr %p - ret void + ret <1 x ptr> %g } -define void @vector_gep_v2i64(<2 x ptr> %b, <2 x i64> %off, ptr %p) { +define <2 x ptr> @vector_gep_v2i64(<2 x ptr> %b, <2 x i64> %off) { ; CHECK-LABEL: vector_gep_v2i64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: add v0.2d, v0.2d, v1.2d -; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret entry: %g = getelementptr i8, <2 x ptr> %b, <2 x i64> %off - store <2 x ptr> %g, ptr %p - ret void + ret <2 x ptr> %g } -define void @vector_gep_v3i64(<3 x ptr> %b, <3 x i64> %off, ptr %p) { +define <3 x ptr> @vector_gep_v3i64(<3 x ptr> %b, <3 x i64> %off) { ; CHECK-SD-LABEL: vector_gep_v3i64: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: // kill: def $d3 killed $d3 def $q3 -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: // kill: def $d4 killed $d4 def $q4 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-SD-NEXT: mov v3.d[1], v4.d[0] -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: add d1, d2, d5 -; CHECK-SD-NEXT: str d1, [x0, #16] -; CHECK-SD-NEXT: add v0.2d, v0.2d, v3.2d -; CHECK-SD-NEXT: str q0, [x0] +; CHECK-SD-NEXT: add d0, d0, d3 +; CHECK-SD-NEXT: add d1, d1, d4 +; CHECK-SD-NEXT: add d2, d2, d5 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vector_gep_v3i64: @@ -178,42 +175,37 @@ define void @vector_gep_v3i64(<3 x ptr> %b, <3 x i64> %off, ptr %p) { ; CHECK-GI-NEXT: fmov x8, d2 ; CHECK-GI-NEXT: add x8, x8, x9 ; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d -; CHECK-GI-NEXT: str x8, [x0, #16] -; CHECK-GI-NEXT: str q0, [x0] +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: ret entry: %g = getelementptr i8, <3 x ptr> %b, <3 x i64> %off - store <3 x ptr> %g, ptr %p - ret void + ret <3 x ptr> %g } -define void @vector_gep_v4i64(<4 x ptr> %b, <4 x i64> %off, ptr %p) { +define <4 x ptr> @vector_gep_v4i64(<4 x ptr> %b, <4 x i64> %off) { ; CHECK-SD-LABEL: vector_gep_v4i64: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: add v1.2d, v1.2d, v3.2d ; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d -; CHECK-SD-NEXT: stp q0, q1, [x0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vector_gep_v4i64: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d ; CHECK-GI-NEXT: add v1.2d, v1.2d, v3.2d -; CHECK-GI-NEXT: stp q0, q1, [x0] ; CHECK-GI-NEXT: ret entry: %g = getelementptr i8, <4 x ptr> %b, <4 x i64> %off - store <4 x ptr> %g, ptr %p - ret void + ret <4 x ptr> %g } -define void @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off, ptr %p) { +define <2 x ptr> @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off) { ; CHECK-SD-LABEL: vector_gep_v4i128: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: fmov d1, x0 ; CHECK-SD-NEXT: mov v1.d[1], x2 ; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d -; CHECK-SD-NEXT: str q0, [x4] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vector_gep_v4i128: @@ -221,10 +213,266 @@ define void @vector_gep_v4i128(<2 x ptr> %b, <2 x i128> %off, ptr %p) { ; CHECK-GI-NEXT: mov v1.d[0], x0 ; CHECK-GI-NEXT: mov v1.d[1], x2 ; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d -; CHECK-GI-NEXT: str q0, [x4] ; CHECK-GI-NEXT: ret entry: %g = getelementptr i8, <2 x ptr> %b, <2 x i128> %off - store <2 x ptr> %g, ptr %p - ret void + ret <2 x ptr> %g +} + + +define <1 x ptr> @vector_gep_v1i64_base(ptr %b, <1 x i64> %off) { +; CHECK-SD-LABEL: vector_gep_v1i64_base: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov d1, x0 +; CHECK-SD-NEXT: add d0, d1, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v1i64_base: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov x8, d0 +; CHECK-GI-NEXT: add x8, x0, x8 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <1 x i64> %off + ret <1 x ptr> %g +} + +define <2 x ptr> @vector_gep_v2i64_base(ptr %b, <2 x i64> %off) { +; CHECK-LABEL: vector_gep_v2i64_base: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <2 x i64> %off + ret <2 x ptr> %g +} + +define <3 x ptr> @vector_gep_v3i64_base(ptr %b, <3 x i64> %off) { +; CHECK-SD-LABEL: vector_gep_v3i64_base: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: fmov d3, x0 +; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] +; CHECK-SD-NEXT: dup v1.2d, x0 +; CHECK-SD-NEXT: add d2, d3, d2 +; CHECK-SD-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v3i64_base: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: fmov x8, d2 +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] +; CHECK-GI-NEXT: dup v1.2d, x0 +; CHECK-GI-NEXT: add x8, x0, x8 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <3 x i64> %off + ret <3 x ptr> %g +} + +define <4 x ptr> @vector_gep_v4i64_base(ptr %b, <4 x i64> %off) { +; CHECK-LABEL: vector_gep_v4i64_base: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: dup v2.2d, x0 +; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: add v1.2d, v2.2d, v1.2d +; CHECK-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <4 x i64> %off + ret <4 x ptr> %g +} + +define <1 x ptr> @vector_gep_v1i64_c10(ptr %b) { +; CHECK-SD-LABEL: vector_gep_v1i64_c10: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #10 // =0xa +; CHECK-SD-NEXT: fmov d0, x0 +; CHECK-SD-NEXT: fmov d1, x8 +; CHECK-SD-NEXT: add d0, d0, d1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v1i64_c10: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: add x8, x0, #10 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <1 x i64> + ret <1 x ptr> %g +} + +define <2 x ptr> @vector_gep_v2i64_c10(ptr %b) { +; CHECK-SD-LABEL: vector_gep_v2i64_c10: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #10 // =0xa +; CHECK-SD-NEXT: dup v0.2d, x0 +; CHECK-SD-NEXT: dup v1.2d, x8 +; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v2i64_c10: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI18_0 +; CHECK-GI-NEXT: dup v0.2d, x0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI18_0] +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <2 x i64> + ret <2 x ptr> %g +} + +define <3 x ptr> @vector_gep_v3i64_c10(ptr %b) { +; CHECK-SD-LABEL: vector_gep_v3i64_c10: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #10 // =0xa +; CHECK-SD-NEXT: dup v0.2d, x0 +; CHECK-SD-NEXT: fmov d3, x0 +; CHECK-SD-NEXT: dup v2.2d, x8 +; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: add d2, d3, d2 +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v3i64_c10: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI19_0 +; CHECK-GI-NEXT: dup v0.2d, x0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI19_0] +; CHECK-GI-NEXT: add x8, x0, #10 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <3 x i64> + ret <3 x ptr> %g +} + +define <4 x ptr> @vector_gep_v4i64_c10(ptr %b) { +; CHECK-SD-LABEL: vector_gep_v4i64_c10: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov w8, #10 // =0xa +; CHECK-SD-NEXT: dup v0.2d, x0 +; CHECK-SD-NEXT: dup v1.2d, x8 +; CHECK-SD-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v4i64_c10: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI20_0 +; CHECK-GI-NEXT: dup v0.2d, x0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI20_0] +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <4 x i64> + ret <4 x ptr> %g +} + +define <1 x ptr> @vector_gep_v1i64_cm10(ptr %b) { +; CHECK-SD-LABEL: vector_gep_v1i64_cm10: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, #-10 // =0xfffffffffffffff6 +; CHECK-SD-NEXT: fmov d1, x0 +; CHECK-SD-NEXT: fmov d0, x8 +; CHECK-SD-NEXT: add d0, d1, d0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v1i64_cm10: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: sub x8, x0, #10 +; CHECK-GI-NEXT: fmov d0, x8 +; CHECK-GI-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <1 x i64> + ret <1 x ptr> %g +} + +define <2 x ptr> @vector_gep_v2i64_cm10(ptr %b) { +; CHECK-SD-LABEL: vector_gep_v2i64_cm10: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, #-10 // =0xfffffffffffffff6 +; CHECK-SD-NEXT: dup v1.2d, x0 +; CHECK-SD-NEXT: dup v0.2d, x8 +; CHECK-SD-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v2i64_cm10: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI22_0 +; CHECK-GI-NEXT: dup v0.2d, x0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI22_0] +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <2 x i64> + ret <2 x ptr> %g +} + +define <3 x ptr> @vector_gep_v3i64_cm10(ptr %b) { +; CHECK-SD-LABEL: vector_gep_v3i64_cm10: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, #-10 // =0xfffffffffffffff6 +; CHECK-SD-NEXT: dup v0.2d, x0 +; CHECK-SD-NEXT: fmov d3, x0 +; CHECK-SD-NEXT: dup v2.2d, x8 +; CHECK-SD-NEXT: add v0.2d, v0.2d, v2.2d +; CHECK-SD-NEXT: add d2, d3, d2 +; CHECK-SD-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v3i64_cm10: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI23_0 +; CHECK-GI-NEXT: dup v0.2d, x0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI23_0] +; CHECK-GI-NEXT: sub x8, x0, #10 +; CHECK-GI-NEXT: fmov d2, x8 +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: mov d1, v0.d[1] +; CHECK-GI-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <3 x i64> + ret <3 x ptr> %g +} + +define <4 x ptr> @vector_gep_v4i64_cm10(ptr %b) { +; CHECK-SD-LABEL: vector_gep_v4i64_cm10: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, #-10 // =0xfffffffffffffff6 +; CHECK-SD-NEXT: dup v1.2d, x0 +; CHECK-SD-NEXT: dup v0.2d, x8 +; CHECK-SD-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-SD-NEXT: mov v1.16b, v0.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: vector_gep_v4i64_cm10: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: adrp x8, .LCPI24_0 +; CHECK-GI-NEXT: dup v0.2d, x0 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI24_0] +; CHECK-GI-NEXT: add v0.2d, v0.2d, v1.2d +; CHECK-GI-NEXT: mov v1.16b, v0.16b +; CHECK-GI-NEXT: ret +entry: + %g = getelementptr i8, ptr %b, <4 x i64> + ret <4 x ptr> %g } diff --git a/llvm/test/CodeGen/AArch64/sadd_sat.ll b/llvm/test/CodeGen/AArch64/sadd_sat.ll index 789fd7b20a7f99..cb52c17e2531c8 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat.ll @@ -71,9 +71,9 @@ define i16 @func16(i16 %x, i16 %y) nounwind { ; CHECK-GI-NEXT: sxth w8, w1 ; CHECK-GI-NEXT: add w8, w8, w0, sxth ; CHECK-GI-NEXT: sxth w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #15 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #15, #1 ; CHECK-GI-NEXT: sub w10, w10, #8, lsl #12 // =32768 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w0, w10, w8, ne ; CHECK-GI-NEXT: ret %tmp = call i16 @llvm.sadd.sat.i16(i16 %x, i16 %y); @@ -98,9 +98,9 @@ define i8 @func8(i8 %x, i8 %y) nounwind { ; CHECK-GI-NEXT: sxtb w8, w1 ; CHECK-GI-NEXT: add w8, w8, w0, sxtb ; CHECK-GI-NEXT: sxtb w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #7 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #7, #1 ; CHECK-GI-NEXT: sub w10, w10, #128 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w0, w10, w8, ne ; CHECK-GI-NEXT: ret %tmp = call i8 @llvm.sadd.sat.i8(i8 %x, i8 %y); diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll b/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll index ecc8cbaeeecae4..f6fb4dd5e4b417 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_plus.ll @@ -76,9 +76,9 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; CHECK-GI-NEXT: sxth w8, w8 ; CHECK-GI-NEXT: add w8, w8, w0, sxth ; CHECK-GI-NEXT: sxth w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #15 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #15, #1 ; CHECK-GI-NEXT: sub w10, w10, #8, lsl #12 // =32768 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w0, w10, w8, ne ; CHECK-GI-NEXT: ret %a = mul i16 %y, %z @@ -106,9 +106,9 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; CHECK-GI-NEXT: sxtb w8, w8 ; CHECK-GI-NEXT: add w8, w8, w0, sxtb ; CHECK-GI-NEXT: sxtb w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #7 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #7, #1 ; CHECK-GI-NEXT: sub w10, w10, #128 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w0, w10, w8, ne ; CHECK-GI-NEXT: ret %a = mul i8 %y, %z diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll index 6d331d9413f913..29318bd28c45d4 100644 --- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -332,9 +332,9 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-NEXT: ldrsb w9, [x1] ; CHECK-GI-NEXT: add w8, w8, w9 ; CHECK-GI-NEXT: sxtb w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #7 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #7, #1 ; CHECK-GI-NEXT: sub w10, w10, #128 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w8, w10, w8, ne ; CHECK-GI-NEXT: strb w8, [x2] ; CHECK-GI-NEXT: ret @@ -360,9 +360,9 @@ define void @v1i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-NEXT: ldrsh w9, [x1] ; CHECK-GI-NEXT: add w8, w8, w9 ; CHECK-GI-NEXT: sxth w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #15 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #15, #1 ; CHECK-GI-NEXT: sub w10, w10, #8, lsl #12 // =32768 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w8, w10, w8, ne ; CHECK-GI-NEXT: strh w8, [x2] ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll index 853ed92c91fbcd..3604db33d5c4b3 100644 --- a/llvm/test/CodeGen/AArch64/sext.ll +++ b/llvm/test/CodeGen/AArch64/sext.ll @@ -221,14 +221,11 @@ define <3 x i16> @sext_v3i8_v3i16(<3 x i8> %a) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: lsl w8, w0, #8 ; CHECK-GI-NEXT: lsl w9, w1, #8 -; CHECK-GI-NEXT: lsl w10, w2, #8 -; CHECK-GI-NEXT: sxth w8, w8 -; CHECK-GI-NEXT: sxth w9, w9 -; CHECK-GI-NEXT: asr w8, w8, #8 -; CHECK-GI-NEXT: asr w9, w9, #8 +; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 +; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: sxth w8, w10 -; CHECK-GI-NEXT: asr w8, w8, #8 +; CHECK-GI-NEXT: lsl w8, w2, #8 +; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 ; CHECK-GI-NEXT: mov v0.h[1], w9 ; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 @@ -386,14 +383,11 @@ define <3 x i16> @sext_v3i10_v3i16(<3 x i10> %a) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: lsl w8, w0, #6 ; CHECK-GI-NEXT: lsl w9, w1, #6 -; CHECK-GI-NEXT: lsl w10, w2, #6 -; CHECK-GI-NEXT: sxth w8, w8 -; CHECK-GI-NEXT: sxth w9, w9 -; CHECK-GI-NEXT: asr w8, w8, #6 -; CHECK-GI-NEXT: asr w9, w9, #6 +; CHECK-GI-NEXT: sbfx w8, w8, #6, #10 +; CHECK-GI-NEXT: sbfx w9, w9, #6, #10 ; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: sxth w8, w10 -; CHECK-GI-NEXT: asr w8, w8, #6 +; CHECK-GI-NEXT: lsl w8, w2, #6 +; CHECK-GI-NEXT: sbfx w8, w8, #6, #10 ; CHECK-GI-NEXT: mov v0.h[1], w9 ; CHECK-GI-NEXT: mov v0.h[2], w8 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 diff --git a/llvm/test/CodeGen/AArch64/ssub_sat.ll b/llvm/test/CodeGen/AArch64/ssub_sat.ll index 4d755f480c3fc9..cf201d628b7e1e 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat.ll @@ -71,9 +71,9 @@ define i16 @func16(i16 %x, i16 %y) nounwind { ; CHECK-GI-NEXT: sxth w8, w0 ; CHECK-GI-NEXT: sub w8, w8, w1, sxth ; CHECK-GI-NEXT: sxth w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #15 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #15, #1 ; CHECK-GI-NEXT: sub w10, w10, #8, lsl #12 // =32768 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w0, w10, w8, ne ; CHECK-GI-NEXT: ret %tmp = call i16 @llvm.ssub.sat.i16(i16 %x, i16 %y); @@ -98,9 +98,9 @@ define i8 @func8(i8 %x, i8 %y) nounwind { ; CHECK-GI-NEXT: sxtb w8, w0 ; CHECK-GI-NEXT: sub w8, w8, w1, sxtb ; CHECK-GI-NEXT: sxtb w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #7 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #7, #1 ; CHECK-GI-NEXT: sub w10, w10, #128 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w0, w10, w8, ne ; CHECK-GI-NEXT: ret %tmp = call i8 @llvm.ssub.sat.i8(i8 %x, i8 %y); diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll b/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll index 25d615f6451ba1..cabd580e20d504 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_plus.ll @@ -76,9 +76,9 @@ define i16 @func16(i16 %x, i16 %y, i16 %z) nounwind { ; CHECK-GI-NEXT: sxth w9, w0 ; CHECK-GI-NEXT: sub w8, w9, w8, sxth ; CHECK-GI-NEXT: sxth w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #15 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #15, #1 ; CHECK-GI-NEXT: sub w10, w10, #8, lsl #12 // =32768 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w0, w10, w8, ne ; CHECK-GI-NEXT: ret %a = mul i16 %y, %z @@ -106,9 +106,9 @@ define i8 @func8(i8 %x, i8 %y, i8 %z) nounwind { ; CHECK-GI-NEXT: sxtb w9, w0 ; CHECK-GI-NEXT: sub w8, w9, w8, sxtb ; CHECK-GI-NEXT: sxtb w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #7 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #7, #1 ; CHECK-GI-NEXT: sub w10, w10, #128 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w0, w10, w8, ne ; CHECK-GI-NEXT: ret %a = mul i8 %y, %z diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll index dddda7e9ba64cd..30e2a70ace0722 100644 --- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -333,9 +333,9 @@ define void @v1i8(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-NEXT: ldrsb w9, [x1] ; CHECK-GI-NEXT: sub w8, w8, w9 ; CHECK-GI-NEXT: sxtb w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #7 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #7, #1 ; CHECK-GI-NEXT: sub w10, w10, #128 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w8, w10, w8, ne ; CHECK-GI-NEXT: strb w8, [x2] ; CHECK-GI-NEXT: ret @@ -361,9 +361,9 @@ define void @v1i16(ptr %px, ptr %py, ptr %pz) nounwind { ; CHECK-GI-NEXT: ldrsh w9, [x1] ; CHECK-GI-NEXT: sub w8, w8, w9 ; CHECK-GI-NEXT: sxth w9, w8 -; CHECK-GI-NEXT: asr w10, w9, #15 -; CHECK-GI-NEXT: cmp w8, w9 +; CHECK-GI-NEXT: sbfx w10, w8, #15, #1 ; CHECK-GI-NEXT: sub w10, w10, #8, lsl #12 // =32768 +; CHECK-GI-NEXT: cmp w8, w9 ; CHECK-GI-NEXT: csel w8, w10, w8, ne ; CHECK-GI-NEXT: strh w8, [x2] ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/sve-hadd.ll b/llvm/test/CodeGen/AArch64/sve-hadd.ll index 857a883d80ea3d..978ee4534e5e1a 100644 --- a/llvm/test/CodeGen/AArch64/sve-hadd.ll +++ b/llvm/test/CodeGen/AArch64/sve-hadd.ll @@ -1347,10 +1347,8 @@ define void @zext_mload_avgflooru(ptr %p1, ptr %p2, %mask) { ; SVE: // %bb.0: ; SVE-NEXT: ld1b { z0.h }, p0/z, [x0] ; SVE-NEXT: ld1b { z1.h }, p0/z, [x1] -; SVE-NEXT: eor z2.d, z0.d, z1.d -; SVE-NEXT: and z0.d, z0.d, z1.d -; SVE-NEXT: lsr z1.h, z2.h, #1 ; SVE-NEXT: add z0.h, z0.h, z1.h +; SVE-NEXT: lsr z0.h, z0.h, #1 ; SVE-NEXT: st1h { z0.h }, p0, [x0] ; SVE-NEXT: ret ; @@ -1377,11 +1375,11 @@ define void @zext_mload_avgceilu(ptr %p1, ptr %p2, %mask) { ; SVE-LABEL: zext_mload_avgceilu: ; SVE: // %bb.0: ; SVE-NEXT: ld1b { z0.h }, p0/z, [x0] -; SVE-NEXT: ld1b { z1.h }, p0/z, [x1] -; SVE-NEXT: eor z2.d, z0.d, z1.d -; SVE-NEXT: orr z0.d, z0.d, z1.d -; SVE-NEXT: lsr z1.h, z2.h, #1 -; SVE-NEXT: sub z0.h, z0.h, z1.h +; SVE-NEXT: mov z1.h, #-1 // =0xffffffffffffffff +; SVE-NEXT: ld1b { z2.h }, p0/z, [x1] +; SVE-NEXT: eor z0.d, z0.d, z1.d +; SVE-NEXT: sub z0.h, z2.h, z0.h +; SVE-NEXT: lsr z0.h, z0.h, #1 ; SVE-NEXT: st1b { z0.h }, p0, [x0] ; SVE-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems-i32.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems-i32.ll index 5062a43da931f8..02d172fbc9dbfd 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems-i32.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems-i32.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s -check-prefix=NO_SCALAR_INC -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+use-scalar-inc-vl < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -sve-use-scalar-inc-vl=true < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s ; INCB diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll index 8f0a9eac87c27c..4e1ff4abd6ffdf 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-counting-elems.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+use-scalar-inc-vl < %s | FileCheck %s -check-prefix=USE_SCALAR_INC +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -sve-use-scalar-inc-vl=true < %s | FileCheck %s -check-prefix=USE_SCALAR_INC ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s -check-prefix=USE_SCALAR_INC ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming < %s | FileCheck %s -check-prefix=USE_SCALAR_INC +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -sve-use-scalar-inc-vl=false < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -sve-use-scalar-inc-vl=false -force-streaming < %s | FileCheck %s ; ; CNTB diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll index de2af590acd1e2..dad357c8a0c132 100644 --- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -verify-machineinstrs < %s | FileCheck %s -check-prefix=NO_SCALAR_INC -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+use-scalar-inc-vl -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -sve-use-scalar-inc-vl=true -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -sve-use-scalar-inc-vl=false -verify-machineinstrs < %s | FileCheck %s -check-prefix=NO_SCALAR_INC define @inch_vec( %a) { ; NO_SCALAR_INC-LABEL: inch_vec: diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll index fe8271cdf054bf..3bfaf6dddaef8e 100644 --- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-crypto.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2-aes,+sve2-sha3,+sve2-sm4 < %s | FileCheck %s -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2,+sve-aes,+sve2-sha3,+sve2-sm4 < %s | FileCheck %s ; ; AESD diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll new file mode 100644 index 00000000000000..74422a1962344b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.i1.ll @@ -0,0 +1,153 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=WAVE64 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=WAVE32 %s + +define i32 @s_andn2_i1_vcc(i32 %arg0, i32 %arg1) { +; WAVE64-LABEL: s_andn2_i1_vcc: +; WAVE64: ; %bb.0: +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; WAVE64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v1 +; WAVE64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: s_andn2_i1_vcc: +; WAVE32: ; %bb.0: +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; WAVE32-NEXT: v_cmp_ne_u32_e64 s4, 0, v1 +; WAVE32-NEXT: s_and_b32 s4, vcc_lo, s4 +; WAVE32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; WAVE32-NEXT: s_setpc_b64 s[30:31] + %src0 = icmp eq i32 %arg0, 0 + %src1 = icmp eq i32 %arg1, 0 + %not.src1 = xor i1 %src1, true + %and = and i1 %src0, %not.src1 + %select = select i1 %and, i32 1, i32 0 + ret i32 %select +} + +define i32 @s_andn2_i1_vcc_commute(i32 %arg0, i32 %arg1) { +; WAVE64-LABEL: s_andn2_i1_vcc_commute: +; WAVE64: ; %bb.0: +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; WAVE64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v1 +; WAVE64-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: s_andn2_i1_vcc_commute: +; WAVE32: ; %bb.0: +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; WAVE32-NEXT: v_cmp_ne_u32_e64 s4, 0, v1 +; WAVE32-NEXT: s_and_b32 s4, s4, vcc_lo +; WAVE32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; WAVE32-NEXT: s_setpc_b64 s[30:31] + %src0 = icmp eq i32 %arg0, 0 + %src1 = icmp eq i32 %arg1, 0 + %not.src1 = xor i1 %src1, true + %and = and i1 %not.src1, %src0 + %select = select i1 %and, i32 1, i32 0 + ret i32 %select +} + +define i32 @s_andn2_i1_vcc_multi_use(i32 %arg0, i32 %arg1) { +; WAVE64-LABEL: s_andn2_i1_vcc_multi_use: +; WAVE64: ; %bb.0: +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v1 +; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; WAVE64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; WAVE64-NEXT: v_cndmask_b32_e64 v0, v0, 1, s[4:5] +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: s_andn2_i1_vcc_multi_use: +; WAVE32: ; %bb.0: +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: v_cmp_ne_u32_e64 s4, 0, v1 +; WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; WAVE32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; WAVE32-NEXT: s_and_b32 s4, vcc_lo, s4 +; WAVE32-NEXT: v_cndmask_b32_e64 v0, v0, 1, s4 +; WAVE32-NEXT: s_setpc_b64 s[30:31] + %src0 = icmp eq i32 %arg0, 0 + %src1 = icmp eq i32 %arg1, 0 + %not.src1 = xor i1 %src1, -1 + %user = zext i1 %not.src1 to i32 + %and = and i1 %src0, %not.src1 + %select = select i1 %and, i32 1, i32 %user + ret i32 %select +} + +define <2 x i32> @s_andn2_v2i1_vcc(<2 x i32> %arg0, <2 x i32> %arg1) { +; WAVE64-LABEL: s_andn2_v2i1_vcc: +; WAVE64: ; %bb.0: +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; WAVE64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2 +; WAVE64-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v1 +; WAVE64-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 +; WAVE64-NEXT: s_and_b64 s[4:5], vcc, s[4:5] +; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; WAVE64-NEXT: s_and_b64 s[4:5], s[6:7], s[8:9] +; WAVE64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: s_andn2_v2i1_vcc: +; WAVE32: ; %bb.0: +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; WAVE32-NEXT: v_cmp_ne_u32_e64 s4, 0, v2 +; WAVE32-NEXT: v_cmp_eq_u32_e64 s5, 0, v1 +; WAVE32-NEXT: v_cmp_ne_u32_e64 s6, 0, v3 +; WAVE32-NEXT: s_and_b32 s4, vcc_lo, s4 +; WAVE32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; WAVE32-NEXT: s_and_b32 s4, s5, s6 +; WAVE32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; WAVE32-NEXT: s_setpc_b64 s[30:31] + %src0 = icmp eq <2 x i32> %arg0, zeroinitializer + %src1 = icmp eq <2 x i32> %arg1, zeroinitializer + %not.src1 = xor <2 x i1> %src1, + %and = and <2 x i1> %src0, %not.src1 + %select = select <2 x i1> %and, <2 x i32> , <2 x i32> zeroinitializer + ret <2 x i32> %select +} + +define <2 x i32> @s_andn2_v2i1_vcc_commute(<2 x i32> %arg0, <2 x i32> %arg1) { +; WAVE64-LABEL: s_andn2_v2i1_vcc_commute: +; WAVE64: ; %bb.0: +; WAVE64-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; WAVE64-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v2 +; WAVE64-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v1 +; WAVE64-NEXT: v_cmp_ne_u32_e64 s[8:9], 0, v3 +; WAVE64-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; WAVE64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; WAVE64-NEXT: s_and_b64 s[4:5], s[8:9], s[6:7] +; WAVE64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; WAVE64-NEXT: s_setpc_b64 s[30:31] +; +; WAVE32-LABEL: s_andn2_v2i1_vcc_commute: +; WAVE32: ; %bb.0: +; WAVE32-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; WAVE32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; WAVE32-NEXT: v_cmp_ne_u32_e64 s4, 0, v2 +; WAVE32-NEXT: v_cmp_eq_u32_e64 s5, 0, v1 +; WAVE32-NEXT: v_cmp_ne_u32_e64 s6, 0, v3 +; WAVE32-NEXT: s_and_b32 s4, s4, vcc_lo +; WAVE32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; WAVE32-NEXT: s_and_b32 s4, s6, s5 +; WAVE32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; WAVE32-NEXT: s_setpc_b64 s[30:31] + %src0 = icmp eq <2 x i32> %arg0, zeroinitializer + %src1 = icmp eq <2 x i32> %arg1, zeroinitializer + %not.src1 = xor <2 x i1> %src1, + %and = and <2 x i1> %not.src1, %src0 + %select = select <2 x i1> %and, <2 x i32> , <2 x i32> zeroinitializer + ret <2 x i32> %select +} + diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll index 63f5464371cc62..493e8cef638902 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -1669,7 +1669,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) { ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_ashr_i64 v[10:11], v[4:5], v3 ; GFX6-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v5 @@ -1692,7 +1692,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) { ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_ashrrev_i64 v[10:11], v3, v[4:5] ; GFX8-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v5 @@ -1715,7 +1715,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) { ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] -; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_ashrrev_i64 v[10:11], v3, v[4:5] ; GFX9-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v5 @@ -1735,7 +1735,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_bfe_i32 v4, v2, 0, 1 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX10-NEXT: v_ashrrev_i32_e32 v5, 31, v4 @@ -1758,7 +1758,7 @@ define i65 @v_ashr_i65(i65 %value, i65 %amount) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_bfe_i32 v4, v2, 0, 1 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll index 168bf16ad68674..b9cd330ee2b5f9 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -1438,7 +1438,7 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) { ; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; SI-NEXT: v_ffbh_i32_e32 v3, 0 ; SI-NEXT: v_add_i32_e32 v2, vcc, 32, v2 -; SI-NEXT: v_subrev_i32_e32 v3, vcc, 1, v3 +; SI-NEXT: v_add_i32_e32 v3, vcc, -1, v3 ; SI-NEXT: v_mov_b32_e32 v1, 0 ; SI-NEXT: v_min_u32_e32 v2, v3, v2 ; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v2 @@ -1456,7 +1456,7 @@ define float @v_test_sitofp_i64_byte_to_f32(i64 %arg0) { ; VI-NEXT: v_ashrrev_i32_e32 v2, 31, v0 ; VI-NEXT: v_ffbh_i32_e32 v3, 0 ; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2 -; VI-NEXT: v_subrev_u32_e32 v3, vcc, 1, v3 +; VI-NEXT: v_add_u32_e32 v3, vcc, -1, v3 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_min_u32_e32 v2, v3, v2 ; VI-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll index 88a7ba7ac98928..f2a4332bcb8ba6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -3039,7 +3039,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 +; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -3130,7 +3130,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 -; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 +; MOVREL-NEXT: enable_sgpr_queue_ptr = 1 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -3222,7 +3222,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 ; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 -; GFX10-NEXT: enable_sgpr_queue_ptr = 0 +; GFX10-NEXT: enable_sgpr_queue_ptr = 1 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -3314,7 +3314,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 ; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 -; GFX11-NEXT: enable_sgpr_queue_ptr = 0 +; GFX11-NEXT: enable_sgpr_queue_ptr = 1 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX11-NEXT: enable_sgpr_dispatch_id = 1 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -4054,7 +4054,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 +; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -4138,7 +4138,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 -; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 +; MOVREL-NEXT: enable_sgpr_queue_ptr = 1 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -4223,7 +4223,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 ; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 -; GFX10-NEXT: enable_sgpr_queue_ptr = 0 +; GFX10-NEXT: enable_sgpr_queue_ptr = 1 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -4308,7 +4308,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 ; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 -; GFX11-NEXT: enable_sgpr_queue_ptr = 0 +; GFX11-NEXT: enable_sgpr_queue_ptr = 1 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX11-NEXT: enable_sgpr_dispatch_id = 1 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -4399,7 +4399,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GPRIDX-NEXT: enable_exception = 0 ; GPRIDX-NEXT: enable_sgpr_private_segment_buffer = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_ptr = 1 -; GPRIDX-NEXT: enable_sgpr_queue_ptr = 0 +; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1 ; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -4486,7 +4486,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; MOVREL-NEXT: enable_exception = 0 ; MOVREL-NEXT: enable_sgpr_private_segment_buffer = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_ptr = 1 -; MOVREL-NEXT: enable_sgpr_queue_ptr = 0 +; MOVREL-NEXT: enable_sgpr_queue_ptr = 1 ; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; MOVREL-NEXT: enable_sgpr_dispatch_id = 1 ; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -4574,7 +4574,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX10-NEXT: enable_exception = 0 ; GFX10-NEXT: enable_sgpr_private_segment_buffer = 1 ; GFX10-NEXT: enable_sgpr_dispatch_ptr = 1 -; GFX10-NEXT: enable_sgpr_queue_ptr = 0 +; GFX10-NEXT: enable_sgpr_queue_ptr = 1 ; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX10-NEXT: enable_sgpr_dispatch_id = 1 ; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0 @@ -4662,7 +4662,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s ; GFX11-NEXT: enable_exception = 0 ; GFX11-NEXT: enable_sgpr_private_segment_buffer = 0 ; GFX11-NEXT: enable_sgpr_dispatch_ptr = 1 -; GFX11-NEXT: enable_sgpr_queue_ptr = 0 +; GFX11-NEXT: enable_sgpr_queue_ptr = 1 ; GFX11-NEXT: enable_sgpr_kernarg_segment_ptr = 1 ; GFX11-NEXT: enable_sgpr_dispatch_id = 1 ; GFX11-NEXT: enable_sgpr_flat_scratch_init = 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll index 146f344930a4ee..6e55d7fdb5e957 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll @@ -4101,7 +4101,7 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 { ; GFX10-NEXT: v_rcp_f32_e32 v1, 0x3f40e400 ; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v0 ; GFX10-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 14, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v0, -14, v0 ; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1 ; GFX10-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -4112,10 +4112,9 @@ define float @v_fdiv_f32_constrhs0_dynamic_25ulp(float %x) #0 { ; GFX11-NEXT: v_rcp_f32_e32 v1, 0x3f40e400 ; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v0 ; GFX11-NEXT: v_frexp_exp_i32_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 14, v0 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1 +; GFX11-NEXT: v_dual_mul_f32 v1, v2, v1 :: v_dual_add_nc_u32 v0, -14, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; EG-LABEL: v_fdiv_f32_constrhs0_dynamic_25ulp: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll index 3bd3486ec261d4..5d76b542fad894 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll @@ -20,10 +20,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -7, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -7, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 6, v0 @@ -51,10 +51,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -7, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -7, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u16_e32 v1, 6, v0 @@ -82,10 +82,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_add_u32_e32 v1, -7, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_add_u32_e32 v1, -7, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u16_e32 v1, 6, v0 @@ -113,10 +113,10 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0 @@ -150,11 +150,11 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo @@ -189,10 +189,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, -7, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, -7, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 @@ -219,10 +219,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -7, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -7, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 @@ -249,10 +249,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -7, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -7, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 @@ -279,10 +279,10 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 @@ -315,11 +315,11 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX11-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo @@ -1550,16 +1550,16 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX6-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v0, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 @@ -1580,16 +1580,16 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x170001 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 @@ -1616,10 +1616,10 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 @@ -1644,10 +1644,10 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 @@ -1678,11 +1678,11 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1710,16 +1710,16 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX6-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffffffe8, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 @@ -1740,16 +1740,16 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX8-NEXT: v_bfe_u32 v1, v1, 1, 23 ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GFX8-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 +; GFX8-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffffffe8, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 @@ -1776,10 +1776,10 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 @@ -1804,10 +1804,10 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 @@ -1838,11 +1838,11 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX11-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1887,7 +1887,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_lshr_b32 s0, s2, 16 ; GFX6-NEXT: s_lshr_b32 s1, s3, 8 ; GFX6-NEXT: s_bfe_u32 s8, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v3, v2, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3 ; GFX6-NEXT: s_and_b32 s7, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s8, s8, 8 ; GFX6-NEXT: s_and_b32 s0, s0, 0xff @@ -1906,7 +1906,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_or_b32_e32 v1, s1, v1 ; GFX6-NEXT: s_lshr_b32 s1, s4, 16 ; GFX6-NEXT: s_bfe_u32 s7, s4, 0x80008 -; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 ; GFX6-NEXT: s_and_b32 s3, s4, 0xff ; GFX6-NEXT: s_lshl_b32 s7, s7, 8 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff @@ -1915,53 +1915,53 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_or_b32 s1, s3, s1 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s1, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, s1, v2 ; GFX6-NEXT: s_lshr_b32 s2, s5, 8 ; GFX6-NEXT: s_and_b32 s3, s5, 0xff -; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: v_mov_b32_e32 v5, s4 ; GFX6-NEXT: s_and_b32 s2, s2, 0xff -; GFX6-NEXT: v_alignbit_b32 v4, s3, v4, 24 +; GFX6-NEXT: v_alignbit_b32 v5, s3, v5, 24 ; GFX6-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: v_or_b32_e32 v4, s2, v4 -; GFX6-NEXT: v_mul_hi_u32 v2, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 24, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v3 +; GFX6-NEXT: v_or_b32_e32 v5, s2, v5 +; GFX6-NEXT: v_mul_hi_u32 v2, v5, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s1, v4 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, 24 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 24, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 23, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v3 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: s_lshr_b32 s0, s0, 1 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_lshl_b32_e32 v3, s6, v3 -; GFX6-NEXT: v_lshr_b32_e32 v5, s0, v5 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_lshl_b32_e32 v4, s6, v4 +; GFX6-NEXT: v_lshr_b32_e32 v6, s0, v6 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v3, v5 +; GFX6-NEXT: v_or_b32_e32 v4, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v2, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v4 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v2, v3, 8, 8 +; GFX6-NEXT: v_bfe_u32 v2, v4, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_bfe_u32 v2, v3, 16, 8 +; GFX6-NEXT: v_bfe_u32 v2, v4, 16, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 @@ -2021,7 +2021,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_not_b32_e32 v1, 23 ; GFX8-NEXT: s_or_b32 s3, s8, s3 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s6 @@ -2031,67 +2031,67 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: s_lshr_b32 s8, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s6, s6, 8 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s4, s4, s6 ; GFX8-NEXT: s_and_b32 s6, s7, 0xff ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s6 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_lshr_b32 s9, s5, 8 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX8-NEXT: v_mul_lo_u32 v2, v2, 24 ; GFX8-NEXT: s_and_b32 s6, s9, 0xff ; GFX8-NEXT: s_or_b32 s5, s8, s5 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s6, s6, 16 ; GFX8-NEXT: s_or_b32 s5, s5, s6 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s4, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 -; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: s_lshr_b32 s0, s2, 1 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 +; GFX8-NEXT: v_lshrrev_b32_e64 v3, v3, s0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 ; GFX8-NEXT: s_lshr_b32 s0, s3, 1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_lshlrev_b32_e64 v0, v0, s1 -; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s0 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v2 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -2172,10 +2172,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: s_or_b32 s5, s5, s6 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc @@ -2186,10 +2186,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, s0, v1, v2 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: v_sub_u32_e32 v2, 23, v0 @@ -2282,9 +2282,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: s_lshr_b32 s4, s3, 8 ; GFX10-NEXT: s_and_b32 s5, s9, 0xff ; GFX10-NEXT: s_and_b32 s3, s3, 0xff -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX10-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_and_b32 s4, s4, 0xff @@ -2293,13 +2293,13 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: s_lshl_b32 s5, s5, 16 ; GFX10-NEXT: s_or_b32 s3, s10, s3 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX10-NEXT: s_or_b32 s2, s2, s5 ; GFX10-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 16 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: s_or_b32 s3, s3, s4 @@ -2399,9 +2399,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: s_and_b32 s5, s8, 0xff ; GFX11-NEXT: s_lshr_b32 s4, s3, 8 ; GFX11-NEXT: s_and_b32 s5, 0xffff, s5 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: s_lshl_b32 s5, s5, 16 ; GFX11-NEXT: s_lshl_b32 s3, s3, 8 @@ -2410,7 +2410,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: s_and_b32 s4, s4, 0xff ; GFX11-NEXT: s_or_b32 s2, s2, s5 ; GFX11-NEXT: s_or_b32 s3, s9, s3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: s_and_b32 s4, 0xffff, s4 @@ -2423,7 +2423,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: s_lshr_b32 s3, s3, 1 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 23, v1 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 @@ -2479,31 +2479,31 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX6-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX6-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX6-NEXT: v_mul_lo_u32 v7, v6, v7 -; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GFX6-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX6-NEXT: v_mul_lo_u32 v8, v6, v7 +; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 ; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX6-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX6-NEXT: v_mul_lo_u32 v8, v8, 24 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v7 -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 23, v4 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v7 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v8 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe8, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 @@ -2526,31 +2526,31 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX8-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 ; GFX8-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX8-NEXT: v_bfe_u32 v2, v2, 1, 23 -; GFX8-NEXT: v_mul_lo_u32 v7, v6, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX8-NEXT: v_mul_lo_u32 v8, v6, v7 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_mul_hi_u32 v8, v4, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX8-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX8-NEXT: v_mul_lo_u32 v8, v8, 24 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 23, v4 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, v4, v0 -; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v7 +; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v8 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe8, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 @@ -2583,21 +2583,21 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v7 ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v6, 0xffffffe8, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v6, 0xffffffe8, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 ; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX9-NEXT: v_subrev_u32_e32 v7, 24, v5 +; GFX9-NEXT: v_add_u32_e32 v7, 0xffffffe8, v5 ; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v6, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v5 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v4, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v4, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 @@ -2627,15 +2627,15 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v7 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 @@ -2679,34 +2679,32 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v6 -; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_add_nc_u32 v7, 0xffffffe8, v5 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 -; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 23, v4 -; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4 ; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v7, 0xffffff, v7 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, v4, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v3, v7, v3 +; GFX11-NEXT: v_lshl_or_b32 v1, v1, v5, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) ret <2 x i24> %result @@ -6061,11 +6059,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, 64, v15 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, 0xffffffc0, v15 ; GFX6-NEXT: v_lshr_b64 v[9:10], v[0:1], v9 ; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15 ; GFX6-NEXT: v_lshl_b64 v[13:14], v[0:1], v15 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v16 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v17 ; GFX6-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX6-NEXT: v_or_b32_e32 v10, v10, v12 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 @@ -6082,8 +6080,9 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_lshr_b64 v[2:3], v[6:7], 1 ; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v4 +; GFX6-NEXT: v_not_b32_e32 v16, 63 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v14 -; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v16 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v14 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[2:3], v6 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[2:3], v14 @@ -6109,11 +6108,11 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, 64, v15 -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, 0xffffffc0, v15 ; GFX8-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] ; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v17, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v9, v9, v11 ; GFX8-NEXT: v_or_b32_e32 v10, v10, v12 ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 @@ -6130,8 +6129,9 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[6:7] ; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v4 +; GFX8-NEXT: v_not_b32_e32 v16, 63 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v14 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v16 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3] @@ -6157,7 +6157,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v15, 0x7f, v8 ; GFX9-NEXT: v_sub_u32_e32 v9, 64, v15 -; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v15 ; GFX9-NEXT: v_lshrrev_b64 v[9:10], v9, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] ; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[0:1] @@ -6178,7 +6178,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v4 ; GFX9-NEXT: v_lshl_or_b32 v1, v6, 31, v1 ; GFX9-NEXT: v_sub_u32_e32 v6, 64, v14 -; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[2:3] @@ -6210,7 +6210,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: v_and_b32_e32 v19, 0x7f, v10 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 31, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX10-NEXT: v_add_nc_u32_e32 v20, 0xffffffc0, v18 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], v11, v[0:1] ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] @@ -6218,7 +6218,7 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX10-NEXT: v_or_b32_e32 v10, v10, v8 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v19 ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[12:13] ; GFX10-NEXT: v_or_b32_e32 v11, v11, v9 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v19 @@ -6258,34 +6258,34 @@ define i128 @v_fshl_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v10 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX11-NEXT: v_lshl_or_b32 v5, v6, 31, v5 -; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] ; GFX11-NEXT: v_lshrrev_b64 v[10:11], v11, v[0:1] ; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX11-NEXT: v_lshlrev_b64 v[6:7], v18, v[0:1] -; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0xffffffc0, v18 +; GFX11-NEXT: v_lshrrev_b64 v[14:15], v19, v[4:5] +; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v8 -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v19 ; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[12:13] +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] ; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[12:13] ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v8, v[12:13] +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo ; GFX11-NEXT: v_or_b32_e32 v14, v14, v16 ; GFX11-NEXT: v_or_b32_e32 v15, v15, v17 +; GFX11-NEXT: v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[12:13] -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 -; GFX11-NEXT: v_dual_cndmask_b32 v6, 0, v6 :: v_dual_cndmask_b32 v7, 0, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v14, s0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 ; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v15, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v5, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, v1, s0 @@ -6307,15 +6307,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 ; GFX6-NEXT: v_lshr_b64 v[1:2], s[0:1], v1 ; GFX6-NEXT: v_lshl_b64 v[3:4], s[2:3], v7 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0xffffffc0, v7 ; GFX6-NEXT: v_lshl_b64 v[5:6], s[0:1], v7 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v8 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[0:1], v9 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX6-NEXT: v_not_b32_e32 v0, v0 ; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -6324,33 +6324,34 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX6-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX6-NEXT: s_lshl_b32 s9, s6, 31 -; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX6-NEXT: v_and_b32_e32 v11, 0x7f, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc ; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX6-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v10 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v10 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v11 +; GFX6-NEXT: v_not_b32_e32 v8, 63 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v11 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[2:3], v2 -; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v10 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v11, v8 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[2:3], v8 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[2:3], v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshl_i128_ssv: @@ -6359,15 +6360,15 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 ; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0xffffffc0, v7 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], v8, s[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v9, s[0:1] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GFX8-NEXT: v_not_b32_e32 v0, v0 ; GFX8-NEXT: s_mov_b32 s8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc @@ -6376,33 +6377,34 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 ; GFX8-NEXT: s_lshr_b64 s[0:1], s[4:5], 1 ; GFX8-NEXT: s_lshl_b32 s9, s6, 31 -; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v11, 0x7f, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc ; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9] ; GFX8-NEXT: s_lshr_b64 s[2:3], s[6:7], 1 -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v10 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1] +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v11 +; GFX8-NEXT: v_not_b32_e32 v8, 63 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v10 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v11, v8 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, s[2:3] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v8, s[2:3] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v11, s[2:3] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshl_i128_ssv: @@ -6411,7 +6413,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 ; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v7 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[0:1] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 @@ -6436,7 +6438,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v10 +; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[2:3] @@ -6471,12 +6473,12 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-NEXT: s_lshr_b64 s[6:7], s[6:7], 1 ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v3, s[0:1] ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v12 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v12, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffffc0, v13 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX10-NEXT: v_or_b32_e32 v3, v3, v1 @@ -6522,7 +6524,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 64, v12 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[2:3] ; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v13 -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v12 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v13, s[8:9] ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v3, s[0:1] ; GFX11-NEXT: v_cmp_eq_u32_e64 s4, 0, v12 @@ -6531,7 +6533,7 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v13 ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v13 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v0 -; GFX11-NEXT: v_subrev_nc_u32_e32 v0, 64, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xffffffc0, v13 ; GFX11-NEXT: v_or_b32_e32 v3, v3, v1 ; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX11-NEXT: v_or_b32_e32 v7, v7, v9 @@ -7677,12 +7679,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX6-NEXT: v_not_b32_e32 v25, 63 ; GFX6-NEXT: v_sub_i32_e32 v17, vcc, 64, v19 -; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v19 +; GFX6-NEXT: v_add_i32_e32 v26, vcc, v19, v25 ; GFX6-NEXT: v_lshr_b64 v[17:18], v[0:1], v17 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 ; GFX6-NEXT: v_lshl_b64 v[23:24], v[0:1], v19 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v25 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], v26 ; GFX6-NEXT: v_or_b32_e32 v17, v17, v21 ; GFX6-NEXT: v_or_b32_e32 v18, v18, v22 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 @@ -7700,7 +7703,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_lshr_b64 v[2:3], v[10:11], 1 ; GFX6-NEXT: v_and_b32_e32 v23, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v23 -; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v23 +; GFX6-NEXT: v_add_i32_e32 v24, vcc, v23, v25 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[0:1], v23 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[2:3], v10 ; GFX6-NEXT: v_lshr_b64 v[16:17], v[2:3], v23 @@ -7719,7 +7722,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v18 ; GFX6-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v8 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 ; GFX6-NEXT: v_lshl_b64 v[16:17], v[4:5], v18 @@ -7741,7 +7744,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_lshr_b64 v[6:7], v[14:15], 1 ; GFX6-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, 64, v14 -; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v25 ; GFX6-NEXT: v_lshr_b64 v[8:9], v[4:5], v14 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v10 ; GFX6-NEXT: v_lshr_b64 v[12:13], v[6:7], v14 @@ -7768,12 +7771,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v16 +; GFX8-NEXT: v_not_b32_e32 v25, 63 ; GFX8-NEXT: v_sub_u32_e32 v17, vcc, 64, v19 -; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v19 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v19, v25 ; GFX8-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] ; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[0:1] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v17, v17, v21 ; GFX8-NEXT: v_or_b32_e32 v18, v18, v22 ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 @@ -7791,7 +7795,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 1, v[10:11] ; GFX8-NEXT: v_and_b32_e32 v23, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v23 -; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v23 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v23, v25 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] ; GFX8-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] @@ -7810,7 +7814,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v18 ; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] ; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] @@ -7832,7 +7836,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] ; GFX8-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, 64, v14 -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v25 ; GFX8-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] ; GFX8-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] @@ -7860,7 +7864,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v19, 0x7f, v16 ; GFX9-NEXT: v_sub_u32_e32 v17, 64, v19 -; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v19 +; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19 ; GFX9-NEXT: v_lshrrev_b64 v[17:18], v17, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] ; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[0:1] @@ -7881,7 +7885,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_and_b32_e32 v23, 0x7f, v8 ; GFX9-NEXT: v_lshl_or_b32 v1, v10, 31, v1 ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v23 -; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v23 +; GFX9-NEXT: v_add_u32_e32 v24, 0xffffffc0, v23 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v23, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] ; GFX9-NEXT: v_lshrrev_b64 v[16:17], v23, v[2:3] @@ -7900,7 +7904,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v18 ; GFX9-NEXT: v_or_b32_e32 v3, v19, v3 -; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v8, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[4:5] @@ -7921,7 +7925,7 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_lshrrev_b64 v[6:7], 1, v[14:15] ; GFX9-NEXT: v_and_b32_e32 v14, 0x7f, v8 ; GFX9-NEXT: v_sub_u32_e32 v10, 64, v14 -; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 ; GFX9-NEXT: v_lshrrev_b64 v[8:9], v14, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7] ; GFX9-NEXT: v_lshrrev_b64 v[12:13], v14, v[6:7] @@ -7956,13 +7960,13 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_lshl_or_b32 v9, v10, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[10:11], 1, v[10:11] ; GFX10-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] -; GFX10-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 +; GFX10-NEXT: v_add_nc_u32_e32 v29, 0xffffffc0, v27 ; GFX10-NEXT: v_sub_nc_u32_e32 v25, 64, v28 ; GFX10-NEXT: v_lshlrev_b64 v[21:22], v27, v[0:1] ; GFX10-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v27 ; GFX10-NEXT: v_or_b32_e32 v18, v16, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 +; GFX10-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v28 ; GFX10-NEXT: v_lshlrev_b64 v[25:26], v25, v[10:11] ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] ; GFX10-NEXT: v_or_b32_e32 v19, v17, v19 @@ -7999,10 +8003,10 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX10-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] ; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v22 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffc0, v24 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v24 ; GFX10-NEXT: v_or_b32_e32 v12, v10, v12 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v22 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v22 ; GFX10-NEXT: v_lshrrev_b64 v[18:19], v22, v[8:9] ; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] @@ -8049,19 +8053,19 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-NEXT: v_cndmask_b32_e32 v22, 0, v22, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v17, 64, v27 ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v27, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v29, 64, v27 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v17, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 ; GFX11-NEXT: v_or_b32_e32 v18, v16, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v21, 0, v21, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_dual_cndmask_b32 v19, v1, v19 :: v_dual_cndmask_b32 v18, v0, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v29, 0xffffffc0, v27 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_or_b32_e32 v19, v17, v19 +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v29, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_cndmask_b32 v18, v0, v18 :: v_dual_cndmask_b32 v19, v1, v19 ; GFX11-NEXT: v_sub_nc_u32_e32 v25, 64, v28 -; GFX11-NEXT: v_subrev_nc_u32_e32 v16, 64, v28 +; GFX11-NEXT: v_add_nc_u32_e32 v16, 0xffffffc0, v28 ; GFX11-NEXT: v_lshrrev_b64 v[23:24], v28, v[8:9] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v28 ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v28, v[10:11] @@ -8095,26 +8099,26 @@ define <2 x i128> @v_fshl_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-NEXT: v_lshl_or_b32 v9, v14, 31, v9 ; GFX11-NEXT: v_lshrrev_b64 v[14:15], 1, v[14:15] ; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v22 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v24 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffc0, v24 ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v22 ; GFX11-NEXT: v_or_b32_e32 v12, v10, v12 -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v22 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v22 ; GFX11-NEXT: v_lshrrev_b64 v[18:19], v22, v[8:9] ; GFX11-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX11-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] ; GFX11-NEXT: v_or_b32_e32 v5, v11, v13 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] ; GFX11-NEXT: v_cndmask_b32_e32 v13, 0, v16, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[14:15] ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v22 ; GFX11-NEXT: v_or_b32_e32 v16, v18, v20 ; GFX11-NEXT: v_or_b32_e32 v18, v19, v21 ; GFX11-NEXT: v_dual_cndmask_b32 v12, v3, v12 :: v_dual_cndmask_b32 v5, v4, v5 ; GFX11-NEXT: v_lshrrev_b64 v[3:4], v22, v[14:15] -; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v16, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v24 ; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v18, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v17, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, v6, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v5, v7, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v10, v8, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll index 58304d2072d7f6..dbc8f12c2c25c4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll @@ -21,10 +21,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -7, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 7, v0 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, -7, v0 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 6, v0 @@ -51,10 +51,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -7, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 7, v0 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, -7, v0 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u16_e32 v1, 6, v0 @@ -81,10 +81,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_add_u32_e32 v1, -7, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_subrev_u32_e32 v1, 7, v0 +; GFX9-NEXT: v_add_u32_e32 v1, -7, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u16_e32 v1, 6, v0 @@ -111,10 +111,10 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v1, 6, v0 @@ -147,11 +147,11 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) { ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 7 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 7, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, -7, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -186,10 +186,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, -7, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 7, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, -7, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 6, v2 @@ -216,10 +216,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -7, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 7, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -7, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u16_e32 v3, 6, v2 @@ -246,10 +246,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -7, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 7, v2 +; GFX9-NEXT: v_add_u32_e32 v3, -7, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 7, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u16_e32 v3, 6, v2 @@ -276,10 +276,10 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u16 v3, 6, v2 @@ -312,11 +312,11 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) { ; GFX11-NEXT: v_mul_lo_u32 v3, v3, 7 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 7, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, -7, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 7, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1552,16 +1552,16 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffffff -; GFX6-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; GFX6-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX6-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v0, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 24, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 23, v0 @@ -1583,16 +1583,16 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: s_and_b32 s1, s1, 0xffffff -; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 @@ -1620,10 +1620,10 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX9-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 -; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX9-NEXT: v_subrev_u32_e32 v1, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX9-NEXT: v_sub_u32_e32 v1, 23, v0 @@ -1649,10 +1649,10 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX10-NEXT: v_mul_hi_u32 v0, s2, v0 ; GFX10-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 23, v0 @@ -1684,11 +1684,11 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt) ; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 24, v0 +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1717,16 +1717,16 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX6-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX6-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX6-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; GFX6-NEXT: v_mul_lo_u32 v5, v3, v4 +; GFX6-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v4 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xffffffe8, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 @@ -1748,16 +1748,16 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX8-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GFX8-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX8-NEXT: v_mul_lo_u32 v4, v3, v4 -; GFX8-NEXT: v_mul_hi_u32 v4, v3, v4 -; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v4 +; GFX8-NEXT: v_mul_lo_u32 v5, v3, v4 +; GFX8-NEXT: v_mul_hi_u32 v5, v3, v5 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX8-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v4 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xffffffe8, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 @@ -1785,10 +1785,10 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX9-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX9-NEXT: v_subrev_u32_e32 v3, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v3, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; GFX9-NEXT: v_sub_u32_e32 v3, 23, v2 @@ -1814,10 +1814,10 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX10-NEXT: v_mul_hi_u32 v3, v2, v3 ; GFX10-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, v2, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX10-NEXT: v_sub_nc_u32_e32 v3, 23, v2 @@ -1849,11 +1849,11 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) { ; GFX11-NEXT: v_mul_lo_u32 v3, v3, 24 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, v2, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v2 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v2 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v2 ; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc_lo ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) @@ -1888,7 +1888,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_lshr_b32 s1, s2, 16 ; GFX6-NEXT: s_lshr_b32 s7, s3, 8 ; GFX6-NEXT: s_bfe_u32 s10, s2, 0x80008 -; GFX6-NEXT: v_mul_lo_u32 v3, v2, v3 +; GFX6-NEXT: v_mul_lo_u32 v4, v2, v3 ; GFX6-NEXT: s_or_b32 s8, s8, s9 ; GFX6-NEXT: s_and_b32 s9, s2, 0xff ; GFX6-NEXT: s_lshl_b32 s10, s10, 8 @@ -1908,7 +1908,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: v_or_b32_e32 v1, s2, v1 ; GFX6-NEXT: s_lshr_b32 s2, s4, 16 ; GFX6-NEXT: s_bfe_u32 s9, s4, 0x80008 -; GFX6-NEXT: v_mul_hi_u32 v3, v2, v3 +; GFX6-NEXT: v_mul_hi_u32 v4, v2, v4 ; GFX6-NEXT: s_and_b32 s7, s4, 0xff ; GFX6-NEXT: s_lshl_b32 s9, s9, 8 ; GFX6-NEXT: s_and_b32 s2, s2, 0xff @@ -1917,62 +1917,62 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX6-NEXT: s_and_b32 s7, 0xffff, s7 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_or_b32 s2, s7, s2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GFX6-NEXT: v_mul_hi_u32 v3, s2, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GFX6-NEXT: v_mul_hi_u32 v4, s2, v2 ; GFX6-NEXT: s_lshr_b32 s3, s5, 8 ; GFX6-NEXT: s_and_b32 s5, s5, 0xff -; GFX6-NEXT: v_mov_b32_e32 v4, s4 +; GFX6-NEXT: v_mov_b32_e32 v5, s4 ; GFX6-NEXT: s_and_b32 s3, s3, 0xff -; GFX6-NEXT: v_alignbit_b32 v4, s5, v4, 24 +; GFX6-NEXT: v_alignbit_b32 v5, s5, v5, 24 ; GFX6-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX6-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX6-NEXT: v_mul_lo_u32 v4, v4, 24 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 -; GFX6-NEXT: v_or_b32_e32 v4, s3, v4 -; GFX6-NEXT: v_mul_hi_u32 v2, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s2, v3 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 24, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v3 +; GFX6-NEXT: v_or_b32_e32 v5, s3, v5 +; GFX6-NEXT: v_mul_hi_u32 v2, v5, v2 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, s2, v4 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX6-NEXT: v_mul_lo_u32 v2, v2, 24 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 24, v3 -; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, 23, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v4, v3 +; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 23, v4 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v2, v3 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: s_and_b32 s6, s6, 0xff -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc ; GFX6-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX6-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v2, v3 ; GFX6-NEXT: s_lshl_b32 s2, s6, 17 ; GFX6-NEXT: s_lshl_b32 s3, s8, 1 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX6-NEXT: s_or_b32 s2, s2, s3 -; GFX6-NEXT: v_and_b32_e32 v5, 0xffffff, v5 -; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_lshl_b32_e32 v5, s2, v5 -; GFX6-NEXT: v_lshr_b32_e32 v3, s1, v3 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffffff, v6 +; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX6-NEXT: v_lshl_b32_e32 v6, s2, v6 +; GFX6-NEXT: v_lshr_b32_e32 v4, s1, v4 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 23, v2 ; GFX6-NEXT: s_lshl_b32 s0, s0, 17 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX6-NEXT: v_or_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_or_b32_e32 v4, v6, v4 ; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 -; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v3, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 -; GFX6-NEXT: v_bfe_u32 v2, v3, 8, 8 +; GFX6-NEXT: v_bfe_u32 v2, v4, 8, 8 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_bfe_u32 v2, v3, 16, 8 +; GFX6-NEXT: v_bfe_u32 v2, v4, 16, 8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX6-NEXT: v_and_b32_e32 v2, 0xff, v0 @@ -2024,7 +2024,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: v_not_b32_e32 v1, 23 ; GFX8-NEXT: s_or_b32 s3, s10, s3 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX8-NEXT: v_mul_lo_u32 v1, v0, v1 +; GFX8-NEXT: v_mul_lo_u32 v2, v0, v1 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s3, s3, s8 @@ -2034,75 +2034,75 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX8-NEXT: s_lshr_b32 s10, s4, 24 ; GFX8-NEXT: s_and_b32 s4, s4, 0xff ; GFX8-NEXT: s_lshl_b32 s8, s8, 8 -; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 +; GFX8-NEXT: v_mul_hi_u32 v2, v0, v2 ; GFX8-NEXT: s_or_b32 s4, s4, s8 ; GFX8-NEXT: s_and_b32 s8, s9, 0xff ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s4, s4, s8 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 -; GFX8-NEXT: v_mul_hi_u32 v1, s4, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 +; GFX8-NEXT: v_mul_hi_u32 v2, s4, v0 ; GFX8-NEXT: s_lshr_b32 s11, s5, 8 ; GFX8-NEXT: s_and_b32 s5, s5, 0xff ; GFX8-NEXT: s_lshl_b32 s5, s5, 8 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX8-NEXT: v_mul_lo_u32 v2, v2, 24 ; GFX8-NEXT: s_and_b32 s8, s11, 0xff ; GFX8-NEXT: s_or_b32 s5, s10, s5 ; GFX8-NEXT: s_and_b32 s8, 0xffff, s8 ; GFX8-NEXT: s_and_b32 s5, 0xffff, s5 ; GFX8-NEXT: s_lshl_b32 s8, s8, 16 ; GFX8-NEXT: s_or_b32 s5, s5, s8 -; GFX8-NEXT: v_sub_u32_e32 v1, vcc, s4, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, s4, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s5, v0 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v1 -; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v2, v1 +; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_and_b32 s6, 0xffff, s6 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, 23, v2 ; GFX8-NEXT: s_lshl_b32 s4, s6, 17 ; GFX8-NEXT: s_lshl_b32 s0, s0, 1 ; GFX8-NEXT: s_or_b32 s0, s4, s0 +; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 -; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 -; GFX8-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 +; GFX8-NEXT: v_lshrrev_b32_e64 v2, v2, s2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s5, v0 -; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX8-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX8-NEXT: s_and_b32 s7, 0xffff, s7 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 23, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 23, v0 ; GFX8-NEXT: s_lshl_b32 s0, s7, 17 ; GFX8-NEXT: s_lshl_b32 s1, s1, 1 ; GFX8-NEXT: s_or_b32 s0, s0, s1 -; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX8-NEXT: v_and_b32_e32 v1, 0xffffff, v1 ; GFX8-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 +; GFX8-NEXT: v_lshlrev_b32_e64 v1, v1, s0 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s3 -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, 8 -; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 8 +; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_or_b32_sdwa v3, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v3, 0xff, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD -; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD +; GFX8-NEXT: v_readfirstlane_b32 s0, v2 ; GFX8-NEXT: v_readfirstlane_b32 s1, v0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -2175,11 +2175,11 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: s_or_b32 s5, s5, s8 ; GFX9-NEXT: v_mul_hi_u32 v0, s5, v0 ; GFX9-NEXT: v_sub_u32_e32 v1, s4, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX9-NEXT: v_mul_lo_u32 v0, v0, 24 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v1 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v1 ; GFX9-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX9-NEXT: s_and_b32 s6, 0xffff, s6 @@ -2193,10 +2193,10 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX9-NEXT: v_lshrrev_b32_e64 v1, v1, s2 ; GFX9-NEXT: v_sub_u32_e32 v0, s5, v0 ; GFX9-NEXT: v_lshl_or_b32 v1, s0, v2, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v0 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v0 ; GFX9-NEXT: s_and_b32 s1, 0xffff, s1 ; GFX9-NEXT: s_and_b32 s7, 0xffff, s7 @@ -2294,23 +2294,23 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX10-NEXT: s_and_b32 s4, s11, 0xff ; GFX10-NEXT: s_lshl_b32 s0, s0, 1 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: s_lshl_b32 s1, s1, 1 ; GFX10-NEXT: s_or_b32 s2, s2, s4 ; GFX10-NEXT: s_and_b32 s4, s13, 0xff ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: s_and_b32 s4, 0xffff, s4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v1 ; GFX10-NEXT: s_lshl_b32 s4, s4, 16 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX10-NEXT: s_or_b32 s3, s3, s4 ; GFX10-NEXT: s_lshl_b32 s4, s7, 17 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX10-NEXT: s_or_b32 s0, s4, s0 @@ -2393,69 +2393,67 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i ; GFX11-NEXT: s_and_b32 s0, s0, 0xff ; GFX11-NEXT: v_mul_hi_u32 v1, s4, v0 ; GFX11-NEXT: v_mul_hi_u32 v0, s5, v0 +; GFX11-NEXT: s_lshr_b32 s13, s3, 8 +; GFX11-NEXT: s_and_b32 s3, s3, 0xff ; GFX11-NEXT: s_lshl_b32 s6, s6, 8 ; GFX11-NEXT: s_and_b32 s7, s7, 0xff +; GFX11-NEXT: s_lshl_b32 s3, s3, 8 +; GFX11-NEXT: s_and_b32 s13, s13, 0xff +; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 +; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 ; GFX11-NEXT: s_or_b32 s0, s0, s6 ; GFX11-NEXT: s_and_b32 s7, 0xffff, s7 +; GFX11-NEXT: s_or_b32 s3, s12, s3 ; GFX11-NEXT: s_and_b32 s0, 0xffff, s0 -; GFX11-NEXT: s_lshr_b32 s13, s3, 8 -; GFX11-NEXT: v_mul_lo_u32 v1, v1, 24 -; GFX11-NEXT: v_mul_lo_u32 v0, v0, 24 +; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX11-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-NEXT: s_and_b32 s3, s3, 0xff -; GFX11-NEXT: s_and_b32 s13, s13, 0xff -; GFX11-NEXT: s_lshl_b32 s3, s3, 8 -; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-NEXT: s_or_b32 s3, s12, s3 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, s4, v1 ; GFX11-NEXT: v_sub_nc_u32_e32 v0, s5, v0 ; GFX11-NEXT: s_lshl_b32 s4, s10, 8 ; GFX11-NEXT: s_and_b32 s10, 0xffff, s13 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v1 +; GFX11-NEXT: v_add_nc_u32_e32 v2, 0xffffffe8, v1 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: s_lshl_b32 s4, s9, 16 ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2 -; GFX11-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX11-NEXT: s_lshl_b32 s5, s10, 16 ; GFX11-NEXT: s_or_b32 s2, s2, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v2 :: v_dual_add_nc_u32 v2, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 ; GFX11-NEXT: s_lshl_b32 s4, s7, 17 -; GFX11-NEXT: s_lshl_b32 s5, s10, 16 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 24, v1 +; GFX11-NEXT: s_and_b32 s1, 0xffff, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffe8, v1 ; GFX11-NEXT: s_or_b32 s0, s4, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v1 ; GFX11-NEXT: s_lshl_b32 s1, s1, 1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 -; GFX11-NEXT: v_and_b32_e32 v1, 0xffffff, v1 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 24, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_add_nc_u32 v2, 0xffffffe8, v0 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v0 -; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 -; GFX11-NEXT: s_or_b32 s2, s3, s5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v1 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e64 v1, v1, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v3, 23, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 +; GFX11-NEXT: s_or_b32 s2, s3, s5 ; GFX11-NEXT: v_lshl_or_b32 v1, s0, v2, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 +; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s2 ; GFX11-NEXT: s_lshl_b32 s0, s8, 17 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v3 ; GFX11-NEXT: v_bfe_u32 v3, v1, 16, 8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffffff, v0 -; GFX11-NEXT: v_lshrrev_b32_e64 v0, v0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v0, s0, v2, v0 ; GFX11-NEXT: v_bfe_u32 v2, v1, 8, 8 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-NEXT: v_and_b32_e32 v4, 0xff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -2491,32 +2489,32 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX6-NEXT: v_mul_lo_u32 v7, v6, v7 +; GFX6-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX6-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v7 -; GFX6-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX6-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v8 +; GFX6-NEXT: v_mul_hi_u32 v8, v4, v6 ; GFX6-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX6-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX6-NEXT: v_mul_lo_u32 v8, v8, 24 ; GFX6-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v7 -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX6-NEXT: v_subrev_i32_e32 v7, vcc, 24, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v4, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, 23, v4 -; GFX6-NEXT: v_and_b32_e32 v7, 0xffffff, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 23, v4 +; GFX6-NEXT: v_and_b32_e32 v8, 0xffffff, v8 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, v7, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, v8, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v5, v6 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v2, v7 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 24, v2 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xffffffe8, v2 ; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 23, v2 @@ -2540,32 +2538,32 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffffff, v2 ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 1, v1 -; GFX8-NEXT: v_mul_lo_u32 v7, v6, v7 +; GFX8-NEXT: v_mul_lo_u32 v8, v6, v7 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffffff, v3 -; GFX8-NEXT: v_mul_hi_u32 v7, v6, v7 -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v7 -; GFX8-NEXT: v_mul_hi_u32 v7, v4, v6 +; GFX8-NEXT: v_mul_hi_u32 v8, v6, v8 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v8 +; GFX8-NEXT: v_mul_hi_u32 v8, v4, v6 ; GFX8-NEXT: v_mul_hi_u32 v6, v5, v6 -; GFX8-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX8-NEXT: v_mul_lo_u32 v8, v8, 24 ; GFX8-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v7 -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, 24, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v4, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc -; GFX8-NEXT: v_sub_u32_e32 v7, vcc, 23, v4 -; GFX8-NEXT: v_and_b32_e32 v7, 0xffffff, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 23, v4 +; GFX8-NEXT: v_and_b32_e32 v8, 0xffffff, v8 ; GFX8-NEXT: v_and_b32_e32 v4, 0xffffff, v4 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, v7, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, v8, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v5, v6 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v2, v7 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 24, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffffe8, v2 ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 23, v2 @@ -2599,10 +2597,10 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v7 ; GFX9-NEXT: v_sub_u32_e32 v5, v5, v6 -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v6, 0xffffffe8, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_subrev_u32_e32 v6, 24, v4 +; GFX9-NEXT: v_add_u32_e32 v6, 0xffffffe8, v4 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc ; GFX9-NEXT: v_sub_u32_e32 v6, 23, v4 @@ -2610,10 +2608,10 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v6 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, v4, v2 ; GFX9-NEXT: v_lshl_or_b32 v0, v0, v6, v2 -; GFX9-NEXT: v_subrev_u32_e32 v2, 24, v5 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffe8, v5 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_subrev_u32_e32 v4, 24, v2 +; GFX9-NEXT: v_add_u32_e32 v4, 0xffffffe8, v2 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, 24, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX9-NEXT: v_sub_u32_e32 v4, 23, v2 @@ -2645,15 +2643,15 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX10-NEXT: v_mul_lo_u32 v6, v6, 24 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v7 ; GFX10-NEXT: v_sub_nc_u32_e32 v5, v5, v6 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX10-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 +; GFX10-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX10-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 +; GFX10-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 23, v4 @@ -2675,12 +2673,11 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v6, 24 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX11-NEXT: v_and_b32_e32 v2, 0xffffff, v2 +; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v6, v6 -; GFX11-NEXT: v_and_b32_e32 v3, 0xffffff, v3 ; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6 +; GFX11-NEXT: v_dual_mul_f32 v6, 0x4f7ffffe, v6 :: v_dual_lshlrev_b32 v1, 1, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_u32_f32_e32 v6, v6 ; GFX11-NEXT: v_mul_lo_u32 v7, 0xffffffe8, v6 @@ -2697,34 +2694,33 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mul_hi_u32 v6, v5, v6 ; GFX11-NEXT: v_mul_lo_u32 v6, v6, 24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v6 -; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 -; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 -; GFX11-NEXT: v_subrev_nc_u32_e32 v6, 24, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v6, 0xffffffe8, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v7, 0xffffffe8, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v4 -; GFX11-NEXT: v_subrev_nc_u32_e32 v7, 24, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v4, v4, v6 :: v_dual_add_nc_u32 v7, 0xffffffe8, v5 ; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 24, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 23, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v6, 0xffffff, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_nc_u32_e32 v7, 23, v5 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v0, v0, v6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v1, v1, v4, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) @@ -6087,13 +6083,14 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: v_not_b32_e32 v0, v8 ; GFX6-NEXT: v_and_b32_e32 v15, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v15 +; GFX6-NEXT: v_not_b32_e32 v16, 63 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[9:10], v0 ; GFX6-NEXT: v_lshl_b64 v[11:12], v[2:3], v15 -; GFX6-NEXT: v_subrev_i32_e32 v16, vcc, 64, v15 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v15, v16 ; GFX6-NEXT: v_lshl_b64 v[13:14], v[9:10], v15 ; GFX6-NEXT: v_or_b32_e32 v11, v0, v11 ; GFX6-NEXT: v_or_b32_e32 v12, v1, v12 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[9:10], v16 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[9:10], v17 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX6-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc @@ -6106,7 +6103,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v14 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[4:5], v14 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 -; GFX6-NEXT: v_subrev_i32_e32 v15, vcc, 64, v14 +; GFX6-NEXT: v_add_i32_e32 v15, vcc, v14, v16 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[6:7], v15 @@ -6135,13 +6132,14 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: v_not_b32_e32 v0, v8 ; GFX8-NEXT: v_and_b32_e32 v15, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v15 +; GFX8-NEXT: v_not_b32_e32 v16, 63 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] ; GFX8-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v16, vcc, 64, v15 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v15, v16 ; GFX8-NEXT: v_lshlrev_b64 v[13:14], v15, v[9:10] ; GFX8-NEXT: v_or_b32_e32 v11, v0, v11 ; GFX8-NEXT: v_or_b32_e32 v12, v1, v12 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v16, v[9:10] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v17, v[9:10] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 ; GFX8-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v13, 0, v14, vcc @@ -6154,7 +6152,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v14 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; GFX8-NEXT: v_subrev_u32_e32 v15, vcc, 64, v14 +; GFX8-NEXT: v_add_u32_e32 v15, vcc, v14, v16 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v15, v[6:7] @@ -6185,7 +6183,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v15 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[9:10] ; GFX9-NEXT: v_lshlrev_b64 v[11:12], v15, v[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v16, 64, v15 +; GFX9-NEXT: v_add_u32_e32 v16, 0xffffffc0, v15 ; GFX9-NEXT: v_lshlrev_b64 v[13:14], v15, v[9:10] ; GFX9-NEXT: v_or_b32_e32 v11, v0, v11 ; GFX9-NEXT: v_or_b32_e32 v12, v1, v12 @@ -6202,7 +6200,7 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX9-NEXT: v_cndmask_b32_e32 v12, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v14, v[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GFX9-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v15, v[6:7] @@ -6232,9 +6230,9 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX10-NEXT: v_and_b32_e32 v18, 0x7f, v9 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v10 ; GFX10-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX10-NEXT: v_subrev_nc_u32_e32 v21, 64, v19 +; GFX10-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v19 ; GFX10-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX10-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX10-NEXT: v_add_nc_u32_e32 v20, 0xffffffc0, v18 ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] ; GFX10-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] @@ -6273,47 +6271,48 @@ define i128 @v_fshr_i128(i128 %lhs, i128 %rhs, i128 %amt) { ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_not_b32_e32 v9, v8 +; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] ; GFX11-NEXT: v_lshrrev_b32_e32 v10, 31, v1 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_b32_e32 v18, 0x7f, v9 ; GFX11-NEXT: v_or_b32_e32 v2, v2, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 ; GFX11-NEXT: v_lshlrev_b64 v[14:15], v18, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v18 ; GFX11-NEXT: v_and_b32_e32 v19, 0x7f, v8 -; GFX11-NEXT: v_sub_nc_u32_e32 v10, 64, v18 -; GFX11-NEXT: v_subrev_nc_u32_e32 v20, 64, v18 +; GFX11-NEXT: v_add_nc_u32_e32 v20, 0xffffffc0, v18 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v18, v[2:3] +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] ; GFX11-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v16, 64, v19 -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[0:1] -; GFX11-NEXT: v_subrev_nc_u32_e32 v21, 64, v19 -; GFX11-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v20, v[0:1] -; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] +; GFX11-NEXT: v_lshrrev_b64 v[12:13], v19, v[4:5] ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v19 ; GFX11-NEXT: v_or_b32_e32 v10, v10, v8 +; GFX11-NEXT: v_add_nc_u32_e32 v21, 0xffffffc0, v19 +; GFX11-NEXT: v_lshlrev_b64 v[16:17], v16, v[6:7] ; GFX11-NEXT: v_or_b32_e32 v11, v11, v9 -; GFX11-NEXT: v_lshrrev_b64 v[8:9], v21, v[6:7] ; GFX11-NEXT: v_cmp_eq_u32_e64 s1, 0, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v10, v0, v10, vcc_lo +; GFX11-NEXT: v_lshrrev_b64 v[8:9], v21, v[6:7] ; GFX11-NEXT: v_or_b32_e32 v12, v12, v16 ; GFX11-NEXT: v_or_b32_e32 v13, v13, v17 -; GFX11-NEXT: v_dual_cndmask_b32 v10, v0, v10 :: v_dual_cndmask_b32 v11, v1, v11 +; GFX11-NEXT: v_cndmask_b32_e32 v11, v1, v11, vcc_lo ; GFX11-NEXT: v_lshrrev_b64 v[0:1], v19, v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v15, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v12, s0 ; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0, v18 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v12, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v9, v13, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0, v15, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s2 ; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v4, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v6, v5, s1 ; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v1, s0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_or_b32_e32 v0, v14, v4 ; GFX11-NEXT: v_or_b32_e32 v1, v7, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -6335,46 +6334,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX6-NEXT: v_and_b32_e32 v7, 0x7f, v1 ; GFX6-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 64, v7 +; GFX6-NEXT: v_not_b32_e32 v8, 63 ; GFX6-NEXT: v_lshr_b64 v[1:2], s[8:9], v1 ; GFX6-NEXT: v_lshl_b64 v[3:4], s[0:1], v7 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v7 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v7, v8 ; GFX6-NEXT: v_lshl_b64 v[5:6], s[8:9], v7 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX6-NEXT: v_lshl_b64 v[1:2], s[8:9], v8 +; GFX6-NEXT: v_lshl_b64 v[1:2], s[8:9], v9 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX6-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX6-NEXT: v_mov_b32_e32 v3, s0 ; GFX6-NEXT: v_mov_b32_e32 v4, s1 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX6-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX6-NEXT: v_and_b32_e32 v11, 0x7f, v0 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v10 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v10 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v11 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[4:5], v11 ; GFX6-NEXT: v_lshl_b64 v[2:3], s[6:7], v2 -; GFX6-NEXT: v_subrev_i32_e32 v11, vcc, 64, v10 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v11, v8 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX6-NEXT: v_lshr_b64 v[0:1], s[6:7], v11 -; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v10 -; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX6-NEXT: v_lshr_b64 v[0:1], s[6:7], v8 +; GFX6-NEXT: v_lshr_b64 v[4:5], s[6:7], v11 +; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 -; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX6-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 ; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX6-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX6-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX6-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX6-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX6-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX6-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: v_fshr_i128_ssv: @@ -6387,46 +6387,47 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX8-NEXT: v_and_b32_e32 v7, 0x7f, v1 ; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 64, v7 +; GFX8-NEXT: v_not_b32_e32 v8, 63 ; GFX8-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1] -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v7 +; GFX8-NEXT: v_add_u32_e32 v9, vcc, v7, v8 ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9] ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v4, v2, v4 -; GFX8-NEXT: v_lshlrev_b64 v[1:2], v8, s[8:9] +; GFX8-NEXT: v_lshlrev_b64 v[1:2], v9, s[8:9] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v8, 0, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v9, 0, v5, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v3, s0 ; GFX8-NEXT: v_mov_b32_e32 v4, s1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 -; GFX8-NEXT: v_and_b32_e32 v10, 0x7f, v0 +; GFX8-NEXT: v_and_b32_e32 v11, 0x7f, v0 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v2, v4, vcc -; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v10 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v10, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e32 v10, v2, v4, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v11 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[4:5] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7] -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 64, v10 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v11, v8 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] -; GFX8-NEXT: v_lshrrev_b64 v[4:5], v10, s[6:7] -; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v10 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], v8, s[6:7] +; GFX8-NEXT: v_lshrrev_b64 v[4:5], v11, s[6:7] +; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v10 +; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v11 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc -; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX8-NEXT: v_or_b32_e32 v1, v6, v1 ; GFX8-NEXT: v_or_b32_e32 v2, v7, v2 -; GFX8-NEXT: v_or_b32_e32 v3, v9, v3 +; GFX8-NEXT: v_or_b32_e32 v3, v10, v3 ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: v_fshr_i128_ssv: @@ -6441,7 +6442,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX9-NEXT: v_sub_u32_e32 v1, 64, v7 ; GFX9-NEXT: v_lshrrev_b64 v[1:2], v1, s[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[3:4], v7, s[0:1] -; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v7 +; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v7 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], v7, s[8:9] ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_or_b32_e32 v4, v2, v4 @@ -6460,7 +6461,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v10, s[4:5] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[6:7] -; GFX9-NEXT: v_subrev_u32_e32 v11, 64, v10 +; GFX9-NEXT: v_add_u32_e32 v11, 0xffffffc0, v10 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v11, s[6:7] @@ -6492,10 +6493,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX10-NEXT: v_sub_nc_u32_e32 v8, 64, v13 ; GFX10-NEXT: s_lshl_b64 s[0:1], s[0:1], 1 ; GFX10-NEXT: s_or_b64 s[8:9], s[2:3], s[8:9] -; GFX10-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 +; GFX10-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v12 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v12 ; GFX10-NEXT: v_lshrrev_b64 v[4:5], v13, s[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] ; GFX10-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] @@ -6544,11 +6545,11 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs, ; GFX11-NEXT: v_and_b32_e32 v13, 0x7f, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v12 ; GFX11-NEXT: v_lshlrev_b64 v[0:1], v12, s[8:9] -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v12 +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v12 ; GFX11-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v8, 64, v13 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v2, s[0:1] -; GFX11-NEXT: v_subrev_nc_u32_e32 v14, 64, v13 +; GFX11-NEXT: v_add_nc_u32_e32 v14, 0xffffffc0, v13 ; GFX11-NEXT: v_lshrrev_b64 v[4:5], v13, s[4:5] ; GFX11-NEXT: v_lshlrev_b64 v[10:11], v10, s[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v8, s[6:7] @@ -7718,13 +7719,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_not_b32_e32 v0, v16 ; GFX6-NEXT: v_and_b32_e32 v19, 0x7f, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, 64, v19 +; GFX6-NEXT: v_not_b32_e32 v25, 63 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[17:18], v0 ; GFX6-NEXT: v_lshl_b64 v[21:22], v[2:3], v19 -; GFX6-NEXT: v_subrev_i32_e32 v25, vcc, 64, v19 +; GFX6-NEXT: v_add_i32_e32 v26, vcc, v19, v25 ; GFX6-NEXT: v_lshl_b64 v[23:24], v[17:18], v19 ; GFX6-NEXT: v_or_b32_e32 v21, v0, v21 ; GFX6-NEXT: v_or_b32_e32 v22, v1, v22 -; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v25 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[17:18], v26 ; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 ; GFX6-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc @@ -7737,7 +7739,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 64, v22 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[8:9], v22 ; GFX6-NEXT: v_lshl_b64 v[2:3], v[10:11], v2 -; GFX6-NEXT: v_subrev_i32_e32 v24, vcc, 64, v22 +; GFX6-NEXT: v_add_i32_e32 v24, vcc, v22, v25 ; GFX6-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX6-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX6-NEXT: v_lshr_b64 v[0:1], v[10:11], v24 @@ -7761,7 +7763,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_lshr_b64 v[4:5], v[8:9], v4 ; GFX6-NEXT: v_lshl_b64 v[10:11], v[6:7], v18 ; GFX6-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 ; GFX6-NEXT: v_lshl_b64 v[16:17], v[8:9], v18 ; GFX6-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX6-NEXT: v_or_b32_e32 v11, v5, v11 @@ -7778,7 +7780,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, 64, v18 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[12:13], v18 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[14:15], v6 -; GFX6-NEXT: v_subrev_i32_e32 v19, vcc, 64, v18 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v18, v25 ; GFX6-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX6-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[14:15], v19 @@ -7809,13 +7811,14 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_not_b32_e32 v0, v16 ; GFX8-NEXT: v_and_b32_e32 v19, 0x7f, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, 64, v19 +; GFX8-NEXT: v_not_b32_e32 v25, 63 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] ; GFX8-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v25, vcc, 64, v19 +; GFX8-NEXT: v_add_u32_e32 v26, vcc, v19, v25 ; GFX8-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] ; GFX8-NEXT: v_or_b32_e32 v21, v0, v21 ; GFX8-NEXT: v_or_b32_e32 v22, v1, v22 -; GFX8-NEXT: v_lshlrev_b64 v[0:1], v25, v[17:18] +; GFX8-NEXT: v_lshlrev_b64 v[0:1], v26, v[17:18] ; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v19 ; GFX8-NEXT: v_cndmask_b32_e32 v18, 0, v23, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v23, 0, v24, vcc @@ -7828,7 +7831,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, 64, v22 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX8-NEXT: v_subrev_u32_e32 v24, vcc, 64, v22 +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v22, v25 ; GFX8-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX8-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX8-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] @@ -7852,7 +7855,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] ; GFX8-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] ; GFX8-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 ; GFX8-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] ; GFX8-NEXT: v_or_b32_e32 v10, v4, v10 ; GFX8-NEXT: v_or_b32_e32 v11, v5, v11 @@ -7869,7 +7872,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, 64, v18 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13] ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX8-NEXT: v_subrev_u32_e32 v19, vcc, 64, v18 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v18, v25 ; GFX8-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX8-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15] @@ -7902,7 +7905,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_sub_u32_e32 v0, 64, v19 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, v[17:18] ; GFX9-NEXT: v_lshlrev_b64 v[21:22], v19, v[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v25, 64, v19 +; GFX9-NEXT: v_add_u32_e32 v25, 0xffffffc0, v19 ; GFX9-NEXT: v_lshlrev_b64 v[23:24], v19, v[17:18] ; GFX9-NEXT: v_or_b32_e32 v21, v0, v21 ; GFX9-NEXT: v_or_b32_e32 v22, v1, v22 @@ -7919,7 +7922,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cndmask_b32_e32 v21, v1, v3, vcc ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v22, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, v[10:11] -; GFX9-NEXT: v_subrev_u32_e32 v24, 64, v22 +; GFX9-NEXT: v_add_u32_e32 v24, 0xffffffc0, v22 ; GFX9-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-NEXT: v_or_b32_e32 v3, v1, v3 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], v24, v[10:11] @@ -7942,7 +7945,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[8:9] ; GFX9-NEXT: v_lshlrev_b64 v[10:11], v18, v[6:7] ; GFX9-NEXT: v_or_b32_e32 v2, v19, v2 -; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v17, vcc ; GFX9-NEXT: v_lshlrev_b64 v[16:17], v18, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v10, v4, v10 @@ -7960,7 +7963,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX9-NEXT: v_cndmask_b32_e32 v11, v5, v7, vcc ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v18, v[12:13] ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v6, v[14:15] -; GFX9-NEXT: v_subrev_u32_e32 v19, 64, v18 +; GFX9-NEXT: v_add_u32_e32 v19, 0xffffffc0, v18 ; GFX9-NEXT: v_or_b32_e32 v6, v4, v6 ; GFX9-NEXT: v_or_b32_e32 v7, v5, v7 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v19, v[14:15] @@ -7991,11 +7994,11 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_and_b32_e32 v25, 0x7f, v17 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 31, v1 ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1] -; GFX10-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 +; GFX10-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26 ; GFX10-NEXT: v_cmp_gt_u32_e64 s4, 64, v26 ; GFX10-NEXT: v_sub_nc_u32_e32 v18, 64, v25 ; GFX10-NEXT: v_or_b32_e32 v2, v2, v17 -; GFX10-NEXT: v_subrev_nc_u32_e32 v19, 64, v25 +; GFX10-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25 ; GFX10-NEXT: v_lshlrev_b64 v[23:24], v25, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 ; GFX10-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] @@ -8035,12 +8038,12 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX10-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] ; GFX10-NEXT: v_lshlrev_b64 v[10:11], v25, v[6:7] ; GFX10-NEXT: v_sub_nc_u32_e32 v20, 64, v23 -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v25 +; GFX10-NEXT: v_add_nc_u32_e32 v3, 0xffffffc0, v25 ; GFX10-NEXT: v_or_b32_e32 v2, v18, v2 ; GFX10-NEXT: v_lshlrev_b64 v[16:17], v25, v[4:5] ; GFX10-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13] ; GFX10-NEXT: v_or_b32_e32 v10, v8, v10 -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v23 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v23 ; GFX10-NEXT: v_lshlrev_b64 v[20:21], v20, v[14:15] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v25 ; GFX10-NEXT: v_lshlrev_b64 v[3:4], v3, v[4:5] @@ -8091,41 +8094,41 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-NEXT: v_cndmask_b32_e32 v24, 0, v24, vcc_lo ; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v25 ; GFX11-NEXT: v_lshlrev_b64 v[21:22], v25, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v19, 64, v25 -; GFX11-NEXT: v_subrev_nc_u32_e32 v27, 64, v26 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v26 ; GFX11-NEXT: v_lshrrev_b64 v[17:18], v18, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_or_b32_e32 v22, v18, v22 -; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v26 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add_nc_u32_e32 v19, 0xffffffc0, v25 ; GFX11-NEXT: v_or_b32_e32 v21, v17, v21 +; GFX11-NEXT: v_sub_nc_u32_e32 v18, 64, v26 ; GFX11-NEXT: v_lshrrev_b64 v[16:17], v26, v[8:9] -; GFX11-NEXT: v_cndmask_b32_e32 v22, v1, v22, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_lshlrev_b64 v[0:1], v19, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[18:19], v18, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e32 v21, v0, v21, vcc_lo -; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v22, v1, v22 :: v_dual_cndmask_b32 v21, v0, v21 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v27, 0xffffffc0, v26 ; GFX11-NEXT: v_or_b32_e32 v16, v16, v18 ; GFX11-NEXT: v_or_b32_e32 v17, v17, v19 +; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b64 v[0:1], v27, v[10:11] ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v16, s0 ; GFX11-NEXT: v_not_b32_e32 v16, v20 ; GFX11-NEXT: v_cndmask_b32_e32 v18, v21, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v17, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v22, v22, v3, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v17, s0 ; GFX11-NEXT: v_lshrrev_b64 v[2:3], v26, v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v1, v1, v9 :: v_dual_cndmask_b32 v0, v0, v8 +; GFX11-NEXT: v_and_b32_e32 v25, 0x7f, v16 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_cndmask_b32 v1, v1, v9 ; GFX11-NEXT: v_lshrrev_b32_e32 v8, 31, v5 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5] ; GFX11-NEXT: v_sub_nc_u32_e32 v9, 64, v25 ; GFX11-NEXT: v_cndmask_b32_e64 v26, 0, v3, s0 -; GFX11-NEXT: v_subrev_nc_u32_e32 v3, 64, v25 +; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xffffffc0, v25 ; GFX11-NEXT: v_or_b32_e32 v6, v6, v8 ; GFX11-NEXT: v_or_b32_e32 v0, v23, v0 ; GFX11-NEXT: v_lshrrev_b64 v[8:9], v9, v[4:5] @@ -8143,7 +8146,7 @@ define <2 x i128> @v_fshr_v2i128(<2 x i128> %lhs, <2 x i128> %rhs, <2 x i128> %a ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_dual_cndmask_b32 v11, 0, v16 :: v_dual_cndmask_b32 v10, v3, v10 ; GFX11-NEXT: v_sub_nc_u32_e32 v20, 64, v23 -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v23 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v23 ; GFX11-NEXT: v_lshrrev_b64 v[18:19], v23, v[12:13] ; GFX11-NEXT: v_cmp_gt_u32_e64 s0, 64, v23 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v4, v5, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll index 5dd4fa0809131f..cc185aff9eff22 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll @@ -1577,7 +1577,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX6-NEXT: v_mov_b32_e32 v5, 0 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 64, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 ; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 ; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v3 @@ -1599,7 +1599,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] ; GFX8-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] @@ -1621,7 +1621,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 -; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 ; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] ; GFX9-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] @@ -1643,7 +1643,7 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_and_b32_e32 v4, 1, v2 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX10-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 ; GFX10-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v3 @@ -1664,20 +1664,20 @@ define i65 @v_lshr_i65(i65 %value, i65 %amount) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2 ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX11-NEXT: v_subrev_nc_u32_e32 v10, 64, v3 ; GFX11-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 ; GFX11-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] -; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] -; GFX11-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] ; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 ; GFX11-NEXT: v_or_b32_e32 v6, v7, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 +; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] +; GFX11-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] ; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0 ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = lshr i65 %value, %amount ret i65 %result diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll index 2c2f8e914447d1..88eb0e4b848c95 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll @@ -325,7 +325,7 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) { ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4 ; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_add_i32_e32 v3, vcc, 0xffed2705, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5] ; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 @@ -353,29 +353,29 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v7, v5, v3 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5 -; GISEL-NEXT: v_mul_lo_u32 v9, v4, v3 -; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 +; GISEL-NEXT: v_mul_lo_u32 v8, v7, v3 +; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v7 +; GISEL-NEXT: v_mul_lo_u32 v10, v4, v3 +; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5] -; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v0, v3 +; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, v0, v5 ; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v3 -; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7] -; GISEL-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1 -; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5 +; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v11, s[6:7] +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 0xffed2705, v1 +; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v7 ; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v0, v7, v5, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc ; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2 @@ -398,29 +398,29 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v7, v4, v5 -; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4 -; CGP-NEXT: v_mul_lo_u32 v9, v3, v5 -; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9 +; CGP-NEXT: v_mul_lo_u32 v8, v7, v5 +; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v7 +; CGP-NEXT: v_mul_lo_u32 v10, v3, v5 +; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10 ; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5 -; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5] -; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v5 +; CGP-NEXT: v_cndmask_b32_e64 v7, v7, v9, s[4:5] +; CGP-NEXT: v_add_i32_e32 v4, vcc, v0, v4 ; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5 -; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[6:7] -; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1 -; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5] -; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4 +; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[6:7] +; CGP-NEXT: v_add_i32_e32 v8, vcc, 0xffed2705, v1 +; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] +; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v7 ; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] ; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc +; CGP-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 ; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll index 4cf1c92539c36f..b12e915c7d21b1 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll @@ -1583,7 +1583,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) { ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 64, v3 ; GFX6-NEXT: v_lshr_b64 v[4:5], v[0:1], v4 ; GFX6-NEXT: v_lshl_b64 v[5:6], v[2:3], v3 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 64, v3 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0xffffffc0, v3 ; GFX6-NEXT: v_lshl_b64 v[6:7], v[0:1], v3 ; GFX6-NEXT: v_or_b32_e32 v9, v4, v5 ; GFX6-NEXT: v_lshl_b64 v[4:5], v[0:1], v8 @@ -1601,7 +1601,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) { ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 64, v3 ; GFX8-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1] ; GFX8-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3] -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 64, v3 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffffc0, v3 ; GFX8-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1] ; GFX8-NEXT: v_or_b32_e32 v9, v4, v5 ; GFX8-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1] @@ -1619,7 +1619,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) { ; GFX9-NEXT: v_sub_u32_e32 v4, 64, v3 ; GFX9-NEXT: v_lshrrev_b64 v[4:5], v4, v[0:1] ; GFX9-NEXT: v_lshlrev_b64 v[5:6], v3, v[2:3] -; GFX9-NEXT: v_subrev_u32_e32 v8, 64, v3 +; GFX9-NEXT: v_add_u32_e32 v8, 0xffffffc0, v3 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1] ; GFX9-NEXT: v_or_b32_e32 v9, v4, v5 ; GFX9-NEXT: v_lshlrev_b64 v[4:5], v8, v[0:1] @@ -1636,7 +1636,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_nc_u32_e32 v6, 64, v3 ; GFX10-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3] -; GFX10-NEXT: v_subrev_nc_u32_e32 v8, 64, v3 +; GFX10-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v3 ; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX10-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1] ; GFX10-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1] @@ -1654,7 +1654,7 @@ define i65 @v_shl_i65(i65 %value, i65 %amount) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v6, 64, v3 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], v3, v[2:3] -; GFX11-NEXT: v_subrev_nc_u32_e32 v8, 64, v3 +; GFX11-NEXT: v_add_nc_u32_e32 v8, 0xffffffc0, v3 ; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 ; GFX11-NEXT: v_lshrrev_b64 v[5:6], v6, v[0:1] ; GFX11-NEXT: v_lshlrev_b64 v[6:7], v3, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll index 1bb606f36e48d2..2b12e4b973acb2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -268,10 +268,10 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) { ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x1000, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xfffff000, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x1000, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xfffff000, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -297,23 +297,23 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_lshlrev_b32_e32 v5, 12, v5 +; GISEL-NEXT: v_lshlrev_b32_e32 v7, 12, v7 ; GISEL-NEXT: v_lshlrev_b32_e32 v4, 12, v4 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v3 -; GISEL-NEXT: v_subrev_i32_e32 v5, vcc, 0x1000, v1 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 0xfffff000, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v3 -; GISEL-NEXT: v_subrev_i32_e32 v5, vcc, 0x1000, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0xfffff000, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 @@ -338,23 +338,23 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v4, 12, v4 +; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v7 ; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 0xfffff000, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x1000, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 0xfffff000, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 @@ -386,10 +386,10 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) { ; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 ; CHECK-NEXT: v_mul_lo_u32 v2, v2, v4 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 @@ -415,23 +415,23 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4 ; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6 ; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5 -; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5 -; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4 +; GISEL-NEXT: v_mul_lo_u32 v7, v4, v5 +; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7 +; GISEL-NEXT: v_mul_hi_u32 v7, v0, v4 ; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4 -; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3 +; GISEL-NEXT: v_mul_lo_u32 v7, v7, v3 ; GISEL-NEXT: v_mul_lo_u32 v4, v4, v3 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v3 -; GISEL-NEXT: v_subrev_i32_e32 v5, vcc, 0x12d8fb, v1 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v7, vcc, 0xffed2705, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v3 -; GISEL-NEXT: v_subrev_i32_e32 v5, vcc, 0x12d8fb, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0xffed2705, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3 @@ -456,23 +456,23 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 ; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v4, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v3, v4 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v4, v0, v3 +; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 +; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 +; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 ; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v4, v4, v5 +; CGP-NEXT: v_mul_lo_u32 v7, v7, v5 ; CGP-NEXT: v_mul_lo_u32 v3, v3, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; CGP-NEXT: v_add_i32_e32 v7, vcc, 0xffed2705, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v5 -; CGP-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; CGP-NEXT: v_add_i32_e32 v4, vcc, 0xffed2705, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll index 2572f8581f0edf..7214f4ab581d5d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -12,9 +12,9 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -27,9 +27,9 @@ define i7 @v_ssubsat_i7(i7 %lhs, i7 %rhs) { ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -63,9 +63,9 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 25 ; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 25 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 @@ -79,11 +79,11 @@ define amdgpu_ps i7 @s_ssubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { ; GFX8-NEXT: s_sext_i32_i16 s3, -1 ; GFX8-NEXT: s_max_i32 s4, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 9 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -122,9 +122,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -137,9 +137,9 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) { ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -173,9 +173,9 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 24 ; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 @@ -189,11 +189,11 @@ define amdgpu_ps i8 @s_ssubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { ; GFX8-NEXT: s_sext_i32_i16 s3, -1 ; GFX8-NEXT: s_max_i32 s4, s2, s3 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -234,18 +234,19 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_max_i32_e32 v1, v4, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -266,16 +267,16 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { ; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 ; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_add_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_max_i16_e32 v1, v4, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v5 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 ; GFX8-NEXT: v_min_i16_e32 v4, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x8000, v4 +; GFX8-NEXT: v_add_u16_e32 v4, 0x8000, v4 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v4 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 @@ -355,18 +356,18 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: s_lshr_b32 s3, s1, 8 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s0, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s4, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -387,11 +388,11 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_max_i32 s6, s4, s5 ; GFX8-NEXT: s_lshr_b32 s3, s1, 8 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s4, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s6, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -401,11 +402,11 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { ; GFX8-NEXT: s_lshl_b32 s2, s3, 8 ; GFX8-NEXT: s_sext_i32_i16 s3, s1 ; GFX8-NEXT: s_max_i32 s4, s3, s5 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s3, s3, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s3, 0x8000 ; GFX8-NEXT: s_max_i32 s2, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -509,29 +510,29 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x80000001, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v11 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX6-NEXT: v_max_i32_e32 v1, v8, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v10 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX6-NEXT: v_mov_b32_e32 v9, 0x80000001 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v11 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GFX6-NEXT: v_max_i32_e32 v2, v5, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v3, v5, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -539,10 +540,10 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 24, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 24, v0 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 24, v2 @@ -573,34 +574,34 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_e32 v1, v8, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v9 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 ; GFX8-NEXT: v_max_i16_e32 v1, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 ; GFX8-NEXT: v_min_i16_e32 v8, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x8000, v8 +; GFX8-NEXT: v_add_u16_e32 v8, 0x8000, v8 ; GFX8-NEXT: v_max_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v8 ; GFX8-NEXT: v_max_i16_e32 v4, -1, v2 ; GFX8-NEXT: v_sub_u16_e32 v1, v3, v1 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 +; GFX8-NEXT: v_add_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v3, v4, v3 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u16_e32 v2, v2, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 ; GFX8-NEXT: v_max_i16_e32 v5, -1, v3 ; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 -; GFX8-NEXT: v_subrev_u16_e32 v5, 0x7fff, v5 +; GFX8-NEXT: v_add_u16_e32 v5, 0x8001, v5 ; GFX8-NEXT: v_min_i16_e32 v6, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x8000, v6 +; GFX8-NEXT: v_add_u16_e32 v6, 0x8000, v6 ; GFX8-NEXT: v_max_i16_e32 v4, v5, v4 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v6 ; GFX8-NEXT: v_sub_u16_e32 v3, v3, v4 @@ -727,27 +728,27 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_lshr_b32 s6, s1, 16 ; GFX6-NEXT: s_lshr_b32 s7, s1, 24 ; GFX6-NEXT: s_lshl_b32 s1, s1, 24 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 ; GFX6-NEXT: s_min_i32 s9, s0, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s8, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 ; GFX6-NEXT: s_lshl_b32 s1, s2, 24 ; GFX6-NEXT: s_lshl_b32 s2, s5, 24 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s1, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s5, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 24 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_lshl_b32 s3, s6, 24 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s5, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 @@ -755,10 +756,10 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX6-NEXT: s_max_i32 s5, s3, -1 ; GFX6-NEXT: s_ashr_i32 s1, s1, 24 ; GFX6-NEXT: s_lshl_b32 s4, s7, 24 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s3, -1 ; GFX6-NEXT: s_ashr_i32 s0, s0, 24 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_and_b32 s1, s1, 0xff ; GFX6-NEXT: s_ashr_i32 s2, s2, 24 @@ -789,11 +790,11 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshr_b32 s6, s1, 16 ; GFX8-NEXT: s_lshr_b32 s7, s1, 24 ; GFX8-NEXT: s_lshl_b32 s1, s1, 8 -; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff +; GFX8-NEXT: s_addk_i32 s10, 0x8001 ; GFX8-NEXT: s_min_i32 s8, s8, s9 ; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s8, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s10, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -803,11 +804,11 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_lshl_b32 s2, s5, 8 ; GFX8-NEXT: s_sext_i32_i16 s5, s1 ; GFX8-NEXT: s_max_i32 s8, s5, s9 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_addk_i32 s8, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s5, 0x8000 ; GFX8-NEXT: s_max_i32 s2, s8, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -817,11 +818,11 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_sext_i32_i16 s5, s2 ; GFX8-NEXT: s_lshl_b32 s3, s6, 8 ; GFX8-NEXT: s_max_i32 s6, s5, s9 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s5, 0x8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -831,12 +832,12 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { ; GFX8-NEXT: s_sext_i32_i16 s5, s3 ; GFX8-NEXT: s_max_i32 s6, s5, s9 ; GFX8-NEXT: s_lshl_b32 s4, s7, 8 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s5, 0x8000 ; GFX8-NEXT: s_max_i32 s4, s6, s4 ; GFX8-NEXT: s_sext_i32_i16 s0, s0 ; GFX8-NEXT: s_ashr_i32 s1, s1, 8 @@ -1004,9 +1005,9 @@ define i24 @v_ssubsat_i24(i24 %lhs, i24 %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -1055,9 +1056,9 @@ define amdgpu_ps i24 @s_ssubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 8 ; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 8 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 @@ -1109,9 +1110,9 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -1121,9 +1122,9 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x80000001, v2 ; GFX8-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 0x80000000, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000000, v3 ; GFX8-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 @@ -1148,9 +1149,9 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; GFX6-LABEL: s_ssubsat_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 @@ -1159,9 +1160,9 @@ define amdgpu_ps i32 @s_ssubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { ; GFX8-LABEL: s_ssubsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s2, s0, -1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX8-NEXT: s_min_i32 s3, s0, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX8-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX8-NEXT: s_max_i32 s1, s2, s1 ; GFX8-NEXT: s_min_i32 s1, s1, s3 ; GFX8-NEXT: s_sub_i32 s0, s0, s1 @@ -1187,9 +1188,9 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; GFX6-LABEL: ssubsat_i32_sv: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s1, s0, -1 -; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX6-NEXT: s_add_i32 s1, s1, 0x80000001 ; GFX6-NEXT: s_min_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v0, s1, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 @@ -1198,9 +1199,9 @@ define amdgpu_ps float @ssubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { ; GFX8-LABEL: ssubsat_i32_sv: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s1, s0, -1 -; GFX8-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX8-NEXT: s_add_i32 s1, s1, 0x80000001 ; GFX8-NEXT: s_min_i32 s2, s0, -1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0x80000000 +; GFX8-NEXT: s_add_i32 s2, s2, 0x80000000 ; GFX8-NEXT: v_max_i32_e32 v0, s1, v0 ; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 @@ -1224,9 +1225,9 @@ define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; GFX6-LABEL: ssubsat_i32_vs: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000001, v1 ; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x80000000, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000000, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -1235,9 +1236,9 @@ define amdgpu_ps float @ssubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { ; GFX8-LABEL: ssubsat_i32_vs: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_max_i32_e32 v1, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, 0x7fffffff, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x80000001, v1 ; GFX8-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x80000000, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x80000000, v2 ; GFX8-NEXT: v_max_i32_e32 v1, s0, v1 ; GFX8-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v1 @@ -1262,16 +1263,16 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -1281,16 +1282,16 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v4, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000001, v4 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000000, v5 ; GFX8-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_max_i32_e32 v2, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x80000001, v2 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x80000000, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000000, v4 ; GFX8-NEXT: v_max_i32_e32 v2, v2, v3 ; GFX8-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v2 @@ -1317,16 +1318,16 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX6-LABEL: s_ssubsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s4, s0, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s0, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_max_i32 s2, s1, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s2, s3 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -1335,16 +1336,16 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inre ; GFX8-LABEL: s_ssubsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s4, s0, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX8-NEXT: s_min_i32 s5, s0, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s2, s4, s2 ; GFX8-NEXT: s_min_i32 s2, s2, s5 ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_max_i32 s2, s1, -1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX8-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX8-NEXT: s_min_i32 s4, s1, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX8-NEXT: s_max_i32 s2, s2, s3 ; GFX8-NEXT: s_min_i32 s2, s2, s4 ; GFX8-NEXT: s_sub_i32 s1, s1, s2 @@ -1376,24 +1377,25 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x7fffffff, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0x80000001, v6 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x80000000, v8 +; GFX6-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v9 ; GFX6-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v8 -; GFX6-NEXT: v_bfrev_b32_e32 v7, -2 +; GFX6-NEXT: v_mov_b32_e32 v7, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v7 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v7 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v9 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v3, v3, v5 ; GFX6-NEXT: v_min_i32_e32 v3, v3, v4 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 @@ -1403,24 +1405,25 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v6, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x7fffffff, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x80000001, v6 ; GFX8-NEXT: v_min_i32_e32 v8, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 0x80000000, v8 +; GFX8-NEXT: v_bfrev_b32_e32 v9, 1 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v9 ; GFX8-NEXT: v_max_i32_e32 v3, v6, v3 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v8 -; GFX8-NEXT: v_bfrev_b32_e32 v7, -2 +; GFX8-NEXT: v_mov_b32_e32 v7, 0x80000001 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v7 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v7 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v9 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v6 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_max_i32_e32 v3, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, 0x7fffffff, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x80000001, v3 ; GFX8-NEXT: v_min_i32_e32 v4, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x80000000, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000000, v4 ; GFX8-NEXT: v_max_i32_e32 v3, v3, v5 ; GFX8-NEXT: v_min_i32_e32 v3, v3, v4 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 @@ -1449,23 +1452,23 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX6-LABEL: s_ssubsat_v3i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s6, s0, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x7fffffff +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000001 ; GFX6-NEXT: s_min_i32 s7, s0, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x80000000 +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s6, s3 ; GFX6-NEXT: s_min_i32 s3, s3, s7 ; GFX6-NEXT: s_sub_i32 s0, s0, s3 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s1, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s3, s4 ; GFX6-NEXT: s_min_i32 s3, s3, s6 ; GFX6-NEXT: s_sub_i32 s1, s1, s3 ; GFX6-NEXT: s_max_i32 s3, s2, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX6-NEXT: s_min_i32 s4, s2, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s3, s3, s5 ; GFX6-NEXT: s_min_i32 s3, s3, s4 ; GFX6-NEXT: s_sub_i32 s2, s2, s3 @@ -1474,23 +1477,23 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inre ; GFX8-LABEL: s_ssubsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s6, s0, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fffffff +; GFX8-NEXT: s_add_i32 s6, s6, 0x80000001 ; GFX8-NEXT: s_min_i32 s7, s0, -1 -; GFX8-NEXT: s_sub_i32 s7, s7, 0x80000000 +; GFX8-NEXT: s_add_i32 s7, s7, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_min_i32 s3, s3, s7 ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_max_i32 s3, s1, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX8-NEXT: s_min_i32 s6, s1, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s3, s4 ; GFX8-NEXT: s_min_i32 s3, s3, s6 ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_max_i32 s3, s2, -1 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX8-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX8-NEXT: s_min_i32 s4, s2, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX8-NEXT: s_max_i32 s3, s3, s5 ; GFX8-NEXT: s_min_i32 s3, s3, s4 ; GFX8-NEXT: s_sub_i32 s2, s2, s3 @@ -1527,32 +1530,32 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x80000001, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v11 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX6-NEXT: v_mov_b32_e32 v9, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v9 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x80000000, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v9 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v9 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000000, v5 ; GFX6-NEXT: v_max_i32_e32 v4, v4, v7 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 @@ -1562,32 +1565,32 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v8, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 0x7fffffff, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x80000001, v8 ; GFX8-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v11 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v11 ; GFX8-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v10 -; GFX8-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX8-NEXT: v_mov_b32_e32 v9, 0x80000001 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v9 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 ; GFX8-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX8-NEXT: v_subrev_u32_e32 v8, vcc, 0x80000000, v8 +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v11 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v2 -; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v9 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v9 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v6 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v4 ; GFX8-NEXT: v_max_i32_e32 v4, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x80000001, v4 ; GFX8-NEXT: v_min_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x80000000, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000000, v5 ; GFX8-NEXT: v_max_i32_e32 v4, v4, v7 ; GFX8-NEXT: v_min_i32_e32 v4, v4, v5 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v4 @@ -1618,30 +1621,30 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX6-LABEL: s_ssubsat_v4i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s8, s0, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 ; GFX6-NEXT: s_min_i32 s9, s0, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s8, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s9 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_max_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s1, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s5 ; GFX6-NEXT: s_min_i32 s4, s4, s8 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_max_i32 s4, s2, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s6 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_max_i32 s4, s3, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s4, s7 ; GFX6-NEXT: s_min_i32 s4, s4, s5 ; GFX6-NEXT: s_sub_i32 s3, s3, s4 @@ -1650,30 +1653,30 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inre ; GFX8-LABEL: s_ssubsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s8, s0, -1 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX8-NEXT: s_add_i32 s8, s8, 0x80000001 ; GFX8-NEXT: s_min_i32 s9, s0, -1 -; GFX8-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX8-NEXT: s_add_i32 s9, s9, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s8, s4 ; GFX8-NEXT: s_min_i32 s4, s4, s9 ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_max_i32 s4, s1, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX8-NEXT: s_min_i32 s8, s1, -1 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX8-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s5 ; GFX8-NEXT: s_min_i32 s4, s4, s8 ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_max_i32 s4, s2, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX8-NEXT: s_min_i32 s5, s2, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s2, s2, s4 ; GFX8-NEXT: s_max_i32 s4, s3, -1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX8-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX8-NEXT: s_min_i32 s5, s3, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX8-NEXT: s_max_i32 s4, s4, s7 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sub_i32 s3, s3, s4 @@ -1715,39 +1718,39 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v10, vcc, 0x7fffffff, v10 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, 0x80000001, v10 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v13 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v12 -; GFX6-NEXT: v_bfrev_b32_e32 v11, -2 +; GFX6-NEXT: v_mov_b32_e32 v11, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v13 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v10 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v11 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v11 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v13 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x7fffffff, v5 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, 0x80000001, v5 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX6-NEXT: v_subrev_i32_e32 v6, vcc, 0x80000000, v6 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0x80000000, v6 ; GFX6-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 @@ -1757,39 +1760,39 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v10, -1, v0 -; GFX8-NEXT: v_subrev_u32_e32 v10, vcc, 0x7fffffff, v10 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x80000001, v10 ; GFX8-NEXT: v_min_i32_e32 v12, -1, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v13, 1 -; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v13 +; GFX8-NEXT: v_add_u32_e32 v12, vcc, v12, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v10, v5 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v12 -; GFX8-NEXT: v_bfrev_b32_e32 v11, -2 +; GFX8-NEXT: v_mov_b32_e32 v11, 0x80000001 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_min_i32_e32 v10, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v13 +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v10 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v2 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v7 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v11 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v11 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v13 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v8 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v5 ; GFX8-NEXT: v_max_i32_e32 v5, -1, v4 -; GFX8-NEXT: v_subrev_u32_e32 v5, vcc, 0x7fffffff, v5 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x80000001, v5 ; GFX8-NEXT: v_min_i32_e32 v6, -1, v4 -; GFX8-NEXT: v_subrev_u32_e32 v6, vcc, 0x80000000, v6 +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x80000000, v6 ; GFX8-NEXT: v_max_i32_e32 v5, v5, v9 ; GFX8-NEXT: v_min_i32_e32 v5, v5, v6 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v5 @@ -1822,37 +1825,37 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX6-LABEL: s_ssubsat_v5i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s10, s0, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x7fffffff +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000001 ; GFX6-NEXT: s_min_i32 s11, s0, -1 -; GFX6-NEXT: s_sub_i32 s11, s11, 0x80000000 +; GFX6-NEXT: s_add_i32 s11, s11, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s10, s5 ; GFX6-NEXT: s_min_i32 s5, s5, s11 ; GFX6-NEXT: s_sub_i32 s0, s0, s5 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s1, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s6 ; GFX6-NEXT: s_min_i32 s5, s5, s10 ; GFX6-NEXT: s_sub_i32 s1, s1, s5 ; GFX6-NEXT: s_max_i32 s5, s2, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s7 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s2, s2, s5 ; GFX6-NEXT: s_max_i32 s5, s3, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s8 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s3, s3, s5 ; GFX6-NEXT: s_max_i32 s5, s4, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s4, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s5, s5, s9 ; GFX6-NEXT: s_min_i32 s5, s5, s6 ; GFX6-NEXT: s_sub_i32 s4, s4, s5 @@ -1861,37 +1864,37 @@ define amdgpu_ps <5 x i32> @s_ssubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inre ; GFX8-LABEL: s_ssubsat_v5i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s10, s0, -1 -; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fffffff +; GFX8-NEXT: s_add_i32 s10, s10, 0x80000001 ; GFX8-NEXT: s_min_i32 s11, s0, -1 -; GFX8-NEXT: s_sub_i32 s11, s11, 0x80000000 +; GFX8-NEXT: s_add_i32 s11, s11, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s10, s5 ; GFX8-NEXT: s_min_i32 s5, s5, s11 ; GFX8-NEXT: s_sub_i32 s0, s0, s5 ; GFX8-NEXT: s_max_i32 s5, s1, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX8-NEXT: s_min_i32 s10, s1, -1 -; GFX8-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX8-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s6 ; GFX8-NEXT: s_min_i32 s5, s5, s10 ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_max_i32 s5, s2, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX8-NEXT: s_min_i32 s6, s2, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s7 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s2, s2, s5 ; GFX8-NEXT: s_max_i32 s5, s3, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX8-NEXT: s_min_i32 s6, s3, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s8 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s3, s3, s5 ; GFX8-NEXT: s_max_i32 s5, s4, -1 -; GFX8-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX8-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX8-NEXT: s_min_i32 s6, s4, -1 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX8-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX8-NEXT: s_max_i32 s5, s5, s9 ; GFX8-NEXT: s_min_i32 s5, s5, s6 ; GFX8-NEXT: s_sub_i32 s4, s4, s5 @@ -1938,117 +1941,117 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v31, -2 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v32, v31 +; GFX6-NEXT: v_mov_b32_e32 v31, 0x80000001 +; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v31 ; GFX6-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX6-NEXT: v_min_i32_e32 v33, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX6-NEXT: v_sub_i32_e32 v33, vcc, v33, v16 +; GFX6-NEXT: v_add_i32_e32 v33, vcc, v33, v16 ; GFX6-NEXT: v_min_i32_e32 v32, v32, v33 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v32 ; GFX6-NEXT: v_max_i32_e32 v32, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v32, v31 +; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v31 ; GFX6-NEXT: v_max_i32_e32 v17, v32, v17 ; GFX6-NEXT: v_min_i32_e32 v32, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v32, v16 +; GFX6-NEXT: v_add_i32_e32 v32, vcc, v32, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v32 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v4 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v5 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v6 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v6 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v16 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX6-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v7 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v8 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v24 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v9 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v25 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v10 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v26 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v11 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v27 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v28 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v29 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX6-NEXT: v_sub_i32_e32 v19, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v19, vcc, v19, v16 ; GFX6-NEXT: v_max_i32_e32 v17, v17, v30 ; GFX6-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v17 ; GFX6-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v17, v31 +; GFX6-NEXT: v_add_i32_e32 v17, vcc, v17, v31 ; GFX6-NEXT: v_min_i32_e32 v19, -1, v15 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v19, v16 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v19, v16 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_max_i32_e32 v17, v17, v18 ; GFX6-NEXT: v_min_i32_e32 v16, v17, v16 @@ -2059,117 +2062,117 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v32, -1, v0 -; GFX8-NEXT: v_bfrev_b32_e32 v31, -2 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v32, v31 +; GFX8-NEXT: v_mov_b32_e32 v31, 0x80000001 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_max_i32_e32 v32, v32, v16 ; GFX8-NEXT: v_min_i32_e32 v33, -1, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v16, 1 -; GFX8-NEXT: v_sub_u32_e32 v33, vcc, v33, v16 +; GFX8-NEXT: v_add_u32_e32 v33, vcc, v33, v16 ; GFX8-NEXT: v_min_i32_e32 v32, v32, v33 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v32 ; GFX8-NEXT: v_max_i32_e32 v32, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v32, v31 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v31 ; GFX8-NEXT: v_max_i32_e32 v17, v32, v17 ; GFX8-NEXT: v_min_i32_e32 v32, -1, v1 -; GFX8-NEXT: v_sub_u32_e32 v32, vcc, v32, v16 +; GFX8-NEXT: v_add_u32_e32 v32, vcc, v32, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v32 ; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v1, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v2 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v2 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v3 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v3 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v19 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v3, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v4 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v4 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v20 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v5 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v5 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v21 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v5, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v6 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v18, -1, v6 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v22 -; GFX8-NEXT: v_sub_u32_e32 v18, vcc, v18, v16 +; GFX8-NEXT: v_add_u32_e32 v18, vcc, v18, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v18 ; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 ; GFX8-NEXT: v_sub_u32_e32 v6, vcc, v6, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v7 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v7 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v23 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v7, vcc, v7, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v8 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v8 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v24 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v8, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v9 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v9 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v25 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v9, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v10 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v10 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v26 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v10, vcc, v10, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v11 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v11 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v27 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v11, vcc, v11, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v12 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v12 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v28 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v12, vcc, v12, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v13 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v13 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v29 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v13, vcc, v13, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v14 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v14 -; GFX8-NEXT: v_sub_u32_e32 v19, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v19, vcc, v19, v16 ; GFX8-NEXT: v_max_i32_e32 v17, v17, v30 ; GFX8-NEXT: v_min_i32_e32 v17, v17, v19 ; GFX8-NEXT: v_sub_u32_e32 v14, vcc, v14, v17 ; GFX8-NEXT: v_max_i32_e32 v17, -1, v15 -; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v17, v31 +; GFX8-NEXT: v_add_u32_e32 v17, vcc, v17, v31 ; GFX8-NEXT: v_min_i32_e32 v19, -1, v15 -; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v19, v16 +; GFX8-NEXT: v_add_u32_e32 v16, vcc, v19, v16 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_max_i32_e32 v17, v17, v18 ; GFX8-NEXT: v_min_i32_e32 v16, v17, v16 @@ -2252,114 +2255,114 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX6-LABEL: s_ssubsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_max_i32 s32, s0, -1 -; GFX6-NEXT: s_sub_i32 s32, s32, 0x7fffffff +; GFX6-NEXT: s_add_i32 s32, s32, 0x80000001 ; GFX6-NEXT: s_min_i32 s33, s0, -1 -; GFX6-NEXT: s_sub_i32 s33, s33, 0x80000000 +; GFX6-NEXT: s_add_i32 s33, s33, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s32, s16 ; GFX6-NEXT: s_min_i32 s16, s16, s33 ; GFX6-NEXT: s_sub_i32 s0, s0, s16 ; GFX6-NEXT: s_max_i32 s16, s1, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s32, s1, -1 -; GFX6-NEXT: s_sub_i32 s32, s32, 0x80000000 +; GFX6-NEXT: s_add_i32 s32, s32, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s17 ; GFX6-NEXT: s_min_i32 s16, s16, s32 ; GFX6-NEXT: s_sub_i32 s1, s1, s16 ; GFX6-NEXT: s_max_i32 s16, s2, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s2, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s18 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s2, s2, s16 ; GFX6-NEXT: s_max_i32 s16, s3, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s3, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s19 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s3, s3, s16 ; GFX6-NEXT: s_max_i32 s16, s4, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s4, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s20 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s4, s4, s16 ; GFX6-NEXT: s_max_i32 s16, s5, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s5, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s21 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s5, s5, s16 ; GFX6-NEXT: s_max_i32 s16, s6, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s6, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s22 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s6, s6, s16 ; GFX6-NEXT: s_max_i32 s16, s7, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s7, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s23 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s7, s7, s16 ; GFX6-NEXT: s_max_i32 s16, s8, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s8, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s24 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s16, s9, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s9, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s25 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s9, s9, s16 ; GFX6-NEXT: s_max_i32 s16, s10, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s10, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s26 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s10, s10, s16 ; GFX6-NEXT: s_max_i32 s16, s11, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s11, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s27 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s11, s11, s16 ; GFX6-NEXT: s_max_i32 s16, s12, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s12, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s28 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s12, s12, s16 ; GFX6-NEXT: s_max_i32 s16, s13, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s13, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s29 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s13, s13, s16 ; GFX6-NEXT: s_max_i32 s16, s14, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s14, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s30 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s14, s14, s16 ; GFX6-NEXT: s_max_i32 s16, s15, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s15, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s16, s16, s31 ; GFX6-NEXT: s_min_i32 s16, s16, s17 ; GFX6-NEXT: s_sub_i32 s15, s15, s16 @@ -2368,114 +2371,114 @@ define amdgpu_ps <16 x i32> @s_ssubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> ; GFX8-LABEL: s_ssubsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_max_i32 s32, s0, -1 -; GFX8-NEXT: s_sub_i32 s32, s32, 0x7fffffff +; GFX8-NEXT: s_add_i32 s32, s32, 0x80000001 ; GFX8-NEXT: s_min_i32 s33, s0, -1 -; GFX8-NEXT: s_sub_i32 s33, s33, 0x80000000 +; GFX8-NEXT: s_add_i32 s33, s33, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s32, s16 ; GFX8-NEXT: s_min_i32 s16, s16, s33 ; GFX8-NEXT: s_sub_i32 s0, s0, s16 ; GFX8-NEXT: s_max_i32 s16, s1, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s32, s1, -1 -; GFX8-NEXT: s_sub_i32 s32, s32, 0x80000000 +; GFX8-NEXT: s_add_i32 s32, s32, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s17 ; GFX8-NEXT: s_min_i32 s16, s16, s32 ; GFX8-NEXT: s_sub_i32 s1, s1, s16 ; GFX8-NEXT: s_max_i32 s16, s2, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s2, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s18 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s2, s2, s16 ; GFX8-NEXT: s_max_i32 s16, s3, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s3, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s19 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s3, s3, s16 ; GFX8-NEXT: s_max_i32 s16, s4, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s4, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s20 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s4, s4, s16 ; GFX8-NEXT: s_max_i32 s16, s5, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s5, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s21 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s5, s5, s16 ; GFX8-NEXT: s_max_i32 s16, s6, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s6, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s22 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s6, s6, s16 ; GFX8-NEXT: s_max_i32 s16, s7, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s7, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s23 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s7, s7, s16 ; GFX8-NEXT: s_max_i32 s16, s8, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s8, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s24 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s8, s8, s16 ; GFX8-NEXT: s_max_i32 s16, s9, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s9, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s25 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s9, s9, s16 ; GFX8-NEXT: s_max_i32 s16, s10, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s10, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s26 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s10, s10, s16 ; GFX8-NEXT: s_max_i32 s16, s11, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s11, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s27 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s11, s11, s16 ; GFX8-NEXT: s_max_i32 s16, s12, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s12, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s28 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s12, s12, s16 ; GFX8-NEXT: s_max_i32 s16, s13, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s13, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s29 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s13, s13, s16 ; GFX8-NEXT: s_max_i32 s16, s14, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s14, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s30 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s14, s14, s16 ; GFX8-NEXT: s_max_i32 s16, s15, -1 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX8-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX8-NEXT: s_min_i32 s17, s15, -1 -; GFX8-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX8-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX8-NEXT: s_max_i32 s16, s16, s31 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sub_i32 s15, s15, s16 @@ -2579,9 +2582,9 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v1, v2, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v3 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -2592,9 +2595,9 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v1, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -2621,9 +2624,9 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: s_max_i32 s1, s2, s1 ; GFX6-NEXT: s_min_i32 s1, s1, s3 ; GFX6-NEXT: s_sub_i32 s0, s0, s1 @@ -2635,11 +2638,11 @@ define amdgpu_ps i16 @s_ssubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { ; GFX8-NEXT: s_sext_i32_i16 s2, s0 ; GFX8-NEXT: s_sext_i32_i16 s3, -1 ; GFX8-NEXT: s_max_i32 s4, s2, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s4 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s3, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -2669,9 +2672,9 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s1, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX6-NEXT: s_add_i32 s1, s1, 0x80000001 ; GFX6-NEXT: s_min_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v0, s1, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 @@ -2683,9 +2686,9 @@ define amdgpu_ps half @ssubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { ; GFX8-NEXT: s_sext_i32_i16 s1, s0 ; GFX8-NEXT: s_sext_i32_i16 s2, -1 ; GFX8-NEXT: s_max_i32 s3, s1, s2 -; GFX8-NEXT: s_sub_i32 s3, s3, 0x7fff +; GFX8-NEXT: s_addk_i32 s3, 0x8001 ; GFX8-NEXT: s_min_i32 s1, s1, s2 -; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s1, 0x8000 ; GFX8-NEXT: v_max_i16_e32 v0, s3, v0 ; GFX8-NEXT: v_min_i16_e32 v0, s1, v0 ; GFX8-NEXT: v_sub_u16_e32 v0, s0, v0 @@ -2711,9 +2714,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v1, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, 0x7fffffff, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0x80000001, v1 ; GFX6-NEXT: v_min_i32_e32 v2, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x80000000, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000000, v2 ; GFX6-NEXT: v_max_i32_e32 v1, s0, v1 ; GFX6-NEXT: v_min_i32_e32 v1, v1, v2 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 @@ -2723,9 +2726,9 @@ define amdgpu_ps half @ssubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { ; GFX8-LABEL: ssubsat_i16_vs: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_max_i16_e32 v1, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 ; GFX8-NEXT: v_min_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_max_i16_e32 v1, s0, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_sub_u16_e32 v0, v0, v1 @@ -2752,18 +2755,19 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v4, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000001, v4 ; GFX6-NEXT: v_min_i32_e32 v5, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v5, vcc, 0x80000000, v5 +; GFX6-NEXT: v_bfrev_b32_e32 v6, 1 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GFX6-NEXT: v_max_i32_e32 v2, v4, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v5 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX6-NEXT: v_max_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000001, v3 ; GFX6-NEXT: v_min_i32_e32 v4, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v4, vcc, 0x80000000, v4 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x80000000, v4 ; GFX6-NEXT: v_max_i32_e32 v2, v3, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -2775,16 +2779,16 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x7fff, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8001, v2 ; GFX8-NEXT: v_min_i16_e32 v3, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_e32 v2, v2, v1 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v3, -1 ; GFX8-NEXT: v_max_i16_sdwa v4, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 ; GFX8-NEXT: v_min_i16_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x8000, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8000, v3 ; GFX8-NEXT: v_max_i16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v3 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 @@ -2813,18 +2817,18 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s4, s0, -1 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x7fffffff +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000001 ; GFX6-NEXT: s_min_i32 s5, s0, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x80000000 +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s4, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s5 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s2 ; GFX6-NEXT: s_lshl_b32 s2, s3, 16 ; GFX6-NEXT: s_max_i32 s3, s1, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x7fffffff +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000001 ; GFX6-NEXT: s_min_i32 s4, s1, -1 -; GFX6-NEXT: s_sub_i32 s4, s4, 0x80000000 +; GFX6-NEXT: s_add_i32 s4, s4, 0x80000000 ; GFX6-NEXT: s_max_i32 s2, s3, s2 ; GFX6-NEXT: s_min_i32 s2, s2, s4 ; GFX6-NEXT: s_sub_i32 s1, s1, s2 @@ -2841,12 +2845,12 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8-NEXT: s_sext_i32_i16 s4, s0 ; GFX8-NEXT: s_sext_i32_i16 s5, -1 ; GFX8-NEXT: s_max_i32 s6, s4, s5 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_lshr_b32 s3, s1, 16 ; GFX8-NEXT: s_min_i32 s4, s4, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s4, 0x8000 ; GFX8-NEXT: s_max_i32 s1, s6, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -2855,11 +2859,11 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs ; GFX8-NEXT: s_sub_i32 s0, s0, s1 ; GFX8-NEXT: s_sext_i32_i16 s1, s2 ; GFX8-NEXT: s_max_i32 s4, s1, s5 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s1, s1, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s1, 0x8000 ; GFX8-NEXT: s_max_i32 s3, s4, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s1, s1 @@ -2894,18 +2898,18 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s2, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x7fffffff +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000001 ; GFX6-NEXT: s_min_i32 s3, s0, -1 -; GFX6-NEXT: s_sub_i32 s3, s3, 0x80000000 +; GFX6-NEXT: s_add_i32 s3, s3, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX6-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 ; GFX6-NEXT: s_max_i32 s1, s0, -1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: s_sub_i32 s1, s1, 0x7fffffff +; GFX6-NEXT: s_add_i32 s1, s1, 0x80000001 ; GFX6-NEXT: s_min_i32 s2, s0, -1 -; GFX6-NEXT: s_sub_i32 s2, s2, 0x80000000 +; GFX6-NEXT: s_add_i32 s2, s2, 0x80000000 ; GFX6-NEXT: v_max_i32_e32 v1, s1, v1 ; GFX6-NEXT: v_min_i32_e32 v1, s2, v1 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 @@ -2922,18 +2926,18 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: s_sext_i32_i16 s2, s0 ; GFX8-NEXT: s_sext_i32_i16 s3, -1 ; GFX8-NEXT: s_max_i32 s4, s2, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: v_max_i16_e32 v1, s4, v0 ; GFX8-NEXT: v_min_i16_e32 v1, s2, v1 ; GFX8-NEXT: s_sext_i32_i16 s2, s1 ; GFX8-NEXT: s_max_i32 s4, s2, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: v_max_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v0, s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 @@ -2962,18 +2966,20 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v0 ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 -; GFX6-NEXT: v_min_i32_e32 v3, -1, v0 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0x80000001, v2 +; GFX6-NEXT: v_min_i32_e32 v4, -1, v0 +; GFX6-NEXT: v_bfrev_b32_e32 v5, 1 +; GFX6-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 -; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 +; GFX6-NEXT: v_min_i32_e32 v2, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mov_b32_e32 v3, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_max_i32_e32 v2, -1, v1 ; GFX6-NEXT: s_lshl_b32 s0, s1, 16 -; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: v_min_i32_e32 v3, -1, v1 -; GFX6-NEXT: v_subrev_i32_e32 v3, vcc, 0x80000000, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0x80000000, v3 ; GFX6-NEXT: v_max_i32_e32 v2, s0, v2 ; GFX6-NEXT: v_min_i32_e32 v2, v2, v3 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 @@ -2988,17 +2994,17 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { ; GFX8-LABEL: ssubsat_v2i16_vs: ; GFX8: ; %bb.0: ; GFX8-NEXT: v_max_i16_e32 v1, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v1, 0x7fff, v1 +; GFX8-NEXT: v_add_u16_e32 v1, 0x8001, v1 ; GFX8-NEXT: v_min_i16_e32 v2, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_max_i16_e32 v1, s0, v1 ; GFX8-NEXT: v_min_i16_e32 v1, v1, v2 ; GFX8-NEXT: v_mov_b32_e32 v2, -1 ; GFX8-NEXT: v_max_i16_sdwa v3, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 -; GFX8-NEXT: v_subrev_u16_e32 v3, 0x7fff, v3 +; GFX8-NEXT: v_add_u16_e32 v3, 0x8001, v3 ; GFX8-NEXT: v_min_i16_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v2, 0x8000, v2 +; GFX8-NEXT: v_add_u16_e32 v2, 0x8000, v2 ; GFX8-NEXT: v_max_i16_e32 v3, s1, v3 ; GFX8-NEXT: v_min_i16_e32 v2, v3, v2 ; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 @@ -3038,38 +3044,38 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v8, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX6-NEXT: v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x80000001, v8 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v11, 1 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v11 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v8, v4 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v10 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_bfrev_b32_e32 v9, -2 +; GFX6-NEXT: v_mov_b32_e32 v9, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v11 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v8 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 ; GFX6-NEXT: v_max_i32_e32 v5, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v9 +; GFX6-NEXT: v_add_i32_e32 v5, vcc, v5, v9 ; GFX6-NEXT: v_min_i32_e32 v6, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v11 +; GFX6-NEXT: v_add_i32_e32 v6, vcc, v6, v11 ; GFX6-NEXT: v_max_i32_e32 v4, v5, v4 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_min_i32_e32 v4, v4, v6 @@ -3091,28 +3097,28 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v4, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v4, 0x7fff, v4 +; GFX8-NEXT: v_add_u16_e32 v4, 0x8001, v4 ; GFX8-NEXT: v_min_i16_e32 v5, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_add_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_max_i16_e32 v4, v4, v2 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v5 ; GFX8-NEXT: v_mov_b32_e32 v5, -1 ; GFX8-NEXT: v_max_i16_sdwa v6, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_add_u16_e32 v6, 0x8001, v6 ; GFX8-NEXT: v_min_i16_sdwa v7, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_max_i16_sdwa v2, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v6, -1, v1 ; GFX8-NEXT: v_min_i16_e32 v2, v2, v7 -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_add_u16_e32 v6, 0x8001, v6 ; GFX8-NEXT: v_min_i16_e32 v7, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 ; GFX8-NEXT: v_max_i16_sdwa v7, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x7fff, v7 +; GFX8-NEXT: v_add_u16_e32 v7, 0x8001, v7 ; GFX8-NEXT: v_min_i16_sdwa v5, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v5, 0x8000, v5 +; GFX8-NEXT: v_add_u16_e32 v5, 0x8000, v5 ; GFX8-NEXT: v_max_i16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v5 ; GFX8-NEXT: v_sub_u16_e32 v4, v0, v4 @@ -3147,36 +3153,36 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s8, s0, -1 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x7fffffff +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000001 ; GFX6-NEXT: s_min_i32 s9, s0, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x80000000 +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s8, s4 ; GFX6-NEXT: s_min_i32 s4, s4, s9 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s4 ; GFX6-NEXT: s_lshl_b32 s4, s5, 16 ; GFX6-NEXT: s_max_i32 s5, s1, -1 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s1, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s8 ; GFX6-NEXT: s_max_i32 s5, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s4 ; GFX6-NEXT: s_lshl_b32 s4, s6, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s2, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s6 ; GFX6-NEXT: s_max_i32 s5, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s4 ; GFX6-NEXT: s_lshl_b32 s4, s7, 16 -; GFX6-NEXT: s_sub_i32 s5, s5, 0x7fffffff +; GFX6-NEXT: s_add_i32 s5, s5, 0x80000001 ; GFX6-NEXT: s_min_i32 s6, s3, -1 -; GFX6-NEXT: s_sub_i32 s6, s6, 0x80000000 +; GFX6-NEXT: s_add_i32 s6, s6, 0x80000000 ; GFX6-NEXT: s_max_i32 s4, s5, s4 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_min_i32 s4, s4, s6 @@ -3199,12 +3205,12 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s8, s0 ; GFX8-NEXT: s_sext_i32_i16 s9, -1 ; GFX8-NEXT: s_max_i32 s10, s8, s9 -; GFX8-NEXT: s_sub_i32 s10, s10, 0x7fff +; GFX8-NEXT: s_addk_i32 s10, 0x8001 ; GFX8-NEXT: s_lshr_b32 s6, s2, 16 ; GFX8-NEXT: s_min_i32 s8, s8, s9 ; GFX8-NEXT: s_sext_i32_i16 s10, s10 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s8, 0x8000 ; GFX8-NEXT: s_max_i32 s2, s10, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -3213,11 +3219,11 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_sub_i32 s0, s0, s2 ; GFX8-NEXT: s_sext_i32_i16 s2, s4 ; GFX8-NEXT: s_max_i32 s8, s2, s9 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_addk_i32 s8, 0x8001 ; GFX8-NEXT: s_min_i32 s2, s2, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s2, s2, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s2, 0x8000 ; GFX8-NEXT: s_max_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s2, s2 @@ -3225,12 +3231,12 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_sub_i32 s2, s4, s2 ; GFX8-NEXT: s_sext_i32_i16 s4, s1 ; GFX8-NEXT: s_max_i32 s6, s4, s9 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_lshr_b32 s7, s3, 16 ; GFX8-NEXT: s_min_i32 s4, s4, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s4, 0x8000 ; GFX8-NEXT: s_max_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3239,11 +3245,11 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre ; GFX8-NEXT: s_sub_i32 s1, s1, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s5 ; GFX8-NEXT: s_max_i32 s4, s3, s9 -; GFX8-NEXT: s_sub_i32 s4, s4, 0x7fff +; GFX8-NEXT: s_addk_i32 s4, 0x8001 ; GFX8-NEXT: s_min_i32 s3, s3, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s7 -; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s3, 0x8000 ; GFX8-NEXT: s_max_i32 s4, s4, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3299,57 +3305,57 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v12, -1, v0 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX6-NEXT: v_subrev_i32_e32 v12, vcc, 0x7fffffff, v12 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, 0x80000001, v12 ; GFX6-NEXT: v_min_i32_e32 v14, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v15, 1 -; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v15 +; GFX6-NEXT: v_add_i32_e32 v14, vcc, v14, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v12, v6 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v14 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_bfrev_b32_e32 v13, -2 +; GFX6-NEXT: v_mov_b32_e32 v13, 0x80000001 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v12, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v15 +; GFX6-NEXT: v_add_i32_e32 v12, vcc, v12, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v12 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 ; GFX6-NEXT: v_max_i32_e32 v7, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 -; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v13 +; GFX6-NEXT: v_add_i32_e32 v7, vcc, v7, v13 ; GFX6-NEXT: v_min_i32_e32 v8, -1, v5 ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v15 +; GFX6-NEXT: v_add_i32_e32 v8, vcc, v8, v15 ; GFX6-NEXT: v_max_i32_e32 v6, v7, v6 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX6-NEXT: v_min_i32_e32 v6, v6, v8 @@ -3376,40 +3382,40 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v6, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v6, 0x7fff, v6 +; GFX8-NEXT: v_add_u16_e32 v6, 0x8001, v6 ; GFX8-NEXT: v_min_i16_e32 v7, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_max_i16_e32 v6, v6, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v7 ; GFX8-NEXT: v_mov_b32_e32 v7, -1 ; GFX8-NEXT: v_max_i16_sdwa v8, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 ; GFX8-NEXT: v_min_i16_sdwa v9, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v8, -1, v1 ; GFX8-NEXT: v_min_i16_e32 v3, v3, v9 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_e32 v8, v8, v4 ; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 ; GFX8-NEXT: v_max_i16_sdwa v9, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8001, v9 ; GFX8-NEXT: v_min_i16_sdwa v10, v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10 +; GFX8-NEXT: v_add_u16_e32 v10, 0x8000, v10 ; GFX8-NEXT: v_max_i16_sdwa v4, v9, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v9, -1, v2 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v10 -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x7fff, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8001, v9 ; GFX8-NEXT: v_min_i16_e32 v10, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x8000, v10 +; GFX8-NEXT: v_add_u16_e32 v10, 0x8000, v10 ; GFX8-NEXT: v_max_i16_e32 v9, v9, v5 ; GFX8-NEXT: v_min_i16_e32 v9, v9, v10 ; GFX8-NEXT: v_max_i16_sdwa v10, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_add_u16_e32 v10, 0x8001, v10 ; GFX8-NEXT: v_min_i16_sdwa v7, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v7, 0x8000, v7 +; GFX8-NEXT: v_add_u16_e32 v7, 0x8000, v7 ; GFX8-NEXT: v_max_i16_sdwa v5, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v7 ; GFX8-NEXT: v_sub_u16_e32 v6, v0, v6 @@ -3449,55 +3455,55 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s12, s0, -1 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 -; GFX6-NEXT: s_sub_i32 s12, s12, 0x7fffffff +; GFX6-NEXT: s_add_i32 s12, s12, 0x80000001 ; GFX6-NEXT: s_min_i32 s13, s0, -1 -; GFX6-NEXT: s_sub_i32 s13, s13, 0x80000000 +; GFX6-NEXT: s_add_i32 s13, s13, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s12, s6 ; GFX6-NEXT: s_min_i32 s6, s6, s13 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s6 ; GFX6-NEXT: s_lshl_b32 s6, s7, 16 ; GFX6-NEXT: s_max_i32 s7, s1, -1 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s12, s1, -1 -; GFX6-NEXT: s_sub_i32 s12, s12, 0x80000000 +; GFX6-NEXT: s_add_i32 s12, s12, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s12 ; GFX6-NEXT: s_max_i32 s7, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s6 ; GFX6-NEXT: s_lshl_b32 s6, s8, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s2, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s6 ; GFX6-NEXT: s_lshl_b32 s6, s9, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s3, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s6 ; GFX6-NEXT: s_lshl_b32 s6, s10, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s4, -1 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 ; GFX6-NEXT: s_max_i32 s7, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s6 ; GFX6-NEXT: s_lshl_b32 s6, s11, 16 -; GFX6-NEXT: s_sub_i32 s7, s7, 0x7fffffff +; GFX6-NEXT: s_add_i32 s7, s7, 0x80000001 ; GFX6-NEXT: s_min_i32 s8, s5, -1 ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 -; GFX6-NEXT: s_sub_i32 s8, s8, 0x80000000 +; GFX6-NEXT: s_add_i32 s8, s8, 0x80000000 ; GFX6-NEXT: s_max_i32 s6, s7, s6 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 ; GFX6-NEXT: s_min_i32 s6, s6, s8 @@ -3525,12 +3531,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s12, s0 ; GFX8-NEXT: s_sext_i32_i16 s13, -1 ; GFX8-NEXT: s_max_i32 s14, s12, s13 -; GFX8-NEXT: s_sub_i32 s14, s14, 0x7fff +; GFX8-NEXT: s_addk_i32 s14, 0x8001 ; GFX8-NEXT: s_lshr_b32 s9, s3, 16 ; GFX8-NEXT: s_min_i32 s12, s12, s13 ; GFX8-NEXT: s_sext_i32_i16 s14, s14 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 -; GFX8-NEXT: s_sub_i32 s12, s12, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s12, 0x8000 ; GFX8-NEXT: s_max_i32 s3, s14, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 @@ -3539,11 +3545,11 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sub_i32 s0, s0, s3 ; GFX8-NEXT: s_sext_i32_i16 s3, s6 ; GFX8-NEXT: s_max_i32 s12, s3, s13 -; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff +; GFX8-NEXT: s_addk_i32 s12, 0x8001 ; GFX8-NEXT: s_min_i32 s3, s3, s13 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 -; GFX8-NEXT: s_sub_i32 s3, s3, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s3, 0x8000 ; GFX8-NEXT: s_max_i32 s9, s12, s9 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s3, s3 @@ -3551,12 +3557,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sub_i32 s3, s6, s3 ; GFX8-NEXT: s_sext_i32_i16 s6, s1 ; GFX8-NEXT: s_max_i32 s9, s6, s13 -; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff +; GFX8-NEXT: s_addk_i32 s9, 0x8001 ; GFX8-NEXT: s_lshr_b32 s10, s4, 16 ; GFX8-NEXT: s_min_i32 s6, s6, s13 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s6, 0x8000 ; GFX8-NEXT: s_max_i32 s4, s9, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3565,11 +3571,11 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sub_i32 s1, s1, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s7 ; GFX8-NEXT: s_max_i32 s6, s4, s13 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_min_i32 s4, s4, s13 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s9, s10 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s4, 0x8000 ; GFX8-NEXT: s_max_i32 s6, s6, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3577,12 +3583,12 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s6, s2 ; GFX8-NEXT: s_sub_i32 s4, s7, s4 ; GFX8-NEXT: s_max_i32 s7, s6, s13 -; GFX8-NEXT: s_sub_i32 s7, s7, 0x7fff +; GFX8-NEXT: s_addk_i32 s7, 0x8001 ; GFX8-NEXT: s_lshr_b32 s11, s5, 16 ; GFX8-NEXT: s_min_i32 s6, s6, s13 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s6, 0x8000 ; GFX8-NEXT: s_max_i32 s5, s7, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3591,11 +3597,11 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre ; GFX8-NEXT: s_sub_i32 s2, s2, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s8 ; GFX8-NEXT: s_max_i32 s6, s5, s13 -; GFX8-NEXT: s_sub_i32 s6, s6, 0x7fff +; GFX8-NEXT: s_addk_i32 s6, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, s13 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s7, s11 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s5, 0x8000 ; GFX8-NEXT: s_max_i32 s6, s6, s7 ; GFX8-NEXT: s_and_b32 s3, 0xffff, s3 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 @@ -3648,66 +3654,66 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX6-NEXT: v_max_i32_e32 v16, -1, v0 -; GFX6-NEXT: v_bfrev_b32_e32 v17, -2 +; GFX6-NEXT: v_mov_b32_e32 v17, 0x80000001 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v16, v17 ; GFX6-NEXT: v_min_i32_e32 v18, -1, v0 ; GFX6-NEXT: v_bfrev_b32_e32 v19, 1 -; GFX6-NEXT: v_sub_i32_e32 v18, vcc, v18, v19 +; GFX6-NEXT: v_add_i32_e32 v18, vcc, v18, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v16, v8 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v18 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v16, -1, v1 -; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v16, v19 +; GFX6-NEXT: v_add_i32_e32 v16, vcc, v16, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v16 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v2 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v2 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v3 ; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v3 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v4 ; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v4 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v5 ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v5 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 ; GFX6-NEXT: v_max_i32_e32 v9, -1, v6 ; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v6 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX6-NEXT: v_min_i32_e32 v8, v8, v10 @@ -3715,10 +3721,10 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 ; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 -; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v17 +; GFX6-NEXT: v_add_i32_e32 v9, vcc, v9, v17 ; GFX6-NEXT: v_min_i32_e32 v10, -1, v7 ; GFX6-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 +; GFX6-NEXT: v_add_i32_e32 v10, vcc, v10, v19 ; GFX6-NEXT: v_max_i32_e32 v8, v9, v8 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX6-NEXT: v_ashrrev_i32_e32 v2, 16, v2 @@ -3750,52 +3756,52 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_max_i16_e32 v8, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v8, 0x7fff, v8 +; GFX8-NEXT: v_add_u16_e32 v8, 0x8001, v8 ; GFX8-NEXT: v_min_i16_e32 v9, -1, v0 -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_e32 v8, v8, v4 ; GFX8-NEXT: v_min_i16_e32 v8, v8, v9 ; GFX8-NEXT: v_mov_b32_e32 v9, -1 ; GFX8-NEXT: v_max_i16_sdwa v10, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_add_u16_e32 v10, 0x8001, v10 ; GFX8-NEXT: v_min_i16_sdwa v11, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_add_u16_e32 v11, 0x8000, v11 ; GFX8-NEXT: v_max_i16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v10, -1, v1 ; GFX8-NEXT: v_min_i16_e32 v4, v4, v11 -; GFX8-NEXT: v_subrev_u16_e32 v10, 0x7fff, v10 +; GFX8-NEXT: v_add_u16_e32 v10, 0x8001, v10 ; GFX8-NEXT: v_min_i16_e32 v11, -1, v1 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x8000, v11 +; GFX8-NEXT: v_add_u16_e32 v11, 0x8000, v11 ; GFX8-NEXT: v_max_i16_e32 v10, v10, v5 ; GFX8-NEXT: v_min_i16_e32 v10, v10, v11 ; GFX8-NEXT: v_max_i16_sdwa v11, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_add_u16_e32 v11, 0x8001, v11 ; GFX8-NEXT: v_min_i16_sdwa v12, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_add_u16_e32 v12, 0x8000, v12 ; GFX8-NEXT: v_max_i16_sdwa v5, v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v11, -1, v2 ; GFX8-NEXT: v_min_i16_e32 v5, v5, v12 -; GFX8-NEXT: v_subrev_u16_e32 v11, 0x7fff, v11 +; GFX8-NEXT: v_add_u16_e32 v11, 0x8001, v11 ; GFX8-NEXT: v_min_i16_e32 v12, -1, v2 -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x8000, v12 +; GFX8-NEXT: v_add_u16_e32 v12, 0x8000, v12 ; GFX8-NEXT: v_max_i16_e32 v11, v11, v6 ; GFX8-NEXT: v_min_i16_e32 v11, v11, v12 ; GFX8-NEXT: v_max_i16_sdwa v12, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_add_u16_e32 v12, 0x8001, v12 ; GFX8-NEXT: v_min_i16_sdwa v13, v2, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_add_u16_e32 v13, 0x8000, v13 ; GFX8-NEXT: v_max_i16_sdwa v6, v12, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_max_i16_e32 v12, -1, v3 ; GFX8-NEXT: v_min_i16_e32 v6, v6, v13 -; GFX8-NEXT: v_subrev_u16_e32 v12, 0x7fff, v12 +; GFX8-NEXT: v_add_u16_e32 v12, 0x8001, v12 ; GFX8-NEXT: v_min_i16_e32 v13, -1, v3 -; GFX8-NEXT: v_subrev_u16_e32 v13, 0x8000, v13 +; GFX8-NEXT: v_add_u16_e32 v13, 0x8000, v13 ; GFX8-NEXT: v_max_i16_e32 v12, v12, v7 ; GFX8-NEXT: v_min_i16_e32 v12, v12, v13 ; GFX8-NEXT: v_max_i16_sdwa v13, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v13, 0x7fff, v13 +; GFX8-NEXT: v_add_u16_e32 v13, 0x8001, v13 ; GFX8-NEXT: v_min_i16_sdwa v9, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_subrev_u16_e32 v9, 0x8000, v9 +; GFX8-NEXT: v_add_u16_e32 v9, 0x8000, v9 ; GFX8-NEXT: v_max_i16_sdwa v7, v13, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: v_sub_u16_e32 v8, v0, v8 ; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD @@ -3840,63 +3846,63 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_lshl_b32 s0, s0, 16 ; GFX6-NEXT: s_max_i32 s16, s0, -1 ; GFX6-NEXT: s_lshl_b32 s8, s8, 16 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x7fffffff +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000001 ; GFX6-NEXT: s_min_i32 s17, s0, -1 -; GFX6-NEXT: s_sub_i32 s17, s17, 0x80000000 +; GFX6-NEXT: s_add_i32 s17, s17, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s16, s8 ; GFX6-NEXT: s_min_i32 s8, s8, s17 ; GFX6-NEXT: s_lshl_b32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s0, s0, s8 ; GFX6-NEXT: s_lshl_b32 s8, s9, 16 ; GFX6-NEXT: s_max_i32 s9, s1, -1 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s16, s1, -1 -; GFX6-NEXT: s_sub_i32 s16, s16, 0x80000000 +; GFX6-NEXT: s_add_i32 s16, s16, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s2, s2, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s16 ; GFX6-NEXT: s_max_i32 s9, s2, -1 ; GFX6-NEXT: s_sub_i32 s1, s1, s8 ; GFX6-NEXT: s_lshl_b32 s8, s10, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s2, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s3, s3, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s3, -1 ; GFX6-NEXT: s_sub_i32 s2, s2, s8 ; GFX6-NEXT: s_lshl_b32 s8, s11, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s3, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s4, s4, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s4, -1 ; GFX6-NEXT: s_sub_i32 s3, s3, s8 ; GFX6-NEXT: s_lshl_b32 s8, s12, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s4, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s5, s5, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s5, -1 ; GFX6-NEXT: s_sub_i32 s4, s4, s8 ; GFX6-NEXT: s_lshl_b32 s8, s13, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s5, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s6, s6, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 ; GFX6-NEXT: s_max_i32 s9, s6, -1 ; GFX6-NEXT: s_sub_i32 s5, s5, s8 ; GFX6-NEXT: s_lshl_b32 s8, s14, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s6, -1 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_lshl_b32 s7, s7, 16 ; GFX6-NEXT: s_min_i32 s8, s8, s10 @@ -3904,10 +3910,10 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX6-NEXT: s_ashr_i32 s1, s1, 16 ; GFX6-NEXT: s_sub_i32 s6, s6, s8 ; GFX6-NEXT: s_lshl_b32 s8, s15, 16 -; GFX6-NEXT: s_sub_i32 s9, s9, 0x7fffffff +; GFX6-NEXT: s_add_i32 s9, s9, 0x80000001 ; GFX6-NEXT: s_min_i32 s10, s7, -1 ; GFX6-NEXT: s_ashr_i32 s0, s0, 16 -; GFX6-NEXT: s_sub_i32 s10, s10, 0x80000000 +; GFX6-NEXT: s_add_i32 s10, s10, 0x80000000 ; GFX6-NEXT: s_max_i32 s8, s9, s8 ; GFX6-NEXT: s_and_b32 s1, s1, 0xffff ; GFX6-NEXT: s_ashr_i32 s2, s2, 16 @@ -3940,12 +3946,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s16, s0 ; GFX8-NEXT: s_sext_i32_i16 s17, -1 ; GFX8-NEXT: s_max_i32 s18, s16, s17 -; GFX8-NEXT: s_sub_i32 s18, s18, 0x7fff +; GFX8-NEXT: s_addk_i32 s18, 0x8001 ; GFX8-NEXT: s_lshr_b32 s12, s4, 16 ; GFX8-NEXT: s_min_i32 s16, s16, s17 ; GFX8-NEXT: s_sext_i32_i16 s18, s18 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 -; GFX8-NEXT: s_sub_i32 s16, s16, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s16, 0x8000 ; GFX8-NEXT: s_max_i32 s4, s18, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 ; GFX8-NEXT: s_sext_i32_i16 s16, s16 @@ -3954,11 +3960,11 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sub_i32 s0, s0, s4 ; GFX8-NEXT: s_sext_i32_i16 s4, s8 ; GFX8-NEXT: s_max_i32 s16, s4, s17 -; GFX8-NEXT: s_sub_i32 s16, s16, 0x7fff +; GFX8-NEXT: s_addk_i32 s16, 0x8001 ; GFX8-NEXT: s_min_i32 s4, s4, s17 ; GFX8-NEXT: s_sext_i32_i16 s16, s16 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 -; GFX8-NEXT: s_sub_i32 s4, s4, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s4, 0x8000 ; GFX8-NEXT: s_max_i32 s12, s16, s12 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s4, s4 @@ -3966,12 +3972,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sub_i32 s4, s8, s4 ; GFX8-NEXT: s_sext_i32_i16 s8, s1 ; GFX8-NEXT: s_max_i32 s12, s8, s17 -; GFX8-NEXT: s_sub_i32 s12, s12, 0x7fff +; GFX8-NEXT: s_addk_i32 s12, 0x8001 ; GFX8-NEXT: s_lshr_b32 s13, s5, 16 ; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s12, s12 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s8, 0x8000 ; GFX8-NEXT: s_max_i32 s5, s12, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -3980,11 +3986,11 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sub_i32 s1, s1, s5 ; GFX8-NEXT: s_sext_i32_i16 s5, s9 ; GFX8-NEXT: s_max_i32 s8, s5, s17 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_addk_i32 s8, 0x8001 ; GFX8-NEXT: s_min_i32 s5, s5, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s12, s13 -; GFX8-NEXT: s_sub_i32 s5, s5, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s5, 0x8000 ; GFX8-NEXT: s_max_i32 s8, s8, s12 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s5, s5 @@ -3992,12 +3998,12 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sext_i32_i16 s8, s2 ; GFX8-NEXT: s_sub_i32 s5, s9, s5 ; GFX8-NEXT: s_max_i32 s9, s8, s17 -; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff +; GFX8-NEXT: s_addk_i32 s9, 0x8001 ; GFX8-NEXT: s_lshr_b32 s14, s6, 16 ; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s8, 0x8000 ; GFX8-NEXT: s_max_i32 s6, s9, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4006,23 +4012,23 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sub_i32 s2, s2, s6 ; GFX8-NEXT: s_sext_i32_i16 s6, s10 ; GFX8-NEXT: s_max_i32 s8, s6, s17 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_addk_i32 s8, 0x8001 ; GFX8-NEXT: s_min_i32 s6, s6, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s14 -; GFX8-NEXT: s_sub_i32 s6, s6, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s6, 0x8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s6, s6 ; GFX8-NEXT: s_min_i32 s6, s8, s6 ; GFX8-NEXT: s_sext_i32_i16 s8, s3 ; GFX8-NEXT: s_max_i32 s9, s8, s17 -; GFX8-NEXT: s_sub_i32 s9, s9, 0x7fff +; GFX8-NEXT: s_addk_i32 s9, 0x8001 ; GFX8-NEXT: s_lshr_b32 s15, s7, 16 ; GFX8-NEXT: s_min_i32 s8, s8, s17 ; GFX8-NEXT: s_sext_i32_i16 s9, s9 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 -; GFX8-NEXT: s_sub_i32 s8, s8, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s8, 0x8000 ; GFX8-NEXT: s_max_i32 s7, s9, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s7 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 @@ -4031,14 +4037,14 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre ; GFX8-NEXT: s_sub_i32 s3, s3, s7 ; GFX8-NEXT: s_sext_i32_i16 s7, s11 ; GFX8-NEXT: s_max_i32 s8, s7, s17 -; GFX8-NEXT: s_sub_i32 s8, s8, 0x7fff +; GFX8-NEXT: s_addk_i32 s8, 0x8001 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s4 ; GFX8-NEXT: s_min_i32 s7, s7, s17 ; GFX8-NEXT: s_sext_i32_i16 s8, s8 ; GFX8-NEXT: s_sext_i32_i16 s9, s15 ; GFX8-NEXT: s_and_b32 s0, 0xffff, s0 ; GFX8-NEXT: s_lshl_b32 s4, s4, 16 -; GFX8-NEXT: s_sub_i32 s7, s7, 0xffff8000 +; GFX8-NEXT: s_addk_i32 s7, 0x8000 ; GFX8-NEXT: s_max_i32 s8, s8, s9 ; GFX8-NEXT: s_or_b32 s0, s0, s4 ; GFX8-NEXT: s_and_b32 s4, 0xffff, s5 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll index 855687281ce9ab..6c104709f5ee3a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll @@ -147,10 +147,10 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) { ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_splat: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_not_b32_e32 v1, 63 -; GFX8-NEXT: v_subrev_u16_e32 v2, 0xffc0, v0 -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_add_u16_e32 v1, 64, v0 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_splat: @@ -179,9 +179,9 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_lo(<2 x i16> %a) { ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_lo: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v2, 4 -; GFX8-NEXT: v_subrev_u16_e32 v1, 0xffc0, v0 -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_mov_b32_e32 v2, -4 +; GFX8-NEXT: v_add_u16_e32 v1, 64, v0 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -211,10 +211,10 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_hi(<2 x i16> %a) { ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_hi: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_not_b32_e32 v1, 63 -; GFX8-NEXT: v_subrev_u16_e32 v2, 4, v0 -; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 64 +; GFX8-NEXT: v_add_u16_e32 v1, -4, v0 +; GFX8-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_hi: @@ -245,8 +245,8 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sub_i32 s0, s0, 0xffc0 -; GFX8-NEXT: s_sub_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_add_i32 s0, s0, 0xffff0040 +; GFX8-NEXT: s_add_i32 s1, s1, 0xffff0040 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 @@ -285,8 +285,8 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sub_i32 s0, s0, 0xffc0 -; GFX8-NEXT: s_sub_i32 s1, s1, 4 +; GFX8-NEXT: s_add_i32 s0, s0, 0xffff0040 +; GFX8-NEXT: s_add_i32 s1, s1, -4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 @@ -325,8 +325,8 @@ define amdgpu_ps i32 @s_sub_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_lshr_b32 s1, s0, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff -; GFX8-NEXT: s_sub_i32 s0, s0, 4 -; GFX8-NEXT: s_sub_i32 s1, s1, 0xffc0 +; GFX8-NEXT: s_add_i32 s0, s0, -4 +; GFX8-NEXT: s_add_i32 s1, s1, 0xffff0040 ; GFX8-NEXT: s_lshl_b32 s1, s1, 16 ; GFX8-NEXT: s_and_b32 s0, s0, 0xffff ; GFX8-NEXT: s_or_b32 s0, s1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll index 31f61b9968b8bf..24ec4fa48f7789 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll @@ -222,10 +222,10 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) { ; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1 ; CHECK-NEXT: v_mul_lo_u32 v1, v1, v3 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 0xffed2705, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; CHECK-NEXT: v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, 0xffed2705, v0 ; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3 ; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -243,23 +243,23 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) { ; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3 ; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 ; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3 -; GISEL-NEXT: v_mul_lo_u32 v4, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4 -; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GISEL-NEXT: v_mul_hi_u32 v4, v0, v3 +; GISEL-NEXT: v_mul_lo_u32 v5, v3, v4 +; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GISEL-NEXT: v_mul_hi_u32 v5, v0, v3 ; GISEL-NEXT: v_mul_hi_u32 v3, v1, v3 -; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2 +; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2 ; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2 -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 -; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0xffed2705, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 -; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v0, v2 -; GISEL-NEXT: v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1 +; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GISEL-NEXT: v_add_i32_e32 v3, vcc, v0, v4 +; GISEL-NEXT: v_add_i32_e32 v4, vcc, 0xffed2705, v1 ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 @@ -274,23 +274,23 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-NEXT: v_mov_b32_e32 v4, 0x12d8fb ; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 ; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v2, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v2, v3 -; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v0, v2 +; CGP-NEXT: v_mul_lo_u32 v5, v2, v3 +; CGP-NEXT: v_mul_hi_u32 v5, v2, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; CGP-NEXT: v_mul_hi_u32 v5, v0, v2 ; CGP-NEXT: v_mul_hi_u32 v2, v1, v2 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v4 +; CGP-NEXT: v_mul_lo_u32 v5, v5, v4 ; CGP-NEXT: v_mul_lo_u32 v2, v2, v4 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 -; CGP-NEXT: v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v0, v3 +; CGP-NEXT: v_add_i32_e32 v5, vcc, 0xffed2705, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4 -; CGP-NEXT: v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1 +; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; CGP-NEXT: v_add_i32_e32 v2, vcc, v0, v3 +; CGP-NEXT: v_add_i32_e32 v3, vcc, 0xffed2705, v1 ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 ; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index 5889af70a8f092..c1a957dec3e867 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -364,9 +364,10 @@ entry: ; FUNC-LABEL: ptrtoint: ; SI-NOT: ds_write +; SI: s_add_i32 [[S_ADD_OFFSET:s[0-9]+]], s{{[0-9]+}}, 5 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen -; SI: v_add_{{[iu]}}32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5, -; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offen ; +; SI: v_mov_b32_e32 [[V_ADD_OFFSET:v[0-9]+]], [[S_ADD_OFFSET]] +; SI: buffer_load_dword v{{[0-9]+}}, [[V_ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offen ; define amdgpu_kernel void @ptrtoint(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], addrspace(5) %tmp0 = getelementptr [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 %a diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll index 96d7f02cf2422a..835e5e5f06ef0f 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-kernarg-preload-num-sgprs.ll @@ -6,7 +6,7 @@ ; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000 ................ ; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000 ................ ; OBJDUMP-NOT: 0030 0000af00 94130000 1a000400 00000000 ................ -; OBJDUMP-NEXT: 0030 8000af00 98130000 1a000400 00000000 ................ +; OBJDUMP-NEXT: 0030 8000af00 98130000 1e000400 00000000 ................ ; ASM-LABEL: amdhsa_kernarg_preload_4_implicit_6: ; ASM: .amdhsa_user_sgpr_count 12 diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll index 90562e25a3e9c1..55ed11ac629724 100644 --- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll +++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll @@ -1,6 +1,6 @@ ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O2 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=OPT %s -; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV4 %s -; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefixes=NOOPT,COV5 %s +; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=NOOPT %s +; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 | FileCheck -check-prefix=NOOPT %s ; Check that AMDGPUAttributor is not run with -O0. ; OPT: .amdhsa_user_sgpr_private_segment_buffer 1 @@ -19,8 +19,7 @@ ; NOOPT: .amdhsa_user_sgpr_private_segment_buffer 1 ; NOOPT: .amdhsa_user_sgpr_dispatch_ptr 1 -; COV4: .amdhsa_user_sgpr_queue_ptr 1 -; COV5: .amdhsa_user_sgpr_queue_ptr 0 +; NOOPT: .amdhsa_user_sgpr_queue_ptr 1 ; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; NOOPT: .amdhsa_user_sgpr_dispatch_id 1 ; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0 diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll index 5155a14f1f7963..d4c50cf2c7e4aa 100644 --- a/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll +++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-no-sgpr-for-csrspill-xfail.ll @@ -1,4 +1,4 @@ -; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s -filetype=null 2>&1 | FileCheck -enable-var-scope %s +; RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s ; CHECK: LLVM ERROR: failed to find free scratch register diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index 8fde0dd2d28ed4..609425329e106b 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -31,7 +31,7 @@ define hidden void @use_queue_ptr() #1 { ; GCN-LABEL: {{^}}kern_indirect_use_queue_ptr: ; GCN: s_swappc_b64 s[30:31], s[10:11] -; GCN: .amdhsa_user_sgpr_queue_ptr 0 +; GCN: .amdhsa_user_sgpr_queue_ptr 1 define amdgpu_kernel void @kern_indirect_use_queue_ptr(i32) #1 { call void @use_queue_ptr() ret void @@ -470,7 +470,7 @@ define hidden void @use_every_sgpr_input() #1 { ; GCN: .amdhsa_user_sgpr_private_segment_buffer 1 ; GCN: .amdhsa_user_sgpr_dispatch_ptr 1 -; GCN: .amdhsa_user_sgpr_queue_ptr 0 +; GCN: .amdhsa_user_sgpr_queue_ptr 1 ; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; GCN: .amdhsa_user_sgpr_dispatch_id 1 ; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 @@ -495,7 +495,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 { ; GCN: .amdhsa_user_sgpr_private_segment_buffer 1 ; GCN: .amdhsa_user_sgpr_dispatch_ptr 1 -; GCN: .amdhsa_user_sgpr_queue_ptr 0 +; GCN: .amdhsa_user_sgpr_queue_ptr 1 ; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 0 ; GCN: .amdhsa_user_sgpr_dispatch_id 1 ; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll index ca0c669056ee33..2ec4c074a892dc 100644 --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -147,19 +147,14 @@ define amdgpu_kernel void @stored_fi_to_global_2_small_objects(ptr addrspace(1) ; GCN-LABEL: {{^}}kernel_stored_fi_to_global_huge_frame_offset: ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} -; FIXME: Re-initialize -; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}} +; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]] - +; GCN-DAG: v_mov_b32_e32 [[V_BASE_1_OFF:v[0-9]+]], 0x4000{{$}} +; GCN: buffer_store_dword [[K]], [[V_BASE_1_OFF]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} -; GCN: v_add_i32_e32 [[BASE_1_OFF_2:v[0-9]+]], vcc, 56, [[BASE_0_1]] -; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} - -; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} +; GCN: buffer_store_dword [[V_BASE_1_OFF]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @kernel_stored_fi_to_global_huge_frame_offset(ptr addrspace(1) %ptr) #0 { %tmp0 = alloca [4096 x i32], addrspace(5) %tmp1 = alloca [4096 x i32], addrspace(5) @@ -171,20 +166,20 @@ define amdgpu_kernel void @kernel_stored_fi_to_global_huge_frame_offset(ptr addr ret void } +; FIXME: Shift of SP repeated twice ; GCN-LABEL: {{^}}func_stored_fi_to_global_huge_frame_offset: -; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} +; GCN-DAG: v_lshr_b32_e64 [[FI_TMP_0:v[0-9]+]], s32, 6 +; GCN-DAG: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} ; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4{{$}} -; GCN: v_lshr_b32_e64 [[FI_TMP:v[0-9]+]], s32, 6 -; GCN: v_add_i32_e32 [[BASE_0_1:v[0-9]+]], vcc, 4, [[FI_TMP]]{{$}} +; GCN-DAG: v_add_i32_e32 [[FI_0:v[0-9]+]], vcc, 0x4000, [[FI_TMP_0]]{{$}} ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]] -; GCN: v_add_i32_e32 [[BASE_1_OFF_2:v[0-9]+]], vcc, 56, [[BASE_0_1]] -; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} - -; GCN: buffer_store_dword [[BASE_1_OFF_2]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 +; GCN: buffer_store_dword [[K]], [[FI_0]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} +; GCN: v_lshr_b32_e64 [[FI_TMP_1:v[0-9]+]], s32, 6 +; GCN: v_add_i32_e32 [[BASE_0_1:v[0-9]+]], vcc, 60, [[FI_TMP_1]]{{$}} +; GCN: buffer_store_dword [[BASE_0_1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 define void @func_stored_fi_to_global_huge_frame_offset(ptr addrspace(1) %ptr) #0 { %tmp0 = alloca [4096 x i32], addrspace(5) %tmp1 = alloca [4096 x i32], addrspace(5) @@ -217,9 +212,9 @@ entry: ret void } -; FIXME: This is broken, and the sgpr input just gets replaced with a VGPR ; GCN-LABEL: {{^}}func_alloca_offset0__use_asm_sgpr: -; GCN: v_lshr_b32_e64 [[FI:v[0-9]+]], s32, 6 +; GCN: s_lshr_b32 [[FI:s[0-9]+]], s32, 6 +; GCN-NOT: [[FI]] ; GCN: ; use [[FI]] define void @func_alloca_offset0__use_asm_sgpr() { %alloca = alloca i32, addrspace(5) @@ -238,9 +233,9 @@ define void @func_alloca_offset0__use_asm_vgpr() { } ; GCN-LABEL: {{^}}func_alloca_offset0__use_asm_phys_sgpr: -; GCN: s_lshr_b32 s8, s32, 6 +; GCN: s_lshr_b32 [[FI:s[0-9]+]], s32, 6 ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use s8 +; GCN-NEXT: ; use [[FI]] define void @func_alloca_offset0__use_asm_phys_sgpr() { %alloca = alloca i32, addrspace(5) call void asm sideeffect "; use $0", "{s8}"(ptr addrspace(5) %alloca) @@ -258,12 +253,11 @@ define void @func_alloca_offset0__use_asm_phys_vgpr() { } ; GCN-LABEL: {{^}}func_alloca_offset_use_asm_sgpr: -; GCN: v_lshr_b32_e64 [[FI0_TMP0:v[0-9]+]], s32, 6 -; GCN-NEXT: v_add_i32_e32 [[FI0:v[0-9]+]], vcc, 16, [[FI0_TMP0]] +; GCN: s_lshr_b32 [[FI0_TMP0:s[0-9]+]], s32, 6 +; GCN-NEXT: s_add_i32 [[FI0:s[0-9]+]], [[FI0_TMP0]], 16 -; GCN: v_lshr_b32_e64 [[TMP:v[0-9]+]], s32, 6 -; GCN-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GCN-NEXT: v_add_i32_e32 [[TMP]], vcc, vcc_lo, [[TMP]] +; GCN: s_lshr_b32 [[TMP:s[0-9]+]], s32, 6 +; GCN-NEXT: s_addk_i32 [[TMP]], 0x4010 ; GCN-NEXT: ;;#ASMSTART ; GCN: ; use [[TMP]] define void @func_alloca_offset_use_asm_sgpr() { @@ -274,19 +268,17 @@ define void @func_alloca_offset_use_asm_sgpr() { ret void } -; FIXME: Shouldn't need to materialize constant ; GCN-LABEL: {{^}}func_alloca_offset_use_asm_vgpr: -; GCN: v_lshr_b32_e64 [[FI0_TMP:v[0-9]+]], s32, 6 -; GCN-NEXT: v_add_i32_e32 [[FI0:v[0-9]+]], vcc, 16, [[FI0_TMP]] +; GCN: s_lshr_b32 [[S_FI:s[0-9]+]], s32, 6 +; GCN: v_lshr_b32_e64 [[V_FI:v[0-9]+]], s32, 6 +; GCN: s_movk_i32 vcc_lo, 0x4010 +; GCN: s_add_i32 [[S_FI]], [[S_FI]], 16 ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use [[FI0]] +; GCN-NEXT: ; use [[S_FI]] ; GCN-NEXT: ;;#ASMEND - -; GCN: v_lshr_b32_e64 [[FI1_TMP:v[0-9]+]], s32, 6 -; GCN-NEXT: s_movk_i32 vcc_lo, 0x4010 -; GCN-NEXT: v_add_i32_e32 [[FI1:v[0-9]+]], vcc, vcc_lo, [[FI1_TMP]] +; GCN-NEXT: v_add_i32_e32 [[V_FI:v[0-9]+]], vcc, vcc_lo, [[V_FI]] ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use [[FI1]] +; GCN-NEXT: ; use [[V_FI]] ; GCN-NEXT: ;;#ASMEND define void @func_alloca_offset_use_asm_vgpr() { %alloca0 = alloca [4096 x i32], align 16, addrspace(5) @@ -296,17 +288,15 @@ define void @func_alloca_offset_use_asm_vgpr() { ret void } -; FIXME: Using VGPR for SGPR input ; GCN-LABEL: {{^}}kernel_alloca_offset_use_asm_sgpr: -; GCN: v_mov_b32_e32 v0, 16 +; GCN: s_mov_b32 [[FI0:s[0-9]+]], 16 ; GCN-NOT: v0 ; GCN: ;;#ASMSTART -; GCN-NEXT: ; use v0 +; GCN-NEXT: ; use [[FI0]] ; GCN-NEXT: ;;#ASMEND - -; GCN: v_mov_b32_e32 v0, 0x4010 +; GCN: s_movk_i32 [[FI1:s[0-9]+]], 0x4010 ; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; use v0 +; GCN-NEXT: ; use [[FI1]] ; GCN-NEXT: ;;#ASMEND define amdgpu_kernel void @kernel_alloca_offset_use_asm_sgpr() { %alloca0 = alloca [4096 x i32], align 16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll index d94e75c8c8e223..d36dcc247331c7 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll @@ -699,8 +699,8 @@ define amdgpu_kernel void @commute_uno_2.0_f64(ptr addrspace(1) %out, ptr addrsp ; GCN-LABEL: {{^}}commute_frameindex: ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} -; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} -; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]] +; GCN: s_mov_b32 [[FI:s[0-9]+]], 0{{$}} +; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}} define amdgpu_kernel void @commute_frameindex(ptr addrspace(1) nocapture %out) #0 { entry: %stack0 = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll index e1c9fed9df4892..3019d4d298eb45 100644 --- a/llvm/test/CodeGen/AMDGPU/ctlz.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll @@ -567,7 +567,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac ; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 24, v1 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffe8, v1 ; GFX10-GISEL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-GISEL-NEXT: s_endpgm ; @@ -1566,7 +1566,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 24 +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffe8 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: global_store_byte v1, v0, s[0:1] @@ -1686,7 +1686,7 @@ define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias % ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v2, v1 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX10-GISEL-NEXT: v_min_u32_e32 v2, 32, v2 -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v2, 16, v2 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, -16, v2 ; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo ; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] @@ -1807,7 +1807,7 @@ define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ; GFX10-GISEL-NEXT: v_ffbh_u32_e32 v1, v0 ; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10-GISEL-NEXT: v_min_u32_e32 v1, 32, v1 -; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v1, 25 +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffe7 ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo ; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7f, v0 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index d94ec56842ab87..147ddc4d4b75b2 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -1313,7 +1313,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8 ; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13] -; GFX9-G-NEXT: v_subrev_u32_e32 v9, 64, v8 +; GFX9-G-NEXT: v_add_u32_e32 v9, 0xffffffc0, v8 ; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 @@ -1338,7 +1338,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20 ; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11] ; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13] -; GFX9-G-NEXT: v_subrev_u32_e32 v24, 64, v20 +; GFX9-G-NEXT: v_add_u32_e32 v24, 0xffffffc0, v20 ; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13] ; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2 ; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3 @@ -2070,8 +2070,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v4 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v18, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 0xffffffc0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-G-O0-NEXT: v_add_u32_e64 v4, v18, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v18 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 @@ -2203,8 +2204,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v8, v0 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0xffffffc0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-G-O0-NEXT: v_add_u32_e64 v2, v8, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v14, v0, v8 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 @@ -3453,7 +3455,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: v_sub_u32_e32 v8, 64, v16 ; GFX9-G-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1] ; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], v16, v[2:3] -; GFX9-G-NEXT: v_subrev_u32_e32 v14, 64, v16 +; GFX9-G-NEXT: v_add_u32_e32 v14, 0xffffffc0, v16 ; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v16, v[0:1] ; GFX9-G-NEXT: v_or_b32_e32 v10, v8, v10 ; GFX9-G-NEXT: v_or_b32_e32 v11, v9, v11 @@ -3476,7 +3478,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader ; GFX9-G-NEXT: v_sub_u32_e32 v12, 64, v18 -; GFX9-G-NEXT: v_subrev_u32_e32 v22, 64, v18 +; GFX9-G-NEXT: v_add_u32_e32 v22, 0xffffffc0, v18 ; GFX9-G-NEXT: v_lshrrev_b64 v[10:11], v18, v[0:1] ; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3] ; GFX9-G-NEXT: v_lshrrev_b64 v[16:17], v18, v[2:3] @@ -4175,8 +4177,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v7 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v6 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v12, v4 +; GFX9-G-O0-NEXT: s_mov_b32 s5, 0xffffffc0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-G-O0-NEXT: v_add_u32_e64 v4, v12, v4 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v12 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 @@ -4311,8 +4314,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-G-O0-NEXT: s_mov_b32 s7, 64 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v8 -; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-G-O0-NEXT: v_sub_u32_e64 v2, v3, v0 +; GFX9-G-O0-NEXT: s_mov_b32 s6, 0xffffffc0 +; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-G-O0-NEXT: v_add_u32_e64 v2, v3, v0 ; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s7 ; GFX9-G-O0-NEXT: v_sub_u32_e64 v8, v0, v3 ; GFX9-G-O0-NEXT: s_mov_b32 s6, 0 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index e04cd711256081..691f3d36bc7360 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -476,18 +476,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[20:21] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v0 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v2 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v8, v8, v2 -; GISEL-NEXT: v_or_b32_e32 v9, v1, v3 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 +; GISEL-NEXT: v_or_b32_e32 v9, v3, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -505,12 +505,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB0_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 -; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v0 -; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v2, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc -; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v32 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v2 +; GISEL-NEXT: v_not_b32_e32 v2, 63 +; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v32, v2 ; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v32 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[18:19], v32 @@ -536,7 +537,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_subrev_i32_e32 v34, vcc, 64, v28 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, 0xffffffc0, v28 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28 ; GISEL-NEXT: v_lshr_b64 v[0:1], v[18:19], v28 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v28 @@ -665,18 +666,18 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] ; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v0 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v2 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v10, v10, v2 -; GISEL-NEXT: v_or_b32_e32 v11, v1, v3 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_or_b32_e32 v10, v10, v0 +; GISEL-NEXT: v_or_b32_e32 v11, v3, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc @@ -694,12 +695,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB0_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 -; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v0 -; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v2, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v3, vcc -; GISEL-NEXT: v_subrev_i32_e64 v14, s[4:5], 64, v30 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v2 +; GISEL-NEXT: v_not_b32_e32 v2, 63 +; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v30, v2 ; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30 @@ -725,7 +727,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader -; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 +; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26 ; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26 ; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26 @@ -1229,18 +1231,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v18, vcc -; GISEL-NEXT: v_sub_i32_e32 v20, vcc, v2, v3 -; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[20:21], v[24:25] +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v2, v3 +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v20, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v20 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v22 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v2, v2, v22 -; GISEL-NEXT: v_or_b32_e32 v3, v21, v23 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v20 +; GISEL-NEXT: v_or_b32_e32 v3, v23, v21 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -1258,12 +1260,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB1_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 -; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v20 -; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v21, vcc -; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v20 -; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v22, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v23, vcc -; GISEL-NEXT: v_subrev_i32_e64 v22, s[4:5], 64, v30 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v22 +; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v23, vcc +; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v22 +; GISEL-NEXT: v_not_b32_e32 v2, 63 +; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v20, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v21, vcc +; GISEL-NEXT: v_add_i32_e64 v22, s[4:5], v30, v2 ; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v30 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[0:1], v30 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[16:17], v30 @@ -1289,7 +1292,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26 +; GISEL-NEXT: v_add_i32_e32 v32, vcc, 0xffffffc0, v26 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v26 ; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v26 ; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v26 @@ -1401,18 +1404,18 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11] +; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[10:11] ; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v0 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v16 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v8, v8, v16 -; GISEL-NEXT: v_or_b32_e32 v9, v1, v17 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] +; GISEL-NEXT: v_or_b32_e32 v8, v8, v0 +; GISEL-NEXT: v_or_b32_e32 v9, v17, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc @@ -1430,12 +1433,13 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB1_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 -; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0 -; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v16, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v17, vcc -; GISEL-NEXT: v_subrev_i32_e64 v9, s[4:5], 64, v26 +; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v16 +; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v17, vcc +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 +; GISEL-NEXT: v_not_b32_e32 v9, 63 +; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v26, v9 ; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v26 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v26 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v26 @@ -1461,7 +1465,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader -; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v8 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v8 ; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v8 ; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v8 @@ -2072,18 +2076,18 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[18:19] +; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v0 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v2 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v18, v18, v2 -; GISEL-NEXT: v_or_b32_e32 v19, v1, v3 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GISEL-NEXT: v_or_b32_e32 v18, v18, v0 +; GISEL-NEXT: v_or_b32_e32 v19, v3, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc @@ -2101,12 +2105,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB2_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 -; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v0 -; GISEL-NEXT: v_addc_u32_e64 v33, vcc, 0, v2, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v34, vcc, 0, v3, vcc -; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v24 +; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v2 +; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v3, vcc +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v2 +; GISEL-NEXT: v_not_b32_e32 v2, 63 +; GISEL-NEXT: v_addc_u32_e64 v33, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v34, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v24, v2 ; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 64, v24 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v24 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[8:9], v24 @@ -2132,7 +2137,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v31 +; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v31 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v31 ; GISEL-NEXT: v_lshr_b64 v[0:1], v[8:9], v31 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v31 @@ -2262,18 +2267,18 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v14, vcc -; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GISEL-NEXT: v_sub_i32_e32 v14, vcc, v0, v1 +; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v0, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[2:3] ; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v0 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v14 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v2, v2, v14 -; GISEL-NEXT: v_or_b32_e32 v3, v1, v15 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] +; GISEL-NEXT: v_or_b32_e32 v2, v2, v0 +; GISEL-NEXT: v_or_b32_e32 v3, v15, v1 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -2291,12 +2296,13 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB2_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 -; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v0 -; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v1, vcc -; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v0 -; GISEL-NEXT: v_addc_u32_e64 v38, vcc, 0, v14, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v39, vcc, 0, v15, vcc -; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v24 +; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v14 +; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v15, vcc +; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v14 +; GISEL-NEXT: v_not_b32_e32 v2, 63 +; GISEL-NEXT: v_addc_u32_e64 v38, vcc, 0, v0, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v39, vcc, 0, v1, vcc +; GISEL-NEXT: v_add_i32_e64 v20, s[4:5], v24, v2 ; GISEL-NEXT: v_sub_i32_e64 v14, s[4:5], 64, v24 ; GISEL-NEXT: v_lshl_b64 v[0:1], v[12:13], v24 ; GISEL-NEXT: v_lshl_b64 v[2:3], v[6:7], v24 @@ -2322,7 +2328,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader -; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v36 +; GISEL-NEXT: v_add_i32_e32 v24, vcc, 0xffffffc0, v36 ; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v36 ; GISEL-NEXT: v_lshr_b64 v[0:1], v[6:7], v36 ; GISEL-NEXT: v_lshr_b64 v[2:3], v[12:13], v36 @@ -2903,18 +2909,18 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] ; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[20:21] +; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v16, v17 +; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[18:19], v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v20, 0x7f, v16 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_xor_b32_e32 v20, 0x7f, v18 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v20, v20, v18 -; GISEL-NEXT: v_or_b32_e32 v21, v17, v19 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; GISEL-NEXT: v_or_b32_e32 v20, v20, v16 +; GISEL-NEXT: v_or_b32_e32 v21, v19, v17 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc @@ -2932,12 +2938,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB3_6 ; GISEL-NEXT: ; %bb.1: ; %udiv-bb15 -; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v16 -; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v17, vcc -; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16 -; GISEL-NEXT: v_addc_u32_e64 v32, vcc, 0, v18, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc -; GISEL-NEXT: v_subrev_i32_e64 v22, s[4:5], 64, v26 +; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v18 +; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v19, vcc +; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v18 +; GISEL-NEXT: v_not_b32_e32 v18, 63 +; GISEL-NEXT: v_addc_u32_e64 v32, vcc, 0, v16, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e64 v22, s[4:5], v26, v18 ; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v26 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[0:1], v26 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[2:3], v26 @@ -2963,7 +2970,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_5 ; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4 -; GISEL-NEXT: v_subrev_i32_e32 v26, vcc, 64, v30 +; GISEL-NEXT: v_add_i32_e32 v26, vcc, 0xffffffc0, v30 ; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v30 ; GISEL-NEXT: v_lshr_b64 v[16:17], v[2:3], v30 ; GISEL-NEXT: v_lshr_b64 v[18:19], v[0:1], v30 @@ -3075,18 +3082,18 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] ; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc -; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v17 -; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc -; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5] -; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[24:25] +; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v16, v17 +; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, vcc +; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5] +; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[22:23], v[24:25] ; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc -; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v16 -; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v22 +; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc -; GISEL-NEXT: v_or_b32_e32 v18, v18, v22 -; GISEL-NEXT: v_or_b32_e32 v19, v17, v23 -; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23] +; GISEL-NEXT: v_or_b32_e32 v18, v18, v16 +; GISEL-NEXT: v_or_b32_e32 v19, v23, v17 +; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] ; GISEL-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] ; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc @@ -3104,12 +3111,13 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5] ; GISEL-NEXT: s_cbranch_execz .LBB3_12 ; GISEL-NEXT: ; %bb.7: ; %udiv-bb1 -; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v16 -; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v17, vcc -; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v16 -; GISEL-NEXT: v_addc_u32_e64 v36, vcc, 0, v22, s[4:5] -; GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v23, vcc -; GISEL-NEXT: v_subrev_i32_e64 v24, s[4:5], 64, v28 +; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v22 +; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v23, vcc +; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v22 +; GISEL-NEXT: v_not_b32_e32 v18, 63 +; GISEL-NEXT: v_addc_u32_e64 v36, vcc, 0, v16, s[4:5] +; GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v17, vcc +; GISEL-NEXT: v_add_i32_e64 v24, s[4:5], v28, v18 ; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], 64, v28 ; GISEL-NEXT: v_lshl_b64 v[16:17], v[4:5], v28 ; GISEL-NEXT: v_lshl_b64 v[18:19], v[6:7], v28 @@ -3135,7 +3143,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_11 ; GISEL-NEXT: ; %bb.8: ; %udiv-preheader -; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v34 +; GISEL-NEXT: v_add_i32_e32 v28, vcc, 0xffffffc0, v34 ; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 64, v34 ; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v34 ; GISEL-NEXT: v_lshr_b64 v[18:19], v[4:5], v34 diff --git a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll index 3de8db2c6a448e..0e5a68773a6ba8 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-lowbits.ll @@ -163,6 +163,31 @@ define i32 @bzhi32_d0(i32 %val, i32 %numlowbits) nounwind { ret i32 %masked } +define i32 @bzhi32_d0_even(i32 %val, i32 %numlowbits) nounwind { +; SI-LABEL: bzhi32_d0_even: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; SI-NEXT: v_sub_i32_e32 v1, vcc, 32, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; SI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: bzhi32_d0_even: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v1, 1, v1 +; VI-NEXT: v_sub_u32_e32 v1, vcc, 32, v1 +; VI-NEXT: v_lshlrev_b32_e32 v0, v1, v0 +; VI-NEXT: v_lshrrev_b32_e32 v0, v1, v0 +; VI-NEXT: s_setpc_b64 s[30:31] + %times2 = shl i32 %numlowbits, 1 + %numhighbits = sub i32 32, %times2 + %highbitscleared = shl i32 %val, %numhighbits + %masked = lshr i32 %highbitscleared, %numhighbits + ret i32 %masked +} + define i32 @bzhi32_d1_indexzext(i32 %val, i8 %numlowbits) nounwind { ; SI-LABEL: bzhi32_d1_indexzext: ; SI: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll index 77eb9c495cead5..066c04b1af088d 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -15,18 +15,16 @@ define amdgpu_kernel void @soff1_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -145,18 +143,17 @@ define amdgpu_kernel void @soff1_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 +; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -282,18 +279,17 @@ define amdgpu_kernel void @soff1_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff1_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -419,19 +415,17 @@ define amdgpu_kernel void @soff2_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -556,14 +550,13 @@ define amdgpu_kernel void @soff2_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 @@ -698,14 +691,13 @@ define amdgpu_kernel void @soff2_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff2_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v2 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 @@ -840,19 +832,17 @@ define amdgpu_kernel void @soff4_voff1(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff1: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX940-SDAG-NEXT: v_add_u32_e32 v0, v1, v0 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX940-SDAG-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX940-SDAG-NEXT: scratch_store_byte v2, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v1, v2, off sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 +; GFX940-SDAG-NEXT: scratch_store_byte v3, v1, off sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 @@ -977,14 +967,13 @@ define amdgpu_kernel void @soff4_voff2(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff2: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v2 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v1, 2, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 @@ -1119,17 +1108,16 @@ define amdgpu_kernel void @soff4_voff4(i32 %soff) { ; GFX940-SDAG-LABEL: soff4_voff4: ; GFX940-SDAG: ; %bb.0: ; %bb ; GFX940-SDAG-NEXT: s_load_dword s0, s[4:5], 0x24 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 0 ; GFX940-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX940-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX940-SDAG-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-SDAG-NEXT: v_add_u32_e32 v1, s0, v1 -; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1 -; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 2 -; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:1 sc0 sc1 +; GFX940-SDAG-NEXT: v_mov_b32_e32 v3, s0 +; GFX940-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v3 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:1 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX940-SDAG-NEXT: scratch_store_byte v0, v1, off offset:2 sc0 sc1 +; GFX940-SDAG-NEXT: scratch_store_byte v0, v2, off offset:2 sc0 sc1 ; GFX940-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX940-SDAG-NEXT: v_add_u32_e32 v0, 4, v0 ; GFX940-SDAG-NEXT: v_mov_b32_e32 v1, 4 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll index 8123f1270ab65d..8290942e46e6a1 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -4688,13 +4688,13 @@ define amdgpu_ps void @large_offset() { ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:3024 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, 16 +; GFX9-NEXT: s_mov_b32 s0, 16 ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ; use s0 ; GFX9-NEXT: ;;#ASMEND -; GFX9-NEXT: v_mov_b32_e32 v0, 0x810 +; GFX9-NEXT: s_movk_i32 s0, 0x810 ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use v0 +; GFX9-NEXT: ; use s0 ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_endpgm ; @@ -4705,27 +4705,29 @@ define amdgpu_ps void @large_offset() { ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_movk_i32 s0, 0xbd0 +; GFX10-NEXT: s_movk_i32 s0, 0x810 +; GFX10-NEXT: s_add_i32 s1, s0, 0x3c0 ; GFX10-NEXT: v_mov_b32_e32 v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 +; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s1 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc +; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s1 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, 16 -; GFX10-NEXT: v_mov_b32_e32 v1, 0x810 +; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v0 +; GFX10-NEXT: ; use s1 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use v1 +; GFX10-NEXT: ; use s0 ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: large_offset: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: s_mov_b32 s0, 16 +; GFX11-NEXT: s_movk_i32 s1, 0x810 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-NEXT: v_mov_b32_e32 v2, v0 @@ -4734,18 +4736,19 @@ define amdgpu_ps void @large_offset() { ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 +; GFX11-NEXT: ; use s0 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v1 +; GFX11-NEXT: ; use s1 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: large_offset: ; GFX12: ; %bb.0: ; %bb ; GFX12-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-NEXT: s_mov_b32 s0, 0 +; GFX12-NEXT: s_movk_i32 s1, 0x800 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-NEXT: v_mov_b32_e32 v3, v0 @@ -4753,12 +4756,11 @@ define amdgpu_ps void @large_offset() { ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800 ; GFX12-NEXT: ;;#ASMSTART -; GFX12-NEXT: ; use v0 +; GFX12-NEXT: ; use s0 ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: ;;#ASMSTART -; GFX12-NEXT: ; use v1 +; GFX12-NEXT: ; use s1 ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: s_endpgm ; @@ -4780,13 +4782,13 @@ define amdgpu_ps void @large_offset() { ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:3024 glc ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 16 +; GFX9-PAL-NEXT: s_mov_b32 s0, 16 ; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ; use s0 ; GFX9-PAL-NEXT: ;;#ASMEND -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x810 +; GFX9-PAL-NEXT: s_movk_i32 s0, 0x810 ; GFX9-PAL-NEXT: ;;#ASMSTART -; GFX9-PAL-NEXT: ; use v0 +; GFX9-PAL-NEXT: ; use s0 ; GFX9-PAL-NEXT: ;;#ASMEND ; GFX9-PAL-NEXT: s_endpgm ; @@ -4800,49 +4802,82 @@ define amdgpu_ps void @large_offset() { ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v0, 16 +; GFX940-NEXT: s_mov_b32 s0, 16 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ; use s0 ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, 0x810 +; GFX940-NEXT: s_movk_i32 s0, 0x810 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use v0 +; GFX940-NEXT: ; use s0 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_endpgm ; -; GFX10-PAL-LABEL: large_offset: -; GFX10-PAL: ; %bb.0: ; %bb -; GFX10-PAL-NEXT: s_getpc_b64 s[2:3] -; GFX10-PAL-NEXT: s_mov_b32 s2, s0 -; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 -; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX10-PAL-NEXT: s_add_u32 s2, s2, s0 -; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 -; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-PAL-NEXT: s_movk_i32 s0, 0xbd0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0 -; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 -; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc -; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 16 -; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x810 -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v0 -; GFX10-PAL-NEXT: ;;#ASMEND -; GFX10-PAL-NEXT: ;;#ASMSTART -; GFX10-PAL-NEXT: ; use v1 -; GFX10-PAL-NEXT: ;;#ASMEND -; GFX10-PAL-NEXT: s_endpgm +; GFX1010-PAL-LABEL: large_offset: +; GFX1010-PAL: ; %bb.0: ; %bb +; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1010-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s0 +; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x810 +; GFX1010-PAL-NEXT: s_add_i32 s1, s0, 0x3c0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, v0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 +; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1010-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s1 glc dlc +; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1010-PAL-NEXT: s_mov_b32 s1, 16 +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use s1 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: ;;#ASMSTART +; GFX1010-PAL-NEXT: ; use s0 +; GFX1010-PAL-NEXT: ;;#ASMEND +; GFX1010-PAL-NEXT: s_endpgm +; +; GFX1030-PAL-LABEL: large_offset: +; GFX1030-PAL: ; %bb.0: ; %bb +; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3] +; GFX1030-PAL-NEXT: s_mov_b32 s2, s0 +; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0) +; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff +; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s0 +; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 0 +; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x810 +; GFX1030-PAL-NEXT: s_add_i32 s1, s0, 0x3c0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, v0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, v0 +; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, v0 +; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s1 +; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1030-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s1 glc dlc +; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) +; GFX1030-PAL-NEXT: s_mov_b32 s1, 16 +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use s1 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: ;;#ASMSTART +; GFX1030-PAL-NEXT: ; use s0 +; GFX1030-PAL-NEXT: ;;#ASMEND +; GFX1030-PAL-NEXT: s_endpgm ; ; GFX11-PAL-LABEL: large_offset: ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-PAL-NEXT: s_mov_b32 s0, 16 +; GFX11-PAL-NEXT: s_movk_i32 s1, 0x810 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0 @@ -4851,18 +4886,19 @@ define amdgpu_ps void @large_offset() { ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 ; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v0 +; GFX11-PAL-NEXT: ; use s0 ; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: ;;#ASMSTART -; GFX11-PAL-NEXT: ; use v1 +; GFX11-PAL-NEXT: ; use s1 ; GFX11-PAL-NEXT: ;;#ASMEND ; GFX11-PAL-NEXT: s_endpgm ; ; GFX12-PAL-LABEL: large_offset: ; GFX12-PAL: ; %bb.0: ; %bb ; GFX12-PAL-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-PAL-NEXT: s_mov_b32 s0, 0 +; GFX12-PAL-NEXT: s_movk_i32 s1, 0x800 ; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-PAL-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0 ; GFX12-PAL-NEXT: v_mov_b32_e32 v3, v0 @@ -4870,12 +4906,11 @@ define amdgpu_ps void @large_offset() { ; GFX12-PAL-NEXT: s_wait_storecnt 0x0 ; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS ; GFX12-PAL-NEXT: s_wait_loadcnt 0x0 -; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x800 ; GFX12-PAL-NEXT: ;;#ASMSTART -; GFX12-PAL-NEXT: ; use v0 +; GFX12-PAL-NEXT: ; use s0 ; GFX12-PAL-NEXT: ;;#ASMEND ; GFX12-PAL-NEXT: ;;#ASMSTART -; GFX12-PAL-NEXT: ; use v1 +; GFX12-PAL-NEXT: ; use s1 ; GFX12-PAL-NEXT: ;;#ASMEND ; GFX12-PAL-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 05a4c2a1c04644..a0fe9d88e31cf9 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -234,17 +234,17 @@ define i128 @fptosi_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB0_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 -; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] @@ -602,17 +602,17 @@ define i128 @fptoui_f64_to_i128(double %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB1_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 -; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffb8d, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] @@ -963,17 +963,17 @@ define i128 @fptosi_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB2_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 -; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] @@ -1318,17 +1318,17 @@ define i128 @fptoui_f32_to_i128(float %x) { ; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7] ; GISEL-NEXT: s_cbranch_execz .LBB3_4 ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else -; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6 -; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5] -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v7, 0xffffff6a, v6 +; GISEL-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc ; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0 -; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6 -; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff2a, v6 +; GISEL-NEXT: v_sub_u32_e32 v2, 64, v7 ; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5] -; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5] -; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6 +; GISEL-NEXT: v_lshlrev_b64 v[4:5], v6, v[4:5] +; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7 ; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1] ; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7] @@ -1699,7 +1699,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[7:8] -; GISEL-NEXT: v_subrev_u32_e32 v4, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GISEL-NEXT: v_lshl_or_b32 v11, v11, 16, v11 @@ -2048,7 +2048,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else ; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff7a, v5 ; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[7:8] -; GISEL-NEXT: v_subrev_u32_e32 v4, 64, v6 +; GISEL-NEXT: v_add_u32_e32 v4, 0xffffff3a, v5 ; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6 ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6 ; GISEL-NEXT: v_lshl_or_b32 v11, v11, 16, v11 diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index e3cd8028422ddb..c11b7d67a8a214 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -82,13 +82,13 @@ define void @func_add_constant_to_fi_i32() #0 { ; vgpr ; GCN-LABEL: {{^}}func_other_fi_user_i32: +; MUBUF: s_lshr_b32 [[SCALED:s[0-9]+]], s32, 6 +; MUBUF: s_mul_i32 [[MUL:s[0-9]+]], [[SCALED]], 9 +; MUBUF: v_mov_b32_e32 v0, [[MUL]] -; CI: v_lshr_b32_e64 v0, s32, 6 +; GFX9-FLATSCR: s_mul_i32 [[MUL:s[0-9]+]], s32, 9 +; GFX9-FLATSCR: v_mov_b32_e32 v0, [[MUL]] -; GFX9-MUBUF: v_lshrrev_b32_e64 v0, 6, s32 -; GFX9-FLATSCR: v_mov_b32_e32 v0, s32 - -; GCN-NEXT: v_mul_lo_u32 v0, v0, 9 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_other_fi_user_i32() #0 { @@ -126,8 +126,7 @@ define void @func_load_private_arg_i32_ptr(ptr addrspace(5) %ptr) #0 { ; GFX9-MUBUF: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 ; GFX9-MUBUF-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] -; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 -; GFX9-FLATSCR-NEXT: v_or_b32_e32 v0, 4, [[SP]] +; GFX9-FLATSCR: v_or_b32_e64 v0, s32, 4 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -190,17 +189,16 @@ ret: ; Added offset can't be used with VOP3 add ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: -; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200 -; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 -; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]] +; MUBUF: s_lshr_b32 [[SCALED:s[0-9]+]], s32, 6 +; MUBUF: s_addk_i32 [[SCALED]], 0x200 -; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9-MUBUF: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] +; MUBUF: s_mul_i32 [[Z:s[0-9]+]], [[SCALED]], 9 +; MUBUF: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] -; GFX9-FLATSCR-DAG: s_add_i32 [[SZ:[^,]+]], s32, 0x200 -; GFX9-FLATSCR: v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]] +; GFX9-FLATSCR: s_add_i32 [[SZ:[^,]+]], s32, 0x200 +; GFX9-FLATSCR: s_mul_i32 [[MUL:s[0-9]+]], [[SZ]], 9 +; GFX9-FLATSCR: v_mov_b32_e32 [[VZ:v[0-9]+]], [[MUL]] -; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9 ; GCN: ds_write_b32 v0, [[VZ]] define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { %alloca0 = alloca [128 x i32], align 4, addrspace(5) @@ -215,17 +213,15 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live: -; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200 -; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 -; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]] - -; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 -; GFX9-MUBUF: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] +; MUBUF: s_lshr_b32 [[SCALED:s[0-9]+]], s32, 6 +; MUBUF: s_addk_i32 [[SCALED]], 0x200 +; MUBUF: s_mul_i32 [[Z:s[0-9]+]], [[SCALED]], 9 +; MUBUF: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]] -; GFX9-FLATSCR-DAG: s_add_i32 [[SZ:[^,]+]], s32, 0x200 -; GFX9-FLATSCR: v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]] +; GFX9-FLATSCR: s_add_i32 [[SZ:[^,]+]], s32, 0x200 +; GFX9-FLATSCR: s_mul_i32 [[MUL:s[0-9]+]], [[SZ]], 9 +; GFX9-FLATSCR: v_mov_b32_e32 [[VZ:v[0-9]+]], [[MUL]] -; GCN: v_mul_lo_u32 [[VZ]], [[VZ]], 9 ; GCN: ds_write_b32 v0, [[VZ]] define void @func_other_fi_user_non_inline_imm_offset_i32_vcc_live() #0 { %alloca0 = alloca [128 x i32], align 4, addrspace(5) @@ -284,8 +280,7 @@ bb5: ; GFX9-MUBUF: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 ; GFX9-MUBUF-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] -; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32 -; GFX9-FLATSCR-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SP]] +; GFX9-FLATSCR: v_or_b32_e64 [[PTR:v[0-9]+]], s32, 4 ; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]] define void @alloca_ptr_nonentry_block(i32 %arg0) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll index 6007dede902209..ea39df0ca01e2b 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll @@ -93,7 +93,7 @@ define amdgpu_kernel void @minimal_kernel_inputs_with_stack() #0 { ; WORKAROUND: .amdhsa_user_sgpr_count 15 ; NOWORKAROUND: .amdhsa_user_sgpr_count 4 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 0 -; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 0 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 @@ -140,7 +140,7 @@ define amdgpu_kernel void @queue_ptr() #1 { ; WORKAROUND: .amdhsa_user_sgpr_count 13 ; NOWORKAROUND: .amdhsa_user_sgpr_count 8 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_ptr 1 -; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 0 +; GCN-NEXT: .amdhsa_user_sgpr_queue_ptr 1 ; GCN-NEXT: .amdhsa_user_sgpr_kernarg_segment_ptr 1 ; GCN-NEXT: .amdhsa_user_sgpr_dispatch_id 1 ; GCN-NEXT: .amdhsa_user_sgpr_private_segment_size 0 diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll index cb0452d4c99b59..77924222919984 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -4275,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_max_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4300,26 +4300,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -4331,16 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -7823,20 +7823,20 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_max_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7848,26 +7848,26 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -7879,16 +7879,16 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], v[41:42] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll index 75447fc971c8b8..cb3291df891af4 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -4275,20 +4275,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB7_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_min_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -4300,26 +4300,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1064-NEXT: .LBB7_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -4331,16 +4331,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] @@ -7823,20 +7823,20 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_movk_i32 s32, 0x800 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0x7ff80000 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0x7ff80000 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: .LBB11_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1] -; GFX1064-NEXT: v_max_f64 v[2:3], v[3:4], v[3:4] +; GFX1064-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4 ; GFX1064-NEXT: v_max_f64 v[4:5], s[2:3], s[2:3] ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX1064-NEXT: v_min_f64 v[3:4], v[2:3], v[4:5] +; GFX1064-NEXT: v_min_f64 v[2:3], v[2:3], v[4:5] ; GFX1064-NEXT: s_cbranch_scc1 .LBB11_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 @@ -7848,26 +7848,26 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: ; %bb.3: ; GFX1064-NEXT: s_load_dwordx2 s[44:45], s[34:35], 0x24 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 -; GFX1064-NEXT: v_max_f64 v[41:42], v[3:4], v[3:4] +; GFX1064-NEXT: v_max_f64 v[41:42], v[2:3], v[2:3] ; GFX1064-NEXT: s_mov_b64 s[46:47], 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45] +; GFX1064-NEXT: global_load_dwordx2 v[4:5], v0, s[44:45] ; GFX1064-NEXT: .LBB11_4: ; %atomicrmw.start ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) -; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2] +; GFX1064-NEXT: v_max_f64 v[0:1], v[4:5], v[4:5] ; GFX1064-NEXT: s_add_u32 s8, s34, 44 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0 ; GFX1064-NEXT: s_getpc_b64 s[0:1] ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12 -; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4 -; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 +; GFX1064-NEXT: buffer_store_dword v5, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40 -; GFX1064-NEXT: v_mov_b32_e32 v0, 8 -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: v_mov_b32_e32 v2, s44 +; GFX1064-NEXT: v_mov_b32_e32 v3, s45 +; GFX1064-NEXT: v_mov_b32_e32 v4, 0 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0 @@ -7879,16 +7879,16 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau ; GFX1064-NEXT: s_mov_b32 s13, s42 ; GFX1064-NEXT: s_mov_b32 s14, s33 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51] -; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], v[41:42] -; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12 -; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8 -; GFX1064-NEXT: v_mov_b32_e32 v3, s45 -; GFX1064-NEXT: v_mov_b32_e32 v4, 0 +; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42] +; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12 +; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8 +; GFX1064-NEXT: v_mov_b32_e32 v0, 8 +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0 -; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4 +; GFX1064-NEXT: buffer_load_dword v4, off, s[48:51], 0 +; GFX1064-NEXT: buffer_load_dword v5, off, s[48:51], 0 offset:4 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_or_b64 s[46:47], vcc, s[46:47] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 605a58125bb79a..6bafaad582901b 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -950,6 +950,7 @@ entry: ret void } +; FIXME: Fold out s_or_b32 s2, 0, s3 define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 %sel) { ; GCN-LABEL: bit4_inselt: ; GCN: ; %bb.0: ; %entry @@ -962,21 +963,20 @@ define amdgpu_kernel void @bit4_inselt(ptr addrspace(1) %out, <4 x i1> %vec, i32 ; GCN-NEXT: s_addc_u32 s13, s13, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_bfe_u32 s6, s2, 0x10003 -; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v0, s2 ; GCN-NEXT: s_bfe_u32 s5, s2, 0x20002 -; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: s_bfe_u32 s4, s2, 0x10001 -; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:3 -; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:3 +; GCN-NEXT: v_mov_b32_e32 v0, s5 ; GCN-NEXT: s_and_b32 s3, s3, 3 -; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:2 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: v_or_b32_e32 v0, s3, v0 -; GCN-NEXT: buffer_store_byte v1, off, s[12:15], 0 offset:1 -; GCN-NEXT: v_mov_b32_e32 v1, 1 -; GCN-NEXT: buffer_store_byte v1, v0, s[12:15], 0 offen +; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:2 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_or_b32_e64 v1, s3, 0 +; GCN-NEXT: buffer_store_byte v0, off, s[12:15], 0 offset:1 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: buffer_store_byte v0, v1, s[12:15], 0 offen ; GCN-NEXT: buffer_load_ubyte v0, off, s[12:15], 0 ; GCN-NEXT: buffer_load_ubyte v1, off, s[12:15], 0 offset:1 ; GCN-NEXT: buffer_load_ubyte v2, off, s[12:15], 0 offset:2 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index 2999ddb8315883..f372a54894604c 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -193,32 +193,32 @@ define float @sitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 ; GISEL-NEXT: v_lshrrev_b64 v[9:10], v4, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, v[2:3] -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v13, 0xffffffc0, v4 ; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v12, v10, v12 ; GISEL-NEXT: v_lshrrev_b64 v[9:10], v13, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_add_u32_e32 v14, 26, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_sub_u32_e32 v11, 64, v5 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v13, v9, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc -; GISEL-NEXT: v_lshrrev_b64 v[9:10], v5, -1 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v14, -1 ; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 -; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v5 +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffffda, v5 ; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v16, v10, v12 -; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v5, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v12, v16, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v11, v12, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 ; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 @@ -438,32 +438,32 @@ define float @uitofp_i128_to_f32(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 ; GISEL-NEXT: v_lshrrev_b64 v[8:9], v4, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GISEL-NEXT: v_subrev_u32_e32 v12, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v12, 0xffffffc0, v4 ; GISEL-NEXT: v_or_b32_e32 v10, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 ; GISEL-NEXT: v_lshrrev_b64 v[8:9], v12, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_add_u32_e32 v13, 26, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_sub_u32_e32 v10, 64, v5 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v12, v8, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v1, vcc -; GISEL-NEXT: v_lshrrev_b64 v[8:9], v5, -1 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v13, -1 ; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, -1 -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v5 +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffffda, v5 ; GISEL-NEXT: v_or_b32_e32 v14, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 -; GISEL-NEXT: v_lshrrev_b64 v[10:11], v13, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_lshrrev_b64 v[10:11], v5, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v10, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, v8, v2 ; GISEL-NEXT: v_and_b32_e32 v3, v9, v3 ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 @@ -723,34 +723,34 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v14 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], v14, v[2:3] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5] -; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14 +; GISEL-NEXT: v_add_u32_e32 v15, 0xffffffc0, v14 ; GISEL-NEXT: v_lshrrev_b64 v[12:13], v14, v[4:5] ; GISEL-NEXT: v_or_b32_e32 v10, v0, v10 ; GISEL-NEXT: v_or_b32_e32 v11, v1, v11 ; GISEL-NEXT: v_lshrrev_b64 v[0:1], v15, v[4:5] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 -; GISEL-NEXT: v_add_u32_e32 v9, 55, v9 +; GISEL-NEXT: v_add_u32_e32 v15, 55, v9 ; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc -; GISEL-NEXT: v_sub_u32_e32 v12, 64, v9 +; GISEL-NEXT: v_sub_u32_e32 v12, 64, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v14, v0, v2, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v10, v1, v3, s[4:5] -; GISEL-NEXT: v_lshrrev_b64 v[0:1], v9, -1 +; GISEL-NEXT: v_lshrrev_b64 v[0:1], v15, -1 ; GISEL-NEXT: v_lshlrev_b64 v[12:13], v12, -1 -; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v9 +; GISEL-NEXT: v_add_u32_e32 v9, -9, v9 ; GISEL-NEXT: v_or_b32_e32 v16, v0, v12 ; GISEL-NEXT: v_or_b32_e32 v17, v1, v13 -; GISEL-NEXT: v_lshrrev_b64 v[12:13], v15, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v9 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9 +; GISEL-NEXT: v_lshrrev_b64 v[12:13], v9, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v9, v12, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v9, v12, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v9, v9, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v0, v0, v4 ; GISEL-NEXT: v_and_b32_e32 v1, v1, v5 ; GISEL-NEXT: v_and_or_b32 v0, v9, v2, v0 @@ -999,35 +999,35 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v9, 64, v13 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], v13, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3] -; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v13 +; GISEL-NEXT: v_add_u32_e32 v14, 0xffffffc0, v13 ; GISEL-NEXT: v_lshrrev_b64 v[11:12], v13, v[2:3] ; GISEL-NEXT: v_or_b32_e32 v9, v4, v9 ; GISEL-NEXT: v_or_b32_e32 v10, v5, v10 ; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 -; GISEL-NEXT: v_add_u32_e32 v8, 55, v8 +; GISEL-NEXT: v_add_u32_e32 v15, 55, v8 ; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc ; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc -; GISEL-NEXT: v_sub_u32_e32 v12, 64, v8 +; GISEL-NEXT: v_sub_u32_e32 v12, 64, v15 ; GISEL-NEXT: v_cndmask_b32_e64 v14, v4, v0, s[4:5] ; GISEL-NEXT: v_cndmask_b32_e64 v9, v5, v1, s[4:5] -; GISEL-NEXT: v_lshrrev_b64 v[4:5], v8, -1 +; GISEL-NEXT: v_lshrrev_b64 v[4:5], v15, -1 ; GISEL-NEXT: v_lshlrev_b64 v[12:13], v12, -1 -; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v8 +; GISEL-NEXT: v_add_u32_e32 v8, -9, v8 ; GISEL-NEXT: v_or_b32_e32 v16, v4, v12 ; GISEL-NEXT: v_or_b32_e32 v17, v5, v13 -; GISEL-NEXT: v_lshrrev_b64 v[12:13], v15, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8 -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8 +; GISEL-NEXT: v_lshrrev_b64 v[12:13], v8, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15 +; GISEL-NEXT: v_cndmask_b32_e32 v8, v12, v16, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v12, v13, v17, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15 ; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v8, v12, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, v4, v2 ; GISEL-NEXT: v_and_b32_e32 v3, v5, v3 ; GISEL-NEXT: v_and_or_b32 v0, v8, v0, v2 @@ -1284,32 +1284,32 @@ define half @sitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4 ; GISEL-NEXT: v_lshrrev_b64 v[9:10], v4, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, v[2:3] -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v13, 0xffffffc0, v4 ; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v12, v10, v12 ; GISEL-NEXT: v_lshrrev_b64 v[9:10], v13, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_add_u32_e32 v14, 26, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_sub_u32_e32 v11, 64, v5 +; GISEL-NEXT: v_sub_u32_e32 v11, 64, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v13, v9, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc -; GISEL-NEXT: v_lshrrev_b64 v[9:10], v5, -1 +; GISEL-NEXT: v_lshrrev_b64 v[9:10], v14, -1 ; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1 -; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v5 +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffffda, v5 ; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 ; GISEL-NEXT: v_or_b32_e32 v16, v10, v12 -; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_lshrrev_b64 v[11:12], v5, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v15, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v11, v12, v16, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14 ; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v11, v12, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, v9, v2 ; GISEL-NEXT: v_and_b32_e32 v3, v10, v3 ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 @@ -1531,32 +1531,32 @@ define half @uitofp_i128_to_f16(i128 %x) { ; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4 ; GISEL-NEXT: v_lshrrev_b64 v[8:9], v4, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3] -; GISEL-NEXT: v_subrev_u32_e32 v12, 64, v4 +; GISEL-NEXT: v_add_u32_e32 v12, 0xffffffc0, v4 ; GISEL-NEXT: v_or_b32_e32 v10, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v11, v9, v11 ; GISEL-NEXT: v_lshrrev_b64 v[8:9], v12, v[2:3] ; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 -; GISEL-NEXT: v_add_u32_e32 v5, 26, v5 +; GISEL-NEXT: v_add_u32_e32 v13, 26, v5 ; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 -; GISEL-NEXT: v_sub_u32_e32 v10, 64, v5 +; GISEL-NEXT: v_sub_u32_e32 v10, 64, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v12, v8, v0, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v1, vcc -; GISEL-NEXT: v_lshrrev_b64 v[8:9], v5, -1 +; GISEL-NEXT: v_lshrrev_b64 v[8:9], v13, -1 ; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, -1 -; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v5 +; GISEL-NEXT: v_add_u32_e32 v5, 0xffffffda, v5 ; GISEL-NEXT: v_or_b32_e32 v14, v8, v10 ; GISEL-NEXT: v_or_b32_e32 v15, v9, v11 -; GISEL-NEXT: v_lshrrev_b64 v[10:11], v13, -1 -; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5 -; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc -; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc -; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5 +; GISEL-NEXT: v_lshrrev_b64 v[10:11], v5, -1 +; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13 +; GISEL-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc +; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v15, vcc +; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13 ; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc ; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc -; GISEL-NEXT: v_cndmask_b32_e64 v5, v10, -1, s[4:5] -; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, -1, s[4:5] +; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, -1, s[4:5] ; GISEL-NEXT: v_and_b32_e32 v2, v8, v2 ; GISEL-NEXT: v_and_b32_e32 v3, v9, v3 ; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll index e30d9331b0e341..3149d7c19b9292 100644 --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -19,9 +19,6 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: v_readlane_b32 s14, v40, 0 ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] -; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: v_mov_b32_e32 v2, v1 -; CHECK-NEXT: v_mov_b32_e32 v1, v0 ; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_writelane_b32 v40, s8, 1 @@ -39,8 +36,8 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_addc_u32 s15, s9, s15 ; CHECK-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9 ; CHECK-NEXT: s_mov_b32 s9, s15 -; CHECK-NEXT: v_mov_b32_e32 v0, 0x2000 -; CHECK-NEXT: ; implicit-def: $sgpr15 +; CHECK-NEXT: s_mov_b32 s15, 0x2000 +; CHECK-NEXT: s_mov_b32 s18, s15 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, device_func@gotpcrel32@lo+4 ; CHECK-NEXT: s_addc_u32 s17, s17, device_func@gotpcrel32@hi+12 @@ -48,13 +45,14 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 { ; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3] ; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1] ; CHECK-NEXT: s_mov_b32 s15, 20 -; CHECK-NEXT: v_lshlrev_b32_e64 v3, s15, v3 -; CHECK-NEXT: s_mov_b32 s15, 10 ; CHECK-NEXT: v_lshlrev_b32_e64 v2, s15, v2 -; CHECK-NEXT: v_or3_b32 v31, v1, v2, v3 +; CHECK-NEXT: s_mov_b32 s15, 10 +; CHECK-NEXT: v_lshlrev_b32_e64 v1, s15, v1 +; CHECK-NEXT: v_or3_b32 v31, v0, v1, v2 ; CHECK-NEXT: ; implicit-def: $sgpr15 ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] +; CHECK-NEXT: v_mov_b32_e32 v0, s18 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100100 diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index 7698372b687797..da98eb40a129dd 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -38,7 +38,7 @@ ; GCNHSA: .amdhsa_system_sgpr_workgroup_id_z 1 ; GCNHSA: .amdhsa_system_sgpr_workgroup_info 0 ; GCNHSA: .amdhsa_system_vgpr_workitem_id 2 -; GCNHSA: .amdhsa_next_free_vgpr 3 +; GCNHSA: .amdhsa_next_free_vgpr {{2|3}} ; GCNHSA: .amdhsa_next_free_sgpr 18 ; GCNHSA: .amdhsa_float_round_mode_32 0 ; GCNHSA: .amdhsa_float_round_mode_16_64 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll index e4b9299869334c..0dcfb840dec069 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -139,7 +139,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 { ; VARIANT6-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 ; VARIANT6-NEXT: s_wait_kmcnt 0x0 ; VARIANT6-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_and_b32 v4, 0x3ff, v0 -; VARIANT6-NEXT: s_sub_co_i32 s2, s2, 1 +; VARIANT6-NEXT: s_add_co_i32 s2, s2, -1 ; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) ; VARIANT6-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_lshlrev_b32 v5, 2, v4 ; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index b7c566f682e349..1d869559d9e772 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -395,7 +395,7 @@ define i1 @posnormal_f16(half %x) nounwind { ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v2 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -459,7 +459,7 @@ define i1 @negnormal_f16(half %x) nounwind { ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, v2 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -513,7 +513,7 @@ define i1 @possubnormal_f16(half %x) nounwind { ; GFX7GLISEL-LABEL: possubnormal_f16: ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -575,7 +575,7 @@ define i1 @negsubnormal_f16(half %x) nounwind { ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v2, 0xffff, v1 ; GFX7GLISEL-NEXT: v_cmp_ne_u32_e32 vcc, v0, v2 -; GFX7GLISEL-NEXT: v_subrev_i32_e64 v0, s[4:5], 1, v1 +; GFX7GLISEL-NEXT: v_add_i32_e64 v0, s[4:5], -1, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 @@ -1587,7 +1587,7 @@ define i1 @not_issubnormal_or_zero_f16(half %x) { ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 @@ -1647,7 +1647,7 @@ define i1 @isnormal_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1780,7 +1780,7 @@ define i1 @not_is_plus_normal_f16(half %x) { ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v3, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1853,7 +1853,7 @@ define i1 @not_is_neg_normal_f16(half %x) { ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], vcc, s[6:7] ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v3, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v1 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v1 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1911,7 +1911,7 @@ define i1 @issubnormal_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -1974,7 +1974,7 @@ define i1 @not_issubnormal_f16(half %x) { ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2081,7 +2081,7 @@ define i1 @not_iszero_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2091,7 +2091,7 @@ define i1 @not_iszero_f16(half %x) { ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cmp_gt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2568,7 +2568,7 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2576,7 +2576,7 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2635,7 +2635,7 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2643,7 +2643,7 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2702,7 +2702,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2710,7 +2710,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x7c00 ; GFX7GLISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2902,7 +2902,7 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2914,7 +2914,7 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x1ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 @@ -2983,7 +2983,7 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX7GLISEL: ; %bb.0: ; %entry ; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v1, vcc, 1, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v1, vcc, -1, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v2, 0x3ff ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v1, v2 @@ -2994,7 +2994,7 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GFX7GLISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v2 ; GFX7GLISEL-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7GLISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0x400, v0 +; GFX7GLISEL-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7GLISEL-NEXT: v_mov_b32_e32 v1, 0x7800 ; GFX7GLISEL-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1 diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll index df7460156e6556..12afc267354220 100644 --- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll +++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll @@ -20,25 +20,24 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-LABEL: local_stack_offset_uses_sp: ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_add_u32 s0, s0, s17 -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; MUBUF-NEXT: v_add_u32_e32 v0, 64, v1 -; MUBUF-NEXT: v_mov_b32_e32 v2, 0 -; MUBUF-NEXT: v_mov_b32_e32 v3, 0x2000 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0x3040 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x2000 ; MUBUF-NEXT: s_mov_b32 s4, 0 -; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: .LBB0_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 -; MUBUF-NEXT: v_add_u32_e32 v3, s4, v1 +; MUBUF-NEXT: v_mov_b32_e32 v3, 0x3000 +; MUBUF-NEXT: v_add_u32_e32 v2, s4, v3 ; MUBUF-NEXT: s_add_i32 s4, s4, 1 ; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 -; MUBUF-NEXT: buffer_store_byte v2, v3, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_byte v1, v2, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_1 ; MUBUF-NEXT: ; %bb.2: ; %split -; MUBUF-NEXT: v_mov_b32_e32 v1, 0x3000 -; MUBUF-NEXT: v_add_u32_e32 v1, 0x20d0, v1 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x50d0 ; MUBUF-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 glc @@ -111,26 +110,27 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) { ; MUBUF-NEXT: s_add_i32 s33, s32, 0x7ffc0 ; MUBUF-NEXT: s_and_b32 s33, s33, 0xfff80000 ; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 -; MUBUF-NEXT: v_add_u32_e32 v3, 0x3000, v3 -; MUBUF-NEXT: v_add_u32_e32 v2, 64, v3 -; MUBUF-NEXT: v_mov_b32_e32 v4, 0 -; MUBUF-NEXT: v_mov_b32_e32 v5, 0x2000 +; MUBUF-NEXT: v_add_u32_e32 v2, 0x3040, v3 +; MUBUF-NEXT: v_mov_b32_e32 v3, 0 +; MUBUF-NEXT: v_mov_b32_e32 v4, 0x2000 ; MUBUF-NEXT: s_mov_b32 s4, 0 ; MUBUF-NEXT: s_add_i32 s32, s32, 0x200000 -; MUBUF-NEXT: buffer_store_dword v4, v5, s[0:3], s33 offen +; MUBUF-NEXT: buffer_store_dword v3, v4, s[0:3], s33 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: .LBB1_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 -; MUBUF-NEXT: v_add_u32_e32 v5, s4, v3 +; MUBUF-NEXT: v_lshrrev_b32_e64 v5, 6, s33 +; MUBUF-NEXT: v_add_u32_e32 v4, s4, v5 +; MUBUF-NEXT: v_mov_b32_e32 v5, 0x3000 ; MUBUF-NEXT: s_add_i32 s4, s4, 1 +; MUBUF-NEXT: v_add_u32_e32 v4, v5, v4 ; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 -; MUBUF-NEXT: buffer_store_byte v4, v5, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_byte v3, v4, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB1_1 ; MUBUF-NEXT: ; %bb.2: ; %split -; MUBUF-NEXT: v_lshrrev_b32_e64 v3, 6, s33 -; MUBUF-NEXT: v_add_u32_e32 v3, 0x3000, v3 -; MUBUF-NEXT: v_add_u32_e32 v3, 0x20d0, v3 +; MUBUF-NEXT: v_lshrrev_b32_e64 v4, 6, s33 +; MUBUF-NEXT: v_add_u32_e32 v3, 0x50d0, v4 ; MUBUF-NEXT: buffer_load_dword v4, v3, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: buffer_load_dword v5, v3, s[0:3], 0 offen offset:4 glc @@ -203,41 +203,51 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF: ; %bb.0: ; %entry ; MUBUF-NEXT: s_add_u32 s0, s0, s17 ; MUBUF-NEXT: s_addc_u32 s1, s1, 0 -; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000 -; MUBUF-NEXT: v_mov_b32_e32 v1, 0 -; MUBUF-NEXT: v_mov_b32_e32 v2, 0x2000 +; MUBUF-NEXT: v_mov_b32_e32 v0, 0 +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF-NEXT: s_mov_b32 s4, 0 -; MUBUF-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: .LBB2_1: ; %loadstoreloop ; MUBUF-NEXT: ; =>This Inner Loop Header: Depth=1 -; MUBUF-NEXT: v_add_u32_e32 v2, s4, v0 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: v_add_u32_e32 v1, s4, v2 ; MUBUF-NEXT: s_add_i32 s4, s4, 1 ; MUBUF-NEXT: s_cmpk_lt_u32 s4, 0x2120 -; MUBUF-NEXT: buffer_store_byte v1, v2, s[0:3], 0 offen +; MUBUF-NEXT: buffer_store_byte v0, v1, s[0:3], 0 offen ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_cbranch_scc1 .LBB2_1 ; MUBUF-NEXT: ; %bb.2: ; %split -; MUBUF-NEXT: v_mov_b32_e32 v0, 0x4000 -; MUBUF-NEXT: v_or_b32_e32 v2, 0x12d4, v0 -; MUBUF-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen glc +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 +; MUBUF-NEXT: s_movk_i32 s4, 0x12d4 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c0, v1 +; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 +; MUBUF-NEXT: s_movk_i32 s4, 0x12d0 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v2, 0x12d0, v0 -; MUBUF-NEXT: v_or_b32_e32 v1, 0x12c0, v0 -; MUBUF-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen glc +; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 +; MUBUF-NEXT: s_movk_i32 s4, 0x12c4 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v2, 0x12c4, v0 -; MUBUF-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen glc +; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 +; MUBUF-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_or_b32_e32 v1, 0x12cc, v0 -; MUBUF-NEXT: v_or_b32_e32 v0, 0x12c8, v0 +; MUBUF-NEXT: s_movk_i32 s4, 0x12cc +; MUBUF-NEXT: v_mov_b32_e32 v1, 0x4000 +; MUBUF-NEXT: v_or_b32_e32 v0, s4, v1 +; MUBUF-NEXT: s_movk_i32 s4, 0x12c8 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 -; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc +; MUBUF-NEXT: v_or_b32_e32 v1, s4, v2 +; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 +; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v3, 0x4000 -; MUBUF-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen glc +; MUBUF-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v10, 0x4000 ; MUBUF-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen glc @@ -254,10 +264,10 @@ define amdgpu_kernel void @local_stack_offset_uses_sp_flat(ptr addrspace(1) %out ; MUBUF-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0 ; MUBUF-NEXT: buffer_load_dword v10, v11, s[0:3], 0 offen offset:16 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v0, v2 +; MUBUF-NEXT: v_add_co_u32_e32 v2, vcc, v1, v2 ; MUBUF-NEXT: buffer_load_dword v11, v12, s[0:3], 0 offen offset:20 glc ; MUBUF-NEXT: s_waitcnt vmcnt(0) -; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc +; MUBUF-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc ; MUBUF-NEXT: v_add_co_u32_e32 v0, vcc, v7, v8 ; MUBUF-NEXT: v_addc_co_u32_e32 v1, vcc, v6, v9, vcc ; MUBUF-NEXT: v_add_co_u32_e32 v4, vcc, v4, v10 diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll index 95914857b87328..b77c3a9bb532bc 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll @@ -1423,26 +1423,25 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill +; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v3, 5, s32 -; GFX10_1-NEXT: v_writelane_b32 v2, s59, 0 +; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 +; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5 +; GFX10_1-NEXT: s_add_i32 s59, s4, 0x442c ; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v2, 0 +; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 ; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload +; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: s_waitcnt vmcnt(0) @@ -1453,25 +1452,24 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill +; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v3, 5, s32 -; GFX10_3-NEXT: v_writelane_b32 v2, s59, 0 +; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 +; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5 +; GFX10_3-NEXT: s_add_i32 s59, s4, 0x442c ; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v2, 0 +; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 ; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload +; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] @@ -1481,25 +1479,23 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_store_b32 off, v2, s1 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v1, s1 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: v_writelane_b32 v1, s59, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: v_writelane_b32 v2, s59, 0 -; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s32 +; GFX11-NEXT: s_add_i32 s59, s32, 0x442c +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x442c, v3 -; GFX11-NEXT: v_readfirstlane_b32 s59, v1 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v2, 0 +; GFX11-NEXT: v_readlane_b32 s59, v1, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_load_b32 v2, off, s1 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1512,24 +1508,22 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill +; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:32768 ; 4-byte Folded Spill ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v3, s32 -; GFX12-NEXT: v_writelane_b32 v2, s59, 0 +; GFX12-NEXT: v_writelane_b32 v1, s59, 0 +; GFX12-NEXT: s_add_co_i32 s59, s32, 0x43ec +; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x43ec, v3 -; GFX12-NEXT: v_readfirstlane_b32 s59, v1 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v2, 0 +; GFX12-NEXT: v_readlane_b32 s59, v1, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload +; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:32768 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1541,25 +1535,24 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x442c, v1 -; GFX8-NEXT: v_writelane_b32 v2, s59, 0 -; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 -; GFX8-NEXT: v_readfirstlane_b32 s59, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v1 +; GFX8-NEXT: s_lshr_b32 s4, s32, 6 +; GFX8-NEXT: v_writelane_b32 v1, s59, 0 +; GFX8-NEXT: s_add_i32 s59, s4, 0x442c +; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use alloca0 v1 +; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v2, 0 +; GFX8-NEXT: v_readlane_b32 s59, v1, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1569,26 +1562,24 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] +; GFX900-NEXT: s_lshr_b32 s4, s32, 6 +; GFX900-NEXT: v_writelane_b32 v1, s59, 0 +; GFX900-NEXT: s_add_i32 s59, s4, 0x442c ; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0 -; GFX900-NEXT: v_add_u32_e32 v0, 0x3ec, v0 -; GFX900-NEXT: v_writelane_b32 v2, s59, 0 -; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s32 -; GFX900-NEXT: v_readfirstlane_b32 s59, v0 -; GFX900-NEXT: v_add_u32_e32 v1, 64, v1 +; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use alloca0 v1 +; GFX900-NEXT: ; use alloca0 v0 ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v2, 0 +; GFX900-NEXT: v_readlane_b32 s59, v1, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1598,26 +1589,23 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_immoffset( ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX940-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX940-NEXT: scratch_store_dword off, v2, s2 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: scratch_store_dword off, v1, s2 sc0 sc1 ; 4-byte Folded Spill ; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: s_add_i32 s0, s32, 0x4040 -; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_add_u32_e32 v0, 0x3ec, v0 -; GFX940-NEXT: v_writelane_b32 v2, s59, 0 +; GFX940-NEXT: v_writelane_b32 v1, s59, 0 +; GFX940-NEXT: s_add_i32 s59, s32, 0x442c ; GFX940-NEXT: s_add_i32 s0, s32, 64 -; GFX940-NEXT: v_readfirstlane_b32 s59, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use alloca0 v1 +; GFX940-NEXT: ; use alloca0 v0 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_and_b64 s[0:1], 0, exec ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s59, scc ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v2, 0 +; GFX940-NEXT: v_readlane_b32 s59, v1, 0 ; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX940-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX940-NEXT: scratch_load_dword v2, off, s2 ; 4-byte Folded Reload +; GFX940-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload ; GFX940-NEXT: s_mov_b64 exec, s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1635,28 +1623,27 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill +; GFX10_1-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v3, 5, s32 -; GFX10_1-NEXT: s_lshl_b32 s4, s16, 2 -; GFX10_1-NEXT: v_writelane_b32 v2, s59, 0 +; GFX10_1-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, s4, v3 -; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_1-NEXT: s_lshl_b32 s4, s16, 2 +; GFX10_1-NEXT: s_lshr_b32 s59, s32, 5 +; GFX10_1-NEXT: s_add_i32 s59, s59, s4 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_1-NEXT: s_addk_i32 s59, 0x4040 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_add_nc_u32_e32 v1, 0x4040, v1 -; GFX10_1-NEXT: v_readfirstlane_b32 s59, v1 +; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v2, 0 +; GFX10_1-NEXT: v_readlane_b32 s59, v1, 0 ; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload +; GFX10_1-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: s_waitcnt vmcnt(0) @@ -1667,27 +1654,26 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_store_dword v2, off, s[0:3], s5 ; 4-byte Folded Spill +; GFX10_3-NEXT: buffer_store_dword v1, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v3, 5, s32 -; GFX10_3-NEXT: s_lshl_b32 s4, s16, 2 -; GFX10_3-NEXT: v_writelane_b32 v2, s59, 0 +; GFX10_3-NEXT: v_writelane_b32 v1, s59, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, s4, v3 -; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo +; GFX10_3-NEXT: s_lshl_b32 s4, s16, 2 +; GFX10_3-NEXT: s_lshr_b32 s59, s32, 5 +; GFX10_3-NEXT: s_add_i32 s59, s59, s4 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 +; GFX10_3-NEXT: s_addk_i32 s59, 0x4040 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_add_nc_u32_e32 v1, 0x4040, v1 -; GFX10_3-NEXT: v_readfirstlane_b32 s59, v1 +; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v2, 0 +; GFX10_3-NEXT: v_readlane_b32 s59, v1, 0 ; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_load_dword v2, off, s[0:3], s5 ; 4-byte Folded Reload +; GFX10_3-NEXT: buffer_load_dword v1, off, s[0:3], s5 ; 4-byte Folded Reload ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] @@ -1697,27 +1683,25 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX11-NEXT: scratch_store_b32 off, v2, s2 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v1, s2 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: s_add_i32 s1, s32, 64 -; GFX11-NEXT: v_add_nc_u32_e64 v1, s0, s32 +; GFX11-NEXT: v_writelane_b32 v1, s59, 0 +; GFX11-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-NEXT: v_writelane_b32 v2, s59, 0 -; GFX11-NEXT: s_and_b32 s0, 0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x4040, v1 +; GFX11-NEXT: s_add_i32 s59, s32, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readfirstlane_b32 s59, v1 +; GFX11-NEXT: s_addk_i32 s59, 0x4040 +; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_readlane_b32 s59, v2, 0 +; GFX11-NEXT: v_readlane_b32 s59, v1, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_load_b32 v2, off, s1 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v1, off, s1 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -1730,27 +1714,26 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s1, -1 -; GFX12-NEXT: scratch_store_b32 off, v2, s32 offset:32768 ; 4-byte Folded Spill +; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:32768 ; 4-byte Folded Spill ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s1 +; GFX12-NEXT: v_writelane_b32 v1, s59, 0 ; GFX12-NEXT: s_lshl_b32 s0, s0, 2 -; GFX12-NEXT: v_writelane_b32 v2, s59, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_add_nc_u32_e64 v1, s0, s32 ; GFX12-NEXT: v_mov_b32_e32 v0, s32 -; GFX12-NEXT: s_and_b32 s0, 0, exec_lo -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, 0x4000, v1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_add_co_i32 s59, s32, s0 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readfirstlane_b32 s59, v1 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_addk_co_i32 s59, 0x4000 +; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_readlane_b32 s59, v2, 0 +; GFX12-NEXT: v_readlane_b32 s59, v1, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v2, off, s32 offset:32768 ; 4-byte Folded Reload +; GFX12-NEXT: scratch_load_b32 v1, off, s32 offset:32768 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 @@ -1762,28 +1745,26 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill +; GFX8-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX8-NEXT: s_movk_i32 vcc_lo, 0x4040 +; GFX8-NEXT: v_writelane_b32 v1, s59, 0 ; GFX8-NEXT: s_lshl_b32 s4, s16, 2 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, vcc_lo, v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 -; GFX8-NEXT: v_writelane_b32 v2, s59, 0 -; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 -; GFX8-NEXT: v_readfirstlane_b32 s59, v0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 64, v1 +; GFX8-NEXT: s_lshr_b32 s59, s32, 6 +; GFX8-NEXT: s_add_i32 s59, s59, s4 +; GFX8-NEXT: s_addk_i32 s59, 0x4040 +; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 ; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use alloca0 v1 +; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_and_b64 s[4:5], 0, exec ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v2, 0 +; GFX8-NEXT: v_readlane_b32 s59, v1, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX8-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1793,27 +1774,26 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_store_dword v2, off, s[0:3], s6 ; 4-byte Folded Spill +; GFX900-NEXT: buffer_store_dword v1, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX900-NEXT: s_mov_b64 exec, s[4:5] -; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX900-NEXT: v_writelane_b32 v1, s59, 0 ; GFX900-NEXT: s_lshl_b32 s4, s16, 2 -; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0 -; GFX900-NEXT: v_add_u32_e32 v0, s4, v0 -; GFX900-NEXT: v_writelane_b32 v2, s59, 0 -; GFX900-NEXT: v_lshrrev_b32_e64 v1, 6, s32 -; GFX900-NEXT: v_readfirstlane_b32 s59, v0 -; GFX900-NEXT: v_add_u32_e32 v1, 64, v1 +; GFX900-NEXT: s_lshr_b32 s59, s32, 6 +; GFX900-NEXT: s_add_i32 s59, s59, s4 +; GFX900-NEXT: s_addk_i32 s59, 0x4040 +; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use alloca0 v1 +; GFX900-NEXT: ; use alloca0 v0 ; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: s_and_b64 s[4:5], 0, exec ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; use s59, scc ; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_readlane_b32 s59, v2, 0 +; GFX900-NEXT: v_readlane_b32 s59, v1, 0 ; GFX900-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX900-NEXT: s_add_i32 s6, s32, 0x201000 -; GFX900-NEXT: buffer_load_dword v2, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX900-NEXT: buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX900-NEXT: s_mov_b64 exec, s[4:5] ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -1823,27 +1803,25 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc__gep_sgpr_offse ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_xor_saveexec_b64 s[2:3], -1 ; GFX940-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX940-NEXT: scratch_store_dword off, v2, s1 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: scratch_store_dword off, v1, s1 sc0 sc1 ; 4-byte Folded Spill ; GFX940-NEXT: s_mov_b64 exec, s[2:3] ; GFX940-NEXT: s_lshl_b32 s0, s0, 2 -; GFX940-NEXT: s_add_i32 s1, s32, 0x4040 -; GFX940-NEXT: v_mov_b32_e32 v0, s1 -; GFX940-NEXT: v_add_u32_e32 v0, s0, v0 -; GFX940-NEXT: v_writelane_b32 v2, s59, 0 +; GFX940-NEXT: v_writelane_b32 v1, s59, 0 +; GFX940-NEXT: s_add_i32 s59, s32, s0 +; GFX940-NEXT: s_addk_i32 s59, 0x4040 ; GFX940-NEXT: s_add_i32 s0, s32, 64 -; GFX940-NEXT: v_readfirstlane_b32 s59, v0 -; GFX940-NEXT: v_mov_b32_e32 v1, s0 +; GFX940-NEXT: v_mov_b32_e32 v0, s0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use alloca0 v1 +; GFX940-NEXT: ; use alloca0 v0 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_and_b64 s[0:1], 0, exec ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s59, scc ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s59, v2, 0 +; GFX940-NEXT: v_readlane_b32 s59, v1, 0 ; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX940-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX940-NEXT: scratch_load_dword v2, off, s2 ; 4-byte Folded Reload +; GFX940-NEXT: scratch_load_dword v1, off, s2 ; 4-byte Folded Reload ; GFX940-NEXT: s_mov_b64 exec, s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll index 308411fa225dae..d4110850f32066 100644 --- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll @@ -1520,9 +1520,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: s_add_i32 s6, s32, 0x202000 +; GFX7-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX7-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX7-NEXT: s_add_i32 s6, s32, 0x202100 +; GFX7-NEXT: s_add_i32 s6, s32, 0x201100 ; GFX7-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: v_writelane_b32 v23, s28, 28 @@ -1552,66 +1552,23 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: v_writelane_b32 v23, s53, 22 ; GFX7-NEXT: v_writelane_b32 v23, s54, 23 ; GFX7-NEXT: v_writelane_b32 v23, s55, 24 -; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 +; GFX7-NEXT: s_lshr_b32 s5, s32, 6 ; GFX7-NEXT: v_writelane_b32 v23, s56, 25 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0 +; GFX7-NEXT: v_lshr_b32_e64 v0, s32, 6 +; GFX7-NEXT: s_add_i32 s4, s5, 0x4240 +; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; GFX7-NEXT: v_writelane_b32 v23, s57, 26 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, 64, v0 +; GFX7-NEXT: v_writelane_b32 v22, s4, 0 +; GFX7-NEXT: s_and_b64 s[4:5], 0, exec +; GFX7-NEXT: v_writelane_b32 v23, s59, 27 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use alloca0 v0 ; GFX7-NEXT: ;;#ASMEND ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX7-NEXT: ;;#ASMEND -; GFX7-NEXT: buffer_store_dword v16, off, s[0:3], s32 -; GFX7-NEXT: v_mov_b32_e32 v16, 0x8040 -; GFX7-NEXT: buffer_store_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; GFX7-NEXT: v_lshr_b32_e64 v1, s32, 6 -; GFX7-NEXT: v_writelane_b32 v22, vcc_lo, 0 -; GFX7-NEXT: v_writelane_b32 v22, vcc_hi, 1 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x4240, v1 -; GFX7-NEXT: v_writelane_b32 v23, s59, 27 -; GFX7-NEXT: v_readfirstlane_b32 s59, v0 -; GFX7-NEXT: s_and_b64 vcc, 0, exec -; GFX7-NEXT: v_readlane_b32 vcc_lo, v22, 0 -; GFX7-NEXT: v_readlane_b32 vcc_hi, v22, 1 -; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: buffer_store_dword v16, off, s[0:3], s32 -; GFX7-NEXT: v_mov_b32_e32 v16, 0x8040 -; GFX7-NEXT: buffer_load_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Reload -; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_readlane_b32 s59, v22, 0 ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX7-NEXT: ;;#ASMEND @@ -1646,9 +1603,9 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX7-NEXT: v_readlane_b32 s28, v23, 28 ; GFX7-NEXT: v_readlane_b32 s29, v23, 29 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX7-NEXT: s_add_i32 s6, s32, 0x202000 +; GFX7-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX7-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX7-NEXT: s_add_i32 s6, s32, 0x202100 +; GFX7-NEXT: s_add_i32 s6, s32, 0x201100 ; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1658,137 +1615,81 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x202000 -; GFX8-NEXT: buffer_store_dword v23, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX8-NEXT: s_add_i32 s6, s32, 0x202100 +; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX8-NEXT: buffer_store_dword v22, off, s[0:3], s6 ; 4-byte Folded Spill ; GFX8-NEXT: s_mov_b64 exec, s[4:5] -; GFX8-NEXT: v_writelane_b32 v23, s58, 28 -; GFX8-NEXT: v_writelane_b32 v23, s59, 29 -; GFX8-NEXT: v_writelane_b32 v23, s30, 0 -; GFX8-NEXT: v_writelane_b32 v23, s31, 1 -; GFX8-NEXT: v_writelane_b32 v23, s33, 2 -; GFX8-NEXT: v_writelane_b32 v23, s34, 3 -; GFX8-NEXT: v_writelane_b32 v23, s35, 4 -; GFX8-NEXT: v_writelane_b32 v23, s36, 5 -; GFX8-NEXT: v_writelane_b32 v23, s37, 6 -; GFX8-NEXT: v_writelane_b32 v23, s38, 7 -; GFX8-NEXT: v_writelane_b32 v23, s39, 8 -; GFX8-NEXT: v_writelane_b32 v23, s40, 9 -; GFX8-NEXT: v_writelane_b32 v23, s41, 10 -; GFX8-NEXT: v_writelane_b32 v23, s42, 11 -; GFX8-NEXT: v_writelane_b32 v23, s43, 12 -; GFX8-NEXT: v_writelane_b32 v23, s44, 13 -; GFX8-NEXT: v_writelane_b32 v23, s45, 14 -; GFX8-NEXT: v_writelane_b32 v23, s46, 15 -; GFX8-NEXT: v_writelane_b32 v23, s47, 16 -; GFX8-NEXT: v_writelane_b32 v23, s48, 17 -; GFX8-NEXT: v_writelane_b32 v23, s49, 18 -; GFX8-NEXT: v_writelane_b32 v23, s50, 19 -; GFX8-NEXT: v_writelane_b32 v23, s51, 20 -; GFX8-NEXT: v_writelane_b32 v23, s52, 21 -; GFX8-NEXT: v_writelane_b32 v23, s53, 22 -; GFX8-NEXT: v_writelane_b32 v23, s54, 23 -; GFX8-NEXT: v_writelane_b32 v23, s55, 24 +; GFX8-NEXT: v_writelane_b32 v22, s30, 0 +; GFX8-NEXT: v_writelane_b32 v22, s31, 1 +; GFX8-NEXT: v_writelane_b32 v22, s33, 2 +; GFX8-NEXT: v_writelane_b32 v22, s34, 3 +; GFX8-NEXT: v_writelane_b32 v22, s35, 4 +; GFX8-NEXT: v_writelane_b32 v22, s36, 5 +; GFX8-NEXT: v_writelane_b32 v22, s37, 6 +; GFX8-NEXT: v_writelane_b32 v22, s38, 7 +; GFX8-NEXT: v_writelane_b32 v22, s39, 8 +; GFX8-NEXT: v_writelane_b32 v22, s40, 9 +; GFX8-NEXT: v_writelane_b32 v22, s41, 10 +; GFX8-NEXT: v_writelane_b32 v22, s42, 11 +; GFX8-NEXT: v_writelane_b32 v22, s43, 12 +; GFX8-NEXT: v_writelane_b32 v22, s44, 13 +; GFX8-NEXT: v_writelane_b32 v22, s45, 14 +; GFX8-NEXT: v_writelane_b32 v22, s46, 15 +; GFX8-NEXT: v_writelane_b32 v22, s47, 16 +; GFX8-NEXT: v_writelane_b32 v22, s48, 17 +; GFX8-NEXT: v_writelane_b32 v22, s49, 18 +; GFX8-NEXT: v_writelane_b32 v22, s50, 19 +; GFX8-NEXT: v_writelane_b32 v22, s51, 20 +; GFX8-NEXT: v_writelane_b32 v22, s52, 21 +; GFX8-NEXT: v_writelane_b32 v22, s53, 22 +; GFX8-NEXT: v_writelane_b32 v22, s54, 23 +; GFX8-NEXT: v_writelane_b32 v22, s55, 24 +; GFX8-NEXT: v_writelane_b32 v22, s56, 25 +; GFX8-NEXT: v_writelane_b32 v22, s57, 26 +; GFX8-NEXT: s_lshr_b32 s4, s32, 6 +; GFX8-NEXT: v_writelane_b32 v22, s59, 27 ; GFX8-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; GFX8-NEXT: v_writelane_b32 v23, s56, 25 +; GFX8-NEXT: s_add_i32 s59, s4, 0x4240 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, 64, v0 -; GFX8-NEXT: v_writelane_b32 v23, s57, 26 +; GFX8-NEXT: s_and_b64 s[4:5], 0, exec ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use alloca0 v0 ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: buffer_store_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_mov_b32_e32 v16, 0x8040 -; GFX8-NEXT: buffer_store_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Spill -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_store_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_store_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Spill -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane -; GFX8-NEXT: v_lshrrev_b32_e64 v1, 6, s32 -; GFX8-NEXT: v_writelane_b32 v22, vcc_lo, 0 -; GFX8-NEXT: v_writelane_b32 v22, vcc_hi, 1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x4240, v1 -; GFX8-NEXT: v_writelane_b32 v23, s59, 27 -; GFX8-NEXT: v_readfirstlane_b32 s59, v0 -; GFX8-NEXT: s_and_b64 vcc, 0, exec -; GFX8-NEXT: v_readlane_b32 vcc_lo, v22, 0 -; GFX8-NEXT: v_readlane_b32 vcc_hi, v22, 1 -; GFX8-NEXT: v_readlane_b32 s58, v23, 28 -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: buffer_store_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_mov_b32_e32 v16, 0x8040 -; GFX8-NEXT: buffer_load_dword v0, v16, s[0:3], s32 offen ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v1, v16, s[0:3], s32 offen offset:4 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v2, v16, s[0:3], s32 offen offset:8 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v3, v16, s[0:3], s32 offen offset:12 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v4, v16, s[0:3], s32 offen offset:16 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v5, v16, s[0:3], s32 offen offset:20 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v6, v16, s[0:3], s32 offen offset:24 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v7, v16, s[0:3], s32 offen offset:28 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v8, v16, s[0:3], s32 offen offset:32 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v9, v16, s[0:3], s32 offen offset:36 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v10, v16, s[0:3], s32 offen offset:40 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v11, v16, s[0:3], s32 offen offset:44 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v12, v16, s[0:3], s32 offen offset:48 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v13, v16, s[0:3], s32 offen offset:52 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v14, v16, s[0:3], s32 offen offset:56 ; 4-byte Folded Reload -; GFX8-NEXT: buffer_load_dword v15, v16, s[0:3], s32 offen offset:60 ; 4-byte Folded Reload -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX8-NEXT: ;;#ASMEND -; GFX8-NEXT: v_readlane_b32 s59, v23, 27 -; GFX8-NEXT: v_readlane_b32 s57, v23, 26 -; GFX8-NEXT: v_readlane_b32 s56, v23, 25 -; GFX8-NEXT: v_readlane_b32 s55, v23, 24 -; GFX8-NEXT: v_readlane_b32 s54, v23, 23 -; GFX8-NEXT: v_readlane_b32 s53, v23, 22 -; GFX8-NEXT: v_readlane_b32 s52, v23, 21 -; GFX8-NEXT: v_readlane_b32 s51, v23, 20 -; GFX8-NEXT: v_readlane_b32 s50, v23, 19 -; GFX8-NEXT: v_readlane_b32 s49, v23, 18 -; GFX8-NEXT: v_readlane_b32 s48, v23, 17 -; GFX8-NEXT: v_readlane_b32 s47, v23, 16 -; GFX8-NEXT: v_readlane_b32 s46, v23, 15 -; GFX8-NEXT: v_readlane_b32 s45, v23, 14 -; GFX8-NEXT: v_readlane_b32 s44, v23, 13 -; GFX8-NEXT: v_readlane_b32 s43, v23, 12 -; GFX8-NEXT: v_readlane_b32 s42, v23, 11 -; GFX8-NEXT: v_readlane_b32 s41, v23, 10 -; GFX8-NEXT: v_readlane_b32 s40, v23, 9 -; GFX8-NEXT: v_readlane_b32 s39, v23, 8 -; GFX8-NEXT: v_readlane_b32 s38, v23, 7 -; GFX8-NEXT: v_readlane_b32 s37, v23, 6 -; GFX8-NEXT: v_readlane_b32 s36, v23, 5 -; GFX8-NEXT: v_readlane_b32 s35, v23, 4 -; GFX8-NEXT: v_readlane_b32 s34, v23, 3 -; GFX8-NEXT: v_readlane_b32 s33, v23, 2 -; GFX8-NEXT: v_readlane_b32 s31, v23, 1 -; GFX8-NEXT: v_readlane_b32 s30, v23, 0 -; GFX8-NEXT: v_readlane_b32 s59, v23, 29 +; GFX8-NEXT: v_readlane_b32 s59, v22, 27 +; GFX8-NEXT: v_readlane_b32 s57, v22, 26 +; GFX8-NEXT: v_readlane_b32 s56, v22, 25 +; GFX8-NEXT: v_readlane_b32 s55, v22, 24 +; GFX8-NEXT: v_readlane_b32 s54, v22, 23 +; GFX8-NEXT: v_readlane_b32 s53, v22, 22 +; GFX8-NEXT: v_readlane_b32 s52, v22, 21 +; GFX8-NEXT: v_readlane_b32 s51, v22, 20 +; GFX8-NEXT: v_readlane_b32 s50, v22, 19 +; GFX8-NEXT: v_readlane_b32 s49, v22, 18 +; GFX8-NEXT: v_readlane_b32 s48, v22, 17 +; GFX8-NEXT: v_readlane_b32 s47, v22, 16 +; GFX8-NEXT: v_readlane_b32 s46, v22, 15 +; GFX8-NEXT: v_readlane_b32 s45, v22, 14 +; GFX8-NEXT: v_readlane_b32 s44, v22, 13 +; GFX8-NEXT: v_readlane_b32 s43, v22, 12 +; GFX8-NEXT: v_readlane_b32 s42, v22, 11 +; GFX8-NEXT: v_readlane_b32 s41, v22, 10 +; GFX8-NEXT: v_readlane_b32 s40, v22, 9 +; GFX8-NEXT: v_readlane_b32 s39, v22, 8 +; GFX8-NEXT: v_readlane_b32 s38, v22, 7 +; GFX8-NEXT: v_readlane_b32 s37, v22, 6 +; GFX8-NEXT: v_readlane_b32 s36, v22, 5 +; GFX8-NEXT: v_readlane_b32 s35, v22, 4 +; GFX8-NEXT: v_readlane_b32 s34, v22, 3 +; GFX8-NEXT: v_readlane_b32 s33, v22, 2 +; GFX8-NEXT: v_readlane_b32 s31, v22, 1 +; GFX8-NEXT: v_readlane_b32 s30, v22, 0 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX8-NEXT: s_add_i32 s6, s32, 0x202000 -; GFX8-NEXT: buffer_load_dword v23, off, s[0:3], s6 ; 4-byte Folded Reload -; GFX8-NEXT: s_add_i32 s6, s32, 0x202100 +; GFX8-NEXT: s_add_i32 s6, s32, 0x201000 ; GFX8-NEXT: buffer_load_dword v22, off, s[0:3], s6 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1824,21 +1725,19 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX900-NEXT: v_writelane_b32 v22, s51, 20 ; GFX900-NEXT: v_writelane_b32 v22, s52, 21 ; GFX900-NEXT: v_writelane_b32 v22, s53, 22 -; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_writelane_b32 v22, s54, 23 -; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 ; GFX900-NEXT: v_writelane_b32 v22, s55, 24 -; GFX900-NEXT: ;;#ASMSTART -; GFX900-NEXT: ; use alloca0 v0 -; GFX900-NEXT: ;;#ASMEND -; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GFX900-NEXT: v_writelane_b32 v22, s56, 25 -; GFX900-NEXT: v_add_u32_e32 v0, 0x4040, v0 ; GFX900-NEXT: v_writelane_b32 v22, s57, 26 -; GFX900-NEXT: v_add_u32_e32 v0, 0x200, v0 -; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: s_lshr_b32 s4, s32, 6 ; GFX900-NEXT: v_writelane_b32 v22, s59, 27 -; GFX900-NEXT: v_readfirstlane_b32 s59, v0 +; GFX900-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; GFX900-NEXT: s_add_i32 s59, s4, 0x4240 +; GFX900-NEXT: v_add_u32_e32 v0, 64, v0 +; GFX900-NEXT: s_and_b64 s[4:5], 0, exec +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ; use alloca0 v0 +; GFX900-NEXT: ;;#ASMEND ; GFX900-NEXT: ;;#ASMSTART ; GFX900-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX900-NEXT: ;;#ASMEND @@ -1885,87 +1784,84 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX940-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX940-NEXT: scratch_store_dword off, v23, s2 sc0 sc1 ; 4-byte Folded Spill +; GFX940-NEXT: scratch_store_dword off, v22, s2 sc0 sc1 ; 4-byte Folded Spill ; GFX940-NEXT: s_mov_b64 exec, s[0:1] -; GFX940-NEXT: v_writelane_b32 v23, s30, 0 -; GFX940-NEXT: v_writelane_b32 v23, s31, 1 -; GFX940-NEXT: v_writelane_b32 v23, s33, 2 -; GFX940-NEXT: v_writelane_b32 v23, s34, 3 -; GFX940-NEXT: v_writelane_b32 v23, s35, 4 -; GFX940-NEXT: v_writelane_b32 v23, s36, 5 -; GFX940-NEXT: v_writelane_b32 v23, s37, 6 -; GFX940-NEXT: v_writelane_b32 v23, s38, 7 -; GFX940-NEXT: v_writelane_b32 v23, s39, 8 -; GFX940-NEXT: v_writelane_b32 v23, s40, 9 -; GFX940-NEXT: v_writelane_b32 v23, s41, 10 -; GFX940-NEXT: v_writelane_b32 v23, s42, 11 -; GFX940-NEXT: v_writelane_b32 v23, s43, 12 -; GFX940-NEXT: v_writelane_b32 v23, s44, 13 -; GFX940-NEXT: v_writelane_b32 v23, s45, 14 -; GFX940-NEXT: v_writelane_b32 v23, s46, 15 -; GFX940-NEXT: v_writelane_b32 v23, s47, 16 -; GFX940-NEXT: v_writelane_b32 v23, s48, 17 -; GFX940-NEXT: v_writelane_b32 v23, s49, 18 -; GFX940-NEXT: v_writelane_b32 v23, s50, 19 -; GFX940-NEXT: v_writelane_b32 v23, s51, 20 -; GFX940-NEXT: v_writelane_b32 v23, s52, 21 -; GFX940-NEXT: v_writelane_b32 v23, s53, 22 -; GFX940-NEXT: v_writelane_b32 v23, s54, 23 -; GFX940-NEXT: v_writelane_b32 v23, s55, 24 -; GFX940-NEXT: v_writelane_b32 v23, s56, 25 +; GFX940-NEXT: v_writelane_b32 v22, s30, 0 +; GFX940-NEXT: v_writelane_b32 v22, s31, 1 +; GFX940-NEXT: v_writelane_b32 v22, s33, 2 +; GFX940-NEXT: v_writelane_b32 v22, s34, 3 +; GFX940-NEXT: v_writelane_b32 v22, s35, 4 +; GFX940-NEXT: v_writelane_b32 v22, s36, 5 +; GFX940-NEXT: v_writelane_b32 v22, s37, 6 +; GFX940-NEXT: v_writelane_b32 v22, s38, 7 +; GFX940-NEXT: v_writelane_b32 v22, s39, 8 +; GFX940-NEXT: v_writelane_b32 v22, s40, 9 +; GFX940-NEXT: v_writelane_b32 v22, s41, 10 +; GFX940-NEXT: v_writelane_b32 v22, s42, 11 +; GFX940-NEXT: v_writelane_b32 v22, s43, 12 +; GFX940-NEXT: v_writelane_b32 v22, s44, 13 +; GFX940-NEXT: v_writelane_b32 v22, s45, 14 +; GFX940-NEXT: v_writelane_b32 v22, s46, 15 +; GFX940-NEXT: v_writelane_b32 v22, s47, 16 +; GFX940-NEXT: v_writelane_b32 v22, s48, 17 +; GFX940-NEXT: v_writelane_b32 v22, s49, 18 +; GFX940-NEXT: v_writelane_b32 v22, s50, 19 +; GFX940-NEXT: v_writelane_b32 v22, s51, 20 +; GFX940-NEXT: v_writelane_b32 v22, s52, 21 +; GFX940-NEXT: v_writelane_b32 v22, s53, 22 +; GFX940-NEXT: v_writelane_b32 v22, s54, 23 +; GFX940-NEXT: v_writelane_b32 v22, s55, 24 +; GFX940-NEXT: v_writelane_b32 v22, s56, 25 +; GFX940-NEXT: v_writelane_b32 v22, s57, 26 ; GFX940-NEXT: s_add_i32 s0, s32, 64 -; GFX940-NEXT: v_writelane_b32 v23, s57, 26 +; GFX940-NEXT: v_writelane_b32 v22, s59, 27 ; GFX940-NEXT: v_mov_b32_e32 v0, s0 -; GFX940-NEXT: v_writelane_b32 v23, s59, 27 +; GFX940-NEXT: v_writelane_b32 v22, s60, 28 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use alloca0 v0 ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_add_i32 s59, s32, 0x4040 -; GFX940-NEXT: v_mov_b32_e32 v22, s59 -; GFX940-NEXT: v_writelane_b32 v23, s60, 28 -; GFX940-NEXT: v_add_u32_e32 v22, 0x200, v22 -; GFX940-NEXT: v_writelane_b32 v23, s61, 29 -; GFX940-NEXT: v_readfirstlane_b32 s59, v22 +; GFX940-NEXT: s_add_i32 s59, s32, 0x4240 +; GFX940-NEXT: v_writelane_b32 v22, s61, 29 ; GFX940-NEXT: s_and_b64 s[60:61], 0, exec ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_readlane_b32 s61, v23, 29 -; GFX940-NEXT: v_readlane_b32 s60, v23, 28 -; GFX940-NEXT: v_readlane_b32 s59, v23, 27 -; GFX940-NEXT: v_readlane_b32 s57, v23, 26 -; GFX940-NEXT: v_readlane_b32 s56, v23, 25 -; GFX940-NEXT: v_readlane_b32 s55, v23, 24 -; GFX940-NEXT: v_readlane_b32 s54, v23, 23 -; GFX940-NEXT: v_readlane_b32 s53, v23, 22 -; GFX940-NEXT: v_readlane_b32 s52, v23, 21 -; GFX940-NEXT: v_readlane_b32 s51, v23, 20 -; GFX940-NEXT: v_readlane_b32 s50, v23, 19 -; GFX940-NEXT: v_readlane_b32 s49, v23, 18 -; GFX940-NEXT: v_readlane_b32 s48, v23, 17 -; GFX940-NEXT: v_readlane_b32 s47, v23, 16 -; GFX940-NEXT: v_readlane_b32 s46, v23, 15 -; GFX940-NEXT: v_readlane_b32 s45, v23, 14 -; GFX940-NEXT: v_readlane_b32 s44, v23, 13 -; GFX940-NEXT: v_readlane_b32 s43, v23, 12 -; GFX940-NEXT: v_readlane_b32 s42, v23, 11 -; GFX940-NEXT: v_readlane_b32 s41, v23, 10 -; GFX940-NEXT: v_readlane_b32 s40, v23, 9 -; GFX940-NEXT: v_readlane_b32 s39, v23, 8 -; GFX940-NEXT: v_readlane_b32 s38, v23, 7 -; GFX940-NEXT: v_readlane_b32 s37, v23, 6 -; GFX940-NEXT: v_readlane_b32 s36, v23, 5 -; GFX940-NEXT: v_readlane_b32 s35, v23, 4 -; GFX940-NEXT: v_readlane_b32 s34, v23, 3 -; GFX940-NEXT: v_readlane_b32 s33, v23, 2 -; GFX940-NEXT: v_readlane_b32 s31, v23, 1 -; GFX940-NEXT: v_readlane_b32 s30, v23, 0 +; GFX940-NEXT: v_readlane_b32 s61, v22, 29 +; GFX940-NEXT: v_readlane_b32 s60, v22, 28 +; GFX940-NEXT: v_readlane_b32 s59, v22, 27 +; GFX940-NEXT: v_readlane_b32 s57, v22, 26 +; GFX940-NEXT: v_readlane_b32 s56, v22, 25 +; GFX940-NEXT: v_readlane_b32 s55, v22, 24 +; GFX940-NEXT: v_readlane_b32 s54, v22, 23 +; GFX940-NEXT: v_readlane_b32 s53, v22, 22 +; GFX940-NEXT: v_readlane_b32 s52, v22, 21 +; GFX940-NEXT: v_readlane_b32 s51, v22, 20 +; GFX940-NEXT: v_readlane_b32 s50, v22, 19 +; GFX940-NEXT: v_readlane_b32 s49, v22, 18 +; GFX940-NEXT: v_readlane_b32 s48, v22, 17 +; GFX940-NEXT: v_readlane_b32 s47, v22, 16 +; GFX940-NEXT: v_readlane_b32 s46, v22, 15 +; GFX940-NEXT: v_readlane_b32 s45, v22, 14 +; GFX940-NEXT: v_readlane_b32 s44, v22, 13 +; GFX940-NEXT: v_readlane_b32 s43, v22, 12 +; GFX940-NEXT: v_readlane_b32 s42, v22, 11 +; GFX940-NEXT: v_readlane_b32 s41, v22, 10 +; GFX940-NEXT: v_readlane_b32 s40, v22, 9 +; GFX940-NEXT: v_readlane_b32 s39, v22, 8 +; GFX940-NEXT: v_readlane_b32 s38, v22, 7 +; GFX940-NEXT: v_readlane_b32 s37, v22, 6 +; GFX940-NEXT: v_readlane_b32 s36, v22, 5 +; GFX940-NEXT: v_readlane_b32 s35, v22, 4 +; GFX940-NEXT: v_readlane_b32 s34, v22, 3 +; GFX940-NEXT: v_readlane_b32 s33, v22, 2 +; GFX940-NEXT: v_readlane_b32 s31, v22, 1 +; GFX940-NEXT: v_readlane_b32 s30, v22, 0 ; GFX940-NEXT: s_xor_saveexec_b64 s[0:1], -1 ; GFX940-NEXT: s_add_i32 s2, s32, 0x8040 -; GFX940-NEXT: scratch_load_dword v23, off, s2 ; 4-byte Folded Reload +; GFX940-NEXT: scratch_load_dword v22, off, s2 ; 4-byte Folded Reload ; GFX940-NEXT: s_mov_b64 exec, s[0:1] ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1975,83 +1871,82 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill +; GFX10_1-NEXT: buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_1-NEXT: v_writelane_b32 v23, s30, 0 -; GFX10_1-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_1-NEXT: v_writelane_b32 v22, s30, 0 ; GFX10_1-NEXT: v_lshrrev_b32_e64 v0, 5, s32 -; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_1-NEXT: v_writelane_b32 v23, s31, 1 -; GFX10_1-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1 +; GFX10_1-NEXT: s_lshr_b32 s4, s32, 5 +; GFX10_1-NEXT: v_writelane_b32 v22, s31, 1 ; GFX10_1-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use alloca0 v0 ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_writelane_b32 v23, s33, 2 -; GFX10_1-NEXT: v_writelane_b32 v23, s34, 3 -; GFX10_1-NEXT: v_writelane_b32 v23, s35, 4 -; GFX10_1-NEXT: v_writelane_b32 v23, s36, 5 -; GFX10_1-NEXT: v_writelane_b32 v23, s37, 6 -; GFX10_1-NEXT: v_writelane_b32 v23, s38, 7 -; GFX10_1-NEXT: v_writelane_b32 v23, s39, 8 -; GFX10_1-NEXT: v_writelane_b32 v23, s40, 9 -; GFX10_1-NEXT: v_writelane_b32 v23, s41, 10 -; GFX10_1-NEXT: v_writelane_b32 v23, s42, 11 -; GFX10_1-NEXT: v_writelane_b32 v23, s43, 12 -; GFX10_1-NEXT: v_writelane_b32 v23, s44, 13 -; GFX10_1-NEXT: v_writelane_b32 v23, s45, 14 -; GFX10_1-NEXT: v_writelane_b32 v23, s46, 15 -; GFX10_1-NEXT: v_writelane_b32 v23, s47, 16 -; GFX10_1-NEXT: v_writelane_b32 v23, s48, 17 -; GFX10_1-NEXT: v_writelane_b32 v23, s49, 18 -; GFX10_1-NEXT: v_writelane_b32 v23, s50, 19 -; GFX10_1-NEXT: v_writelane_b32 v23, s51, 20 -; GFX10_1-NEXT: v_writelane_b32 v23, s52, 21 -; GFX10_1-NEXT: v_writelane_b32 v23, s53, 22 -; GFX10_1-NEXT: v_writelane_b32 v23, s54, 23 -; GFX10_1-NEXT: v_writelane_b32 v23, s55, 24 -; GFX10_1-NEXT: v_writelane_b32 v23, s56, 25 -; GFX10_1-NEXT: v_writelane_b32 v23, s57, 26 +; GFX10_1-NEXT: v_writelane_b32 v22, s33, 2 +; GFX10_1-NEXT: v_writelane_b32 v22, s34, 3 +; GFX10_1-NEXT: v_writelane_b32 v22, s35, 4 +; GFX10_1-NEXT: v_writelane_b32 v22, s36, 5 +; GFX10_1-NEXT: v_writelane_b32 v22, s37, 6 +; GFX10_1-NEXT: v_writelane_b32 v22, s38, 7 +; GFX10_1-NEXT: v_writelane_b32 v22, s39, 8 +; GFX10_1-NEXT: v_writelane_b32 v22, s40, 9 +; GFX10_1-NEXT: v_writelane_b32 v22, s41, 10 +; GFX10_1-NEXT: v_writelane_b32 v22, s42, 11 +; GFX10_1-NEXT: v_writelane_b32 v22, s43, 12 +; GFX10_1-NEXT: v_writelane_b32 v22, s44, 13 +; GFX10_1-NEXT: v_writelane_b32 v22, s45, 14 +; GFX10_1-NEXT: v_writelane_b32 v22, s46, 15 +; GFX10_1-NEXT: v_writelane_b32 v22, s47, 16 +; GFX10_1-NEXT: v_writelane_b32 v22, s48, 17 +; GFX10_1-NEXT: v_writelane_b32 v22, s49, 18 +; GFX10_1-NEXT: v_writelane_b32 v22, s50, 19 +; GFX10_1-NEXT: v_writelane_b32 v22, s51, 20 +; GFX10_1-NEXT: v_writelane_b32 v22, s52, 21 +; GFX10_1-NEXT: v_writelane_b32 v22, s53, 22 +; GFX10_1-NEXT: v_writelane_b32 v22, s54, 23 +; GFX10_1-NEXT: v_writelane_b32 v22, s55, 24 +; GFX10_1-NEXT: v_writelane_b32 v22, s56, 25 +; GFX10_1-NEXT: v_writelane_b32 v22, s57, 26 +; GFX10_1-NEXT: v_writelane_b32 v22, s59, 27 +; GFX10_1-NEXT: s_add_i32 s59, s4, 0x4240 +; GFX10_1-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_writelane_b32 v23, s59, 27 -; GFX10_1-NEXT: v_readfirstlane_b32 s59, v22 ; GFX10_1-NEXT: ;;#ASMSTART ; GFX10_1-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX10_1-NEXT: ;;#ASMEND -; GFX10_1-NEXT: v_readlane_b32 s59, v23, 27 -; GFX10_1-NEXT: v_readlane_b32 s57, v23, 26 -; GFX10_1-NEXT: v_readlane_b32 s56, v23, 25 -; GFX10_1-NEXT: v_readlane_b32 s55, v23, 24 -; GFX10_1-NEXT: v_readlane_b32 s54, v23, 23 -; GFX10_1-NEXT: v_readlane_b32 s53, v23, 22 -; GFX10_1-NEXT: v_readlane_b32 s52, v23, 21 -; GFX10_1-NEXT: v_readlane_b32 s51, v23, 20 -; GFX10_1-NEXT: v_readlane_b32 s50, v23, 19 -; GFX10_1-NEXT: v_readlane_b32 s49, v23, 18 -; GFX10_1-NEXT: v_readlane_b32 s48, v23, 17 -; GFX10_1-NEXT: v_readlane_b32 s47, v23, 16 -; GFX10_1-NEXT: v_readlane_b32 s46, v23, 15 -; GFX10_1-NEXT: v_readlane_b32 s45, v23, 14 -; GFX10_1-NEXT: v_readlane_b32 s44, v23, 13 -; GFX10_1-NEXT: v_readlane_b32 s43, v23, 12 -; GFX10_1-NEXT: v_readlane_b32 s42, v23, 11 -; GFX10_1-NEXT: v_readlane_b32 s41, v23, 10 -; GFX10_1-NEXT: v_readlane_b32 s40, v23, 9 -; GFX10_1-NEXT: v_readlane_b32 s39, v23, 8 -; GFX10_1-NEXT: v_readlane_b32 s38, v23, 7 -; GFX10_1-NEXT: v_readlane_b32 s37, v23, 6 -; GFX10_1-NEXT: v_readlane_b32 s36, v23, 5 -; GFX10_1-NEXT: v_readlane_b32 s35, v23, 4 -; GFX10_1-NEXT: v_readlane_b32 s34, v23, 3 -; GFX10_1-NEXT: v_readlane_b32 s33, v23, 2 -; GFX10_1-NEXT: v_readlane_b32 s31, v23, 1 -; GFX10_1-NEXT: v_readlane_b32 s30, v23, 0 +; GFX10_1-NEXT: v_readlane_b32 s59, v22, 27 +; GFX10_1-NEXT: v_readlane_b32 s57, v22, 26 +; GFX10_1-NEXT: v_readlane_b32 s56, v22, 25 +; GFX10_1-NEXT: v_readlane_b32 s55, v22, 24 +; GFX10_1-NEXT: v_readlane_b32 s54, v22, 23 +; GFX10_1-NEXT: v_readlane_b32 s53, v22, 22 +; GFX10_1-NEXT: v_readlane_b32 s52, v22, 21 +; GFX10_1-NEXT: v_readlane_b32 s51, v22, 20 +; GFX10_1-NEXT: v_readlane_b32 s50, v22, 19 +; GFX10_1-NEXT: v_readlane_b32 s49, v22, 18 +; GFX10_1-NEXT: v_readlane_b32 s48, v22, 17 +; GFX10_1-NEXT: v_readlane_b32 s47, v22, 16 +; GFX10_1-NEXT: v_readlane_b32 s46, v22, 15 +; GFX10_1-NEXT: v_readlane_b32 s45, v22, 14 +; GFX10_1-NEXT: v_readlane_b32 s44, v22, 13 +; GFX10_1-NEXT: v_readlane_b32 s43, v22, 12 +; GFX10_1-NEXT: v_readlane_b32 s42, v22, 11 +; GFX10_1-NEXT: v_readlane_b32 s41, v22, 10 +; GFX10_1-NEXT: v_readlane_b32 s40, v22, 9 +; GFX10_1-NEXT: v_readlane_b32 s39, v22, 8 +; GFX10_1-NEXT: v_readlane_b32 s38, v22, 7 +; GFX10_1-NEXT: v_readlane_b32 s37, v22, 6 +; GFX10_1-NEXT: v_readlane_b32 s36, v22, 5 +; GFX10_1-NEXT: v_readlane_b32 s35, v22, 4 +; GFX10_1-NEXT: v_readlane_b32 s34, v22, 3 +; GFX10_1-NEXT: v_readlane_b32 s33, v22, 2 +; GFX10_1-NEXT: v_readlane_b32 s31, v22, 1 +; GFX10_1-NEXT: v_readlane_b32 s30, v22, 0 ; GFX10_1-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_1-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_1-NEXT: buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload +; GFX10_1-NEXT: buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload ; GFX10_1-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_1-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_1-NEXT: s_waitcnt vmcnt(0) @@ -2062,82 +1957,81 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX10_3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_store_dword v23, off, s[0:3], s5 ; 4-byte Folded Spill +; GFX10_3-NEXT: buffer_store_dword v22, off, s[0:3], s5 ; 4-byte Folded Spill ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 -; GFX10_3-NEXT: v_writelane_b32 v23, s30, 0 -; GFX10_3-NEXT: v_lshrrev_b32_e64 v1, 5, s32 +; GFX10_3-NEXT: v_writelane_b32 v22, s30, 0 ; GFX10_3-NEXT: v_lshrrev_b32_e64 v0, 5, s32 -; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo -; GFX10_3-NEXT: v_writelane_b32 v23, s31, 1 -; GFX10_3-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1 +; GFX10_3-NEXT: s_lshr_b32 s4, s32, 5 +; GFX10_3-NEXT: v_writelane_b32 v22, s31, 1 ; GFX10_3-NEXT: v_add_nc_u32_e32 v0, 64, v0 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use alloca0 v0 ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_writelane_b32 v23, s33, 2 -; GFX10_3-NEXT: v_writelane_b32 v23, s34, 3 -; GFX10_3-NEXT: v_writelane_b32 v23, s35, 4 -; GFX10_3-NEXT: v_writelane_b32 v23, s36, 5 -; GFX10_3-NEXT: v_writelane_b32 v23, s37, 6 -; GFX10_3-NEXT: v_writelane_b32 v23, s38, 7 -; GFX10_3-NEXT: v_writelane_b32 v23, s39, 8 -; GFX10_3-NEXT: v_writelane_b32 v23, s40, 9 -; GFX10_3-NEXT: v_writelane_b32 v23, s41, 10 -; GFX10_3-NEXT: v_writelane_b32 v23, s42, 11 -; GFX10_3-NEXT: v_writelane_b32 v23, s43, 12 -; GFX10_3-NEXT: v_writelane_b32 v23, s44, 13 -; GFX10_3-NEXT: v_writelane_b32 v23, s45, 14 -; GFX10_3-NEXT: v_writelane_b32 v23, s46, 15 -; GFX10_3-NEXT: v_writelane_b32 v23, s47, 16 -; GFX10_3-NEXT: v_writelane_b32 v23, s48, 17 -; GFX10_3-NEXT: v_writelane_b32 v23, s49, 18 -; GFX10_3-NEXT: v_writelane_b32 v23, s50, 19 -; GFX10_3-NEXT: v_writelane_b32 v23, s51, 20 -; GFX10_3-NEXT: v_writelane_b32 v23, s52, 21 -; GFX10_3-NEXT: v_writelane_b32 v23, s53, 22 -; GFX10_3-NEXT: v_writelane_b32 v23, s54, 23 -; GFX10_3-NEXT: v_writelane_b32 v23, s55, 24 -; GFX10_3-NEXT: v_writelane_b32 v23, s56, 25 -; GFX10_3-NEXT: v_writelane_b32 v23, s57, 26 +; GFX10_3-NEXT: v_writelane_b32 v22, s33, 2 +; GFX10_3-NEXT: v_writelane_b32 v22, s34, 3 +; GFX10_3-NEXT: v_writelane_b32 v22, s35, 4 +; GFX10_3-NEXT: v_writelane_b32 v22, s36, 5 +; GFX10_3-NEXT: v_writelane_b32 v22, s37, 6 +; GFX10_3-NEXT: v_writelane_b32 v22, s38, 7 +; GFX10_3-NEXT: v_writelane_b32 v22, s39, 8 +; GFX10_3-NEXT: v_writelane_b32 v22, s40, 9 +; GFX10_3-NEXT: v_writelane_b32 v22, s41, 10 +; GFX10_3-NEXT: v_writelane_b32 v22, s42, 11 +; GFX10_3-NEXT: v_writelane_b32 v22, s43, 12 +; GFX10_3-NEXT: v_writelane_b32 v22, s44, 13 +; GFX10_3-NEXT: v_writelane_b32 v22, s45, 14 +; GFX10_3-NEXT: v_writelane_b32 v22, s46, 15 +; GFX10_3-NEXT: v_writelane_b32 v22, s47, 16 +; GFX10_3-NEXT: v_writelane_b32 v22, s48, 17 +; GFX10_3-NEXT: v_writelane_b32 v22, s49, 18 +; GFX10_3-NEXT: v_writelane_b32 v22, s50, 19 +; GFX10_3-NEXT: v_writelane_b32 v22, s51, 20 +; GFX10_3-NEXT: v_writelane_b32 v22, s52, 21 +; GFX10_3-NEXT: v_writelane_b32 v22, s53, 22 +; GFX10_3-NEXT: v_writelane_b32 v22, s54, 23 +; GFX10_3-NEXT: v_writelane_b32 v22, s55, 24 +; GFX10_3-NEXT: v_writelane_b32 v22, s56, 25 +; GFX10_3-NEXT: v_writelane_b32 v22, s57, 26 +; GFX10_3-NEXT: v_writelane_b32 v22, s59, 27 +; GFX10_3-NEXT: s_add_i32 s59, s4, 0x4240 +; GFX10_3-NEXT: s_and_b32 s4, 0, exec_lo ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_writelane_b32 v23, s59, 27 -; GFX10_3-NEXT: v_readfirstlane_b32 s59, v22 ; GFX10_3-NEXT: ;;#ASMSTART ; GFX10_3-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX10_3-NEXT: ;;#ASMEND -; GFX10_3-NEXT: v_readlane_b32 s59, v23, 27 -; GFX10_3-NEXT: v_readlane_b32 s57, v23, 26 -; GFX10_3-NEXT: v_readlane_b32 s56, v23, 25 -; GFX10_3-NEXT: v_readlane_b32 s55, v23, 24 -; GFX10_3-NEXT: v_readlane_b32 s54, v23, 23 -; GFX10_3-NEXT: v_readlane_b32 s53, v23, 22 -; GFX10_3-NEXT: v_readlane_b32 s52, v23, 21 -; GFX10_3-NEXT: v_readlane_b32 s51, v23, 20 -; GFX10_3-NEXT: v_readlane_b32 s50, v23, 19 -; GFX10_3-NEXT: v_readlane_b32 s49, v23, 18 -; GFX10_3-NEXT: v_readlane_b32 s48, v23, 17 -; GFX10_3-NEXT: v_readlane_b32 s47, v23, 16 -; GFX10_3-NEXT: v_readlane_b32 s46, v23, 15 -; GFX10_3-NEXT: v_readlane_b32 s45, v23, 14 -; GFX10_3-NEXT: v_readlane_b32 s44, v23, 13 -; GFX10_3-NEXT: v_readlane_b32 s43, v23, 12 -; GFX10_3-NEXT: v_readlane_b32 s42, v23, 11 -; GFX10_3-NEXT: v_readlane_b32 s41, v23, 10 -; GFX10_3-NEXT: v_readlane_b32 s40, v23, 9 -; GFX10_3-NEXT: v_readlane_b32 s39, v23, 8 -; GFX10_3-NEXT: v_readlane_b32 s38, v23, 7 -; GFX10_3-NEXT: v_readlane_b32 s37, v23, 6 -; GFX10_3-NEXT: v_readlane_b32 s36, v23, 5 -; GFX10_3-NEXT: v_readlane_b32 s35, v23, 4 -; GFX10_3-NEXT: v_readlane_b32 s34, v23, 3 -; GFX10_3-NEXT: v_readlane_b32 s33, v23, 2 -; GFX10_3-NEXT: v_readlane_b32 s31, v23, 1 -; GFX10_3-NEXT: v_readlane_b32 s30, v23, 0 +; GFX10_3-NEXT: v_readlane_b32 s59, v22, 27 +; GFX10_3-NEXT: v_readlane_b32 s57, v22, 26 +; GFX10_3-NEXT: v_readlane_b32 s56, v22, 25 +; GFX10_3-NEXT: v_readlane_b32 s55, v22, 24 +; GFX10_3-NEXT: v_readlane_b32 s54, v22, 23 +; GFX10_3-NEXT: v_readlane_b32 s53, v22, 22 +; GFX10_3-NEXT: v_readlane_b32 s52, v22, 21 +; GFX10_3-NEXT: v_readlane_b32 s51, v22, 20 +; GFX10_3-NEXT: v_readlane_b32 s50, v22, 19 +; GFX10_3-NEXT: v_readlane_b32 s49, v22, 18 +; GFX10_3-NEXT: v_readlane_b32 s48, v22, 17 +; GFX10_3-NEXT: v_readlane_b32 s47, v22, 16 +; GFX10_3-NEXT: v_readlane_b32 s46, v22, 15 +; GFX10_3-NEXT: v_readlane_b32 s45, v22, 14 +; GFX10_3-NEXT: v_readlane_b32 s44, v22, 13 +; GFX10_3-NEXT: v_readlane_b32 s43, v22, 12 +; GFX10_3-NEXT: v_readlane_b32 s42, v22, 11 +; GFX10_3-NEXT: v_readlane_b32 s41, v22, 10 +; GFX10_3-NEXT: v_readlane_b32 s40, v22, 9 +; GFX10_3-NEXT: v_readlane_b32 s39, v22, 8 +; GFX10_3-NEXT: v_readlane_b32 s38, v22, 7 +; GFX10_3-NEXT: v_readlane_b32 s37, v22, 6 +; GFX10_3-NEXT: v_readlane_b32 s36, v22, 5 +; GFX10_3-NEXT: v_readlane_b32 s35, v22, 4 +; GFX10_3-NEXT: v_readlane_b32 s34, v22, 3 +; GFX10_3-NEXT: v_readlane_b32 s33, v22, 2 +; GFX10_3-NEXT: v_readlane_b32 s31, v22, 1 +; GFX10_3-NEXT: v_readlane_b32 s30, v22, 0 ; GFX10_3-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10_3-NEXT: s_add_i32 s5, s32, 0x100800 -; GFX10_3-NEXT: buffer_load_dword v23, off, s[0:3], s5 ; 4-byte Folded Reload +; GFX10_3-NEXT: buffer_load_dword v22, off, s[0:3], s5 ; 4-byte Folded Reload ; GFX10_3-NEXT: s_mov_b32 exec_lo, s4 ; GFX10_3-NEXT: s_waitcnt vmcnt(0) ; GFX10_3-NEXT: s_setpc_b64 s[30:31] @@ -2147,83 +2041,81 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_store_b32 off, v23, s1 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v22, s1 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 -; GFX11-NEXT: v_writelane_b32 v23, s30, 0 +; GFX11-NEXT: v_writelane_b32 v22, s30, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 64 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_dual_mov_b32 v1, s32 :: v_dual_mov_b32 v0, s0 -; GFX11-NEXT: s_and_b32 s0, 0, exec_lo -; GFX11-NEXT: v_writelane_b32 v23, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use alloca0 v0 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_add_nc_u32_e32 v22, 0x4240, v1 -; GFX11-NEXT: v_writelane_b32 v23, s33, 2 -; GFX11-NEXT: v_writelane_b32 v23, s34, 3 -; GFX11-NEXT: v_writelane_b32 v23, s35, 4 -; GFX11-NEXT: v_writelane_b32 v23, s36, 5 -; GFX11-NEXT: v_writelane_b32 v23, s37, 6 -; GFX11-NEXT: v_writelane_b32 v23, s38, 7 -; GFX11-NEXT: v_writelane_b32 v23, s39, 8 -; GFX11-NEXT: v_writelane_b32 v23, s40, 9 -; GFX11-NEXT: v_writelane_b32 v23, s41, 10 -; GFX11-NEXT: v_writelane_b32 v23, s42, 11 -; GFX11-NEXT: v_writelane_b32 v23, s43, 12 -; GFX11-NEXT: v_writelane_b32 v23, s44, 13 -; GFX11-NEXT: v_writelane_b32 v23, s45, 14 -; GFX11-NEXT: v_writelane_b32 v23, s46, 15 -; GFX11-NEXT: v_writelane_b32 v23, s47, 16 -; GFX11-NEXT: v_writelane_b32 v23, s48, 17 -; GFX11-NEXT: v_writelane_b32 v23, s49, 18 -; GFX11-NEXT: v_writelane_b32 v23, s50, 19 -; GFX11-NEXT: v_writelane_b32 v23, s51, 20 -; GFX11-NEXT: v_writelane_b32 v23, s52, 21 -; GFX11-NEXT: v_writelane_b32 v23, s53, 22 -; GFX11-NEXT: v_writelane_b32 v23, s54, 23 -; GFX11-NEXT: v_writelane_b32 v23, s55, 24 -; GFX11-NEXT: v_writelane_b32 v23, s56, 25 -; GFX11-NEXT: v_writelane_b32 v23, s57, 26 +; GFX11-NEXT: v_writelane_b32 v22, s31, 1 +; GFX11-NEXT: v_writelane_b32 v22, s33, 2 +; GFX11-NEXT: v_writelane_b32 v22, s34, 3 +; GFX11-NEXT: v_writelane_b32 v22, s35, 4 +; GFX11-NEXT: v_writelane_b32 v22, s36, 5 +; GFX11-NEXT: v_writelane_b32 v22, s37, 6 +; GFX11-NEXT: v_writelane_b32 v22, s38, 7 +; GFX11-NEXT: v_writelane_b32 v22, s39, 8 +; GFX11-NEXT: v_writelane_b32 v22, s40, 9 +; GFX11-NEXT: v_writelane_b32 v22, s41, 10 +; GFX11-NEXT: v_writelane_b32 v22, s42, 11 +; GFX11-NEXT: v_writelane_b32 v22, s43, 12 +; GFX11-NEXT: v_writelane_b32 v22, s44, 13 +; GFX11-NEXT: v_writelane_b32 v22, s45, 14 +; GFX11-NEXT: v_writelane_b32 v22, s46, 15 +; GFX11-NEXT: v_writelane_b32 v22, s47, 16 +; GFX11-NEXT: v_writelane_b32 v22, s48, 17 +; GFX11-NEXT: v_writelane_b32 v22, s49, 18 +; GFX11-NEXT: v_writelane_b32 v22, s50, 19 +; GFX11-NEXT: v_writelane_b32 v22, s51, 20 +; GFX11-NEXT: v_writelane_b32 v22, s52, 21 +; GFX11-NEXT: v_writelane_b32 v22, s53, 22 +; GFX11-NEXT: v_writelane_b32 v22, s54, 23 +; GFX11-NEXT: v_writelane_b32 v22, s55, 24 +; GFX11-NEXT: v_writelane_b32 v22, s56, 25 +; GFX11-NEXT: v_writelane_b32 v22, s57, 26 +; GFX11-NEXT: v_writelane_b32 v22, s59, 27 +; GFX11-NEXT: s_add_i32 s59, s32, 0x4240 +; GFX11-NEXT: s_and_b32 s0, 0, exec_lo ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v23, s59, 27 -; GFX11-NEXT: v_readfirstlane_b32 s59, v22 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_readlane_b32 s59, v23, 27 -; GFX11-NEXT: v_readlane_b32 s57, v23, 26 -; GFX11-NEXT: v_readlane_b32 s56, v23, 25 -; GFX11-NEXT: v_readlane_b32 s55, v23, 24 -; GFX11-NEXT: v_readlane_b32 s54, v23, 23 -; GFX11-NEXT: v_readlane_b32 s53, v23, 22 -; GFX11-NEXT: v_readlane_b32 s52, v23, 21 -; GFX11-NEXT: v_readlane_b32 s51, v23, 20 -; GFX11-NEXT: v_readlane_b32 s50, v23, 19 -; GFX11-NEXT: v_readlane_b32 s49, v23, 18 -; GFX11-NEXT: v_readlane_b32 s48, v23, 17 -; GFX11-NEXT: v_readlane_b32 s47, v23, 16 -; GFX11-NEXT: v_readlane_b32 s46, v23, 15 -; GFX11-NEXT: v_readlane_b32 s45, v23, 14 -; GFX11-NEXT: v_readlane_b32 s44, v23, 13 -; GFX11-NEXT: v_readlane_b32 s43, v23, 12 -; GFX11-NEXT: v_readlane_b32 s42, v23, 11 -; GFX11-NEXT: v_readlane_b32 s41, v23, 10 -; GFX11-NEXT: v_readlane_b32 s40, v23, 9 -; GFX11-NEXT: v_readlane_b32 s39, v23, 8 -; GFX11-NEXT: v_readlane_b32 s38, v23, 7 -; GFX11-NEXT: v_readlane_b32 s37, v23, 6 -; GFX11-NEXT: v_readlane_b32 s36, v23, 5 -; GFX11-NEXT: v_readlane_b32 s35, v23, 4 -; GFX11-NEXT: v_readlane_b32 s34, v23, 3 -; GFX11-NEXT: v_readlane_b32 s33, v23, 2 -; GFX11-NEXT: v_readlane_b32 s31, v23, 1 -; GFX11-NEXT: v_readlane_b32 s30, v23, 0 +; GFX11-NEXT: v_readlane_b32 s59, v22, 27 +; GFX11-NEXT: v_readlane_b32 s57, v22, 26 +; GFX11-NEXT: v_readlane_b32 s56, v22, 25 +; GFX11-NEXT: v_readlane_b32 s55, v22, 24 +; GFX11-NEXT: v_readlane_b32 s54, v22, 23 +; GFX11-NEXT: v_readlane_b32 s53, v22, 22 +; GFX11-NEXT: v_readlane_b32 s52, v22, 21 +; GFX11-NEXT: v_readlane_b32 s51, v22, 20 +; GFX11-NEXT: v_readlane_b32 s50, v22, 19 +; GFX11-NEXT: v_readlane_b32 s49, v22, 18 +; GFX11-NEXT: v_readlane_b32 s48, v22, 17 +; GFX11-NEXT: v_readlane_b32 s47, v22, 16 +; GFX11-NEXT: v_readlane_b32 s46, v22, 15 +; GFX11-NEXT: v_readlane_b32 s45, v22, 14 +; GFX11-NEXT: v_readlane_b32 s44, v22, 13 +; GFX11-NEXT: v_readlane_b32 s43, v22, 12 +; GFX11-NEXT: v_readlane_b32 s42, v22, 11 +; GFX11-NEXT: v_readlane_b32 s41, v22, 10 +; GFX11-NEXT: v_readlane_b32 s40, v22, 9 +; GFX11-NEXT: v_readlane_b32 s39, v22, 8 +; GFX11-NEXT: v_readlane_b32 s38, v22, 7 +; GFX11-NEXT: v_readlane_b32 s37, v22, 6 +; GFX11-NEXT: v_readlane_b32 s36, v22, 5 +; GFX11-NEXT: v_readlane_b32 s35, v22, 4 +; GFX11-NEXT: v_readlane_b32 s34, v22, 3 +; GFX11-NEXT: v_readlane_b32 s33, v22, 2 +; GFX11-NEXT: v_readlane_b32 s31, v22, 1 +; GFX11-NEXT: v_readlane_b32 s30, v22, 0 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: s_add_i32 s1, s32, 0x8040 -; GFX11-NEXT: scratch_load_b32 v23, off, s1 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v22, off, s1 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -2236,82 +2128,79 @@ define void @scalar_mov_materializes_frame_index_no_live_scc_no_live_sgprs_gep_i ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_store_b32 off, v23, s32 offset:32768 ; 4-byte Folded Spill +; GFX12-NEXT: scratch_store_b32 off, v22, s32 offset:32768 ; 4-byte Folded Spill ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 -; GFX12-NEXT: v_writelane_b32 v23, s30, 0 -; GFX12-NEXT: v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, s32 -; GFX12-NEXT: s_and_b32 s0, 0, exec_lo +; GFX12-NEXT: v_writelane_b32 v22, s30, 0 +; GFX12-NEXT: v_mov_b32_e32 v0, s32 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use alloca0 v0 ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_writelane_b32 v23, s31, 1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_add_nc_u32_e32 v22, 0x4200, v1 -; GFX12-NEXT: v_writelane_b32 v23, s33, 2 -; GFX12-NEXT: v_writelane_b32 v23, s34, 3 -; GFX12-NEXT: v_writelane_b32 v23, s35, 4 -; GFX12-NEXT: v_writelane_b32 v23, s36, 5 -; GFX12-NEXT: v_writelane_b32 v23, s37, 6 -; GFX12-NEXT: v_writelane_b32 v23, s38, 7 -; GFX12-NEXT: v_writelane_b32 v23, s39, 8 -; GFX12-NEXT: v_writelane_b32 v23, s40, 9 -; GFX12-NEXT: v_writelane_b32 v23, s41, 10 -; GFX12-NEXT: v_writelane_b32 v23, s42, 11 -; GFX12-NEXT: v_writelane_b32 v23, s43, 12 -; GFX12-NEXT: v_writelane_b32 v23, s44, 13 -; GFX12-NEXT: v_writelane_b32 v23, s45, 14 -; GFX12-NEXT: v_writelane_b32 v23, s46, 15 -; GFX12-NEXT: v_writelane_b32 v23, s47, 16 -; GFX12-NEXT: v_writelane_b32 v23, s48, 17 -; GFX12-NEXT: v_writelane_b32 v23, s49, 18 -; GFX12-NEXT: v_writelane_b32 v23, s50, 19 -; GFX12-NEXT: v_writelane_b32 v23, s51, 20 -; GFX12-NEXT: v_writelane_b32 v23, s52, 21 -; GFX12-NEXT: v_writelane_b32 v23, s53, 22 -; GFX12-NEXT: v_writelane_b32 v23, s54, 23 -; GFX12-NEXT: v_writelane_b32 v23, s55, 24 -; GFX12-NEXT: v_writelane_b32 v23, s56, 25 -; GFX12-NEXT: v_writelane_b32 v23, s57, 26 +; GFX12-NEXT: v_writelane_b32 v22, s31, 1 +; GFX12-NEXT: v_writelane_b32 v22, s33, 2 +; GFX12-NEXT: v_writelane_b32 v22, s34, 3 +; GFX12-NEXT: v_writelane_b32 v22, s35, 4 +; GFX12-NEXT: v_writelane_b32 v22, s36, 5 +; GFX12-NEXT: v_writelane_b32 v22, s37, 6 +; GFX12-NEXT: v_writelane_b32 v22, s38, 7 +; GFX12-NEXT: v_writelane_b32 v22, s39, 8 +; GFX12-NEXT: v_writelane_b32 v22, s40, 9 +; GFX12-NEXT: v_writelane_b32 v22, s41, 10 +; GFX12-NEXT: v_writelane_b32 v22, s42, 11 +; GFX12-NEXT: v_writelane_b32 v22, s43, 12 +; GFX12-NEXT: v_writelane_b32 v22, s44, 13 +; GFX12-NEXT: v_writelane_b32 v22, s45, 14 +; GFX12-NEXT: v_writelane_b32 v22, s46, 15 +; GFX12-NEXT: v_writelane_b32 v22, s47, 16 +; GFX12-NEXT: v_writelane_b32 v22, s48, 17 +; GFX12-NEXT: v_writelane_b32 v22, s49, 18 +; GFX12-NEXT: v_writelane_b32 v22, s50, 19 +; GFX12-NEXT: v_writelane_b32 v22, s51, 20 +; GFX12-NEXT: v_writelane_b32 v22, s52, 21 +; GFX12-NEXT: v_writelane_b32 v22, s53, 22 +; GFX12-NEXT: v_writelane_b32 v22, s54, 23 +; GFX12-NEXT: v_writelane_b32 v22, s55, 24 +; GFX12-NEXT: v_writelane_b32 v22, s56, 25 +; GFX12-NEXT: v_writelane_b32 v22, s57, 26 +; GFX12-NEXT: v_writelane_b32 v22, s59, 27 +; GFX12-NEXT: s_add_co_i32 s59, s32, 0x4200 +; GFX12-NEXT: s_and_b32 s0, 0, exec_lo ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; def s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: v_writelane_b32 v23, s59, 27 -; GFX12-NEXT: v_readfirstlane_b32 s59, v22 ; GFX12-NEXT: ;;#ASMSTART ; GFX12-NEXT: ; use s[0:15], s[16:31], s[32:47], s[48:55], s[56:57], v[0:15], v[16:21], vcc, s59, scc ; GFX12-NEXT: ;;#ASMEND -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_readlane_b32 s59, v23, 27 -; GFX12-NEXT: v_readlane_b32 s57, v23, 26 -; GFX12-NEXT: v_readlane_b32 s56, v23, 25 -; GFX12-NEXT: v_readlane_b32 s55, v23, 24 -; GFX12-NEXT: v_readlane_b32 s54, v23, 23 -; GFX12-NEXT: v_readlane_b32 s53, v23, 22 -; GFX12-NEXT: v_readlane_b32 s52, v23, 21 -; GFX12-NEXT: v_readlane_b32 s51, v23, 20 -; GFX12-NEXT: v_readlane_b32 s50, v23, 19 -; GFX12-NEXT: v_readlane_b32 s49, v23, 18 -; GFX12-NEXT: v_readlane_b32 s48, v23, 17 -; GFX12-NEXT: v_readlane_b32 s47, v23, 16 -; GFX12-NEXT: v_readlane_b32 s46, v23, 15 -; GFX12-NEXT: v_readlane_b32 s45, v23, 14 -; GFX12-NEXT: v_readlane_b32 s44, v23, 13 -; GFX12-NEXT: v_readlane_b32 s43, v23, 12 -; GFX12-NEXT: v_readlane_b32 s42, v23, 11 -; GFX12-NEXT: v_readlane_b32 s41, v23, 10 -; GFX12-NEXT: v_readlane_b32 s40, v23, 9 -; GFX12-NEXT: v_readlane_b32 s39, v23, 8 -; GFX12-NEXT: v_readlane_b32 s38, v23, 7 -; GFX12-NEXT: v_readlane_b32 s37, v23, 6 -; GFX12-NEXT: v_readlane_b32 s36, v23, 5 -; GFX12-NEXT: v_readlane_b32 s35, v23, 4 -; GFX12-NEXT: v_readlane_b32 s34, v23, 3 -; GFX12-NEXT: v_readlane_b32 s33, v23, 2 -; GFX12-NEXT: v_readlane_b32 s31, v23, 1 -; GFX12-NEXT: v_readlane_b32 s30, v23, 0 +; GFX12-NEXT: v_readlane_b32 s59, v22, 27 +; GFX12-NEXT: v_readlane_b32 s57, v22, 26 +; GFX12-NEXT: v_readlane_b32 s56, v22, 25 +; GFX12-NEXT: v_readlane_b32 s55, v22, 24 +; GFX12-NEXT: v_readlane_b32 s54, v22, 23 +; GFX12-NEXT: v_readlane_b32 s53, v22, 22 +; GFX12-NEXT: v_readlane_b32 s52, v22, 21 +; GFX12-NEXT: v_readlane_b32 s51, v22, 20 +; GFX12-NEXT: v_readlane_b32 s50, v22, 19 +; GFX12-NEXT: v_readlane_b32 s49, v22, 18 +; GFX12-NEXT: v_readlane_b32 s48, v22, 17 +; GFX12-NEXT: v_readlane_b32 s47, v22, 16 +; GFX12-NEXT: v_readlane_b32 s46, v22, 15 +; GFX12-NEXT: v_readlane_b32 s45, v22, 14 +; GFX12-NEXT: v_readlane_b32 s44, v22, 13 +; GFX12-NEXT: v_readlane_b32 s43, v22, 12 +; GFX12-NEXT: v_readlane_b32 s42, v22, 11 +; GFX12-NEXT: v_readlane_b32 s41, v22, 10 +; GFX12-NEXT: v_readlane_b32 s40, v22, 9 +; GFX12-NEXT: v_readlane_b32 s39, v22, 8 +; GFX12-NEXT: v_readlane_b32 s38, v22, 7 +; GFX12-NEXT: v_readlane_b32 s37, v22, 6 +; GFX12-NEXT: v_readlane_b32 s36, v22, 5 +; GFX12-NEXT: v_readlane_b32 s35, v22, 4 +; GFX12-NEXT: v_readlane_b32 s34, v22, 3 +; GFX12-NEXT: v_readlane_b32 s33, v22, 2 +; GFX12-NEXT: v_readlane_b32 s31, v22, 1 +; GFX12-NEXT: v_readlane_b32 s30, v22, 0 ; GFX12-NEXT: s_xor_saveexec_b32 s0, -1 -; GFX12-NEXT: scratch_load_b32 v23, off, s32 offset:32768 ; 4-byte Folded Reload +; GFX12-NEXT: scratch_load_b32 v22, off, s32 offset:32768 ; 4-byte Folded Reload ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_mov_b32 exec_lo, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll index 343925528a520e..37a261cab75635 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll @@ -10,7 +10,7 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) { ; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32 ; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off ; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 -; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32 +; MUBUF-NEXT: s_lshr_b32 s4, s32, 6 ; MUBUF-NEXT: s_waitcnt vmcnt(2) ; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32 ; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 @@ -25,7 +25,7 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) { ; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: ;;#ASMSTART -; MUBUF-NEXT: ; use v0 +; MUBUF-NEXT: ; use s4 ; MUBUF-NEXT: ;;#ASMEND ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: s_setpc_b64 s[30:31] @@ -36,7 +36,7 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) { ; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off ; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32 -; FLATSCR-NEXT: v_mov_b32_e32 v0, s32 +; FLATSCR-NEXT: s_mov_b32 s0, s32 ; FLATSCR-NEXT: s_waitcnt vmcnt(2) ; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32 ; FLATSCR-NEXT: s_waitcnt vmcnt(2) @@ -44,7 +44,7 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) { ; FLATSCR-NEXT: s_waitcnt vmcnt(2) ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32 ; FLATSCR-NEXT: ;;#ASMSTART -; FLATSCR-NEXT: ; use v0 +; FLATSCR-NEXT: ; use s0 ; FLATSCR-NEXT: ;;#ASMEND ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll index ebc209bd4d4510..a37e2bf4eb2945 100644 --- a/llvm/test/CodeGen/AMDGPU/required-export-priority.ll +++ b/llvm/test/CodeGen/AMDGPU/required-export-priority.ll @@ -263,10 +263,10 @@ define amdgpu_ps void @test_export_across_store_load(i32 %idx, float %v) #0 { ; GCN-LABEL: test_export_across_store_load: ; GCN: ; %bb.0: ; GCN-NEXT: s_setprio 2 -; GCN-NEXT: v_mov_b32_e32 v2, 16 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GCN-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GCN-NEXT: v_cndmask_b32_e32 v0, 16, v2, vcc_lo ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: scratch_store_b32 v0, v1, off ; GCN-NEXT: scratch_load_b32 v0, off, off diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll index 5f9e9b8280326b..4ada73029716dc 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -46,7 +46,7 @@ done: ; GCN-LABEL: {{^}}legal_offset_fi_offset: ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} -; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004 +; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, 4, ; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, 0x8004 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} @@ -84,8 +84,10 @@ done: } ; GCN-LABEL: {{^}}neg_vaddr_offset_inbounds: -; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], 0 offen{{$}} +; GCN: s_add_i32 [[ADD0:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: s_add_i32 [[ADD1:s[0-9]+]], [[ADD0]], 16 +; GCN: v_mov_b32_e32 [[V_ADD:v[0-9]+]], [[ADD1]] +; GCN: buffer_store_dword v{{[0-9]+}}, [[V_ADD]], s[{{[0-9]+:[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @neg_vaddr_offset_inbounds(i32 %offset) { entry: %array = alloca [8192 x i32], addrspace(5) @@ -96,8 +98,10 @@ entry: } ; GCN-LABEL: {{^}}neg_vaddr_offset: -; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], 0 offen{{$}} +; GCN: s_add_i32 [[ADD0:s[0-9]+]], s{{[0-9]+}}, 4 +; GCN: s_add_i32 [[ADD1:s[0-9]+]], [[ADD0]], 16 +; GCN: v_mov_b32_e32 [[V_ADD:v[0-9]+]], [[ADD1]] +; GCN: buffer_store_dword v{{[0-9]+}}, [[V_ADD]], s[{{[0-9]+:[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) { entry: %array = alloca [8192 x i32], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index da6f90561b517e..674c7a67303e49 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -51,11 +51,16 @@ ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 -; MUBUF-DAG: s_mov_b32 s2, -1 -; SI-DAG: s_mov_b32 s3, 0xe8f000 -; VI-DAG: s_mov_b32 s3, 0xe80000 +; SIVI-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SIVI-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; SIVI-DAG: s_mov_b32 s6, -1 + +; GFX9-MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GFX9-MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1 +; GFX9-MUBUF-DAG: s_mov_b32 s2, -1 + +; SI-DAG: s_mov_b32 s7, 0xe8f000 +; VI-DAG: s_mov_b32 s7, 0xe80000 ; GFX9-MUBUF-DAG: s_mov_b32 s3, 0xe00000 ; GFX10_W32-MUBUF-DAG: s_mov_b32 s3, 0x31c16000 ; GFX10_W64-MUBUF-DAG: s_mov_b32 s3, 0x31e16000 @@ -117,7 +122,7 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; MUBUF-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD @@ -170,7 +175,7 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 -; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; MUBUF-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; FLATSCR-NOT: SCRATCH_RSRC_DWORD @@ -199,8 +204,8 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; SIVI-NOT: s_mov_b32 s0 +; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SIVI-NOT: s_mov_b32 s4 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen @@ -252,11 +257,11 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0 ; GFX10-FLATSCR-PAL: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1 -; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 +; GFX9_10-MUBUF: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0 ; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 10fdaaa17da0a4..a1a466fb04440d 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -41,7 +41,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -78,44 +78,79 @@ define amdgpu_kernel void @v_test_i32_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_i32_x_sub_64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_i32_x_sub_64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_subrev_u32_e32 v1, 64, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_i32_x_sub_64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_i32_x_sub_64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 0xffffffc0, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_i32_x_sub_64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_i32_x_sub_64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_test_i32_x_sub_64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffc0, v1 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_i32_x_sub_64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_i32_x_sub_64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffc0, v1 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext @@ -163,8 +198,8 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; SI-GISEL-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffc0, v3 ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: buffer_store_dword v3, v[0:1], s[0:3], 0 addr64 @@ -211,66 +246,119 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(ptr addrspace(1) %out, ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3 -; VI-GISEL-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 +; VI-GISEL-NEXT: v_add_u32_e32 v3, vcc, 0xffffffc0, v4 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_store_dword v[0:1], v3 ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 -; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v2, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_subrev_u32_e32 v1, 64, v1 +; GFX9-SDAG-NEXT: v_subrev_u32_e32 v2, 64, v2 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_dword v2, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v0, v2, s[0:1] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_dword v2, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 0xffffffc0, v1 +; GFX9-GISEL-NEXT: v_add_u32_e32 v2, 0xffffffc0, v2 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_i32_x_sub_64_multi_use: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 -; GFX11-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b32 v0, v2, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX10-SDAG-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v2, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffc0, v1 +; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v2, 0xffffffc0, v2 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_dword v0, v2, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX11-SDAG-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_i32_x_sub_64_multi_use: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 0xffffffc0, v1 +; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v2, 0xffffffc0, v2 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b32 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext @@ -425,7 +513,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0x41, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffbf, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -462,79 +550,44 @@ define amdgpu_kernel void @v_test_i32_x_sub_65(ptr addrspace(1) %out, ptr addrsp ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 0x41, v3 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, 0xffffffbf, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_i32_x_sub_65: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_i32_x_sub_65: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0x41, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_i32_x_sub_65: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_i32_x_sub_65: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_i32_x_sub_65: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_i32_x_sub_65: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_i32_x_sub_65: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_i32_x_sub_65: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0x41, v1 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_i32_x_sub_65: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext @@ -686,7 +739,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, -16, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 16, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -723,79 +776,44 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg16(ptr addrspace(1) %out, ptr add ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, -16, v3 +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, 16, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg16: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg16: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, -16, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg16: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg16: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; GFX9-LABEL: v_test_i32_x_sub_neg16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 16, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg16: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm +; GFX10-LABEL: v_test_i32_x_sub_neg16: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v1, 16, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg16: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, -16, v1 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm +; GFX11-LABEL: v_test_i32_x_sub_neg16: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 16, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext @@ -947,7 +965,7 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0xffffffef, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 17, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -983,80 +1001,45 @@ define amdgpu_kernel void @v_test_i32_x_sub_neg17(ptr addrspace(1) %out, ptr add ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u32_e32 v2, vcc, 0xffffffef, v3 -; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 -; VI-GISEL-NEXT: s_endpgm -; -; GFX9-SDAG-LABEL: v_test_i32_x_sub_neg17: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_add_u32_e32 v1, 17, v1 -; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-SDAG-NEXT: s_endpgm -; -; GFX9-GISEL-LABEL: v_test_i32_x_sub_neg17: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX9-GISEL-NEXT: v_subrev_u32_e32 v1, 0xffffffef, v1 -; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-GISEL-NEXT: s_endpgm -; -; GFX10-SDAG-LABEL: v_test_i32_x_sub_neg17: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1 -; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-SDAG-NEXT: s_endpgm -; -; GFX10-GISEL-LABEL: v_test_i32_x_sub_neg17: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX10-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1 -; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-GISEL-NEXT: s_endpgm +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) +; VI-GISEL-NEXT: v_add_u32_e32 v2, vcc, 17, v3 +; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 +; VI-GISEL-NEXT: s_endpgm ; -; GFX11-SDAG-LABEL: v_test_i32_x_sub_neg17: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX11-SDAG-NEXT: v_add_nc_u32_e32 v1, 17, v1 -; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-SDAG-NEXT: s_endpgm +; GFX9-LABEL: v_test_i32_x_sub_neg17: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u32_e32 v1, 17, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm ; -; GFX11-GISEL-LABEL: v_test_i32_x_sub_neg17: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) -; GFX11-GISEL-NEXT: v_subrev_nc_u32_e32 v1, 0xffffffef, v1 -; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-GISEL-NEXT: s_endpgm +; GFX10-LABEL: v_test_i32_x_sub_neg17: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_nc_u32_e32 v1, 17, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: v_test_i32_x_sub_neg17: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_add_nc_u32_e32 v1, 17, v1 +; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i32, ptr addrspace(1) %in, i64 %tid.ext @@ -1263,7 +1246,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_endpgm @@ -1300,44 +1283,79 @@ define amdgpu_kernel void @v_test_i16_x_sub_64(ptr addrspace(1) %out, ptr addrsp ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3 +; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v3 ; VI-GISEL-NEXT: flat_store_short v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_i16_x_sub_64: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_i16_x_sub_64: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_subrev_u16_e32 v1, 64, v1 +; GFX9-SDAG-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_i16_x_sub_64: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_i16_x_sub_64: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_u16_e32 v1, 0xffc0, v1 +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_i16_x_sub_64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_i16_x_sub_64: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX10-SDAG-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_test_i16_x_sub_64: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_i16_x_sub_64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-SDAG-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_i16_x_sub_64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX11-GISEL-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext @@ -1379,7 +1397,7 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; SI-GISEL-NEXT: buffer_load_ushort v3, v[1:2], s[4:7], 0 addr64 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) -; SI-GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 64, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc0, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-GISEL-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 @@ -1419,50 +1437,91 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v3 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v2 +; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_i16_x_sub_64_zext_to_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_subrev_u16_e32 v1, 64, v1 +; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_u16_e32 v1, 0xffc0, v1 +; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_i16_x_sub_64_zext_to_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 1, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v1, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_ushort v1, v1, s[2:3] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_i16_x_sub_64_zext_to_i32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v1, 1, v0 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_u16 v1, v1, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext @@ -1511,8 +1570,8 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; SI-GISEL-NEXT: buffer_load_ushort v3, v[0:1], s[4:7], 0 addr64 glc ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffc0, v3 ; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: buffer_store_short v3, v[0:1], s[0:3], 0 addr64 @@ -1559,66 +1618,119 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(ptr addrspace(1) %out, ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-GISEL-NEXT: v_subrev_u16_e32 v3, 64, v4 +; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v3 +; VI-GISEL-NEXT: v_add_u16_e32 v3, 0xffc0, v4 ; VI-GISEL-NEXT: flat_store_short v[0:1], v2 ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: flat_store_short v[0:1], v3 ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: s_endpgm ; -; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 -; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2 -; GFX9-NEXT: global_store_short v0, v1, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_short v0, v2, s[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: s_endpgm +; GFX9-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX9-SDAG: ; %bb.0: +; GFX9-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-SDAG-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_load_ushort v2, v0, s[2:3] glc +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: v_subrev_u16_e32 v1, 64, v1 +; GFX9-SDAG-NEXT: v_subrev_u16_e32 v2, 64, v2 +; GFX9-SDAG-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: global_store_short v0, v2, s[0:1] +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX9-SDAG-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX10-NEXT: v_sub_nc_u16 v2, v2, 64 -; GFX10-NEXT: global_store_short v0, v1, s[0:1] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_short v0, v2, s[0:1] -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: s_endpgm +; GFX9-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: v_add_u16_e32 v1, 0xffc0, v1 +; GFX9-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v2 +; GFX9-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: global_store_short v0, v2, s[0:1] +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX9-GISEL-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_i16_x_sub_64_multi_use: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sub_nc_u16 v1, v1, 64 -; GFX11-NEXT: v_sub_nc_u16 v2, v2, 64 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_store_b16 v0, v2, s[0:1] dlc -; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX10-SDAG-NEXT: v_sub_nc_u16 v2, v2, 64 +; GFX10-SDAG-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-SDAG-NEXT: s_endpgm +; +; GFX10-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: global_load_ushort v2, v0, s[2:3] glc dlc +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX10-GISEL-NEXT: v_add_nc_u16 v2, v2, 0xffc0 +; GFX10-GISEL-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: global_store_short v0, v2, s[0:1] +; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_sub_nc_u16 v1, v1, 64 +; GFX11-SDAG-NEXT: v_sub_nc_u16 v2, v2, 64 +; GFX11-SDAG-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: global_store_b16 v0, v2, s[0:1] dlc +; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_i16_x_sub_64_multi_use: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: global_load_u16 v2, v0, s[2:3] glc dlc +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_add_nc_u16 v1, v1, 0xffc0 +; GFX11-GISEL-NEXT: v_add_nc_u16 v2, v2, 0xffc0 +; GFX11-GISEL-NEXT: global_store_b16 v0, v1, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: global_store_b16 v0, v2, s[0:1] dlc +; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i64 %tid.ext @@ -1664,8 +1776,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffc0, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -1698,7 +1810,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 +; VI-GISEL-NEXT: v_not_b32_e32 v4, 63 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1710,8 +1822,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-GISEL-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v3 +; VI-GISEL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm @@ -1792,8 +1904,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 7, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 64, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, -7, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffffc0, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -1826,7 +1938,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 64 +; VI-GISEL-NEXT: v_not_b32_e32 v4, 63 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1838,8 +1950,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 7, v3 -; VI-GISEL-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_e32 v2, -7, v3 +; VI-GISEL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm @@ -1933,8 +2045,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 64, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x7b, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffff85, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -1967,7 +2079,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; VI-GISEL: ; %bb.0: ; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0x7b +; VI-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff85 ; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s3 @@ -1979,8 +2091,8 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_subrev_u16_e32 v2, 64, v3 -; VI-GISEL-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_e32 v2, 0xffc0, v3 +; VI-GISEL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm @@ -2074,7 +2186,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 7, v2 +; SI-GISEL-NEXT: v_add_i32_e32 v2, vcc, -7, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-GISEL-NEXT: v_or_b32_e32 v2, v2, v3 @@ -2117,7 +2229,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(ptr addrspace(1) %out, ptr add ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) ; VI-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; VI-GISEL-NEXT: v_subrev_u16_e32 v3, 7, v3 +; VI-GISEL-NEXT: v_add_u16_e32 v3, -7, v3 ; VI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; VI-GISEL-NEXT: v_or_b32_e32 v2, v3, v2 ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 @@ -2197,7 +2309,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 16, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, -16, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -2237,11 +2349,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad ; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 16 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, -16 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm @@ -2319,7 +2431,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0xffffc400, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0x3c00, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -2359,11 +2471,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a ; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffc400 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x3c00 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm @@ -2454,7 +2566,7 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; SI-GISEL-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 ; SI-GISEL-NEXT: s_waitcnt vmcnt(0) ; SI-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; SI-GISEL-NEXT: v_subrev_i32_e32 v3, vcc, 0x4400, v3 +; SI-GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffffbc00, v3 ; SI-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; SI-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; SI-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -2494,11 +2606,11 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt ; VI-GISEL-NEXT: flat_load_dword v3, v[0:1] ; VI-GISEL-NEXT: v_mov_b32_e32 v0, s0 ; VI-GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 -; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0x4400 +; VI-GISEL-NEXT: v_mov_b32_e32 v2, 0xffffbc00 ; VI-GISEL-NEXT: v_mov_b32_e32 v1, s1 ; VI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-GISEL-NEXT: s_waitcnt vmcnt(0) -; VI-GISEL-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-GISEL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-GISEL-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-GISEL-NEXT: flat_store_dword v[0:1], v2 ; VI-GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/LoongArch/e_flags.ll b/llvm/test/CodeGen/LoongArch/e_flags.ll index 2feb9d832bca9a..5d2aa9695b21ae 100644 --- a/llvm/test/CodeGen/LoongArch/e_flags.ll +++ b/llvm/test/CodeGen/LoongArch/e_flags.ll @@ -1,3 +1,6 @@ +; RUN: llc --mtriple=loongarch32 --filetype=obj %s -o %t-la32s +; RUN: llvm-readelf -h %t-la32s | FileCheck %s --check-prefixes=ILP32,ABI-S --match-full-lines + ; RUN: llc --mtriple=loongarch32 -mattr=+d --filetype=obj %s -o %t-la32 ; RUN: llvm-readelf -h %t-la32 | FileCheck %s --check-prefixes=ILP32,ABI-D --match-full-lines @@ -10,6 +13,9 @@ ; RUN: llc --mtriple=loongarch32 -mattr=+d --filetype=obj %s --target-abi=ilp32d -o %t-ilp32d ; RUN: llvm-readelf -h %t-ilp32d | FileCheck %s --check-prefixes=ILP32,ABI-D --match-full-lines +; RUN: llc --mtriple=loongarch64 --filetype=obj %s -o %t-la64d +; RUN: llvm-readelf -h %t-la64d | FileCheck %s --check-prefixes=LP64,ABI-D --match-full-lines + ; RUN: llc --mtriple=loongarch64 -mattr=+d --filetype=obj %s -o %t-la64 ; RUN: llvm-readelf -h %t-la64 | FileCheck %s --check-prefixes=LP64,ABI-D --match-full-lines diff --git a/llvm/test/CodeGen/LoongArch/jr-without-ra.ll b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll new file mode 100644 index 00000000000000..d1c4459aaa6ee0 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/jr-without-ra.ll @@ -0,0 +1,265 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 -mattr=+d < %s | FileCheck %s + +;; Check the `jr` instruction does not use `ra` register. +;; Ensure that this function has only one `ret` instruction. +;; ret = jr $ra + +define void @jr_without_ra(ptr %rtwdev, ptr %chan, ptr %h2c, i8 %.pre, i1 %cmp.i, ptr %tssi_trim.i, i64 %indvars.iv, ptr %arrayidx14.i, i8 %0, ptr %curr_tssi_trim_de, ptr %arrayidx, ptr %switch.gep, ptr %tssi_cck, i64 %switch.load, ptr %curr_tssi_cck_de, ptr %arrayidx14, ptr %curr_tssi_cck_de_20m, ptr %tssi_trim_6g.i, i64 %indvars.iv14, ptr %tssi_mcs.i) nounwind { +; CHECK-LABEL: jr_without_ra: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s0, $sp, 72 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s1, $sp, 64 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s2, $sp, 56 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s3, $sp, 48 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s4, $sp, 40 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s5, $sp, 32 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s6, $sp, 24 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s7, $sp, 16 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s8, $sp, 8 # 8-byte Folded Spill +; CHECK-NEXT: move $s7, $zero +; CHECK-NEXT: move $s0, $zero +; CHECK-NEXT: ld.d $t0, $sp, 184 +; CHECK-NEXT: ld.d $s2, $sp, 176 +; CHECK-NEXT: ld.d $s1, $sp, 168 +; CHECK-NEXT: ld.d $t1, $sp, 160 +; CHECK-NEXT: ld.d $t2, $sp, 152 +; CHECK-NEXT: ld.d $t3, $sp, 144 +; CHECK-NEXT: ld.d $t4, $sp, 136 +; CHECK-NEXT: ld.d $t5, $sp, 128 +; CHECK-NEXT: ld.d $t6, $sp, 120 +; CHECK-NEXT: ld.d $t7, $sp, 112 +; CHECK-NEXT: ld.d $t8, $sp, 104 +; CHECK-NEXT: ld.d $fp, $sp, 96 +; CHECK-NEXT: andi $a4, $a4, 1 +; CHECK-NEXT: alsl.d $a6, $a6, $s1, 4 +; CHECK-NEXT: pcalau12i $s1, %pc_hi20(.LJTI0_0) +; CHECK-NEXT: addi.d $s1, $s1, %pc_lo12(.LJTI0_0) +; CHECK-NEXT: slli.d $s3, $s2, 2 +; CHECK-NEXT: alsl.d $s2, $s2, $s3, 1 +; CHECK-NEXT: add.d $s2, $t5, $s2 +; CHECK-NEXT: addi.w $s4, $zero, -41 +; CHECK-NEXT: ori $s3, $zero, 1 +; CHECK-NEXT: slli.d $s4, $s4, 3 +; CHECK-NEXT: ori $s6, $zero, 3 +; CHECK-NEXT: lu32i.d $s6, 262144 +; CHECK-NEXT: b .LBB0_4 +; CHECK-NEXT: .p2align 4, , 16 +; CHECK-NEXT: .LBB0_1: # %sw.bb27.i.i +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: ori $s8, $zero, 1 +; CHECK-NEXT: .LBB0_2: # %if.else.i106 +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: alsl.d $s5, $s0, $s0, 3 +; CHECK-NEXT: alsl.d $s0, $s5, $s0, 1 +; CHECK-NEXT: add.d $s0, $t0, $s0 +; CHECK-NEXT: ldx.bu $s8, $s0, $s8 +; CHECK-NEXT: .LBB0_3: # %phy_tssi_get_ofdm_de.exit +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: st.b $zero, $t5, 0 +; CHECK-NEXT: st.b $s7, $t3, 0 +; CHECK-NEXT: st.b $zero, $t8, 0 +; CHECK-NEXT: st.b $zero, $t1, 0 +; CHECK-NEXT: st.b $zero, $a1, 0 +; CHECK-NEXT: st.b $zero, $t2, 0 +; CHECK-NEXT: st.b $s8, $a5, 0 +; CHECK-NEXT: ori $s0, $zero, 1 +; CHECK-NEXT: move $s7, $a3 +; CHECK-NEXT: .LBB0_4: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: beqz $a4, .LBB0_9 +; CHECK-NEXT: # %bb.5: # %calc_6g.i +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: move $s7, $zero +; CHECK-NEXT: bnez $zero, .LBB0_8 +; CHECK-NEXT: # %bb.6: # %calc_6g.i +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: slli.d $s8, $zero, 3 +; CHECK-NEXT: ldx.d $s8, $s8, $s1 +; CHECK-NEXT: jr $s8 +; CHECK-NEXT: .LBB0_7: # %sw.bb12.i.i +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: ori $s7, $zero, 1 +; CHECK-NEXT: .LBB0_8: # %if.else58.i +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: ldx.bu $s7, $a6, $s7 +; CHECK-NEXT: b .LBB0_11 +; CHECK-NEXT: .p2align 4, , 16 +; CHECK-NEXT: .LBB0_9: # %if.end.i +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: andi $s7, $s7, 255 +; CHECK-NEXT: ori $s5, $zero, 50 +; CHECK-NEXT: bltu $s5, $s7, .LBB0_15 +; CHECK-NEXT: # %bb.10: # %if.end.i +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: sll.d $s7, $s3, $s7 +; CHECK-NEXT: and $s8, $s7, $s6 +; CHECK-NEXT: move $s7, $fp +; CHECK-NEXT: beqz $s8, .LBB0_15 +; CHECK-NEXT: .LBB0_11: # %phy_tssi_get_ofdm_trim_de.exit +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: move $s8, $zero +; CHECK-NEXT: st.b $zero, $t7, 0 +; CHECK-NEXT: ldx.b $ra, $s2, $t4 +; CHECK-NEXT: st.b $zero, $a2, 0 +; CHECK-NEXT: st.b $zero, $a7, 0 +; CHECK-NEXT: st.b $zero, $t6, 0 +; CHECK-NEXT: st.b $ra, $a0, 0 +; CHECK-NEXT: bnez $s3, .LBB0_13 +; CHECK-NEXT: # %bb.12: # %phy_tssi_get_ofdm_trim_de.exit +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: pcalau12i $ra, %pc_hi20(.LJTI0_1) +; CHECK-NEXT: addi.d $ra, $ra, %pc_lo12(.LJTI0_1) +; CHECK-NEXT: ldx.d $s5, $s4, $ra +; CHECK-NEXT: jr $s5 +; CHECK-NEXT: .LBB0_13: # %phy_tssi_get_ofdm_trim_de.exit +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: bnez $s3, .LBB0_1 +; CHECK-NEXT: # %bb.14: # %phy_tssi_get_ofdm_trim_de.exit +; CHECK-NEXT: # in Loop: Header=BB0_4 Depth=1 +; CHECK-NEXT: bnez $zero, .LBB0_3 +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .LBB0_15: # %sw.bb9.i.i +; CHECK-NEXT: ld.d $s8, $sp, 8 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s7, $sp, 16 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s6, $sp, 24 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s5, $sp, 32 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s4, $sp, 40 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s3, $sp, 48 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s2, $sp, 56 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s1, $sp, 64 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s0, $sp, 72 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 +; CHECK-NEXT: ret +entry: + br label %for.body + +for.body: + %1 = phi i8 [ 0, %entry ], [ %.pre, %phy_tssi_get_ofdm_de.exit ] + %indvars.iv143 = phi i64 [ 0, %entry ], [ 1, %phy_tssi_get_ofdm_de.exit ] + br i1 %cmp.i, label %calc_6g.i, label %if.end.i + +if.end.i: + switch i8 %1, label %sw.bb9.i.i [ + i8 1, label %phy_tssi_get_ofdm_trim_de.exit + i8 50, label %phy_tssi_get_ofdm_trim_de.exit + i8 0, label %phy_tssi_get_ofdm_trim_de.exit + ] + +sw.bb9.i.i: + ret void + +calc_6g.i: + switch i8 1, label %if.else58.i [ + i8 55, label %sw.bb5.i125.i + i8 54, label %sw.bb5.i125.i + i8 53, label %sw.bb5.i125.i + i8 52, label %sw.bb5.i125.i + i8 51, label %sw.bb5.i125.i + i8 50, label %sw.bb5.i125.i + i8 49, label %sw.bb5.i125.i + i8 56, label %sw.bb5.i125.i + i8 57, label %sw.bb5.i125.i + i8 58, label %sw.bb5.i125.i + i8 59, label %sw.bb5.i125.i + i8 60, label %sw.bb5.i125.i + i8 61, label %sw.bb5.i125.i + i8 -115, label %sw.bb12.i.i + i8 -116, label %sw.bb12.i.i + i8 -117, label %sw.bb12.i.i + i8 -118, label %sw.bb12.i.i + i8 -119, label %sw.bb12.i.i + i8 -120, label %sw.bb12.i.i + i8 -121, label %sw.bb12.i.i + i8 -122, label %sw.bb12.i.i + i8 -123, label %sw.bb12.i.i + i8 -124, label %sw.bb12.i.i + i8 -125, label %sw.bb12.i.i + i8 -126, label %sw.bb12.i.i + i8 -127, label %sw.bb12.i.i + i8 77, label %sw.bb6.i124.i + i8 76, label %sw.bb6.i124.i + i8 75, label %sw.bb6.i124.i + i8 74, label %sw.bb6.i124.i + i8 73, label %sw.bb6.i124.i + i8 72, label %sw.bb6.i124.i + i8 71, label %sw.bb6.i124.i + i8 1, label %sw.bb6.i124.i + i8 69, label %sw.bb6.i124.i + i8 68, label %sw.bb6.i124.i + i8 67, label %sw.bb6.i124.i + i8 66, label %sw.bb6.i124.i + i8 65, label %sw.bb6.i124.i + ] + +sw.bb5.i125.i: + br label %if.else58.i + +sw.bb6.i124.i: + br label %if.else58.i + +sw.bb12.i.i: + br label %if.else58.i + +if.else58.i: + %retval.0.i120.ph.i = phi i64 [ 0, %calc_6g.i ], [ 1, %sw.bb5.i125.i ], [ 1, %sw.bb6.i124.i ], [ 1, %sw.bb12.i.i ] + %arrayidx63.i = getelementptr [4 x [16 x i8]], ptr %tssi_trim_6g.i, i64 0, i64 %indvars.iv, i64 %retval.0.i120.ph.i + %2 = load i8, ptr %arrayidx63.i, align 1 + br label %phy_tssi_get_ofdm_trim_de.exit + +phy_tssi_get_ofdm_trim_de.exit: + %retval.0.i = phi i8 [ %2, %if.else58.i ], [ %0, %if.end.i ], [ %0, %if.end.i ], [ %0, %if.end.i ] + store i8 0, ptr %arrayidx, align 1 + %arrayidx8 = getelementptr [4 x [6 x i8]], ptr %tssi_cck, i64 0, i64 %indvars.iv14, i64 %switch.load + %3 = load i8, ptr %arrayidx8, align 1 + store i8 0, ptr %h2c, align 1 + store i8 0, ptr %arrayidx14.i, align 1 + store i8 0, ptr %switch.gep, align 1 + store i8 %3, ptr %rtwdev, align 1 + switch i8 0, label %if.else.i106 [ + i8 -87, label %sw.bb27.i.i + i8 0, label %sw.bb27.i.i + i8 -89, label %sw.bb27.i.i + i8 -90, label %sw.bb27.i.i + i8 -91, label %sw.bb27.i.i + i8 -92, label %phy_tssi_get_ofdm_de.exit + i8 -93, label %phy_tssi_get_ofdm_de.exit + i8 1, label %phy_tssi_get_ofdm_de.exit + i8 -95, label %sw.bb25.i.i + i8 -96, label %sw.bb25.i.i + i8 -97, label %sw.bb25.i.i + i8 -98, label %sw.bb25.i.i + i8 -99, label %sw.bb25.i.i + i8 43, label %phy_tssi_get_ofdm_de.exit + i8 42, label %phy_tssi_get_ofdm_de.exit + i8 41, label %phy_tssi_get_ofdm_de.exit + ] + +sw.bb25.i.i: + br label %if.else.i106 + +sw.bb27.i.i: + br label %if.else.i106 + +if.else.i106: + %retval.0.i.ph.i107 = phi i64 [ 0, %phy_tssi_get_ofdm_trim_de.exit ], [ 1, %sw.bb25.i.i ], [ 1, %sw.bb27.i.i ] + %arrayidx26.i109 = getelementptr [4 x [19 x i8]], ptr %tssi_mcs.i, i64 0, i64 %indvars.iv143, i64 %retval.0.i.ph.i107 + %4 = load i8, ptr %arrayidx26.i109, align 1 + br label %phy_tssi_get_ofdm_de.exit + +phy_tssi_get_ofdm_de.exit: + %retval.0.i110 = phi i8 [ %4, %if.else.i106 ], [ 0, %phy_tssi_get_ofdm_trim_de.exit ], [ 0, %phy_tssi_get_ofdm_trim_de.exit ], [ 0, %phy_tssi_get_ofdm_trim_de.exit ], [ 0, %phy_tssi_get_ofdm_trim_de.exit ], [ 0, %phy_tssi_get_ofdm_trim_de.exit ], [ 0, %phy_tssi_get_ofdm_trim_de.exit ] + store i8 0, ptr %tssi_cck, align 1 + store i8 %retval.0.i, ptr %curr_tssi_cck_de, align 1 + store i8 0, ptr %curr_tssi_trim_de, align 1 + store i8 0, ptr %curr_tssi_cck_de_20m, align 1 + store i8 0, ptr %chan, align 1 + store i8 0, ptr %arrayidx14, align 1 + store i8 %retval.0.i110, ptr %tssi_trim.i, align 1 + br label %for.body +} diff --git a/llvm/test/CodeGen/M68k/multiple-return.ll b/llvm/test/CodeGen/M68k/multiple-return.ll index f52f422b194f59..8e97908324f057 100644 --- a/llvm/test/CodeGen/M68k/multiple-return.ll +++ b/llvm/test/CodeGen/M68k/multiple-return.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=m68k-linux -verify-machineinstrs | FileCheck %s -define { i32, i32, i32, i32 } @test() { -; CHECK-LABEL: test: +define { i32, i32, i32, i32 } @test0() { +; CHECK-LABEL: test0: ; CHECK: .cfi_startproc ; CHECK-NEXT: ; %bb.0: ; %start ; CHECK-NEXT: move.l (4,%sp), %a0 @@ -18,3 +18,73 @@ define { i32, i32, i32, i32 } @test() { start: ret { i32, i32, i32, i32 } { i32 13, i32 17, i32 19, i32 23 } } + +define void @call_test0() { +; CHECK-LABEL: call_test0: +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: ; %start +; CHECK-NEXT: suba.l #20, %sp +; CHECK-NEXT: .cfi_def_cfa_offset -24 +; CHECK-NEXT: lea (4,%sp), %a0 +; CHECK-NEXT: move.l %a0, (%sp) +; CHECK-NEXT: jsr test0 +; CHECK-NEXT: adda.l #16, %sp +; CHECK-NEXT: rts +start: + %val = call { i32, i32, i32, i32 } @test0() + ret void +} + +define void @test1(ptr sret({ i32, i32, i32, i32 }) %ret_val) { +; CHECK-LABEL: test1: +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: ; %start +; CHECK-NEXT: move.l (4,%sp), %d0 +; CHECK-NEXT: move.l (%sp), %a1 +; CHECK-NEXT: adda.l #4, %sp +; CHECK-NEXT: move.l %a1, (%sp) +; CHECK-NEXT: rts +start: + ret void +} + +define void @call_test1() { +; CHECK-LABEL: call_test1: +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: ; %start +; CHECK-NEXT: suba.l #20, %sp +; CHECK-NEXT: .cfi_def_cfa_offset -24 +; CHECK-NEXT: lea (4,%sp), %a0 +; CHECK-NEXT: move.l %a0, (%sp) +; CHECK-NEXT: jsr test1 +; CHECK-NEXT: adda.l #16, %sp +; CHECK-NEXT: rts +start: + %ret_val = alloca { i32, i32, i32, i32 } + call void @test1(ptr %ret_val) + ret void +} + +define i32 @test2() { +; CHECK-LABEL: test2: +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: ; %start +; CHECK-NEXT: moveq #13, %d0 +; CHECK-NEXT: rts +start: + ret i32 13 +} + +define void @call_test2() { +; CHECK-LABEL: call_test2: +; CHECK: .cfi_startproc +; CHECK-NEXT: ; %bb.0: ; %start +; CHECK-NEXT: suba.l #4, %sp +; CHECK-NEXT: .cfi_def_cfa_offset -8 +; CHECK-NEXT: jsr test2 +; CHECK-NEXT: adda.l #4, %sp +; CHECK-NEXT: rts +start: + %0 = call i32 @test2() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll new file mode 100644 index 00000000000000..cb3b0c03f75d09 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll @@ -0,0 +1,144 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX %s +; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tm, i32 %d0, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %flag); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag); + +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 %f1); + +; CHECK-LABEL: cp_async_bulk_tensor_prefetch_tile_1d +define void @cp_async_bulk_tensor_prefetch_tile_1d(ptr %tmap, i32 %d0, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_prefetch_tile_1d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<2>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_tile_1d_param_0]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_tile_1d_param_1]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile [%rd1, {%r1}]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_tile_1d_param_2]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile.L2::cache_hint [%rd1, {%r1}], %rd2; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tmap, i32 %d0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tmap, i32 %d0, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_prefetch_tile_2d +define void @cp_async_bulk_tensor_prefetch_tile_2d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_prefetch_tile_2d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_tile_2d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_tile_2d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_tile_2d_param_3]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%rd1, {%r1, %r2}]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_tile_2d_param_4]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2}], %rd2; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_prefetch_3d +define void @cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_prefetch_3d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_3d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_3d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_3d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_3d_param_4]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%rd1, {%r1, %r2, %r3}]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_3d_param_6]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3}], %rd2; +; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_3d_param_5]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col [%rd1, {%r1, %r2, %r3}], {%rs1}; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1}, %rd2; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_prefetch_4d +define void @cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_prefetch_4d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_4d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_4d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_4d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_4d_param_4]; +; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_prefetch_4d_param_5]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4}]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_4d_param_8]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], %rd2; +; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_4d_param_6]; +; CHECK-PTX-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_prefetch_4d_param_7]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 1) + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_prefetch_5d +define void @cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch) { +; CHECK-PTX-LABEL: cp_async_bulk_tensor_prefetch_5d( +; CHECK-PTX: { +; CHECK-PTX-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-EMPTY: +; CHECK-PTX-NEXT: // %bb.0: +; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_5d_param_1]; +; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_5d_param_2]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_5d_param_3]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_5d_param_4]; +; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_prefetch_5d_param_5]; +; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_prefetch_5d_param_6]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4, %r5}]; +; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_5d_param_10]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], %rd2; +; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_5d_param_7]; +; CHECK-PTX-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_prefetch_5d_param_8]; +; CHECK-PTX-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_prefetch_5d_param_9]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2, %rs3}; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2, %rs3}, %rd2; +; CHECK-PTX-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 undef, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 1) + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll index 8f12b182283f53..656bc3661178ac 100644 --- a/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll +++ b/llvm/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll @@ -2499,11 +2499,9 @@ define <2 x i64> @buildi2(i64 %arg, i32 %arg1) { ; ; CHECK-LE-LABEL: buildi2: ; CHECK-LE: # %bb.0: # %entry -; CHECK-LE-NEXT: mtfprd f0, r4 +; CHECK-LE-NEXT: mtfprwz f0, r4 ; CHECK-LE-NEXT: mtfprd f1, r3 -; CHECK-LE-NEXT: xxswapd vs0, vs0 -; CHECK-LE-NEXT: xxswapd v2, vs1 -; CHECK-LE-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-NEXT: blr ; ; CHECK-AIX-LABEL: buildi2: diff --git a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll index 31d0960e19f4ef..3ab49cd39f8d80 100644 --- a/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v16i8_scalar_to_vector_shuffle.ll @@ -266,56 +266,54 @@ entry: define <16 x i8> @test_v16i8_v8i16(i16 %arg, i8 %arg1) { ; CHECK-LE-P8-LABEL: test_v16i8_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: mtvsrd v2, r4 +; CHECK-LE-P8-NEXT: mtvsrd v3, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v16i8_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f0, r4 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxswapd v3, vs0 -; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: mtvsrd v2, r4 +; CHECK-LE-P9-NEXT: mtvsrd v3, r3 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v16i8_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r4, r4, 56 -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: mtvsrd v2, r4 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v16i8_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: sldi r4, r4, 56 -; CHECK-BE-P9-NEXT: sldi r3, r3, 48 -; CHECK-BE-P9-NEXT: mtvsrd v2, r4 -; CHECK-BE-P9-NEXT: mtvsrd v3, r3 -; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: mtvsrwz v2, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI3_0@toc@ha +; CHECK-BE-P9-NEXT: mtfprwz f0, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI3_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v16i8_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 56 -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r4 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C3(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v16i8_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: sldi r4, r4, 56 -; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r4 -; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C2(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 +; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v16i8_v8i16: @@ -348,56 +346,54 @@ entry: define <16 x i8> @test_v8i16_v16i8(i16 %arg, i8 %arg1) { ; CHECK-LE-P8-LABEL: test_v8i16_v16i8: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: mtvsrd v2, r4 +; CHECK-LE-P8-NEXT: mtvsrd v3, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v16i8: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f0, r4 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxswapd v3, vs0 -; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: mtvsrd v2, r4 +; CHECK-LE-P9-NEXT: mtvsrd v3, r3 +; CHECK-LE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v16i8: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r4, r4, 56 -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: mtvsrd v2, r4 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI4_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v16i8: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: sldi r4, r4, 56 -; CHECK-BE-P9-NEXT: sldi r3, r3, 48 -; CHECK-BE-P9-NEXT: mtvsrd v2, r4 -; CHECK-BE-P9-NEXT: mtvsrd v3, r3 -; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI4_0@toc@ha +; CHECK-BE-P9-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI4_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v16i8: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 56 -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r4 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C4(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v16i8: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: sldi r4, r4, 56 -; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r4 -; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v16i8: @@ -472,7 +468,7 @@ define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) { ; CHECK-AIX-64-P8-LABEL: test_none_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C3(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C5(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 @@ -481,7 +477,7 @@ define <16 x i8> @test_none_v8i16(i16 %arg, ptr nocapture noundef readonly %b) { ; CHECK-AIX-64-P9-LABEL: test_none_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: mtvsrwz v2, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C2(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C4(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r4) ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 @@ -545,7 +541,7 @@ define <8 x i16> @test_v8i16_none(<8 x i16> %a, i16 %b) { ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r4, L..C4(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C6(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 @@ -580,53 +576,54 @@ entry: define <16 x i8> @test_v16i8_v4i32(i8 %arg, i32 %arg1, <16 x i8> %a, <4 x i32> %b) { ; CHECK-LE-P8-LABEL: test_v16i8_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: mtvsrwz v3, r4 +; CHECK-LE-P8-NEXT: vmrghb v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v16i8_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: mtvsrws v3, r4 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P9-NEXT: mtvsrd v2, r3 +; CHECK-LE-P9-NEXT: mtvsrwz v3, r4 +; CHECK-LE-P9-NEXT: vmrghb v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v16i8_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r3, r3, 56 -; CHECK-BE-P8-NEXT: mtvsrd v2, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 32 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI7_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v3, r4 +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI7_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v16i8_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: sldi r3, r3, 56 -; CHECK-BE-P9-NEXT: mtvsrws v3, r4 -; CHECK-BE-P9-NEXT: mtvsrd v2, r3 -; CHECK-BE-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI7_0@toc@ha +; CHECK-BE-P9-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI7_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v16i8_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 -; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 32 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C7(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v16i8_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 -; CHECK-AIX-64-P9-NEXT: mtvsrws v3, r4 -; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r3 -; CHECK-AIX-64-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C5(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v16i8_v4i32: @@ -660,53 +657,54 @@ entry: define <16 x i8> @test_v4i32_v16i8(i32 %arg, i8 %arg1) { ; CHECK-LE-P8-LABEL: test_v4i32_v16i8: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: mtvsrd v2, r4 +; CHECK-LE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v16i8: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f0, r4 -; CHECK-LE-P9-NEXT: mtvsrws v3, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: mtvsrd v2, r4 +; CHECK-LE-P9-NEXT: mtvsrwz v3, r3 +; CHECK-LE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v16i8: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r4, r4, 56 -; CHECK-BE-P8-NEXT: sldi r3, r3, 32 -; CHECK-BE-P8-NEXT: mtvsrd v2, r4 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI8_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI8_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v16i8: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: sldi r4, r4, 56 -; CHECK-BE-P9-NEXT: mtvsrws v3, r3 -; CHECK-BE-P9-NEXT: mtvsrd v2, r4 -; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI8_0@toc@ha +; CHECK-BE-P9-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI8_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v16i8: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r4, r4, 56 -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 32 -; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r4 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C8(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v16i8: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: sldi r4, r4, 56 -; CHECK-AIX-64-P9-NEXT: mtvsrws v3, r3 -; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r4 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C6(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v16i8: @@ -781,9 +779,9 @@ define <4 x i32> @test_none_v4i32(<4 x i32> %a, i64 %b) { ; ; CHECK-AIX-64-P8-LABEL: test_none_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r4, L..C5(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C9(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C6(r2) # %const.1 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C10(r2) # %const.1 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v4, v3 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r3 @@ -875,7 +873,7 @@ define <16 x i8> @test_v4i32_none(ptr nocapture noundef readonly %a, ptr nocaptu ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lbzx r4, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C7(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C11(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-64-P8-NEXT: vspltb v2, v2, 7 @@ -885,7 +883,7 @@ define <16 x i8> @test_v4i32_none(ptr nocapture noundef readonly %a, ptr nocaptu ; CHECK-AIX-64-P9-LABEL: test_v4i32_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C7(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxsibzx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: vspltb v2, v2, 7 @@ -928,20 +926,16 @@ entry: define <16 x i8> @test_v16i8_v2i64(i8 %arg, i64 %arg1, <16 x i8> %a, <2 x i64> %b) { ; CHECK-LE-P8-LABEL: test_v16i8_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: mtvsrd v3, r4 +; CHECK-LE-P8-NEXT: vmrghb v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v16i8_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: mtfprd f0, r4 -; CHECK-LE-P9-NEXT: xxswapd v3, vs0 -; CHECK-LE-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P9-NEXT: mtvsrd v2, r3 +; CHECK-LE-P9-NEXT: mtvsrd v3, r4 +; CHECK-LE-P9-NEXT: vmrghb v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v16i8_v2i64: @@ -1007,20 +1001,16 @@ entry: define <16 x i8> @test_v2i64_v16i8(i64 %arg, i8 %arg1) { ; CHECK-LE-P8-LABEL: test_v2i64_v16i8: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: mtvsrd v2, r4 +; CHECK-LE-P8-NEXT: mtvsrd v3, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v16i8: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f0, r4 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxswapd v3, vs0 -; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: mtvsrd v2, r4 +; CHECK-LE-P9-NEXT: mtvsrd v3, r3 +; CHECK-LE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v16i8: @@ -1392,7 +1382,7 @@ define <16 x i8> @test_v8i16_v8i16rhs(i16 %arg, i16 %arg1) { ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16rhs: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C8(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C12(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r3 ; CHECK-AIX-64-P8-NEXT: lxvw4x v2, 0, r5 @@ -1401,7 +1391,7 @@ define <16 x i8> @test_v8i16_v8i16rhs(i16 %arg, i16 %arg1) { ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16rhs: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: ld r5, L..C4(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r5, L..C8(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: mtvsrwz v2, r4 ; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r3 ; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r5) @@ -1439,53 +1429,54 @@ entry: define <16 x i8> @test_v8i16_v4i32(<8 x i16> %a, <4 x i32> %b, i16 %arg, i32 %arg1) { ; CHECK-LE-P8-LABEL: test_v8i16_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r7 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r8 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P8-NEXT: mtvsrd v2, r7 +; CHECK-LE-P8-NEXT: mtvsrwz v3, r8 +; CHECK-LE-P8-NEXT: vmrghb v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f0, r7 -; CHECK-LE-P9-NEXT: mtvsrws v3, r8 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P9-NEXT: mtvsrd v2, r7 +; CHECK-LE-P9-NEXT: mtvsrwz v3, r8 +; CHECK-LE-P9-NEXT: vmrghb v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r3, r7, 48 -; CHECK-BE-P8-NEXT: mtvsrd v2, r3 -; CHECK-BE-P8-NEXT: sldi r3, r8, 32 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI17_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v2, r7 +; CHECK-BE-P8-NEXT: mtvsrwz v3, r8 +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI17_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: sldi r3, r7, 48 -; CHECK-BE-P9-NEXT: mtvsrws v3, r8 -; CHECK-BE-P9-NEXT: mtvsrd v2, r3 -; CHECK-BE-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI17_0@toc@ha +; CHECK-BE-P9-NEXT: mtfprwz f0, r7 +; CHECK-BE-P9-NEXT: mtvsrwz v2, r8 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI17_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 32 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C13(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P9-NEXT: mtvsrws v3, r4 -; CHECK-AIX-64-P9-NEXT: mtvsrd v2, r3 -; CHECK-AIX-64-P9-NEXT: vmrghb v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C9(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: @@ -1519,20 +1510,16 @@ entry: define <16 x i8> @test_v8i16_v2i64(<8 x i16> %a, <2 x i64> %b, i16 %arg, i64 %arg1) { ; CHECK-LE-P8-LABEL: test_v8i16_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r7 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r8 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P8-NEXT: mtvsrd v2, r7 +; CHECK-LE-P8-NEXT: mtvsrd v3, r8 +; CHECK-LE-P8-NEXT: vmrghb v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f0, r7 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: mtfprd f0, r8 -; CHECK-LE-P9-NEXT: xxswapd v3, vs0 -; CHECK-LE-P9-NEXT: vmrglb v2, v3, v2 +; CHECK-LE-P9-NEXT: mtvsrd v2, r7 +; CHECK-LE-P9-NEXT: mtvsrd v3, r8 +; CHECK-LE-P9-NEXT: vmrghb v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v2i64: @@ -1669,53 +1656,54 @@ entry: define <16 x i8> @test_v4i32_v8i16(i32 %arg, i16 %arg1) { ; CHECK-LE-P8-LABEL: test_v4i32_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-LE-P8-NEXT: mtvsrd v3, r4 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f0, r4 -; CHECK-LE-P9-NEXT: mtvsrws v2, r3 -; CHECK-LE-P9-NEXT: xxswapd v3, vs0 -; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: mtvsrwz v2, r3 +; CHECK-LE-P9-NEXT: mtvsrd v3, r4 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r3, r3, 32 -; CHECK-BE-P8-NEXT: mtvsrd v2, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 48 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI20_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrwz v3, r4 +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: mtvsrws v2, r3 -; CHECK-BE-P9-NEXT: sldi r3, r4, 48 -; CHECK-BE-P9-NEXT: mtvsrd v3, r3 -; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI20_0@toc@ha +; CHECK-BE-P9-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI20_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 32 -; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C14(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: mtvsrws v2, r3 -; CHECK-AIX-64-P9-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C10(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: @@ -1824,18 +1812,15 @@ define <16 x i8> @test_v2i64_v4i32(i64 %arg, i32 %arg1, <2 x i64> %a, <4 x i32> ; CHECK-LE-P8-LABEL: test_v2i64_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrglw v2, vs1, vs0 +; CHECK-LE-P8-NEXT: mtfprwz f1, r4 +; CHECK-LE-P8-NEXT: xxmrghw v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: mtvsrws vs1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxmrglw v2, vs1, vs0 +; CHECK-LE-P9-NEXT: mtfprwz f1, r4 +; CHECK-LE-P9-NEXT: xxmrghw v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v4i32: @@ -1899,20 +1884,16 @@ entry: define <16 x i8> @test_v2i64_v8i16(i64 %arg, i16 %arg1) { ; CHECK-LE-P8-LABEL: test_v2i64_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, vs0 -; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: mtvsrd v3, r4 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: mtfprd f0, r4 -; CHECK-LE-P9-NEXT: xxswapd v3, vs0 -; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: mtvsrd v2, r3 +; CHECK-LE-P9-NEXT: mtvsrd v3, r4 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v8i16: @@ -1978,27 +1959,23 @@ entry: define <16 x i8> @test_v4i32_v2i64(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-LABEL: test_v4i32_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI24_0@toc@ha -; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI24_0@toc@l -; CHECK-LE-P8-NEXT: xxswapd v2, f0 -; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v4, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-LE-P8-NEXT: addis r5, r2, .LCPI24_0@toc@ha +; CHECK-LE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-LE-P8-NEXT: lxsdx v4, 0, r4 +; CHECK-LE-P8-NEXT: addi r5, r5, .LCPI24_0@toc@l +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r5 +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v4, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 ; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI24_0@toc@ha -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI24_0@toc@l -; CHECK-LE-P9-NEXT: xxswapd v2, f0 ; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: lxv vs0, 0(r3) -; CHECK-LE-P9-NEXT: xxperm v2, v3, vs0 +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI24_0@toc@l +; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-LE-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v2i64: @@ -2026,7 +2003,7 @@ define <16 x i8> @test_v4i32_v2i64(ptr nocapture noundef readonly %a, ptr nocapt ; CHECK-AIX-64-P8-LABEL: test_v4i32_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C9(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C15(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 @@ -2036,7 +2013,7 @@ define <16 x i8> @test_v4i32_v2i64(ptr nocapture noundef readonly %a, ptr nocapt ; CHECK-AIX-64-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C5(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C11(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r4) ; CHECK-AIX-64-P9-NEXT: xxsldwi vs0, f0, f0, 1 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) diff --git a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll index 56c8c128ba9f40..fcfcda586694d5 100644 --- a/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v2i64_scalar_to_vector_shuffle.ll @@ -30,42 +30,42 @@ define <2 x i64> @test_v16i8_v16i8(i8 %arg1, i8 %arg) { ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 ; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v16i8_v16i8: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v16i8_v16i8: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 ; CHECK-BE-P8-NEXT: mtfprwz f1, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v16i8_v16i8: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: mtfprwz f1, r4 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v16i8_v16i8: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v16i8_v16i8: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v16i8_v16i8: @@ -102,42 +102,42 @@ define <2 x i64> @test_none_v16i8(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v16i8: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxv v2, 0(r4) ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_none_v16i8: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 ; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v16i8: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxv v2, 0(r4) ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_none_v16i8: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v16i8: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_none_v16i8: @@ -170,42 +170,42 @@ define <2 x i64> @test_v16i8_none(i8 %arg1, ptr nocapture noundef readonly %b) { ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v16i8_none: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxv v2, 0(r4) ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v16i8_none: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 ; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v16i8_none: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxv v2, 0(r4) ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v16i8_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v16i8_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v16i8_none: @@ -237,54 +237,42 @@ define <2 x i64> @test_v16i8_v8i16(i8 %arg1, i16 %arg) { ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 ; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v16i8_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v16i8_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r3, r3, 56 -; CHECK-BE-P8-NEXT: mtfprd f0, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 48 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: mtfprwz f1, r4 +; CHECK-BE-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v16i8_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: sldi r3, r3, 56 -; CHECK-BE-P9-NEXT: mtfprd f0, r3 -; CHECK-BE-P9-NEXT: sldi r3, r4, 48 -; CHECK-BE-P9-NEXT: mtfprd f1, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: mtfprwz f1, r4 +; CHECK-BE-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v16i8_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 -; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v16i8_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 -; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P9-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P9-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v16i8_v8i16: @@ -320,54 +308,42 @@ define <2 x i64> @test_v8i16_v16i8(i8 %arg1, i16 %arg) { ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 ; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, vs1 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v16i8: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, vs1 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v16i8: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r3, r3, 56 -; CHECK-BE-P8-NEXT: mtfprd f0, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 48 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: mtfprwz f1, r4 +; CHECK-BE-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v16i8: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: sldi r3, r3, 56 -; CHECK-BE-P9-NEXT: mtfprd f0, r3 -; CHECK-BE-P9-NEXT: sldi r3, r4, 48 -; CHECK-BE-P9-NEXT: mtfprd f1, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: mtfprwz f1, r4 +; CHECK-BE-P9-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v16i8: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 -; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v16i8: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 -; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P9-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P9-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v16i8: @@ -404,42 +380,42 @@ define <2 x i64> @test_v8i16_none(i16 %arg1, ptr nocapture noundef readonly %b) ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_none: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxv v2, 0(r4) ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_none: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 ; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_none: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxv v2, 0(r4) ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_none: @@ -472,42 +448,42 @@ define <2 x i64> @test_none_v8i16(i16 %arg1, ptr nocapture noundef readonly %b) ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxv v2, 0(r4) ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_none_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 ; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxv v2, 0(r4) ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_none_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_none_v8i16: @@ -538,52 +514,43 @@ define <2 x i64> @test_v16i8_v4i32(i8 %arg1, i32 %arg) { ; CHECK-LE-P8-LABEL: test_v16i8_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: mtfprwz f1, r4 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v16i8_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: mtvsrws vs1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: mtfprwz f1, r4 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v16i8_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r3, r3, 56 -; CHECK-BE-P8-NEXT: mtfprd f0, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 32 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: mtfprwz f1, r4 +; CHECK-BE-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v16i8_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: sldi r3, r3, 56 -; CHECK-BE-P9-NEXT: mtvsrws vs1, r4 -; CHECK-BE-P9-NEXT: mtfprd f0, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: mtfprwz f1, r4 +; CHECK-BE-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v16i8_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 -; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 32 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v16i8_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 -; CHECK-AIX-64-P9-NEXT: mtvsrws vs1, r4 -; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v16i8_v4i32: @@ -618,52 +585,43 @@ define <2 x i64> @test_v4i32_v16i8(i8 %arg1, i32 %arg) { ; CHECK-LE-P8-LABEL: test_v4i32_v16i8: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, vs1 +; CHECK-LE-P8-NEXT: mtfprwz f1, r4 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v16i8: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: mtvsrws vs1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, vs1 +; CHECK-LE-P9-NEXT: mtfprwz f1, r4 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v16i8: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r3, r3, 56 -; CHECK-BE-P8-NEXT: mtfprd f0, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 32 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: mtfprwz f1, r4 +; CHECK-BE-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v16i8: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: sldi r3, r3, 56 -; CHECK-BE-P9-NEXT: mtvsrws vs1, r4 -; CHECK-BE-P9-NEXT: mtfprd f0, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: mtfprwz f1, r4 +; CHECK-BE-P9-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v16i8: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 56 -; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 32 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v16i8: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 56 -; CHECK-AIX-64-P9-NEXT: mtvsrws vs1, r4 -; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs1, vs0 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, vs1, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v16i8: @@ -700,42 +658,42 @@ define <2 x i64> @test_none_v4i32(i32 %arg1, ptr nocapture noundef readonly %b) ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprwz f0, r3 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_none_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxv v2, 0(r4) ; CHECK-LE-P9-NEXT: mtfprwz f0, r3 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: xxpermdi v2, vs0, v2, 1 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_none_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 ; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_none_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxv v2, 0(r4) ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_none_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_none_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: xxpermdi v2, v2, vs0, 1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_none_v4i32: @@ -768,42 +726,42 @@ define <2 x i64> @test_v4i32_none(i32 %arg1, ptr nocapture noundef readonly %b) ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: mtfprwz f0, r3 -; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_none: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxv v2, 0(r4) ; CHECK-LE-P9-NEXT: mtfprwz f0, r3 -; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: xxpermdi v2, v2, vs0, 2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_none: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 ; CHECK-BE-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_none: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxv v2, 0(r4) ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-BE-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P8-NEXT: lxvd2x v2, 0, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P8-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxv v2, 0(r4) ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, v2 +; CHECK-AIX-64-P9-NEXT: xxpermdi v2, vs0, v2, 2 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_none: @@ -835,18 +793,14 @@ define <2 x i64> @test_v16i8_v2i64(i8 %arg1, i64 %arg) { ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 ; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 -; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v16i8_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxswapd v2, vs1 -; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v16i8_v2i64: @@ -923,18 +877,14 @@ define <2 x i64> @test_v2i64_v16i8(i8 %arg1, i64 %arg) { ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 ; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v16i8: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxswapd v2, vs1 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs0, vs1 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v16i8: @@ -942,14 +892,14 @@ define <2 x i64> @test_v2i64_v16i8(i8 %arg1, i64 %arg) { ; CHECK-BE-P8-NEXT: mtfprd f0, r4 ; CHECK-BE-P8-NEXT: xxspltd v2, vs0, 0 ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 -; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: xxmrgld v2, v2, vs0 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v2i64_v16i8: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: mtvsrdd v2, r4, r4 -; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: xxmrgld v2, v2, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v16i8: @@ -957,14 +907,14 @@ define <2 x i64> @test_v2i64_v16i8(i8 %arg1, i64 %arg) { ; CHECK-AIX-64-P8-NEXT: mtfprd f0, r4 ; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs0 ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, v2, vs0 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_v16i8: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P9-NEXT: mtvsrdd v2, r4, r4 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, v2, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v16i8: @@ -1159,42 +1109,42 @@ define <2 x i64> @test_v8i16_v8i16(i16 %arg1, i16 %arg) { ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 ; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 ; CHECK-BE-P8-NEXT: mtfprwz f1, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: mtfprwz f1, r4 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v8i16: @@ -1229,52 +1179,43 @@ define <2 x i64> @test_v8i16_v4i32(i16 %arg1, i32 %arg) { ; CHECK-LE-P8-LABEL: test_v8i16_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: mtfprwz f1, r4 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: mtvsrws vs1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: mtfprwz f1, r4 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: mtfprd f0, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 32 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: mtfprwz f1, r4 +; CHECK-BE-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: sldi r3, r3, 48 -; CHECK-BE-P9-NEXT: mtvsrws vs1, r4 -; CHECK-BE-P9-NEXT: mtfprd f0, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: mtfprwz f1, r4 +; CHECK-BE-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 32 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P9-NEXT: mtvsrws vs1, r4 -; CHECK-AIX-64-P9-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: @@ -1310,18 +1251,14 @@ define <2 x i64> @test_v8i16_v2i64(i16 %arg1, i64 %arg) { ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 ; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 -; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 ; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxswapd v2, vs1 -; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v2i64: @@ -1398,42 +1335,42 @@ define <2 x i64> @test_v4i32_v4i32(i32 %arg1, i32 %arg) { ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprwz f0, r3 ; CHECK-LE-P8-NEXT: mtfprwz f1, r4 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprwz f0, r3 ; CHECK-LE-P9-NEXT: mtfprwz f1, r4 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: mtfprwz f0, r3 ; CHECK-BE-P8-NEXT: mtfprwz f1, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: mtfprwz f0, r3 ; CHECK-BE-P9-NEXT: mtfprwz f1, r4 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 ; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32: @@ -1467,53 +1404,44 @@ entry: define <2 x i64> @test_v4i32_v8i16(i32 %arg1, i16 %arg) { ; CHECK-LE-P8-LABEL: test_v4i32_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprwz f0, r3 ; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprwz f0, r3 ; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: mtvsrws vs0, r3 -; CHECK-LE-P9-NEXT: xxswapd vs1, vs1 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs1, vs0 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: sldi r3, r3, 32 -; CHECK-BE-P8-NEXT: mtfprd f0, r3 -; CHECK-BE-P8-NEXT: sldi r3, r4, 48 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P8-NEXT: mtfprwz f0, r3 +; CHECK-BE-P8-NEXT: mtfprwz f1, r4 +; CHECK-BE-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: mtvsrws vs0, r3 -; CHECK-BE-P9-NEXT: sldi r3, r4, 48 -; CHECK-BE-P9-NEXT: mtfprd f1, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-BE-P9-NEXT: mtfprwz f0, r3 +; CHECK-BE-P9-NEXT: mtfprwz f1, r4 +; CHECK-BE-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 32 -; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 -; CHECK-AIX-64-P8-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: mtvsrws vs0, r3 -; CHECK-AIX-64-P9-NEXT: sldi r3, r4, 48 -; CHECK-AIX-64-P9-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r3 +; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: @@ -1547,19 +1475,16 @@ entry: define <2 x i64> @test_v4i32_v2i64(i32 %arg1, i64 %arg) { ; CHECK-LE-P8-LABEL: test_v4i32_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: mtfprd f0, r3 +; CHECK-LE-P8-NEXT: mtfprwz f0, r3 ; CHECK-LE-P8-NEXT: mtfprd f1, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxswapd v2, vs1 -; CHECK-LE-P8-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry +; CHECK-LE-P9-NEXT: mtfprwz f0, r3 ; CHECK-LE-P9-NEXT: mtfprd f1, r4 -; CHECK-LE-P9-NEXT: mtvsrws vs0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, vs1 -; CHECK-LE-P9-NEXT: xxmrgld v2, v2, vs0 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v2i64: @@ -1730,18 +1655,15 @@ define <2 x i64> @test_v2i64_v4i32(i64 %arg1, i32 %arg) { ; CHECK-LE-P8-LABEL: test_v2i64_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: mtfprwz f1, r4 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: mtvsrws vs0, r4 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: mtfprwz f1, r4 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v4i32: @@ -1749,14 +1671,14 @@ define <2 x i64> @test_v2i64_v4i32(i64 %arg1, i32 %arg) { ; CHECK-BE-P8-NEXT: mtfprd f0, r3 ; CHECK-BE-P8-NEXT: xxspltd v2, vs0, 0 ; CHECK-BE-P8-NEXT: mtfprwz f0, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: xxmrgld v2, v2, vs0 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v2i64_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: mtfprwz f0, r4 ; CHECK-BE-P9-NEXT: mtvsrdd v2, r3, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: xxmrgld v2, v2, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v4i32: @@ -1764,14 +1686,14 @@ define <2 x i64> @test_v2i64_v4i32(i64 %arg1, i32 %arg) { ; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 ; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs0 ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, v2, vs0 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 ; CHECK-AIX-64-P9-NEXT: mtvsrdd v2, r3, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, v2, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32: @@ -1813,19 +1735,15 @@ define <2 x i64> @test_v2i64_v8i16(i64 %arg1, i16 %arg) { ; CHECK-LE-P8-LABEL: test_v2i64_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P8-NEXT: mtfprd f1, r4 +; CHECK-LE-P8-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: mtfprd f0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, vs0 -; CHECK-LE-P9-NEXT: mtfprd f0, r4 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P9-NEXT: xxmrgld v2, vs0, v2 +; CHECK-LE-P9-NEXT: mtfprd f1, r4 +; CHECK-LE-P9-NEXT: xxmrghd v2, vs1, vs0 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v8i16: @@ -1833,14 +1751,14 @@ define <2 x i64> @test_v2i64_v8i16(i64 %arg1, i16 %arg) { ; CHECK-BE-P8-NEXT: mtfprd f0, r3 ; CHECK-BE-P8-NEXT: xxspltd v2, vs0, 0 ; CHECK-BE-P8-NEXT: mtfprwz f0, r4 -; CHECK-BE-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P8-NEXT: xxmrgld v2, v2, vs0 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v2i64_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: mtfprwz f0, r4 ; CHECK-BE-P9-NEXT: mtvsrdd v2, r3, r3 -; CHECK-BE-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-BE-P9-NEXT: xxmrgld v2, v2, vs0 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v2i64_v8i16: @@ -1848,14 +1766,14 @@ define <2 x i64> @test_v2i64_v8i16(i64 %arg1, i16 %arg) { ; CHECK-AIX-64-P8-NEXT: mtfprd f0, r3 ; CHECK-AIX-64-P8-NEXT: xxmrghd v2, vs0, vs0 ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P8-NEXT: xxmrgld v2, v2, vs0 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 ; CHECK-AIX-64-P9-NEXT: mtvsrdd v2, r3, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghd v2, v2, vs0 +; CHECK-AIX-64-P9-NEXT: xxmrgld v2, v2, vs0 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: diff --git a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll index c8e7b20e4b8c37..402a4f34e62b24 100644 --- a/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v4i32_scalar_to_vector_shuffle.ll @@ -28,15 +28,11 @@ define void @test_none_v8i16(ptr %a) { ; CHECK-LE-P8-LABEL: test_none_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI0_0@toc@ha -; CHECK-LE-P8-NEXT: lxsdx v4, 0, r3 -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI0_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 ; CHECK-LE-P8-NEXT: lhz r4, 0(r3) -; CHECK-LE-P8-NEXT: mtvsrd v3, r4 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: vperm v2, v3, v4, v2 -; CHECK-LE-P8-NEXT: xxswapd vs0, v2 +; CHECK-LE-P8-NEXT: lfdx f1, 0, r3 +; CHECK-LE-P8-NEXT: mtfprd f0, r4 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stfdx f0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; @@ -44,11 +40,8 @@ define void @test_none_v8i16(ptr %a) { ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 ; CHECK-LE-P9-NEXT: lfd f1, 0(r3) -; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI0_0@toc@ha -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI0_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs2, 0(r3) -; CHECK-LE-P9-NEXT: xxperm vs1, vs0, vs2 -; CHECK-LE-P9-NEXT: xxswapd vs0, vs1 +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-LE-P9-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P9-NEXT: stfd f0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; @@ -131,11 +124,15 @@ define void @test_v8i16_none(ptr %a) { ; CHECK-LE-P8-LABEL: test_v8i16_none: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 +; CHECK-LE-P8-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-LE-P8-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-LE-P8-NEXT: xxswapd v2, vs0 +; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: mtfprd f1, r3 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 +; CHECK-LE-P8-NEXT: mtvsrd v4, r3 +; CHECK-LE-P8-NEXT: xxswapd v3, vs0 +; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 +; CHECK-LE-P8-NEXT: xxswapd vs0, v2 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; @@ -143,59 +140,76 @@ define void @test_v8i16_none(ptr %a) { ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 ; CHECK-LE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-LE-P9-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-LE-P9-NEXT: lxv vs2, 0(r3) +; CHECK-LE-P9-NEXT: xxperm vs0, vs1, vs2 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_none: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: lhz r4, 0(r3) -; CHECK-BE-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-BE-P8-NEXT: mtfprwz f0, r4 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_none: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 ; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) -; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI1_0@toc@ha +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI1_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs2, 0(r3) +; CHECK-BE-P9-NEXT: xxperm vs1, vs0, vs2 +; CHECK-BE-P9-NEXT: stxv vs1, 0(r3) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-64-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r4 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C0(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: ld r3, L..C0(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxv vs2, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm vs1, vs0, vs2 +; CHECK-AIX-64-P9-NEXT: stxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_none: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: mtfprwz f0, r4 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C0(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_none: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C0(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv vs2, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxperm vs1, vs0, vs2 +; CHECK-AIX-32-P9-NEXT: stxv vs1, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr undef, align 1 @@ -264,7 +278,7 @@ define void @test_none_v4i32(<2 x i32> %vec, ptr %ptr1) { ; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, v2, v2, 3 ; CHECK-AIX-64-P8-NEXT: mffprwz r4, f0 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 -; CHECK-AIX-64-P8-NEXT: ld r4, L..C0(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C1(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r4 ; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 @@ -275,7 +289,7 @@ define void @test_none_v4i32(<2 x i32> %vec, ptr %ptr1) { ; CHECK-AIX-64-P9-NEXT: li r4, 0 ; CHECK-AIX-64-P9-NEXT: vextuwlx r4, r4, v2 ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 -; CHECK-AIX-64-P9-NEXT: ld r4, L..C0(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r4, L..C1(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r4) ; CHECK-AIX-64-P9-NEXT: xxperm vs0, v2, vs1 ; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) @@ -286,7 +300,7 @@ define void @test_none_v4i32(<2 x i32> %vec, ptr %ptr1) { ; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 ; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 -; CHECK-AIX-32-P8-NEXT: lwz r4, L..C0(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C1(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 ; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 @@ -297,7 +311,7 @@ define void @test_none_v4i32(<2 x i32> %vec, ptr %ptr1) { ; CHECK-AIX-32-P9-NEXT: addi r4, r1, -16 ; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) ; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-32-P9-NEXT: lwz r4, L..C0(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lwz r4, L..C1(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r4) ; CHECK-AIX-32-P9-NEXT: xxperm vs0, v2, vs1 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) @@ -369,7 +383,7 @@ define void @test_v4i32_none(<2 x i32> %vec, ptr %ptr1) { ; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, v2, v2, 3 ; CHECK-AIX-64-P8-NEXT: mffprwz r4, f0 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r4 -; CHECK-AIX-64-P8-NEXT: ld r4, L..C1(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r4, L..C2(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r4 ; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 @@ -380,7 +394,7 @@ define void @test_v4i32_none(<2 x i32> %vec, ptr %ptr1) { ; CHECK-AIX-64-P9-NEXT: li r4, 0 ; CHECK-AIX-64-P9-NEXT: vextuwlx r4, r4, v2 ; CHECK-AIX-64-P9-NEXT: mtfprwz f0, r4 -; CHECK-AIX-64-P9-NEXT: ld r4, L..C1(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r4, L..C2(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r4) ; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: stxv v2, 0(r3) @@ -391,7 +405,7 @@ define void @test_v4i32_none(<2 x i32> %vec, ptr %ptr1) { ; CHECK-AIX-32-P8-NEXT: addi r4, r1, -16 ; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 -; CHECK-AIX-32-P8-NEXT: lwz r4, L..C1(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lwz r4, L..C2(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r4 ; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 @@ -402,7 +416,7 @@ define void @test_v4i32_none(<2 x i32> %vec, ptr %ptr1) { ; CHECK-AIX-32-P9-NEXT: addi r4, r1, -16 ; CHECK-AIX-32-P9-NEXT: stxv v2, -16(r1) ; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-32-P9-NEXT: lwz r4, L..C1(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lwz r4, L..C2(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r4) ; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: stxv v2, 0(r3) @@ -474,12 +488,12 @@ define void @test_none_v2i64(ptr %ptr, i32 %v1, <2 x i32> %vec) local_unnamed_ad ; ; CHECK-AIX-64-P8-LABEL: test_none_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r5, L..C2(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r5, L..C3(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: mtvsrwz v4, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r5 ; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 ; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C3(r2) # %const.1 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C4(r2) # %const.1 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 @@ -487,7 +501,7 @@ define void @test_none_v2i64(ptr %ptr, i32 %v1, <2 x i32> %vec) local_unnamed_ad ; CHECK-AIX-64-P9-LABEL: test_none_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C2(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: mtfprwz f1, r4 ; CHECK-AIX-64-P9-NEXT: xxinsertw v2, vs1, 0 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) @@ -497,12 +511,12 @@ define void @test_none_v2i64(ptr %ptr, i32 %v1, <2 x i32> %vec) local_unnamed_ad ; CHECK-AIX-32-P8-LABEL: test_none_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C2(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C3(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: stw r4, -16(r1) ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 ; CHECK-AIX-32-P8-NEXT: lxvw4x v5, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C3(r2) # %const.1 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C4(r2) # %const.1 ; CHECK-AIX-32-P8-NEXT: vperm v2, v5, v2, v4 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 @@ -511,7 +525,7 @@ define void @test_none_v2i64(ptr %ptr, i32 %v1, <2 x i32> %vec) local_unnamed_ad ; CHECK-AIX-32-P9-LABEL: test_none_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C2(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C3(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: mtfprwz f1, r4 ; CHECK-AIX-32-P9-NEXT: xxinsertw v2, vs1, 0 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) @@ -595,7 +609,7 @@ define void @test_v8i16_v8i16(ptr %a) { ; CHECK-LE-P8-NEXT: lhz r3, 0(r3) ; CHECK-LE-P8-NEXT: mtfprd f0, r4 ; CHECK-LE-P8-NEXT: mtfprd f1, r3 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr @@ -604,7 +618,7 @@ define void @test_v8i16_v8i16(ptr %a) { ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 ; CHECK-LE-P9-NEXT: lxsihzx f1, 0, r3 -; CHECK-LE-P9-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; @@ -614,7 +628,7 @@ define void @test_v8i16_v8i16(ptr %a) { ; CHECK-BE-P8-NEXT: lhz r3, 0(r3) ; CHECK-BE-P8-NEXT: mtfprwz f0, r4 ; CHECK-BE-P8-NEXT: mtfprwz f1, r3 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P8-NEXT: xxmrglw vs0, vs0, vs1 ; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-BE-P8-NEXT: blr ; @@ -622,7 +636,7 @@ define void @test_v8i16_v8i16(ptr %a) { ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 ; CHECK-BE-P9-NEXT: lxsihzx f1, 0, r3 -; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-BE-P9-NEXT: xxmrglw vs0, vs0, vs1 ; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-BE-P9-NEXT: blr ; @@ -632,7 +646,7 @@ define void @test_v8i16_v8i16(ptr %a) { ; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) ; CHECK-AIX-64-P8-NEXT: mtfprwz f0, r4 ; CHECK-AIX-64-P8-NEXT: mtfprwz f1, r3 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P8-NEXT: xxmrglw vs0, vs0, vs1 ; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; @@ -640,7 +654,7 @@ define void @test_v8i16_v8i16(ptr %a) { ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 ; CHECK-AIX-64-P9-NEXT: lxsihzx f1, 0, r3 -; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-64-P9-NEXT: xxmrglw vs0, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; @@ -650,7 +664,7 @@ define void @test_v8i16_v8i16(ptr %a) { ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) ; CHECK-AIX-32-P8-NEXT: mtfprwz f0, r4 ; CHECK-AIX-32-P8-NEXT: mtfprwz f1, r3 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P8-NEXT: xxmrglw vs0, vs0, vs1 ; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; @@ -658,7 +672,7 @@ define void @test_v8i16_v8i16(ptr %a) { ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 ; CHECK-AIX-32-P9-NEXT: lxsihzx f1, 0, r3 -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 +; CHECK-AIX-32-P9-NEXT: xxmrglw vs0, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -681,85 +695,82 @@ define void @test_v8i16_v4i32(ptr %a) { ; CHECK-LE-P8-NEXT: lhz r4, 0(r3) ; CHECK-LE-P8-NEXT: lfiwzx f1, 0, r3 ; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd vs1, f1 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: xxmrglw vs0, vs0, v2 +; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P8-NEXT: lhz r3, 0(r3) -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs1, vs0 -; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI7_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI7_0@toc@l +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P9-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-BE-P9-NEXT: xxmrghw vs0, v2, vs0 -; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) +; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-BE-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI7_0@toc@ha +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI7_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs2, 0(r3) +; CHECK-BE-P9-NEXT: xxperm vs1, vs0, vs2 +; CHECK-BE-P9-NEXT: stxv vs1, 0(r3) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs1, vs0 -; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C5(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 -; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, v2, vs0 -; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C4(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxv vs2, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm vs1, vs0, vs2 +; CHECK-AIX-64-P9-NEXT: stxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) -; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 -; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C5(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C4(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv vs2, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxperm vs1, vs0, vs2 +; CHECK-AIX-32-P9-NEXT: stxv vs1, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr undef, align 1 @@ -780,20 +791,16 @@ define void @test_v8i16_v2i64(ptr %a) { ; CHECK-LE-P8-NEXT: lhz r4, 0(r3) ; CHECK-LE-P8-NEXT: lfdx f1, 0, r3 ; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd vs1, f1 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfd f0, 0(r3) -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: xxmrglw vs0, vs0, v2 +; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lfd f1, 0(r3) +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; @@ -838,23 +845,22 @@ define void @test_v8i16_v2i64(ptr %a) { ; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) -; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs1, vs0 -; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C6(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) +; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C5(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv vs2, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxperm vs1, vs0, vs2 +; CHECK-AIX-32-P9-NEXT: stxv vs1, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr undef, align 1 @@ -914,7 +920,7 @@ define <16 x i8> @test_v4i32_v4i32(ptr %a, ptr %b) { ; CHECK-AIX-64-P8-LABEL: test_v4i32_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C4(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C6(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 @@ -923,7 +929,7 @@ define <16 x i8> @test_v4i32_v4i32(ptr %a, ptr %b) { ; CHECK-AIX-64-P9-LABEL: test_v4i32_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C5(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 @@ -932,7 +938,7 @@ define <16 x i8> @test_v4i32_v4i32(ptr %a, ptr %b) { ; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C4(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C7(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 @@ -941,7 +947,7 @@ define <16 x i8> @test_v4i32_v4i32(ptr %a, ptr %b) { ; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C3(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C6(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r4 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 @@ -960,84 +966,81 @@ define void @test_v4i32_v8i16(ptr %a) { ; CHECK-LE-P8-NEXT: lhz r4, 0(r3) ; CHECK-LE-P8-NEXT: lfiwzx f1, 0, r3 ; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd vs1, f1 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: xxmrglw vs0, v2, vs0 +; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P8-NEXT: lhz r3, 0(r3) -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: mtfprd f1, r3 -; CHECK-BE-P8-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-P8-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-BE-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-BE-P8-NEXT: lhz r4, 0(r3) +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI10_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI10_0@toc@l +; CHECK-BE-P8-NEXT: mtvsrwz v2, r4 +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-BE-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-BE-P9-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-BE-P9-NEXT: xxmrghw vs0, vs0, v2 +; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-BE-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI10_0@toc@ha +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI10_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs2, 0(r3) +; CHECK-BE-P9-NEXT: xxperm vs0, vs1, vs2 ; CHECK-BE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: mtfprd f1, r3 -; CHECK-AIX-64-P8-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-64-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-64-P8-NEXT: lhz r4, 0(r3) +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C7(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-64-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: xxsldwi vs0, f0, f0, 1 -; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 -; CHECK-AIX-64-P9-NEXT: xxmrghw vs0, vs0, v2 +; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C6(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxv vs2, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm vs0, vs1, vs2 ; CHECK-AIX-64-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) -; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C8(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C7(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv vs2, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxperm vs0, vs1, vs2 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: @@ -1058,9 +1061,7 @@ define void @test_v4i32_v2i64(ptr %a) { ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 ; CHECK-LE-P8-NEXT: lfiwzx f1, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd vs0, f0 -; CHECK-LE-P8-NEXT: xxswapd vs1, f1 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr @@ -1069,9 +1070,7 @@ define void @test_v4i32_v2i64(ptr %a) { ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lfd f0, 0(r3) ; CHECK-LE-P9-NEXT: lfiwzx f1, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 -; CHECK-LE-P9-NEXT: xxswapd vs1, f1 -; CHECK-LE-P9-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; @@ -1227,9 +1226,7 @@ define void @test_v2i64_v4i32(ptr %a) { ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: lfdx f0, 0, r3 ; CHECK-LE-P8-NEXT: lfiwzx f1, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd vs0, f0 -; CHECK-LE-P8-NEXT: xxswapd vs1, f1 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr @@ -1238,9 +1235,7 @@ define void @test_v2i64_v4i32(ptr %a) { ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lfd f0, 0(r3) ; CHECK-LE-P9-NEXT: lfiwzx f1, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 -; CHECK-LE-P9-NEXT: xxswapd vs1, f1 -; CHECK-LE-P9-NEXT: xxmrglw vs0, vs1, vs0 +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs1, vs0 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; @@ -1315,20 +1310,16 @@ define void @test_v2i64_v8i16(ptr %a) { ; CHECK-LE-P8-NEXT: lhz r4, 0(r3) ; CHECK-LE-P8-NEXT: lfdx f1, 0, r3 ; CHECK-LE-P8-NEXT: mtfprd f0, r4 -; CHECK-LE-P8-NEXT: xxswapd vs1, f1 -; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 -; CHECK-LE-P8-NEXT: xxmrglw vs0, vs0, vs1 +; CHECK-LE-P8-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-LE-P8-NEXT: xxswapd vs0, vs0 ; CHECK-LE-P8-NEXT: stxvd2x vs0, 0, r3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfd f0, 0(r3) -; CHECK-LE-P9-NEXT: xxswapd vs0, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: xxmrglw vs0, v2, vs0 +; CHECK-LE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-LE-P9-NEXT: lfd f1, 0(r3) +; CHECK-LE-P9-NEXT: xxmrghw vs0, vs0, vs1 ; CHECK-LE-P9-NEXT: stxv vs0, 0(r3) ; CHECK-LE-P9-NEXT: blr ; @@ -1373,22 +1364,21 @@ define void @test_v2i64_v8i16(ptr %a) { ; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r4, -16(r1) -; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lxvw4x vs1, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxspltw vs0, vs0, 1 -; CHECK-AIX-32-P8-NEXT: xxmrghw vs0, vs0, vs1 -; CHECK-AIX-32-P8-NEXT: stxvw4x vs0, 0, r3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C9(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r4 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 +; CHECK-AIX-32-P8-NEXT: stxvw4x v2, 0, r3 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r4, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r4, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv vs0, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxvwsx vs1, 0, r3 -; CHECK-AIX-32-P9-NEXT: xxmrghw vs0, vs1, vs0 +; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lfiwzx f1, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C8(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxv vs2, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxperm vs0, vs1, vs2 ; CHECK-AIX-32-P9-NEXT: stxv vs0, 0(r3) ; CHECK-AIX-32-P9-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll index e1aa531db449e5..47ffdb4625ed39 100644 --- a/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll +++ b/llvm/test/CodeGen/PowerPC/v8i16_scalar_to_vector_shuffle.ll @@ -241,16 +241,13 @@ entry: define void @test_none_v4i32(ptr %ptr, ptr %ptr2, i8 %v3) local_unnamed_addr #0 { ; CHECK-LE-P8-LABEL: test_none_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry +; CHECK-LE-P8-NEXT: mtfprd f0, r5 ; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha -; CHECK-LE-P8-NEXT: mtvsrd v3, r5 ; CHECK-LE-P8-NEXT: lxsiwzx v4, 0, r3 ; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l -; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: addis r4, r2, .LCPI2_1@toc@ha -; CHECK-LE-P8-NEXT: addi r4, r4, .LCPI2_1@toc@l ; CHECK-LE-P8-NEXT: xxswapd v2, vs0 ; CHECK-LE-P8-NEXT: lxvd2x vs0, 0, r4 -; CHECK-LE-P8-NEXT: vperm v2, v3, v3, v2 +; CHECK-LE-P8-NEXT: vmrglh v2, v2, v2 ; CHECK-LE-P8-NEXT: xxswapd v3, vs0 ; CHECK-LE-P8-NEXT: vperm v2, v2, v4, v3 ; CHECK-LE-P8-NEXT: xxswapd vs0, v2 @@ -261,13 +258,11 @@ define void @test_none_v4i32(ptr %ptr, ptr %ptr2, i8 %v3) local_unnamed_addr #0 ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 ; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha -; CHECK-LE-P9-NEXT: mtvsrd v3, r5 +; CHECK-LE-P9-NEXT: mtfprd f0, r5 ; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-LE-P9-NEXT: lxv vs0, 0(r3) -; CHECK-LE-P9-NEXT: addis r3, r2, .LCPI2_1@toc@ha -; CHECK-LE-P9-NEXT: addi r3, r3, .LCPI2_1@toc@l +; CHECK-LE-P9-NEXT: xxswapd v3, vs0 ; CHECK-LE-P9-NEXT: lxv v4, 0(r3) -; CHECK-LE-P9-NEXT: xxperm v3, v3, vs0 +; CHECK-LE-P9-NEXT: vmrglh v3, v3, v3 ; CHECK-LE-P9-NEXT: vperm v2, v3, v2, v4 ; CHECK-LE-P9-NEXT: xxswapd vs0, v2 ; CHECK-LE-P9-NEXT: stfd f0, 0(r3) @@ -275,15 +270,13 @@ define void @test_none_v4i32(ptr %ptr, ptr %ptr2, i8 %v3) local_unnamed_addr #0 ; ; CHECK-BE-P8-LABEL: test_none_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry -; CHECK-BE-P8-NEXT: addis r4, r2, .LCPI2_0@toc@ha -; CHECK-BE-P8-NEXT: mtvsrwz v2, r5 -; CHECK-BE-P8-NEXT: addi r4, r4, .LCPI2_0@toc@l -; CHECK-BE-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-BE-P8-NEXT: vperm v2, v2, v2, v3 +; CHECK-BE-P8-NEXT: sldi r4, r5, 56 ; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r3 -; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI2_1@toc@ha -; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI2_1@toc@l +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI2_0@toc@ha +; CHECK-BE-P8-NEXT: mtvsrd v2, r4 +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI2_0@toc@l ; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vmrghh v2, v2, v2 ; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-BE-P8-NEXT: stxsdx v2, 0, r3 ; CHECK-BE-P8-NEXT: blr @@ -291,27 +284,24 @@ define void @test_none_v4i32(ptr %ptr, ptr %ptr2, i8 %v3) local_unnamed_addr #0 ; CHECK-BE-P9-LABEL: test_none_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-BE-P9-NEXT: sldi r3, r5, 56 +; CHECK-BE-P9-NEXT: mtvsrd v3, r3 ; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI2_0@toc@ha -; CHECK-BE-P9-NEXT: mtvsrwz v3, r5 ; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI2_0@toc@l -; CHECK-BE-P9-NEXT: lxv vs0, 0(r3) -; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI2_1@toc@ha -; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI2_1@toc@l +; CHECK-BE-P9-NEXT: vmrghh v3, v3, v3 ; CHECK-BE-P9-NEXT: lxv v4, 0(r3) -; CHECK-BE-P9-NEXT: xxperm v3, v3, vs0 ; CHECK-BE-P9-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P9-NEXT: stxsd v2, 0(r3) ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_none_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry -; CHECK-AIX-64-P8-NEXT: ld r4, L..C3(r2) # %const.0 -; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r5 -; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 -; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: sldi r4, r5, 56 ; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C4(r2) # %const.1 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C3(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: mtvsrd v2, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v2 ; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-64-P8-NEXT: stxsdx v2, 0, r3 ; CHECK-AIX-64-P8-NEXT: blr @@ -319,12 +309,11 @@ define void @test_none_v4i32(ptr %ptr, ptr %ptr2, i8 %v3) local_unnamed_addr #0 ; CHECK-AIX-64-P9-LABEL: test_none_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: sldi r3, r5, 56 +; CHECK-AIX-64-P9-NEXT: mtvsrd v3, r3 ; CHECK-AIX-64-P9-NEXT: ld r3, L..C2(r2) # %const.0 -; CHECK-AIX-64-P9-NEXT: mtvsrwz v3, r5 -; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.1 +; CHECK-AIX-64-P9-NEXT: vmrghh v3, v3, v3 ; CHECK-AIX-64-P9-NEXT: lxv v4, 0(r3) -; CHECK-AIX-64-P9-NEXT: xxperm v3, v3, vs0 ; CHECK-AIX-64-P9-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P9-NEXT: stxsd v2, 0(r3) ; CHECK-AIX-64-P9-NEXT: blr @@ -421,7 +410,7 @@ define void @test_v4i32_none(ptr nocapture readonly %ptr1, ptr nocapture readonl ; CHECK-AIX-64-P8-LABEL: test_v4i32_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C5(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C4(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: xxlxor v4, v4, v4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r3 ; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 @@ -431,7 +420,7 @@ define void @test_v4i32_none(ptr nocapture readonly %ptr1, ptr nocapture readonl ; CHECK-AIX-64-P9-LABEL: test_v4i32_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C4(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C3(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: xxlxor vs2, vs2, vs2 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: xxperm vs0, vs2, vs1 @@ -531,7 +520,7 @@ define void @test_none_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonl ; CHECK-AIX-64-P8-LABEL: test_none_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C6(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C5(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 @@ -543,7 +532,7 @@ define void @test_none_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readonl ; CHECK-AIX-64-P9-LABEL: test_none_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C5(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C4(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxv vs0, 0(r4) ; CHECK-AIX-64-P9-NEXT: xxlxor v3, v3, v3 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) @@ -635,7 +624,7 @@ define void @test_v2i64_none(ptr nocapture readonly %ptr1) { ; CHECK-AIX-64-P8-LABEL: test_v2i64_none: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C7(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C6(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: xxlxor v4, v4, v4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v3, 0, r3 ; CHECK-AIX-64-P8-NEXT: vperm v2, v4, v2, v3 @@ -645,7 +634,7 @@ define void @test_v2i64_none(ptr nocapture readonly %ptr1) { ; CHECK-AIX-64-P9-LABEL: test_v2i64_none: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C6(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C5(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: xxlxor vs2, vs2, vs2 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: xxperm vs0, vs2, vs1 @@ -739,7 +728,7 @@ define <16 x i8> @test_v8i16_v8i16(ptr %a, ptr %b) { ; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 ; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r4) ; CHECK-AIX-64-P8-NEXT: mtvsrwz v3, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C8(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C7(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: blr @@ -747,7 +736,7 @@ define <16 x i8> @test_v8i16_v8i16(ptr %a, ptr %b) { ; CHECK-AIX-64-P9-LABEL: test_v8i16_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 -; CHECK-AIX-64-P9-NEXT: ld r3, L..C7(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C6(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r4 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 @@ -784,78 +773,75 @@ define <16 x i8> @test_v8i16_v4i32(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_v8i16_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v4i32: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: lhz r3, 0(r3) -; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-BE-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI7_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI7_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v8i16_v4i32: ; CHECK-BE-P9: # %bb.0: # %entry -; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-BE-P9-NEXT: xxsldwi v3, f0, f0, 1 -; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-BE-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI7_0@toc@ha +; CHECK-BE-P9-NEXT: lxsiwzx v2, 0, r4 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI7_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C8(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-64-P9: # %bb.0: # %entry -; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-64-P9-NEXT: xxsldwi v3, f0, f0, 1 -; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C7(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: lxsiwzx v2, 0, r4 +; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 -; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C8(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxvwsx v3, 0, r4 -; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C7(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr %a @@ -874,20 +860,16 @@ define <16 x i8> @test_v8i16_v2i64(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_v8i16_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v8i16_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v8i16_v2i64: @@ -927,21 +909,20 @@ define <16 x i8> @test_v8i16_v2i64(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-AIX-32-P8-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 -; CHECK-AIX-32-P8-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C9(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v8i16_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxvwsx v3, 0, r4 -; CHECK-AIX-32-P9-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P9-NEXT: lxsihzx f0, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C8(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr %a @@ -1045,7 +1026,7 @@ define void @test_v4i32_v4i32(ptr nocapture readonly %ptr1, ptr nocapture readon ; CHECK-AIX-32-P8-LABEL: test_v4i32_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C8(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C10(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 @@ -1057,7 +1038,7 @@ define void @test_v4i32_v4i32(ptr nocapture readonly %ptr1, ptr nocapture readon ; CHECK-AIX-32-P9-LABEL: test_v4i32_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C7(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C9(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-32-P9-NEXT: xxlxor v3, v3, v3 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) @@ -1078,78 +1059,75 @@ define <16 x i8> @test_v4i32_v8i16(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_v4i32_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: lxsiwzx v3, 0, r4 +; CHECK-LE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v8i16: ; CHECK-BE-P8: # %bb.0: # %entry ; CHECK-BE-P8-NEXT: lhz r3, 0(r3) -; CHECK-BE-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-BE-P8-NEXT: sldi r3, r3, 48 -; CHECK-BE-P8-NEXT: mtvsrd v3, r3 -; CHECK-BE-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-BE-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-BE-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-BE-P8-NEXT: mtvsrwz v2, r3 +; CHECK-BE-P8-NEXT: addis r3, r2, .LCPI10_0@toc@ha +; CHECK-BE-P8-NEXT: addi r3, r3, .LCPI10_0@toc@l +; CHECK-BE-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-BE-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-BE-P8-NEXT: blr ; ; CHECK-BE-P9-LABEL: test_v4i32_v8i16: ; CHECK-BE-P9: # %bb.0: # %entry ; CHECK-BE-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-BE-P9-NEXT: addis r3, r2, .LCPI10_0@toc@ha ; CHECK-BE-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-BE-P9-NEXT: xxsldwi v3, f0, f0, 1 -; CHECK-BE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-BE-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-BE-P9-NEXT: addi r3, r3, .LCPI10_0@toc@l +; CHECK-BE-P9-NEXT: lxv vs1, 0(r3) +; CHECK-BE-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-BE-P9-NEXT: blr ; ; CHECK-AIX-64-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-64-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-64-P8-NEXT: sldi r3, r3, 48 -; CHECK-AIX-64-P8-NEXT: mtvsrd v3, r3 -; CHECK-AIX-64-P8-NEXT: xxsldwi v2, f0, f0, 1 -; CHECK-AIX-64-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-64-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-64-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C10(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-64-P8-NEXT: blr ; ; CHECK-AIX-64-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C9(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-64-P9-NEXT: xxsldwi v3, f0, f0, 1 -; CHECK-AIX-64-P9-NEXT: vsplth v2, v2, 3 -; CHECK-AIX-64-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-64-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-64-P9-NEXT: blr ; ; CHECK-AIX-32-P8-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 -; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C11(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v4i32_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxvwsx v3, 0, r4 -; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C10(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr %a @@ -1167,20 +1145,16 @@ entry: define <16 x i8> @test_v4i32_v2i64(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_v4i32_v2i64: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, f0 -; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-LE-P8-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v4i32_v2i64: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, f0 -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vmrglh v2, v3, v2 +; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-LE-P9-NEXT: vmrghh v2, v3, v2 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v4i32_v2i64: @@ -1218,7 +1192,7 @@ define <16 x i8> @test_v4i32_v2i64(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-AIX-32-P8-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C9(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C12(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-32-P8-NEXT: vperm v2, v2, v3, v4 @@ -1227,7 +1201,7 @@ define <16 x i8> @test_v4i32_v2i64(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-AIX-32-P9-LABEL: test_v4i32_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C8(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C11(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r4 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 @@ -1310,7 +1284,7 @@ define void @test_v2i64_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readon ; CHECK-AIX-64-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-64-P8: # %bb.0: # %entry ; CHECK-AIX-64-P8-NEXT: lxsdx v2, 0, r3 -; CHECK-AIX-64-P8-NEXT: ld r3, L..C10(r2) # %const.0 +; CHECK-AIX-64-P8-NEXT: ld r3, L..C11(r2) # %const.0 ; CHECK-AIX-64-P8-NEXT: lxsdx v3, 0, r4 ; CHECK-AIX-64-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-64-P8-NEXT: vperm v2, v3, v2, v4 @@ -1322,7 +1296,7 @@ define void @test_v2i64_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readon ; CHECK-AIX-64-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-64-P9: # %bb.0: # %entry ; CHECK-AIX-64-P9-NEXT: lxsd v2, 0(r3) -; CHECK-AIX-64-P9-NEXT: ld r3, L..C9(r2) # %const.0 +; CHECK-AIX-64-P9-NEXT: ld r3, L..C10(r2) # %const.0 ; CHECK-AIX-64-P9-NEXT: lfd f0, 0(r4) ; CHECK-AIX-64-P9-NEXT: xxlxor v3, v3, v3 ; CHECK-AIX-64-P9-NEXT: lxv vs1, 0(r3) @@ -1334,7 +1308,7 @@ define void @test_v2i64_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readon ; CHECK-AIX-32-P8-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C10(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C13(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 @@ -1346,7 +1320,7 @@ define void @test_v2i64_v2i64(ptr nocapture readonly %ptr1, ptr nocapture readon ; CHECK-AIX-32-P9-LABEL: test_v2i64_v2i64: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C9(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C12(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-32-P9-NEXT: xxlxor v3, v3, v3 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) @@ -1366,20 +1340,16 @@ entry: define <16 x i8> @test_v2i64_v4i32(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_v2i64_v4i32: ; CHECK-LE-P8: # %bb.0: # %entry -; CHECK-LE-P8-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, f0 -; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-LE-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v4i32: ; CHECK-LE-P9: # %bb.0: # %entry -; CHECK-LE-P9-NEXT: lfiwzx f0, 0, r3 -; CHECK-LE-P9-NEXT: xxswapd v2, f0 -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: lxsiwzx v2, 0, r3 +; CHECK-LE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-LE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v4i32: @@ -1417,7 +1387,7 @@ define <16 x i8> @test_v2i64_v4i32(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-AIX-32-P8-LABEL: test_v2i64_v4i32: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P8-NEXT: lwz r3, L..C11(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C14(r2) # %const.0 ; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 ; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 ; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 @@ -1426,7 +1396,7 @@ define <16 x i8> @test_v2i64_v4i32(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-AIX-32-P9-LABEL: test_v2i64_v4i32: ; CHECK-AIX-32-P9: # %bb.0: # %entry ; CHECK-AIX-32-P9-NEXT: lxsiwzx v2, 0, r3 -; CHECK-AIX-32-P9-NEXT: lwz r3, L..C10(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C13(r2) # %const.0 ; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 ; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) ; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 @@ -1448,20 +1418,16 @@ define <16 x i8> @test_v2i64_v8i16(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-LE-P8-LABEL: test_v2i64_v8i16: ; CHECK-LE-P8: # %bb.0: # %entry ; CHECK-LE-P8-NEXT: lhz r3, 0(r3) -; CHECK-LE-P8-NEXT: mtfprd f0, r3 -; CHECK-LE-P8-NEXT: xxswapd v2, vs0 -; CHECK-LE-P8-NEXT: lfdx f0, 0, r4 -; CHECK-LE-P8-NEXT: xxswapd v3, f0 -; CHECK-LE-P8-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P8-NEXT: lxsdx v3, 0, r4 +; CHECK-LE-P8-NEXT: mtvsrd v2, r3 +; CHECK-LE-P8-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P8-NEXT: blr ; ; CHECK-LE-P9-LABEL: test_v2i64_v8i16: ; CHECK-LE-P9: # %bb.0: # %entry ; CHECK-LE-P9-NEXT: lxsihzx v2, 0, r3 -; CHECK-LE-P9-NEXT: lfd f0, 0(r4) -; CHECK-LE-P9-NEXT: xxswapd v3, f0 -; CHECK-LE-P9-NEXT: vsplth v2, v2, 3 -; CHECK-LE-P9-NEXT: vmrglh v2, v2, v3 +; CHECK-LE-P9-NEXT: lxsd v3, 0(r4) +; CHECK-LE-P9-NEXT: vmrghh v2, v2, v3 ; CHECK-LE-P9-NEXT: blr ; ; CHECK-BE-P8-LABEL: test_v2i64_v8i16: @@ -1501,21 +1467,20 @@ define <16 x i8> @test_v2i64_v8i16(ptr %a, ptr %b) local_unnamed_addr { ; CHECK-AIX-32-P8-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P8: # %bb.0: # %entry ; CHECK-AIX-32-P8-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P8-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P8-NEXT: addi r3, r1, -16 -; CHECK-AIX-32-P8-NEXT: lfiwzx f0, 0, r4 -; CHECK-AIX-32-P8-NEXT: lxvw4x v3, 0, r3 -; CHECK-AIX-32-P8-NEXT: xxspltw v2, vs0, 1 -; CHECK-AIX-32-P8-NEXT: vmrghh v2, v2, v3 +; CHECK-AIX-32-P8-NEXT: lxsiwzx v3, 0, r4 +; CHECK-AIX-32-P8-NEXT: mtvsrwz v2, r3 +; CHECK-AIX-32-P8-NEXT: lwz r3, L..C15(r2) # %const.0 +; CHECK-AIX-32-P8-NEXT: lxvw4x v4, 0, r3 +; CHECK-AIX-32-P8-NEXT: vperm v2, v3, v2, v4 ; CHECK-AIX-32-P8-NEXT: blr ; ; CHECK-AIX-32-P9-LABEL: test_v2i64_v8i16: ; CHECK-AIX-32-P9: # %bb.0: # %entry -; CHECK-AIX-32-P9-NEXT: lhz r3, 0(r3) -; CHECK-AIX-32-P9-NEXT: sth r3, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxv v2, -16(r1) -; CHECK-AIX-32-P9-NEXT: lxvwsx v3, 0, r4 -; CHECK-AIX-32-P9-NEXT: vmrghh v2, v3, v2 +; CHECK-AIX-32-P9-NEXT: lxsihzx v2, 0, r3 +; CHECK-AIX-32-P9-NEXT: lwz r3, L..C14(r2) # %const.0 +; CHECK-AIX-32-P9-NEXT: lfiwzx f0, 0, r4 +; CHECK-AIX-32-P9-NEXT: lxv vs1, 0(r3) +; CHECK-AIX-32-P9-NEXT: xxperm v2, vs0, vs1 ; CHECK-AIX-32-P9-NEXT: blr entry: %0 = load <2 x i8>, ptr %a diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll index c503d6541b0a57..14ff9e01ab3bc2 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/alu-roundtrip.ll @@ -457,19 +457,19 @@ entry: define i64 @subi_i64(i64 %a) { ; RV32IM-LABEL: subi_i64: ; RV32IM: # %bb.0: # %entry -; RV32IM-NEXT: lui a2, 301 -; RV32IM-NEXT: addi a3, a2, 1548 -; RV32IM-NEXT: sub a2, a0, a3 -; RV32IM-NEXT: sltu a0, a0, a3 -; RV32IM-NEXT: sub a1, a1, a0 -; RV32IM-NEXT: mv a0, a2 +; RV32IM-NEXT: lui a2, 1048275 +; RV32IM-NEXT: addi a2, a2, -1548 +; RV32IM-NEXT: add a0, a0, a2 +; RV32IM-NEXT: sltu a2, a0, a2 +; RV32IM-NEXT: addi a1, a1, -1 +; RV32IM-NEXT: add a1, a1, a2 ; RV32IM-NEXT: ret ; ; RV64IM-LABEL: subi_i64: ; RV64IM: # %bb.0: # %entry -; RV64IM-NEXT: lui a1, 301 -; RV64IM-NEXT: addiw a1, a1, 1548 -; RV64IM-NEXT: sub a0, a0, a1 +; RV64IM-NEXT: lui a1, 1048275 +; RV64IM-NEXT: addiw a1, a1, -1548 +; RV64IM-NEXT: add a0, a0, a1 ; RV64IM-NEXT: ret entry: %0 = sub i64 %a, 1234444 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll index 32593a74d307ef..d8f20b29e2f064 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/iabs.ll @@ -17,8 +17,7 @@ define i8 @abs8(i8 %x) { ; RV32I-LABEL: abs8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 24 -; RV32I-NEXT: srai a1, a1, 24 -; RV32I-NEXT: srai a1, a1, 7 +; RV32I-NEXT: srai a1, a1, 31 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: ret @@ -33,8 +32,7 @@ define i8 @abs8(i8 %x) { ; RV64I-LABEL: abs8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 24 -; RV64I-NEXT: sraiw a1, a1, 24 -; RV64I-NEXT: sraiw a1, a1, 7 +; RV64I-NEXT: sraiw a1, a1, 31 ; RV64I-NEXT: addw a0, a0, a1 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: ret @@ -53,8 +51,7 @@ define i16 @abs16(i16 %x) { ; RV32I-LABEL: abs16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a0, 16 -; RV32I-NEXT: srai a1, a1, 16 -; RV32I-NEXT: srai a1, a1, 15 +; RV32I-NEXT: srai a1, a1, 31 ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: xor a0, a0, a1 ; RV32I-NEXT: ret @@ -69,8 +66,7 @@ define i16 @abs16(i16 %x) { ; RV64I-LABEL: abs16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a1, a0, 16 -; RV64I-NEXT: sraiw a1, a1, 16 -; RV64I-NEXT: sraiw a1, a1, 15 +; RV64I-NEXT: sraiw a1, a1, 31 ; RV64I-NEXT: addw a0, a0, a1 ; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir index 2ef5de501ee711..39d0ee7c382dfc 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv32.mir @@ -200,8 +200,9 @@ body: | ; RV32I: liveins: $x10 ; RV32I-NEXT: {{ $}} ; RV32I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; RV32I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY]], 1234 - ; RV32I-NEXT: $x10 = COPY [[ADDI]] + ; RV32I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, -1234 + ; RV32I-NEXT: [[SUB:%[0-9]+]]:gpr = SUB [[COPY]], [[ADDI]] + ; RV32I-NEXT: $x10 = COPY [[SUB]] ; RV32I-NEXT: PseudoRET implicit $x10 %0:gprb(s32) = COPY $x10 %1:gprb(s32) = G_CONSTANT i32 -1234 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir index be12333e1499b2..527036d8b750fc 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu-rv64.mir @@ -188,8 +188,9 @@ body: | ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; RV64I-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[COPY]], 1234 - ; RV64I-NEXT: $x10 = COPY [[ADDIW]] + ; RV64I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, -1234 + ; RV64I-NEXT: [[SUBW:%[0-9]+]]:gpr = SUBW [[COPY]], [[ADDI]] + ; RV64I-NEXT: $x10 = COPY [[SUBW]] ; RV64I-NEXT: PseudoRET implicit $x10 %0:gprb(s64) = COPY $x10 %1:gprb(s32) = G_TRUNC %0(s64) @@ -440,8 +441,9 @@ body: | ; RV64I: liveins: $x10 ; RV64I-NEXT: {{ $}} ; RV64I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; RV64I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY]], 1234 - ; RV64I-NEXT: $x10 = COPY [[ADDI]] + ; RV64I-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, -1234 + ; RV64I-NEXT: [[SUB:%[0-9]+]]:gpr = SUB [[COPY]], [[ADDI]] + ; RV64I-NEXT: $x10 = COPY [[SUB]] ; RV64I-NEXT: PseudoRET implicit $x10 %0:gprb(s64) = COPY $x10 %1:gprb(s64) = G_CONSTANT i64 -1234 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu_m-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu_m-rv64.mir index f748f0811a99c6..605830ff4f971b 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu_m-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/alu_m-rv64.mir @@ -47,39 +47,9 @@ body: | ; RV64I-NEXT: $x10 = COPY [[DIVW]] ; RV64I-NEXT: PseudoRET implicit $x10 %0:gprb(s64) = COPY $x10 - %1:gprb(s32) = G_TRUNC %0(s64) - %2:gprb(s64) = COPY $x11 - %3:gprb(s32) = G_TRUNC %2(s64) - %4:gprb(s32) = G_SDIV %1, %3 - %5:gprb(s64) = G_ANYEXT %4(s32) - $x10 = COPY %5(s64) - PseudoRET implicit $x10 - -... ---- -name: srem_i32 -legalized: true -regBankSelected: true -tracksRegLiveness: true -body: | - bb.0.entry: - liveins: $x10, $x11 - - ; RV64I-LABEL: name: srem_i32 - ; RV64I: liveins: $x10, $x11 - ; RV64I-NEXT: {{ $}} - ; RV64I-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10 - ; RV64I-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11 - ; RV64I-NEXT: [[REMW:%[0-9]+]]:gpr = REMW [[COPY]], [[COPY1]] - ; RV64I-NEXT: $x10 = COPY [[REMW]] - ; RV64I-NEXT: PseudoRET implicit $x10 - %0:gprb(s64) = COPY $x10 - %1:gprb(s32) = G_TRUNC %0(s64) - %2:gprb(s64) = COPY $x11 - %3:gprb(s32) = G_TRUNC %2(s64) - %4:gprb(s32) = G_SREM %1, %3 - %5:gprb(s64) = G_ANYEXT %4(s32) - $x10 = COPY %5(s64) + %1:gprb(s64) = COPY $x11 + %2:gprb(s64) = G_DIVW %0, %1 + $x10 = COPY %2(s64) PseudoRET implicit $x10 ... @@ -101,12 +71,9 @@ body: | ; RV64I-NEXT: $x10 = COPY [[DIVUW]] ; RV64I-NEXT: PseudoRET implicit $x10 %0:gprb(s64) = COPY $x10 - %1:gprb(s32) = G_TRUNC %0(s64) - %2:gprb(s64) = COPY $x11 - %3:gprb(s32) = G_TRUNC %2(s64) - %4:gprb(s32) = G_UDIV %1, %3 - %5:gprb(s64) = G_ANYEXT %4(s32) - $x10 = COPY %5(s64) + %1:gprb(s64) = COPY $x11 + %2:gprb(s64) = G_DIVUW %0, %1 + $x10 = COPY %2(s64) PseudoRET implicit $x10 ... @@ -128,12 +95,9 @@ body: | ; RV64I-NEXT: $x10 = COPY [[REMUW]] ; RV64I-NEXT: PseudoRET implicit $x10 %0:gprb(s64) = COPY $x10 - %1:gprb(s32) = G_TRUNC %0(s64) - %2:gprb(s64) = COPY $x11 - %3:gprb(s32) = G_TRUNC %2(s64) - %4:gprb(s32) = G_UREM %1, %3 - %5:gprb(s64) = G_ANYEXT %4(s32) - $x10 = COPY %5(s64) + %1:gprb(s64) = COPY $x11 + %2:gprb(s64) = G_REMUW %0, %1 + $x10 = COPY %2(s64) PseudoRET implicit $x10 ... diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir index 5d980e7721458e..d0237892d132f3 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-medium-rv64.mir @@ -111,8 +111,8 @@ body: | %2:gprb(s64) = G_ASSERT_SEXT %1, 32 %7:gprb(s64) = G_CONSTANT i64 5 %3:gprb(s64) = G_SEXT_INREG %2, 32 - %4:gprb(s64) = G_CONSTANT i64 1 - %5:gprb(s64) = G_SUB %3, %4 + %4:gprb(s64) = G_CONSTANT i64 -1 + %5:gprb(s64) = G_ADD %3, %4 %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7 G_BRCOND %26(s64), %bb.8 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir index 27fe465ccf696b..396421a4ba739a 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv32.mir @@ -115,8 +115,8 @@ body: | %12:gprb(s32) = G_CONSTANT i32 3 %13:gprb(s32) = G_CONSTANT i32 4 %14:gprb(s32) = G_CONSTANT i32 1000 - %1:gprb(s32) = G_CONSTANT i32 1 - %2:gprb(s32) = G_SUB %0, %1 + %1:gprb(s32) = G_CONSTANT i32 -1 + %2:gprb(s32) = G_ADD %0, %1 %16:gprb(s32) = G_ICMP intpred(ugt), %2(s32), %4 G_BRCOND %16(s32), %bb.8 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir index 77156b913c5e8b..0a08586bc1af4f 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-pic-rv64.mir @@ -112,8 +112,8 @@ body: | %2:gprb(s64) = G_ASSERT_SEXT %1, 32 %7:gprb(s64) = G_CONSTANT i64 5 %3:gprb(s64) = G_SEXT_INREG %2, 32 - %4:gprb(s64) = G_CONSTANT i64 1 - %5:gprb(s64) = G_SUB %3, %4 + %4:gprb(s64) = G_CONSTANT i64 -1 + %5:gprb(s64) = G_ADD %3, %4 %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7 G_BRCOND %26(s64), %bb.8 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir index 388c238b86eb6f..efa1a6c86027db 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-rv32.mir @@ -171,8 +171,8 @@ body: | %12:gprb(s32) = G_CONSTANT i32 3 %13:gprb(s32) = G_CONSTANT i32 4 %14:gprb(s32) = G_CONSTANT i32 1000 - %1:gprb(s32) = G_CONSTANT i32 1 - %2:gprb(s32) = G_SUB %0, %1 + %1:gprb(s32) = G_CONSTANT i32 -1 + %2:gprb(s32) = G_ADD %0, %1 %16:gprb(s32) = G_ICMP intpred(ugt), %2(s32), %4 G_BRCOND %16(s32), %bb.8 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir index 09a855105c2627..12b1517e2cfb54 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/jump-table-brjt-small-rv64.mir @@ -112,8 +112,8 @@ body: | %2:gprb(s64) = G_ASSERT_SEXT %1, 32 %7:gprb(s64) = G_CONSTANT i64 5 %3:gprb(s64) = G_SEXT_INREG %2, 32 - %4:gprb(s64) = G_CONSTANT i64 1 - %5:gprb(s64) = G_SUB %3, %4 + %4:gprb(s64) = G_CONSTANT i64 -1 + %5:gprb(s64) = G_ADD %3, %4 %26:gprb(s64) = G_ICMP intpred(ugt), %5(s64), %7 G_BRCOND %26(s64), %bb.8 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir new file mode 100644 index 00000000000000..3c078e9b7e2ddf --- /dev/null +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir @@ -0,0 +1,870 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=riscv32-- -run-pass=legalizer %s \ +# RUN: -mattr=+m,+zbb,+zfh,+v -o - 2>&1 | FileCheck %s --check-prefixes=CHECK +# RUN: llc -mtriple=riscv64-- -run-pass=legalizer %s \ +# RUN: -mattr=+m,+zbb,+zfh,+v -o - 2>&1 | FileCheck %s --check-prefixes=CHECK + +# RUN: llc -mtriple=riscv32-- -run-pass=legalizer %s -debug-only=legalizer-info \ +# RUN: -mattr=+m,+zbb,+zfh,+v -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,DEBUG,DEBUG-RV32 +# RUN: llc -mtriple=riscv64-- -run-pass=legalizer %s -debug-only=legalizer-info \ +# RUN: -mattr=+m,+zbb,+zfh,+v -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,DEBUG,DEBUG-RV64 + +# REQUIRES: asserts + +# The main purpose of this test is to make sure we don't over-relax +# LegalizerInfo validation and lose its ability to catch bugs. +# +# Watch out for every "SKIPPED: user-defined predicate detected" in the +# check-lines below and keep each and every one of them justified. + + +# DEBUG: G_ADD (opcode [[ADD_OPC:[0-9]+]]): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# +# DEBUG-NEXT: G_SUB (opcode [[SUB_OPC:[0-9]+]]): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode [[SUB_OPC]] is aliased to [[ADD_OPC]] +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# +# DEBUG-NEXT: G_MUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_SDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_UDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_SREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_UREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_SDIVREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# +# DEBUG-NEXT: G_UDIVREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# +# DEBUG-NEXT: G_AND (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# +# DEBUG-NEXT: G_OR (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# +# DEBUG-NEXT: G_XOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# +# DEBUG-NEXT: G_IMPLICIT_DEF (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# +# DEBUG-NEXT: G_PHI (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_FRAME_INDEX (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_GLOBAL_VALUE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_PTRAUTH_GLOBAL_VALUE (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# +# DEBUG-NEXT: G_CONSTANT_POOL (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_EXTRACT (opcode {{[0-9]+}}): 2 type indices, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# +# DEBUG-NEXT: G_UNMERGE_VALUES (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-RV32-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-RV32-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-RV64-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-RV64-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_INSERT (opcode {{[0-9]+}}): 2 type indices, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# +# DEBUG-NEXT: G_MERGE_VALUES (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-RV32-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-RV32-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-RV64-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-RV64-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_BUILD_VECTOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# +# DEBUG-NEXT: G_BUILD_VECTOR_TRUNC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# +# DEBUG-NEXT: G_CONCAT_VECTORS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# +# DEBUG-NEXT: G_PTRTOINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_INTTOPTR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# +# DEBUG-NEXT: G_BITCAST (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# +# DEBUG-NEXT: G_FREEZE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected + +# DEBUG-NEXT: G_CONSTANT_FOLD_BARRIER (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected + +# DEBUG-NEXT: G_INTRINSIC_FPTRUNC_ROUND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined + +# DEBUG-NEXT: G_INTRINSIC_TRUNC (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_INTRINSIC_ROUND (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_INTRINSIC_LLRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_READSTEADYCOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined + +# DEBUG-NEXT: G_LOAD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SEXTLOAD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_ZEXTLOAD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_INDEXED_LOAD (opcode {{[0-9]+}}): 3 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_INDEXED_SEXTLOAD (opcode {{[0-9]+}}): 3 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_INDEXED_ZEXTLOAD (opcode {{[0-9]+}}): 3 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_STORE (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_INDEXED_STORE (opcode {{[0-9]+}}): 3 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMIC_CMPXCHG_WITH_SUCCESS (opcode {{[0-9]+}}): 3 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMIC_CMPXCHG (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_XCHG (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_SUB (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_AND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_NAND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_OR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_XOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_MAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_MIN (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_UMAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_UMIN (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_FADD (opcode {{[0-9]+}}): 2 type indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_FSUB (opcode {{[0-9]+}}): 2 type indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_FMAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_FMIN (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_UINC_WRAP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_UDEC_WRAP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_USUB_COND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ATOMICRMW_USUB_SAT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FENCE (opcode {{[0-9]+}}): 0 type indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_PREFETCH (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_BRCOND (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_BRINDIRECT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_INVOKE_REGION_START (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_INTRINSIC (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_INTRINSIC_W_SIDE_EFFECTS (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_INTRINSIC_CONVERGENT (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_ANYEXT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_TRUNC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_CONSTANT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FCONSTANT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_VASTART (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_VAARG (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_SEXT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SEXT_INREG (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_ZEXT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SHL (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_LSHR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_ASHR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_FSHL (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_FSHR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_ROTR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_ROTL (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_ICMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_FCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_SCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UCMP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SELECT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_UADDO (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_UADDE (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_USUBO (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_USUBE (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SADDO (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SADDE (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SSUBO (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SSUBE (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UMULO (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SMULO (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_UMULH (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SMULH (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_UADDSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SADDSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_USUBSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SSUBSAT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_USHLSAT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SSHLSAT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SMULFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UMULFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SMULFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UMULFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SDIVFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UDIVFIX (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SDIVFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UDIVFIXSAT (opcode {{[0-9]+}}): 1 type index, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FSUB (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FMUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FMA (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FMAD (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FPOW (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FPOWI (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FEXP (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FEXP2 (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FEXP10 (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FLOG (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FLOG2 (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FLOG10 (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FLDEXP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FFREXP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FNEG (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FPEXT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FPTRUNC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FPTOSI (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FPTOUI (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_SITOFP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_UITOFP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FPTOSI_SAT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FPTOUI_SAT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FABS (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FCOPYSIGN (opcode {{[0-9]+}}): 2 type indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_IS_FPCLASS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FCANONICALIZE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FMINNUM (opcode {{[0-9]+}}): 1 type index +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FMAXNUM (opcode {{[0-9]+}}): 1 type index +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FMINNUM_IEEE (opcode {{[0-9]+}}): 1 type index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FMAXNUM_IEEE (opcode {{[0-9]+}}): 1 type index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FMINIMUM (opcode {{[0-9]+}}): 1 type index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FMAXIMUM (opcode {{[0-9]+}}): 1 type index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_GET_FPENV (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SET_FPENV (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_RESET_FPENV (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_GET_FPMODE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SET_FPMODE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_RESET_FPMODE (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_PTR_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_PTRMASK (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_SMIN (opcode {{[0-9]+}}): 1 type index +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_SMAX (opcode {{[0-9]+}}): 1 type index +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_UMIN (opcode {{[0-9]+}}): 1 type index +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_UMAX (opcode {{[0-9]+}}): 1 type index +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_ABS (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_LROUND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_LLROUND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_BR (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_BRJT (opcode {{[0-9]+}}): 2 type indices +# DEBUG-NEXT: .. the first uncovered type index: 2, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_VSCALE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_INSERT_SUBVECTOR (opcode {{[0-9]+}}): 2 type indices, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_EXTRACT_SUBVECTOR (opcode {{[0-9]+}}): 2 type indices, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_INSERT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_EXTRACT_VECTOR_ELT (opcode {{[0-9]+}}): 3 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SHUFFLE_VECTOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SPLAT_VECTOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_STEP_VECTOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECTOR_COMPRESS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_CTTZ_ZERO_UNDEF (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_CTLZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_CTLZ_ZERO_UNDEF (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_CTPOP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_BSWAP (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_BITREVERSE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_FCEIL (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FCOS (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FSIN (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FSINCOS (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FTAN (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FACOS (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FASIN (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FATAN (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FATAN2 (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FCOSH (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FSINH (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FTANH (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FFLOOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FRINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FNEARBYINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_ADDRSPACE_CAST (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_BLOCK_ADDR (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_JUMP_TABLE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. the first uncovered type index: 1, OK +# DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_DYN_STACKALLOC (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_STACKSAVE (opcode [[STACKSAVE:[0-9]+]]): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_STACKRESTORE (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_STRICT_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_STRICT_FSUB (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_STRICT_FMUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_STRICT_FDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_STRICT_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_STRICT_FMA (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_STRICT_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_STRICT_FLDEXP (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT:.. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT:.. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_READ_REGISTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_WRITE_REGISTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_MEMCPY (opcode {{[0-9]+}}): 3 type indices, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_MEMCPY_INLINE (opcode {{[0-9]+}}): 3 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_MEMMOVE (opcode {{[0-9]+}}): 3 type indices, 1 imm index +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_MEMSET (opcode {{[0-9]+}}): 3 type indices, 1 imm index +# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} +# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected +# DEBUG-NEXT: G_BZERO (opcode {{[0-9]+}}): 2 type indices, 1 imm index +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_TRAP (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_DEBUGTRAP (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UBSANTRAP (opcode {{[0-9]+}}): 0 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_SEQ_FADD (opcode {{[0-9]+}}): 3 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_SEQ_FMUL (opcode {{[0-9]+}}): 3 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_FADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_FMUL (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_FMAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_FMIN (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_FMAXIMUM (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_FMINIMUM (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_MUL (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_AND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_OR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_XOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_SMAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_SMIN (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_UMAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_VECREDUCE_UMIN (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SBFX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UBFX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# CHECK-NOT: ill-defined + +--- +name: dummy +body: | + bb.0: +... diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir index 183f5e59282396..657dd3cc63226b 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-div-rv64.mir @@ -27,16 +27,13 @@ body: | ; CHECK-M-LABEL: name: sdiv_i8 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32) - ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[ASHR]], [[ASHR1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SDIV]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 + ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) + ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64) + ; CHECK-M-NEXT: [[DIVW:%[0-9]+]]:_(s64) = G_DIVW [[ASHR]], [[ASHR1]] + ; CHECK-M-NEXT: $x10 = COPY [[DIVW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -72,16 +69,13 @@ body: | ; CHECK-M-LABEL: name: sdiv_i15 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 - ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32) - ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[ASHR]], [[ASHR1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SDIV]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 49 + ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) + ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64) + ; CHECK-M-NEXT: [[DIVW:%[0-9]+]]:_(s64) = G_DIVW [[ASHR]], [[ASHR1]] + ; CHECK-M-NEXT: $x10 = COPY [[DIVW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -117,16 +111,13 @@ body: | ; CHECK-M-LABEL: name: sdiv_i16 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32) - ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[ASHR]], [[ASHR1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SDIV]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) + ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64) + ; CHECK-M-NEXT: [[DIVW:%[0-9]+]]:_(s64) = G_DIVW [[ASHR]], [[ASHR1]] + ; CHECK-M-NEXT: $x10 = COPY [[DIVW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -159,11 +150,8 @@ body: | ; CHECK-M-LABEL: name: sdiv_i32 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[TRUNC]], [[TRUNC1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SDIV]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[DIVW:%[0-9]+]]:_(s64) = G_DIVW [[COPY]], [[COPY1]] + ; CHECK-M-NEXT: $x10 = COPY [[DIVW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -343,14 +331,11 @@ body: | ; CHECK-M-LABEL: name: udiv_i8 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]] - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]] - ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 + ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] + ; CHECK-M-NEXT: [[DIVUW:%[0-9]+]]:_(s64) = G_DIVUW [[AND]], [[AND1]] + ; CHECK-M-NEXT: $x10 = COPY [[DIVUW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -384,14 +369,11 @@ body: | ; CHECK-M-LABEL: name: udiv_i15 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]] - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]] - ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767 + ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] + ; CHECK-M-NEXT: [[DIVUW:%[0-9]+]]:_(s64) = G_DIVUW [[AND]], [[AND1]] + ; CHECK-M-NEXT: $x10 = COPY [[DIVUW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -425,14 +407,11 @@ body: | ; CHECK-M-LABEL: name: udiv_i16 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]] - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]] - ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535 + ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] + ; CHECK-M-NEXT: [[DIVUW:%[0-9]+]]:_(s64) = G_DIVUW [[AND]], [[AND1]] + ; CHECK-M-NEXT: $x10 = COPY [[DIVUW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -466,11 +445,8 @@ body: | ; CHECK-M-LABEL: name: udiv_i32 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[TRUNC]], [[TRUNC1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[DIVUW:%[0-9]+]]:_(s64) = G_DIVUW [[COPY]], [[COPY1]] + ; CHECK-M-NEXT: $x10 = COPY [[DIVUW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rem-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rem-rv64.mir index cd951688843eee..8239bb69508675 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rem-rv64.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-rem-rv64.mir @@ -27,16 +27,13 @@ body: | ; CHECK-M-LABEL: name: srem_i8 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24 - ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32) - ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[ASHR]], [[ASHR1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SREM]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 56 + ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) + ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64) + ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[ASHR]], [[ASHR1]] + ; CHECK-M-NEXT: $x10 = COPY [[SREM]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -72,16 +69,13 @@ body: | ; CHECK-M-LABEL: name: srem_i15 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 - ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32) - ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[ASHR]], [[ASHR1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SREM]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 49 + ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) + ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64) + ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[ASHR]], [[ASHR1]] + ; CHECK-M-NEXT: $x10 = COPY [[SREM]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -117,16 +111,13 @@ body: | ; CHECK-M-LABEL: name: srem_i16 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32) - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C]](s32) - ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32) - ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[ASHR]], [[ASHR1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SREM]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; CHECK-M-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C]](s64) + ; CHECK-M-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C]](s64) + ; CHECK-M-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C]](s64) + ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[ASHR]], [[ASHR1]] + ; CHECK-M-NEXT: $x10 = COPY [[SREM]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -159,11 +150,10 @@ body: | ; CHECK-M-LABEL: name: srem_i32 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s32) = G_SREM [[TRUNC]], [[TRUNC1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SREM]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32 + ; CHECK-M-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY1]], 32 + ; CHECK-M-NEXT: [[SREM:%[0-9]+]]:_(s64) = G_SREM [[SEXT_INREG]], [[SEXT_INREG1]] + ; CHECK-M-NEXT: $x10 = COPY [[SREM]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -343,14 +333,11 @@ body: | ; CHECK-M-LABEL: name: urem_i8 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 - ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]] - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]] - ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255 + ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] + ; CHECK-M-NEXT: [[REMUW:%[0-9]+]]:_(s64) = G_REMUW [[AND]], [[AND1]] + ; CHECK-M-NEXT: $x10 = COPY [[REMUW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -384,14 +371,11 @@ body: | ; CHECK-M-LABEL: name: urem_i15 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767 - ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]] - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]] - ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767 + ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] + ; CHECK-M-NEXT: [[REMUW:%[0-9]+]]:_(s64) = G_REMUW [[AND]], [[AND1]] + ; CHECK-M-NEXT: $x10 = COPY [[REMUW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -425,14 +409,11 @@ body: | ; CHECK-M-LABEL: name: urem_i16 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]] - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]] - ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535 + ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]] + ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]] + ; CHECK-M-NEXT: [[REMUW:%[0-9]+]]:_(s64) = G_REMUW [[AND]], [[AND1]] + ; CHECK-M-NEXT: $x10 = COPY [[REMUW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 @@ -466,11 +447,8 @@ body: | ; CHECK-M-LABEL: name: urem_i32 ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10 ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11 - ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64) - ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[TRUNC]], [[TRUNC1]] - ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32) - ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64) + ; CHECK-M-NEXT: [[REMUW:%[0-9]+]]:_(s64) = G_REMUW [[COPY]], [[COPY1]] + ; CHECK-M-NEXT: $x10 = COPY [[REMUW]](s64) ; CHECK-M-NEXT: PseudoRET implicit $x10 %0:_(s64) = COPY $x10 %1:_(s64) = COPY $x11 diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll index 1b323fe35b8e38..c558639fda424e 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb-zbkb.ll @@ -2,9 +2,9 @@ ; RUN: llc -mtriple=riscv32 -global-isel -verify-machineinstrs < %s \ ; RUN: | FileCheck %s -check-prefixes=CHECK,RV32I ; RUN: llc -mtriple=riscv32 -global-isel -mattr=+zbb -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefixes=CHECK,RV32ZBB-ZBKB +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32ZBB-ZBKB,RV32ZBB ; RUN: llc -mtriple=riscv32 -global-isel -mattr=+zbkb -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefixes=CHECK,RV32ZBB-ZBKB +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32ZBB-ZBKB,RV32ZBKB define i32 @andn_i32(i32 %a, i32 %b) nounwind { ; RV32I-LABEL: andn_i32: @@ -143,8 +143,7 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; CHECK-NEXT: bltu a6, a4, .LBB7_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: addi a5, a6, -32 -; CHECK-NEXT: sll a7, a0, a5 +; CHECK-NEXT: sll a7, a0, a6 ; CHECK-NEXT: j .LBB7_3 ; CHECK-NEXT: .LBB7_2: ; CHECK-NEXT: sll a3, a0, a2 @@ -162,8 +161,7 @@ define i64 @rol_i64(i64 %a, i64 %b) nounwind { ; CHECK-NEXT: andi a6, a5, 63 ; CHECK-NEXT: bltu a6, a4, .LBB7_7 ; CHECK-NEXT: # %bb.6: -; CHECK-NEXT: addi a7, a6, -32 -; CHECK-NEXT: srl a7, a1, a7 +; CHECK-NEXT: srl a7, a1, a6 ; CHECK-NEXT: bnez a6, .LBB7_8 ; CHECK-NEXT: j .LBB7_9 ; CHECK-NEXT: .LBB7_7: @@ -220,8 +218,7 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; CHECK-NEXT: li a4, 32 ; CHECK-NEXT: bltu a5, a4, .LBB9_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: addi a3, a5, -32 -; CHECK-NEXT: srl a6, a1, a3 +; CHECK-NEXT: srl a6, a1, a5 ; CHECK-NEXT: mv a3, a0 ; CHECK-NEXT: bnez a5, .LBB9_3 ; CHECK-NEXT: j .LBB9_4 @@ -255,8 +252,7 @@ define i64 @ror_i64(i64 %a, i64 %b) nounwind { ; CHECK-NEXT: bltu a5, a4, .LBB9_6 ; CHECK-NEXT: .LBB9_8: ; CHECK-NEXT: li a4, 0 -; CHECK-NEXT: addi a6, a5, -32 -; CHECK-NEXT: sll a0, a0, a6 +; CHECK-NEXT: sll a0, a0, a5 ; CHECK-NEXT: beqz a5, .LBB9_10 ; CHECK-NEXT: .LBB9_9: ; CHECK-NEXT: mv a1, a0 @@ -338,21 +334,30 @@ define i8 @srli_i8(i8 %a) nounwind { ret i8 %1 } -; We could use sext.b+srai, but slli+srai offers more opportunities for -; comppressed instructions. +; FIXME: We should use slli+srai with Zbb for better compression. define i8 @srai_i8(i8 %a) nounwind { ; RV32I-LABEL: srai_i8: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: srai a0, a0, 5 +; RV32I-NEXT: srai a0, a0, 29 ; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: srai_i8: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: sext.b a0, a0 +; RV32ZBB-NEXT: srai a0, a0, 5 +; RV32ZBB-NEXT: ret +; +; RV32ZBKB-LABEL: srai_i8: +; RV32ZBKB: # %bb.0: +; RV32ZBKB-NEXT: slli a0, a0, 24 +; RV32ZBKB-NEXT: srai a0, a0, 29 +; RV32ZBKB-NEXT: ret %1 = ashr i8 %a, 5 ret i8 %1 } -; We could use zext.h+srli, but slli+srli offers more opportunities for -; comppressed instructions. +; FIXME: We should use slli+srli. define i16 @srli_i16(i16 %a) nounwind { ; RV32I-LABEL: srli_i16: ; RV32I: # %bb.0: @@ -371,15 +376,25 @@ define i16 @srli_i16(i16 %a) nounwind { ret i16 %1 } -; We could use sext.h+srai, but slli+srai offers more opportunities for -; comppressed instructions. +; FIXME: We should use slli+srai with Zbb/Zbkb for better compression. define i16 @srai_i16(i16 %a) nounwind { ; RV32I-LABEL: srai_i16: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a0, a0, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: srai a0, a0, 9 +; RV32I-NEXT: srai a0, a0, 25 ; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: srai_i16: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: sext.h a0, a0 +; RV32ZBB-NEXT: srai a0, a0, 9 +; RV32ZBB-NEXT: ret +; +; RV32ZBKB-LABEL: srai_i16: +; RV32ZBKB: # %bb.0: +; RV32ZBKB-NEXT: slli a0, a0, 16 +; RV32ZBKB-NEXT: srai a0, a0, 25 +; RV32ZBKB-NEXT: ret %1 = ashr i16 %a, 9 ret i16 %1 } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll index 8990c4dd3f26d5..1184905c17edea 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbb.ll @@ -616,12 +616,13 @@ define i32 @sextb_i32(i32 %a) nounwind { ret i32 %shr } +; FIXME: Combine back to back srai. define i64 @sextb_i64(i64 %a) nounwind { ; RV32I-LABEL: sextb_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: srai a0, a0, 24 -; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: slli a1, a0, 24 +; RV32I-NEXT: srai a0, a1, 24 +; RV32I-NEXT: srai a1, a1, 31 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: sextb_i64: @@ -650,12 +651,13 @@ define i32 @sexth_i32(i32 %a) nounwind { ret i32 %shr } +; FIXME: Combine back to back srai. define i64 @sexth_i64(i64 %a) nounwind { ; RV32I-LABEL: sexth_i64: ; RV32I: # %bb.0: -; RV32I-NEXT: slli a0, a0, 16 -; RV32I-NEXT: srai a0, a0, 16 -; RV32I-NEXT: srai a1, a0, 31 +; RV32I-NEXT: slli a1, a0, 16 +; RV32I-NEXT: srai a0, a1, 16 +; RV32I-NEXT: srai a1, a1, 31 ; RV32I-NEXT: ret ; ; RV32ZBB-LABEL: sexth_i64: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll index d9b093448cb46e..80e43c94aab0e6 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv32zbkb.ll @@ -106,6 +106,7 @@ define i64 @pack_i64_3(ptr %0, ptr %1) { ret i64 %8 } +; FIXME: Use packh. define i32 @packh_i32(i32 %a, i32 %b) nounwind { ; CHECK-LABEL: packh_i32: ; CHECK: # %bb.0: @@ -143,6 +144,7 @@ define i32 @packh_i32_2(i32 %a, i32 %b) nounwind { ret i32 %or } +; FIMXE: Use packh define i64 @packh_i64(i64 %a, i64 %b) nounwind { ; CHECK-LABEL: packh_i64: ; CHECK: # %bb.0: @@ -161,6 +163,7 @@ define i64 @packh_i64(i64 %a, i64 %b) nounwind { ret i64 %or } +; FIXME The andi+srli for RV32ZBKB should fold to 0. define i64 @packh_i64_2(i64 %a, i64 %b) nounwind { ; RV32I-LABEL: packh_i64_2: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll index d9b7f16131c352..a6d3ddbf199931 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb-zbkb.ll @@ -2,10 +2,11 @@ ; RUN: llc -mtriple=riscv64 -global-isel -verify-machineinstrs < %s \ ; RUN: | FileCheck %s -check-prefixes=CHECK,RV64I ; RUN: llc -mtriple=riscv64 -global-isel -mattr=+zbb -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefixes=CHECK,RV64ZBB-ZBKB +; RUN: | FileCheck %s -check-prefixes=CHECK,RV64ZBB-ZBKB,RV64ZBB ; RUN: llc -mtriple=riscv64 -global-isel -mattr=+zbkb -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefixes=CHECK,RV64ZBB-ZBKB +; RUN: | FileCheck %s -check-prefixes=CHECK,RV64ZBB-ZBKB,RV64ZBKB +; FIXME: sext.w is unneeded. define signext i32 @andn_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: andn_i32: ; RV64I: # %bb.0: @@ -40,6 +41,7 @@ define i64 @andn_i64(i64 %a, i64 %b) nounwind { ret i64 %and } +; FIXME: sext.w is unneeded. define signext i32 @orn_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: orn_i32: ; RV64I: # %bb.0: @@ -74,6 +76,7 @@ define i64 @orn_i64(i64 %a, i64 %b) nounwind { ret i64 %or } +; FIXME: sext.w is unneeded. define signext i32 @xnor_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: xnor_i32: ; RV64I: # %bb.0: @@ -424,6 +427,7 @@ define i64 @rori_i64_fshr(i64 %a) nounwind { ret i64 %1 } +; FIXME: We should use srli instead of srliw for better compression. define i8 @srli_i8(i8 %a) nounwind { ; CHECK-LABEL: srli_i8: ; CHECK: # %bb.0: @@ -434,21 +438,30 @@ define i8 @srli_i8(i8 %a) nounwind { ret i8 %1 } -; We could use sext.b+srai, but slli+srai offers more opportunities for -; comppressed instructions. +; FIXME: We should use slli+srai with Zbb for better compression. define i8 @srai_i8(i8 %a) nounwind { ; RV64I-LABEL: srai_i8: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 24 -; RV64I-NEXT: sraiw a0, a0, 24 -; RV64I-NEXT: sraiw a0, a0, 5 +; RV64I-NEXT: sraiw a0, a0, 29 ; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: srai_i8: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.b a0, a0 +; RV64ZBB-NEXT: sraiw a0, a0, 5 +; RV64ZBB-NEXT: ret +; +; RV64ZBKB-LABEL: srai_i8: +; RV64ZBKB: # %bb.0: +; RV64ZBKB-NEXT: slli a0, a0, 24 +; RV64ZBKB-NEXT: sraiw a0, a0, 29 +; RV64ZBKB-NEXT: ret %1 = ashr i8 %a, 5 ret i8 %1 } -; We could use zext.h+srli, but slli+srli offers more opportunities for -; comppressed instructions. +; FIXME: We should use slli+srli. define i16 @srli_i16(i16 %a) nounwind { ; RV64I-LABEL: srli_i16: ; RV64I: # %bb.0: @@ -457,19 +470,43 @@ define i16 @srli_i16(i16 %a) nounwind { ; RV64I-NEXT: and a0, a0, a1 ; RV64I-NEXT: srliw a0, a0, 6 ; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: srli_i16: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: zext.h a0, a0 +; RV64ZBB-NEXT: srliw a0, a0, 6 +; RV64ZBB-NEXT: ret +; +; RV64ZBKB-LABEL: srli_i16: +; RV64ZBKB: # %bb.0: +; RV64ZBKB-NEXT: lui a1, 16 +; RV64ZBKB-NEXT: addi a1, a1, -1 +; RV64ZBKB-NEXT: and a0, a0, a1 +; RV64ZBKB-NEXT: srliw a0, a0, 6 +; RV64ZBKB-NEXT: ret %1 = lshr i16 %a, 6 ret i16 %1 } -; We could use sext.h+srai, but slli+srai offers more opportunities for -; comppressed instructions. +; FIXME: We should use slli+srai with Zbb/Zbkb for better compression. define i16 @srai_i16(i16 %a) nounwind { ; RV64I-LABEL: srai_i16: ; RV64I: # %bb.0: ; RV64I-NEXT: slli a0, a0, 16 -; RV64I-NEXT: sraiw a0, a0, 16 -; RV64I-NEXT: sraiw a0, a0, 9 +; RV64I-NEXT: sraiw a0, a0, 25 ; RV64I-NEXT: ret +; +; RV64ZBB-LABEL: srai_i16: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: sext.h a0, a0 +; RV64ZBB-NEXT: sraiw a0, a0, 9 +; RV64ZBB-NEXT: ret +; +; RV64ZBKB-LABEL: srai_i16: +; RV64ZBKB: # %bb.0: +; RV64ZBKB-NEXT: slli a0, a0, 16 +; RV64ZBKB-NEXT: sraiw a0, a0, 25 +; RV64ZBKB-NEXT: ret %1 = ashr i16 %a, 9 ret i16 %1 } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll index 835b4e32ae3206..1d0e38d2f91c44 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbb.ll @@ -6,6 +6,7 @@ declare i32 @llvm.ctlz.i32(i32, i1) +; FIXME: We don't need the shift pair before the beqz for RV64I. define signext i32 @ctlz_i32(i32 signext %a) nounwind { ; RV64I-LABEL: ctlz_i32: ; RV64I: # %bb.0: @@ -126,6 +127,7 @@ define signext i32 @log2_i32(i32 signext %a) nounwind { ret i32 %2 } +; FIXME: We don't need the shift pair before the beqz for RV64I. define signext i32 @log2_ceil_i32(i32 signext %a) nounwind { ; RV64I-LABEL: log2_ceil_i32: ; RV64I: # %bb.0: @@ -264,20 +266,21 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind { ret i32 %4 } +; FIXME: We don't need the shift pair before the beqz for RV64I. define i32 @ctlz_lshr_i32(i32 signext %a) { ; RV64I-LABEL: ctlz_lshr_i32: ; RV64I: # %bb.0: -; RV64I-NEXT: srliw a0, a0, 1 -; RV64I-NEXT: slli a1, a0, 32 -; RV64I-NEXT: srli a1, a1, 32 -; RV64I-NEXT: beqz a1, .LBB4_2 +; RV64I-NEXT: srliw a1, a0, 1 +; RV64I-NEXT: slli a2, a1, 32 +; RV64I-NEXT: srli a2, a2, 32 +; RV64I-NEXT: beqz a2, .LBB4_2 ; RV64I-NEXT: # %bb.1: # %cond.false ; RV64I-NEXT: addi sp, sp, -16 ; RV64I-NEXT: .cfi_def_cfa_offset 16 ; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: .cfi_offset ra, -8 -; RV64I-NEXT: srliw a1, a0, 1 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: srliw a0, a0, 2 +; RV64I-NEXT: or a0, a1, a0 ; RV64I-NEXT: srliw a1, a0, 2 ; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: srliw a1, a0, 4 @@ -995,6 +998,8 @@ define i64 @max_i64(i64 %a, i64 %b) nounwind { ret i64 %cond } +; FIXME: We don't need the shift pairs. The inputs are sign extended, we can +; compare them directly. define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: minu_i32: ; RV64I: # %bb.0: @@ -1041,6 +1046,8 @@ define i64 @minu_i64(i64 %a, i64 %b) nounwind { ret i64 %cond } +; FIXME: We don't need the shift pairs. The inputs are sign extended, we can +; compare them directly. define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: maxu_i32: ; RV64I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll index 6b57b179240d70..b449b7d1beaaec 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll +++ b/llvm/test/CodeGen/RISCV/GlobalISel/rv64zbkb.ll @@ -4,6 +4,7 @@ ; RUN: llc -mtriple=riscv64 -global-isel -mattr=+zbkb -verify-machineinstrs < %s \ ; RUN: | FileCheck %s -check-prefix=RV64ZBKB +; FIXME: Use packw define signext i32 @pack_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: pack_i32: ; RV64I: # %bb.0: @@ -30,6 +31,7 @@ define signext i32 @pack_i32(i32 signext %a, i32 signext %b) nounwind { ret i32 %or } +; FIXME: Use packw define signext i32 @pack_i32_2(i16 zeroext %a, i16 zeroext %b) nounwind { ; RV64I-LABEL: pack_i32_2: ; RV64I: # %bb.0: @@ -52,6 +54,7 @@ define signext i32 @pack_i32_2(i16 zeroext %a, i16 zeroext %b) nounwind { } ; Test case where we don't have a sign_extend_inreg after the or. +; FIXME: Use packw define signext i32 @pack_i32_3(i16 zeroext %0, i16 zeroext %1, i32 signext %2) { ; RV64I-LABEL: pack_i32_3: ; RV64I: # %bb.0: @@ -93,6 +96,7 @@ define i64 @pack_i64(i64 %a, i64 %b) nounwind { ret i64 %or } +; FIXME: The slli+srli isn't needed with pack. define i64 @pack_i64_2(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: pack_i64_2: ; RV64I: # %bb.0: @@ -141,6 +145,7 @@ define i64 @pack_i64_3(ptr %0, ptr %1) { ret i64 %8 } +; FIXME: Use packh define signext i32 @packh_i32(i32 signext %a, i32 signext %b) nounwind { ; RV64I-LABEL: packh_i32: ; RV64I: # %bb.0: @@ -168,6 +173,7 @@ define signext i32 @packh_i32(i32 signext %a, i32 signext %b) nounwind { ret i32 %or } +; FIXME: Use packh define i32 @packh_i32_2(i32 %a, i32 %b) nounwind { ; RV64I-LABEL: packh_i32_2: ; RV64I: # %bb.0: @@ -191,6 +197,7 @@ define i32 @packh_i32_2(i32 %a, i32 %b) nounwind { ret i32 %or } +; FIXME: Use packh define i64 @packh_i64(i64 %a, i64 %b) nounwind { ; RV64I-LABEL: packh_i64: ; RV64I: # %bb.0: @@ -238,6 +245,7 @@ define i64 @packh_i64_2(i64 %a, i64 %b) nounwind { ret i64 %or } +; FIXME: Use packh define zeroext i16 @packh_i16(i8 zeroext %a, i8 zeroext %b) nounwind { ; RV64I-LABEL: packh_i16: ; RV64I: # %bb.0: @@ -261,6 +269,7 @@ define zeroext i16 @packh_i16(i8 zeroext %a, i8 zeroext %b) nounwind { ret i16 %or } +; FIXME: Use packh define zeroext i16 @packh_i16_2(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2) { ; RV64I-LABEL: packh_i16_2: ; RV64I: # %bb.0: @@ -289,6 +298,7 @@ define zeroext i16 @packh_i16_2(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2) { ret i16 %8 } +; FIXME: Use packh define void @packh_i16_3(i8 zeroext %0, i8 zeroext %1, i8 zeroext %2, ptr %p) { ; RV64I-LABEL: packh_i16_3: ; RV64I: # %bb.0: @@ -394,6 +404,7 @@ define i64 @pack_i64_imm() { ret i64 1157442765409226768 ; 0x0101010101010101 } +; FIXME: Use zext.h define i32 @zexth_i32(i32 %a) nounwind { ; RV64I-LABEL: zexth_i32: ; RV64I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index a89ae1742bb3af..7624071f4f93ec 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -592,13 +592,13 @@ ; RVI20U64: .attribute 5, "rv64i2p1" ; RVA20U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zmmul1p0_za128rs1p0" ; RVA20S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zifencei2p0_zmmul1p0_za128rs1p0_ssccptr1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0" -; RVA22U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zihintpause2p0_zihpm2p0_zmmul1p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0" -; RVA22S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zifencei2p0_zihintpause2p0_zihpm2p0_zmmul1p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscounterenw1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0_svinval1p0_svpbmt1p0" -; RVA23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_supm1p0" -; RVA23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_h1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_sha1p0_shcounterenw1p0_shgatpa1p0_shtvala1p0_shvsatpa1p0_shvstvala1p0_shvstvecd1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_ssnpm1p0_ssstateen1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_supm1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0" -; RVB23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0" -; RVB23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0" -; RVM23U32: .attribute 5, "rv32i2p1_m2p0_zicbop1p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zimop1p0_zmmul1p0_zca1p0_zcb1p0_zce1p0_zcmop1p0_zcmp1p0_zcmt1p0_zba1p0_zbb1p0_zbs1p0" +; RVA22U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zihintpause2p0_zihpm2p0_zmmul1p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0" +; RVA22S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zifencei2p0_zihintpause2p0_zihpm2p0_zmmul1p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscounterenw1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0_svinval1p0_svpbmt1p0" +; RVA23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_supm1p0" +; RVA23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_v1p0_h1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_sha1p0_shcounterenw1p0_shgatpa1p0_shtvala1p0_shvsatpa1p0_shvstvala1p0_shvstvecd1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_ssnpm1p0_ssstateen1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_supm1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0" +; RVB23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0" +; RVB23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0" +; RVM23U32: .attribute 5, "rv32i2p1_m2p0_b1p0_zicbop1p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zimop1p0_zmmul1p0_zca1p0_zcb1p0_zce1p0_zcmop1p0_zcmp1p0_zcmt1p0_zba1p0_zbb1p0_zbs1p0" define i32 @addi(i32 %a) { %1 = add i32 %a, 1 diff --git a/llvm/test/CodeGen/RISCV/float-maximum-minimum.ll b/llvm/test/CodeGen/RISCV/float-maximum-minimum.ll index 0e00dff0b64245..2e9f8cbf6d2eff 100644 --- a/llvm/test/CodeGen/RISCV/float-maximum-minimum.ll +++ b/llvm/test/CodeGen/RISCV/float-maximum-minimum.ll @@ -8,6 +8,9 @@ ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+d \ ; RUN: -verify-machineinstrs -target-abi=ilp32f \ ; RUN: | FileCheck -check-prefix=RV32IF %s +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 \ +; RUN: -verify-machineinstrs -target-abi=ilp32 \ +; RUN: | FileCheck -check-prefix=RV32I %s ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+f \ ; RUN: -verify-machineinstrs -target-abi=lp64f \ ; RUN: | FileCheck -check-prefix=RV64IF %s @@ -17,6 +20,9 @@ ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+d \ ; RUN: -verify-machineinstrs -target-abi=lp64d \ ; RUN: | FileCheck -check-prefix=RV64IF %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 \ +; RUN: -verify-machineinstrs -target-abi=lp64 \ +; RUN: | FileCheck -check-prefix=RV64I %s declare float @llvm.minimum.f32(float, float) @@ -59,6 +65,15 @@ define float @fminimum_f32(float %a, float %b) nounwind { ; RV32IZFINX-NEXT: fmin.s a0, a1, a2 ; RV32IZFINX-NEXT: ret ; +; RV32I-LABEL: fminimum_f32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: call fminimumf +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; ; RV64IF-LABEL: fminimum_f32: ; RV64IF: # %bb.0: ; RV64IF-NEXT: feq.s a0, fa0, fa0 @@ -96,6 +111,15 @@ define float @fminimum_f32(float %a, float %b) nounwind { ; RV64IZFINX-NEXT: .LBB0_4: ; RV64IZFINX-NEXT: fmin.s a0, a1, a2 ; RV64IZFINX-NEXT: ret +; +; RV64I-LABEL: fminimum_f32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: call fminimumf +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret %1 = call float @llvm.minimum.f32(float %a, float %b) ret float %1 } @@ -141,6 +165,15 @@ define float @fmaximum_f32(float %a, float %b) nounwind { ; RV32IZFINX-NEXT: fmax.s a0, a1, a2 ; RV32IZFINX-NEXT: ret ; +; RV32I-LABEL: fmaximum_f32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: call fmaximumf +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; ; RV64IF-LABEL: fmaximum_f32: ; RV64IF: # %bb.0: ; RV64IF-NEXT: feq.s a0, fa0, fa0 @@ -178,6 +211,15 @@ define float @fmaximum_f32(float %a, float %b) nounwind { ; RV64IZFINX-NEXT: .LBB1_4: ; RV64IZFINX-NEXT: fmax.s a0, a1, a2 ; RV64IZFINX-NEXT: ret +; +; RV64I-LABEL: fmaximum_f32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: call fmaximumf +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret %1 = call float @llvm.maximum.f32(float %a, float %b) ret float %1 } @@ -193,6 +235,15 @@ define float @fminimum_nnan_f32(float %a, float %b) nounwind { ; RV32IZFINX-NEXT: fmin.s a0, a0, a1 ; RV32IZFINX-NEXT: ret ; +; RV32I-LABEL: fminimum_nnan_f32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: call fminimumf +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; ; RV64IF-LABEL: fminimum_nnan_f32: ; RV64IF: # %bb.0: ; RV64IF-NEXT: fmin.s fa0, fa0, fa1 @@ -202,6 +253,15 @@ define float @fminimum_nnan_f32(float %a, float %b) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: fmin.s a0, a0, a1 ; RV64IZFINX-NEXT: ret +; +; RV64I-LABEL: fminimum_nnan_f32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: call fminimumf +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret %1 = call nnan float @llvm.minimum.f32(float %a, float %b) ret float %1 } @@ -217,6 +277,15 @@ define float @fmaximum_nnan_f32(float %a, float %b) nounwind { ; RV32IZFINX-NEXT: fmax.s a0, a0, a1 ; RV32IZFINX-NEXT: ret ; +; RV32I-LABEL: fmaximum_nnan_f32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: call fmaximumf +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; ; RV64IF-LABEL: fmaximum_nnan_f32: ; RV64IF: # %bb.0: ; RV64IF-NEXT: fmax.s fa0, fa0, fa1 @@ -226,6 +295,15 @@ define float @fmaximum_nnan_f32(float %a, float %b) nounwind { ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: fmax.s a0, a0, a1 ; RV64IZFINX-NEXT: ret +; +; RV64I-LABEL: fmaximum_nnan_f32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: call fmaximumf +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret %1 = call nnan float @llvm.maximum.f32(float %a, float %b) ret float %1 } @@ -241,6 +319,15 @@ define float @fminimum_nnan_attr_f32(float %a, float %b) nounwind "no-nans-fp-ma ; RV32IZFINX-NEXT: fmin.s a0, a0, a1 ; RV32IZFINX-NEXT: ret ; +; RV32I-LABEL: fminimum_nnan_attr_f32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: call fminimumf +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; ; RV64IF-LABEL: fminimum_nnan_attr_f32: ; RV64IF: # %bb.0: ; RV64IF-NEXT: fmin.s fa0, fa0, fa1 @@ -250,6 +337,15 @@ define float @fminimum_nnan_attr_f32(float %a, float %b) nounwind "no-nans-fp-ma ; RV64IZFINX: # %bb.0: ; RV64IZFINX-NEXT: fmin.s a0, a0, a1 ; RV64IZFINX-NEXT: ret +; +; RV64I-LABEL: fminimum_nnan_attr_f32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: call fminimumf +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret %1 = call float @llvm.minimum.f32(float %a, float %b) ret float %1 } @@ -279,6 +375,22 @@ define float @fminimum_nnan_op_f32(float %a, float %b) nounwind { ; RV32IZFINX-NEXT: fmin.s a0, a0, a1 ; RV32IZFINX-NEXT: ret ; +; RV32I-LABEL: fminimum_nnan_op_f32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a0 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: call __addsf3 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: mv a0, s0 +; RV32I-NEXT: call fminimumf +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; ; RV64IF-LABEL: fminimum_nnan_op_f32: ; RV64IF: # %bb.0: ; RV64IF-NEXT: feq.s a0, fa0, fa0 @@ -302,6 +414,22 @@ define float @fminimum_nnan_op_f32(float %a, float %b) nounwind { ; RV64IZFINX-NEXT: fadd.s a1, a0, a0 ; RV64IZFINX-NEXT: fmin.s a0, a0, a1 ; RV64IZFINX-NEXT: ret +; +; RV64I-LABEL: fminimum_nnan_op_f32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv s0, a0 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: call __addsf3 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s0 +; RV64I-NEXT: call fminimumf +; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ret %c = fadd nnan float %a, %a %1 = call float @llvm.minimum.f32(float %a, float %c) ret float %1 @@ -322,6 +450,30 @@ define float @fmaximum_nnan_op_f32(float %a, float %b) nounwind { ; RV32IZFINX-NEXT: fmax.s a0, a2, a0 ; RV32IZFINX-NEXT: ret ; +; RV32I-LABEL: fmaximum_nnan_op_f32: +; RV32I: # %bb.0: +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill +; RV32I-NEXT: mv s0, a1 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: call __addsf3 +; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: mv a0, s1 +; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: call __subsf3 +; RV32I-NEXT: mv a1, a0 +; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: call fmaximumf +; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload +; RV32I-NEXT: addi sp, sp, 16 +; RV32I-NEXT: ret +; ; RV64IF-LABEL: fmaximum_nnan_op_f32: ; RV64IF: # %bb.0: ; RV64IF-NEXT: fadd.s fa5, fa0, fa1 @@ -335,6 +487,30 @@ define float @fmaximum_nnan_op_f32(float %a, float %b) nounwind { ; RV64IZFINX-NEXT: fsub.s a0, a0, a1 ; RV64IZFINX-NEXT: fmax.s a0, a2, a0 ; RV64IZFINX-NEXT: ret +; +; RV64I-LABEL: fmaximum_nnan_op_f32: +; RV64I: # %bb.0: +; RV64I-NEXT: addi sp, sp, -32 +; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 0(sp) # 8-byte Folded Spill +; RV64I-NEXT: mv s0, a1 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: call __addsf3 +; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: mv a0, s1 +; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: call __subsf3 +; RV64I-NEXT: mv a1, a0 +; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: call fmaximumf +; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 0(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 32 +; RV64I-NEXT: ret %c = fadd nnan float %a, %b %d = fsub nnan float %a, %b %1 = call float @llvm.maximum.f32(float %c, float %d) diff --git a/llvm/test/CodeGen/RISCV/global-merge-minsize-smalldata-nonzero.ll b/llvm/test/CodeGen/RISCV/global-merge-minsize-smalldata-nonzero.ll index c5471389302124..39c677ac20b3a5 100644 --- a/llvm/test/CodeGen/RISCV/global-merge-minsize-smalldata-nonzero.ll +++ b/llvm/test/CodeGen/RISCV/global-merge-minsize-smalldata-nonzero.ll @@ -1,8 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv64 -riscv-enable-global-merge -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=SMALL-DATA +; RUN: llc -mtriple=riscv64 -riscv-enable-global-merge -riscv-force-enable-global-merge-external-globals \ +; RUN: -verify-machineinstrs < %s | FileCheck %s -check-prefix=SMALL-DATA ; RUN: llc -mtriple=riscv64 -riscv-enable-global-merge -global-merge-min-data-size=0 \ -; RUN: -verify-machineinstrs < %s | FileCheck %s -check-prefix=MINSIZE +; RUN: -riscv-force-enable-global-merge-external-globals -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=MINSIZE @ig1 = internal global i32 0, align 4 @ig2 = internal global i32 0, align 4 diff --git a/llvm/test/CodeGen/RISCV/global-merge-minsize-smalldata-zero.ll b/llvm/test/CodeGen/RISCV/global-merge-minsize-smalldata-zero.ll index 8e4d72af00ebce..d2b714577db9a8 100644 --- a/llvm/test/CodeGen/RISCV/global-merge-minsize-smalldata-zero.ll +++ b/llvm/test/CodeGen/RISCV/global-merge-minsize-smalldata-zero.ll @@ -1,8 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -riscv-enable-global-merge -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=SMALL-DATA +; RUN: llc -mtriple=riscv32 -riscv-enable-global-merge -riscv-force-enable-global-merge-external-globals \ +; RUN: -verify-machineinstrs < %s | FileCheck %s -check-prefix=SMALL-DATA ; RUN: llc -mtriple=riscv64 -riscv-enable-global-merge -global-merge-min-data-size=5 \ -; RUN: -verify-machineinstrs < %s | FileCheck %s -check-prefix=MINSIZE +; RUN: -riscv-force-enable-global-merge-external-globals -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=MINSIZE @ig1 = internal global i32 0, align 4 @ig2 = internal global i32 0, align 4 diff --git a/llvm/test/CodeGen/RISCV/global-merge-minsize.ll b/llvm/test/CodeGen/RISCV/global-merge-minsize.ll index e405425832acbb..696d163bdcb2c4 100644 --- a/llvm/test/CodeGen/RISCV/global-merge-minsize.ll +++ b/llvm/test/CodeGen/RISCV/global-merge-minsize.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -riscv-enable-global-merge -verify-machineinstrs < %s \ -; RUN: | FileCheck %s -check-prefix=RV32 +; RUN: llc -mtriple=riscv32 -riscv-enable-global-merge -riscv-force-enable-global-merge-external-globals \ +; RUN: -verify-machineinstrs < %s | FileCheck %s -check-prefix=RV32 ; RUN: llc -mtriple=riscv32 -riscv-enable-global-merge -global-merge-min-data-size=5 \ -; RUN: -verify-machineinstrs < %s | FileCheck %s -check-prefix=RV32-MINSIZE +; RUN: -riscv-force-enable-global-merge-external-globals -verify-machineinstrs < %s | FileCheck %s -check-prefix=RV32-MINSIZE @ig1 = internal global i32 0, align 4 @ig2 = internal global i32 0, align 4 diff --git a/llvm/test/CodeGen/RISCV/global-merge-offset.ll b/llvm/test/CodeGen/RISCV/global-merge-offset.ll index 13afcba181719e..0c0881ddf28737 100644 --- a/llvm/test/CodeGen/RISCV/global-merge-offset.ll +++ b/llvm/test/CodeGen/RISCV/global-merge-offset.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: sed 's/ArrSize/100/g' %s | llc -mtriple=riscv32 -riscv-enable-global-merge \ -; RUN: -verify-machineinstrs | FileCheck %s +; RUN: -riscv-force-enable-global-merge-external-globals -verify-machineinstrs | FileCheck %s ; RUN: sed 's/ArrSize/100/g' %s | llc -mtriple=riscv64 -riscv-enable-global-merge \ -; RUN: -verify-machineinstrs | FileCheck %s +; RUN: -riscv-force-enable-global-merge-external-globals -verify-machineinstrs | FileCheck %s ; RUN: sed 's/ArrSize/101/g' %s | llc -mtriple=riscv32 -riscv-enable-global-merge \ -; RUN: -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-TOOBIG +; RUN: -riscv-force-enable-global-merge-external-globals -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-TOOBIG ; RUN: sed 's/ArrSize/101/g' %s | llc -mtriple=riscv64 -riscv-enable-global-merge \ -; RUN: -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-TOOBIG +; RUN: -riscv-force-enable-global-merge-external-globals -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-TOOBIG ; This test demonstrates that the MaxOffset is set correctly for RISC-V by ; constructing an input that is at the limit and comparing. diff --git a/llvm/test/CodeGen/RISCV/global-merge.ll b/llvm/test/CodeGen/RISCV/global-merge.ll index 20379ee2e7dacd..633ba719c6a305 100644 --- a/llvm/test/CodeGen/RISCV/global-merge.ll +++ b/llvm/test/CodeGen/RISCV/global-merge.ll @@ -3,6 +3,12 @@ ; RUN: | FileCheck %s ; RUN: llc -mtriple=riscv64 -riscv-enable-global-merge -verify-machineinstrs < %s \ ; RUN: | FileCheck %s +; RUN: llc -mtriple=riscv32 -riscv-enable-global-merge \ +; RUN: -riscv-force-enable-global-merge-external-globals -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=CHECK-WEXTERN %s +; RUN: llc -mtriple=riscv64 -riscv-enable-global-merge \ +; RUN: -riscv-force-enable-global-merge-external-globals -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=CHECK-WEXTERN %s @ig1 = internal global i32 0, align 4 @ig2 = internal global i32 0, align 4 @@ -21,9 +27,21 @@ define void @f1(i32 %a) nounwind { ; CHECK-NEXT: sw a0, %lo(.L_MergedGlobals)(a1) ; CHECK-NEXT: addi a1, a1, %lo(.L_MergedGlobals) ; CHECK-NEXT: sw a0, 4(a1) -; CHECK-NEXT: sw a0, 8(a1) -; CHECK-NEXT: sw a0, 12(a1) +; CHECK-NEXT: lui a1, %hi(eg1) +; CHECK-NEXT: sw a0, %lo(eg1)(a1) +; CHECK-NEXT: lui a1, %hi(eg2) +; CHECK-NEXT: sw a0, %lo(eg2)(a1) ; CHECK-NEXT: ret +; +; CHECK-WEXTERN-LABEL: f1: +; CHECK-WEXTERN: # %bb.0: +; CHECK-WEXTERN-NEXT: lui a1, %hi(.L_MergedGlobals) +; CHECK-WEXTERN-NEXT: sw a0, %lo(.L_MergedGlobals)(a1) +; CHECK-WEXTERN-NEXT: addi a1, a1, %lo(.L_MergedGlobals) +; CHECK-WEXTERN-NEXT: sw a0, 4(a1) +; CHECK-WEXTERN-NEXT: sw a0, 8(a1) +; CHECK-WEXTERN-NEXT: sw a0, 12(a1) +; CHECK-WEXTERN-NEXT: ret store i32 %a, ptr @ig1, align 4 store i32 %a, ptr @ig2, align 4 store i32 %a, ptr @eg1, align 4 diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-blsi.mir b/llvm/test/CodeGen/X86/GlobalISel/select-blsi.mir index e532f5c72e27e6..af4032b28c5c14 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/select-blsi.mir +++ b/llvm/test/CodeGen/X86/GlobalISel/select-blsi.mir @@ -27,9 +27,8 @@ body: | ; CHECK: liveins: $edi ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edi - ; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags - ; CHECK-NEXT: [[SUB32rr:%[0-9]+]]:gr32 = SUB32rr [[MOV32r0_]], [[COPY]], implicit-def dead $eflags - ; CHECK-NEXT: [[AND32rr:%[0-9]+]]:gr32 = AND32rr [[SUB32rr]], [[COPY]], implicit-def dead $eflags + ; CHECK-NEXT: [[NEG32r:%[0-9]+]]:gr32 = NEG32r [[COPY]], implicit-def dead $eflags + ; CHECK-NEXT: [[AND32rr:%[0-9]+]]:gr32 = AND32rr [[NEG32r]], [[COPY]], implicit-def dead $eflags ; CHECK-NEXT: $edi = COPY [[AND32rr]] %0(s32) = COPY $edi %1(s32) = G_CONSTANT i32 0 @@ -58,8 +57,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edi ; CHECK-NEXT: [[MOV32r0_:%[0-9]+]]:gr32 = MOV32r0 implicit-def dead $eflags - ; CHECK-NEXT: [[SUB32ri:%[0-9]+]]:gr32 = SUB32ri [[MOV32r0_]], 0, implicit-def dead $eflags - ; CHECK-NEXT: [[AND32rr:%[0-9]+]]:gr32 = AND32rr [[SUB32ri]], [[COPY]], implicit-def dead $eflags + ; CHECK-NEXT: [[NEG32r:%[0-9]+]]:gr32 = NEG32r [[MOV32r0_]], implicit-def dead $eflags + ; CHECK-NEXT: [[AND32rr:%[0-9]+]]:gr32 = AND32rr [[NEG32r]], [[COPY]], implicit-def dead $eflags ; CHECK-NEXT: $edi = COPY [[AND32rr]] %0(s32) = COPY $edi %1(s32) = G_CONSTANT i32 0 diff --git a/llvm/test/CodeGen/X86/amx-tf32-internal.ll b/llvm/test/CodeGen/X86/amx-tf32-internal.ll new file mode 100644 index 00000000000000..6d0f3c57c08d89 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx-tf32-internal.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+avx512f, \ +; RUN: -mattr=+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s + +define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 +; CHECK-NEXT: tilezero %tmm1 +; CHECK-NEXT: tilezero %tmm2 +; CHECK-NEXT: tmmultf32ps %tmm1, %tmm0, %tmm2 +; CHECK-NEXT: ttmmultf32ps %tmm1, %tmm0, %tmm2 +; CHECK-NEXT: tilestored %tmm2, (%rdi,%rdx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + + %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %b = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + + %c1 = call x86_amx @llvm.x86.tmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) + %c2 = call x86_amx @llvm.x86.ttmmultf32ps.internal(i16 8, i16 8, i16 8, x86_amx %c1, x86_amx %a, x86_amx %b) + + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %c2) + ret void +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) + + +declare x86_amx @llvm.x86.tmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare x86_amx @llvm.x86.ttmmultf32ps.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) diff --git a/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll new file mode 100644 index 00000000000000..af1a7ae1029756 --- /dev/null +++ b/llvm/test/CodeGen/X86/amx-tf32-intrinsics.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-tf32,+amx-transpose -verify-machineinstrs | FileCheck %s + +define void @test_tmmultf32ps() { +; CHECK-LABEL: test_tmmultf32ps: +; CHECK: # %bb.0: +; CHECK-NEXT: tmmultf32ps %tmm3, %tmm2, %tmm1 +; CHECK-NEXT: retq + call void @llvm.x86.tmmultf32ps(i8 1, i8 2, i8 3) + ret void +} +declare void @llvm.x86.tmmultf32ps(i8 %A, i8 %B, i8 %C) + +define void @test_ttmmultf32ps() { +; CHECK-LABEL: test_ttmmultf32ps: +; CHECK: # %bb.0: +; CHECK-NEXT: ttmmultf32ps %tmm3, %tmm2, %tmm1 +; CHECK-NEXT: retq + call void @llvm.x86.ttmmultf32ps(i8 1, i8 2, i8 3) + ret void +} +declare void @llvm.x86.ttmmultf32ps(i8 %A, i8 %B, i8 %C) + diff --git a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll index b06a9369b9762d..2025ee94a97405 100644 --- a/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll +++ b/llvm/test/CodeGen/X86/amx_transpose_intrinsics.ll @@ -80,18 +80,18 @@ define void @test_amx_spill(i8* %pointer, i8* %base, i64 %stride) #0 { ; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm0 ; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm4 ; CHECK-NEXT: t2rpntlvwz0t1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: movabsq $64, %rcx +; CHECK-NEXT: tilestored %tmm6, 4032(%rsp,%rcx) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm7, 5056(%rsp,%rcx) # 1024-byte Folded Spill ; CHECK-NEXT: t2rpntlvwz1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm6, 1984(%rsp,%rcx) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm7, 3008(%rsp,%rcx) # 1024-byte Folded Spill ; CHECK-NEXT: t2rpntlvwz1t1 (%rsi,%rdx), %tmm6 -; CHECK-NEXT: tilestored %tmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill -; CHECK-NEXT: tilestored %tmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm6, -64(%rsp,%rcx) # 1024-byte Folded Spill +; CHECK-NEXT: tilestored %tmm7, 960(%rsp,%rcx) # 1024-byte Folded Spill ; CHECK-NEXT: t2rpntlvwz0 (%rsi,%rdx), %tmm6 ; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) ; CHECK-NEXT: tilestored %tmm5, (%rsi,%rdx) -; CHECK-NEXT: movabsq $64, %rcx ; CHECK-NEXT: tileloadd 4032(%rsp,%rcx), %tmm4 # 1024-byte Folded Reload ; CHECK-NEXT: tileloadd 5056(%rsp,%rcx), %tmm5 # 1024-byte Folded Reload ; CHECK-NEXT: tilestored %tmm4, (%rsi,%rdx) diff --git a/llvm/test/CodeGen/X86/lrshrink-debug.ll b/llvm/test/CodeGen/X86/lrshrink-debug.ll new file mode 100755 index 00000000000000..dd52968529902c --- /dev/null +++ b/llvm/test/CodeGen/X86/lrshrink-debug.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc %s -o - | FileCheck %s +target triple = "i686-unknown-linux-gnu" + +define noundef i32 @test(i1 %tobool1.not, i32 %sh.012, i1 %cmp, i64 %sh_prom, i64 %shl) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushl %ebp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 20 +; CHECK-NEXT: .cfi_offset %esi, -20 +; CHECK-NEXT: .cfi_offset %edi, -16 +; CHECK-NEXT: .cfi_offset %ebx, -12 +; CHECK-NEXT: .cfi_offset %ebp, -8 +; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %dh +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: jmp .LBB0_1 +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_4: # %if.end +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: orl %ecx, %ebx +; CHECK-NEXT: orl %eax, %ebp +; CHECK-NEXT: movl %ebx, %esi +; CHECK-NEXT: movl %ebp, %edi +; CHECK-NEXT: .LBB0_1: # %for.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: testb $1, %dh +; CHECK-NEXT: je .LBB0_1 +; CHECK-NEXT: # %bb.2: # %if.end +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: xorl %ebx, %ebx +; CHECK-NEXT: testb $1, %dl +; CHECK-NEXT: movl $0, %ebp +; CHECK-NEXT: jne .LBB0_4 +; CHECK-NEXT: # %bb.3: # %if.end +; CHECK-NEXT: # in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: movl %esi, %ebx +; CHECK-NEXT: movl %edi, %ebp +; CHECK-NEXT: jmp .LBB0_4 +entry: + br label %for.body + +for.body: ; preds = %for.inc, %entry + %bitmap.013 = phi i64 [ 0, %entry ], [ %bitmap.2, %for.inc ] + br i1 %tobool1.not, label %if.end, label %for.inc + +if.end: ; preds = %for.body + %spec.select10 = select i1 %cmp, i64 0, i64 %bitmap.013 + %shl6 = shl nuw i64 1, %sh_prom + %or = or i64 %shl, %spec.select10 + tail call void @llvm.dbg.value(metadata i64 %or, metadata !17, metadata !DIExpression()), !dbg !21 + br label %for.inc + +for.inc: ; preds = %if.end, %for.body + %bitmap.2 = phi i64 [ %bitmap.013, %for.body ], [ %or, %if.end ] + %tobool.not = icmp eq i32 0, 0 + br label %for.body +} + +declare void @llvm.dbg.value(metadata, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!16} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, enums: !2) +!1 = !DIFile(filename: "test.c", directory: "test") +!2 = !{} +!16 = !{i32 2, !"Debug Info Version", i32 3} +!17 = !DILocalVariable(name: "bitmap", scope: !18, file: !1, line: 8, type: !20) +!18 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 6, type: !19, scopeLine: 6, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!19 = !DISubroutineType(types: !2) +!20 = !DIBasicType(name: "long long", size: 64, encoding: DW_ATE_signed) +!21 = !DILocation(line: 0, scope: !18) diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index 5bce0bb5a60dc8..ed668c6ef4b043 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -2563,15 +2563,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vbroadcastss %xmm4, %ymm12 ; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13 ; AVX512F-NEXT: vmulps %ymm13, %ymm11, %ymm13 ; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 -; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm4[2,2,2,2] -; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512F-NEXT: vshufpd {{.*#+}} xmm13 = xmm4[1,0] +; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13 ; AVX512F-NEXT: vmulps %ymm1, %ymm13, %ymm13 ; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 ; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13 ; AVX512F-NEXT: vmulps %ymm13, %ymm10, %ymm13 ; AVX512F-NEXT: vaddps %ymm13, %ymm12, %ymm12 ; AVX512F-NEXT: vextractf128 $1, %ymm4, %xmm13 @@ -2627,15 +2627,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vbroadcastss %xmm5, %ymm13 ; AVX512F-NEXT: vmulps %ymm0, %ymm13, %ymm13 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512F-NEXT: vmulps %ymm14, %ymm11, %ymm14 ; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 -; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2] -; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX512F-NEXT: vshufpd {{.*#+}} xmm14 = xmm5[1,0] +; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512F-NEXT: vmulps %ymm1, %ymm14, %ymm14 ; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 ; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512F-NEXT: vmulps %ymm14, %ymm10, %ymm14 ; AVX512F-NEXT: vaddps %ymm14, %ymm13, %ymm13 ; AVX512F-NEXT: vextractf128 $1, %ymm5, %xmm14 @@ -2689,15 +2689,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vbroadcastss %xmm6, %ymm12 ; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm14 = xmm6[1,1,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512F-NEXT: vmulps %ymm14, %ymm11, %ymm14 ; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12 -; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2] -; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX512F-NEXT: vshufpd {{.*#+}} xmm14 = xmm6[1,0] +; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512F-NEXT: vmulps %ymm1, %ymm14, %ymm14 ; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12 ; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm6[3,3,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX512F-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512F-NEXT: vmulps %ymm14, %ymm10, %ymm14 ; AVX512F-NEXT: vaddps %ymm14, %ymm12, %ymm12 ; AVX512F-NEXT: vextractf128 $1, %ymm6, %xmm14 @@ -2753,15 +2753,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512F-NEXT: vbroadcastss %xmm7, %ymm12 ; AVX512F-NEXT: vmulps %ymm0, %ymm12, %ymm12 ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm15 = xmm7[1,1,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm15, %ymm11, %ymm15 ; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 -; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,2,2,2] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512F-NEXT: vshufpd {{.*#+}} xmm15 = xmm7[1,0] +; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm1, %ymm15, %ymm15 ; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 ; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm7[3,3,3,3] -; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512F-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512F-NEXT: vmulps %ymm15, %ymm10, %ymm15 ; AVX512F-NEXT: vaddps %ymm15, %ymm12, %ymm12 ; AVX512F-NEXT: vextractf128 $1, %ymm7, %xmm15 @@ -2828,15 +2828,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm12 ; AVX512VL-NEXT: vmulps %ymm0, %ymm12, %ymm12 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm4[1,1,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13 ; AVX512VL-NEXT: vmulps %ymm13, %ymm11, %ymm13 ; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm4[2,2,2,2] -; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm13 = xmm4[1,0] +; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13 ; AVX512VL-NEXT: vmulps %ymm1, %ymm13, %ymm13 ; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm4[3,3,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13 +; AVX512VL-NEXT: vbroadcastss %xmm13, %ymm13 ; AVX512VL-NEXT: vmulps %ymm13, %ymm10, %ymm13 ; AVX512VL-NEXT: vaddps %ymm13, %ymm12, %ymm12 ; AVX512VL-NEXT: vextractf128 $1, %ymm4, %xmm13 @@ -2890,15 +2890,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512VL-NEXT: vbroadcastss %xmm5, %ymm13 ; AVX512VL-NEXT: vmulps %ymm0, %ymm13, %ymm13 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm14 = xmm5[1,1,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512VL-NEXT: vmulps %ymm14, %ymm11, %ymm14 ; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2] -; AVX512VL-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm14 = xmm5[1,0] +; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512VL-NEXT: vmulps %ymm1, %ymm14, %ymm14 ; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm14 = xmm5[3,3,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm14, %ymm14 +; AVX512VL-NEXT: vbroadcastss %xmm14, %ymm14 ; AVX512VL-NEXT: vmulps %ymm14, %ymm10, %ymm14 ; AVX512VL-NEXT: vaddps %ymm14, %ymm13, %ymm13 ; AVX512VL-NEXT: vextractf128 $1, %ymm5, %xmm14 @@ -2952,15 +2952,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512VL-NEXT: vbroadcastss %xmm6, %ymm14 ; AVX512VL-NEXT: vmulps %ymm0, %ymm14, %ymm14 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm6[1,1,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512VL-NEXT: vmulps %ymm15, %ymm11, %ymm15 ; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm6[2,2,2,2] -; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm15 = xmm6[1,0] +; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512VL-NEXT: vmulps %ymm1, %ymm15, %ymm15 ; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm6[3,3,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 +; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 ; AVX512VL-NEXT: vmulps %ymm15, %ymm10, %ymm15 ; AVX512VL-NEXT: vaddps %ymm15, %ymm14, %ymm14 ; AVX512VL-NEXT: vextractf128 $1, %ymm6, %xmm15 @@ -3014,15 +3014,15 @@ define <64 x float> @test_mul8x8_f32(<64 x float> %a0, <64 x float> %a1) nounwin ; AVX512VL-NEXT: vbroadcastss %xmm7, %ymm15 ; AVX512VL-NEXT: vmulps %ymm0, %ymm15, %ymm15 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm16 = xmm7[1,1,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 +; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 ; AVX512VL-NEXT: vmulps %ymm16, %ymm11, %ymm16 ; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 -; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm7[2,2,2,2] -; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 +; AVX512VL-NEXT: vshufpd {{.*#+}} xmm16 = xmm7[1,0] +; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 ; AVX512VL-NEXT: vmulps %ymm16, %ymm1, %ymm16 ; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 ; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm7[3,3,3,3] -; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 +; AVX512VL-NEXT: vbroadcastss %xmm16, %ymm16 ; AVX512VL-NEXT: vmulps %ymm16, %ymm10, %ymm16 ; AVX512VL-NEXT: vaddps %ymm16, %ymm15, %ymm15 ; AVX512VL-NEXT: vextractf32x4 $1, %ymm7, %xmm16 diff --git a/llvm/test/CodeGen/X86/optnone.mir b/llvm/test/CodeGen/X86/optnone.mir new file mode 100644 index 00000000000000..ba852cebc35e03 --- /dev/null +++ b/llvm/test/CodeGen/X86/optnone.mir @@ -0,0 +1,20 @@ +# RUN: llc -mtriple=x86_64-- -passes=machine-cse -debug-pass-manager %s -o - 2>&1 | FileCheck %s + +# CHECK: Skipping pass MachineCSEPass on test_optnone due to optnone attribute +# CHECK: Running pass: MachineCSEPass on test_opt +--- | + define void @test_optnone() noinline optnone { ret void } + define void @test_opt() { ret void } +... +--- +name: test_optnone +body: | + bb.0: + RET64 +... +--- +name: test_opt +body: | + bb.0: + RET64 +... diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll index 1d87a5773296a1..164bf203d0545d 100644 --- a/llvm/test/CodeGen/X86/pr40730.ll +++ b/llvm/test/CodeGen/X86/pr40730.ll @@ -20,7 +20,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: .quad 0x0000000e0000000d ; CHECK-NEXT: .quad 0x0000000e0000000d ; CHECK-NEXT: .quad 0x0000001000000000 -; CHECK-NEXT: .quad 0x0000000e0000000d +; CHECK-NEXT: .zero 8 define <8 x i32> @shuffle_v8i32_0dcd3f14_constant(<8 x i32> %a0) { ; CHECK-LABEL: shuffle_v8i32_0dcd3f14_constant: diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index 85e782e9083492..358b2a503df261 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -3303,7 +3303,7 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-ONLY: # %bb.0: ; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,19,19,19,19,20,20,20,20,20,21,21,21,21,21,22,22,38,38,38,39,39,39,39,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-ONLY-NEXT: movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF @@ -9332,7 +9332,7 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-ONLY: # %bb.0: ; AVX512BW-ONLY-NEXT: kmovw (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,18,18,18,18,18,19,19,19,19,19,19,19,20,20,20,20,36,36,36,37,37,37,37,37,37,37,38,38,38,38,38,38,54,55,55,55,55,55,55,55,u,u,u,u,u,u,u,u] ; AVX512BW-ONLY-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-ONLY-NEXT: movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF @@ -12935,7 +12935,7 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW-ONLY: # %bb.0: ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-ONLY-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 ; AVX512BW-ONLY-NEXT: kshiftrd $16, %k1, %k2 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 5dd16c7b257903..cf0e40ce521382 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -3086,7 +3086,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z ; ; AVX2-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[10,11],zero,zero,zero,zero,zero,zero,ymm0[28,29],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero ; AVX2-NEXT: retq ; @@ -3110,7 +3110,7 @@ define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_z ; ; XOPAVX2-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[10,11],zero,zero,zero,zero,zero,zero,ymm0[28,29],zero,zero,zero,zero,zero,zero,ymm0[30,31],zero,zero,zero,zero,zero,zero ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 8a6e3c244a1cb6..e29848295eaabd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4092,13 +4092,13 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz ; ; AVX2-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8],zero,zero,zero,ymm0[9],zero,zero,zero,ymm0[10],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[28],zero,zero,zero,ymm0[29],zero,zero,zero,ymm0[30],zero,zero,zero,ymm0[31],zero,zero,zero ; AVX2-NEXT: retq ; ; AVX512VLBW-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8],zero,zero,zero,ymm0[9],zero,zero,zero,ymm0[10],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[28],zero,zero,zero,ymm0[29],zero,zero,zero,ymm0[30],zero,zero,zero,ymm0[31],zero,zero,zero ; AVX512VLBW-NEXT: retq ; @@ -4122,7 +4122,7 @@ define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz ; ; XOPAVX2-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8],zero,zero,zero,ymm0[9],zero,zero,zero,ymm0[10],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[28],zero,zero,zero,ymm0[29],zero,zero,zero,ymm0[30],zero,zero,zero,ymm0[31],zero,zero,zero ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> @@ -5141,26 +5141,11 @@ define <4 x i64> @PR66150(ptr %b) { ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: retq ; -; AVX2-LABEL: PR66150: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] -; AVX2-NEXT: retq -; -; AVX512VLBW-LABEL: PR66150: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] -; AVX512VLBW-NEXT: retq -; -; AVX512VLVBMI-LABEL: PR66150: -; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3] -; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0 -; AVX512VLVBMI-NEXT: retq +; AVX2OR512VL-LABEL: PR66150: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastd (%rdi), %ymm0 +; AVX2OR512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX2OR512VL-NEXT: retq ; ; XOPAVX1-LABEL: PR66150: ; XOPAVX1: # %bb.0: @@ -5174,8 +5159,7 @@ define <4 x i64> @PR66150(ptr %b) { ; ; XOPAVX2-LABEL: PR66150: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; XOPAVX2-NEXT: vpbroadcastd (%rdi), %ymm0 ; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] ; XOPAVX2-NEXT: retq %tmp1 = load i32, ptr %b, align 4 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll index 96559cf7729a20..97c6c4afa59909 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll @@ -1449,6 +1449,73 @@ define <64 x i8> @shuffle_v64i8_shift_00_02_04_06_08_10_12_14_64_66_68_70_72_74_ ret <64 x i8> %5 } +; PR113396 +define <64 x i8> @shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01(<8 x i8> %0) { +; AVX512F-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VBMI-LABEL: shuffle_v8i8_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01_01: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VBMI-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VBMI-NEXT: retq + %s = shufflevector <8 x i8> %0, <8 x i8> poison, <64 x i32> + ret <64 x i8> %s +} + +; PR114001 +define <64 x i8> @shuffle_v8i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01_02_02_02_02_02_02_02_02_03_03_03_03_03_03_03_03_04_04_04_04_04_04_04_04_05_05_05_05_05_05_05_05_06_06_06_06_06_06_06_06_07_07_07_07_07_07_07_07(<8 x i8> %a0) { +; AVX512F-LABEL: shuffle_v8i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01_02_02_02_02_02_02_02_02_03_03_03_03_03_03_03_03_04_04_04_04_04_04_04_04_05_05_05_05_05_05_05_05_06_06_06_06_06_06_06_06_07_07_07_07_07_07_07_07: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] +; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v8i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01_02_02_02_02_02_02_02_02_03_03_03_03_03_03_03_03_04_04_04_04_04_04_04_04_05_05_05_05_05_05_05_05_06_06_06_06_06_06_06_06_07_07_07_07_07_07_07_07: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: shuffle_v8i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01_02_02_02_02_02_02_02_02_03_03_03_03_03_03_03_03_04_04_04_04_04_04_04_04_05_05_05_05_05_05_05_05_06_06_06_06_06_06_06_06_07_07_07_07_07_07_07_07: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] +; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] +; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: retq +; +; AVX512VBMI-LABEL: shuffle_v8i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01_02_02_02_02_02_02_02_02_03_03_03_03_03_03_03_03_04_04_04_04_04_04_04_04_05_05_05_05_05_05_05_05_06_06_06_06_06_06_06_06_07_07_07_07_07_07_07_07: +; AVX512VBMI: # %bb.0: +; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] +; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-NEXT: retq + %s = shufflevector <8 x i8> %a0, <8 x i8> poison, <64 x i32> + ret <64 x i8> %s +} + define <64 x i8> @PR54562_ref(<64 x i8> %a0) { ; AVX512F-LABEL: PR54562_ref: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll index c977929b21f452..6f9b3e94aa68f6 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll @@ -11,7 +11,7 @@ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -41,11 +41,11 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0] ; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -83,7 +83,7 @@ define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { ; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -115,7 +115,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512F-NEXT: vpermq %zmm2, %zmm1, %zmm2 ; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -153,12 +153,12 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -167,12 +167,12 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -205,12 +205,12 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -224,12 +224,12 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 ; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -260,15 +260,15 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm0 & (zmm1 ^ zmm2)) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: @@ -281,15 +281,15 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogq $202, %zmm2, %zmm1, %zmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm0 & (zmm1 ^ zmm2)) ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: @@ -318,12 +318,12 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 @@ -338,15 +338,15 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $202, %ymm2, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: @@ -370,30 +370,30 @@ define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512F-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm0 & (zmm2 ^ zmm3)) ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512VL-NEXT: vpternlogq $202, %zmm3, %zmm2, %zmm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm0 & (zmm2 ^ zmm3)) ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: @@ -421,12 +421,12 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512F: # %bb.0: ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 @@ -436,15 +436,15 @@ define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0 ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 ; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] ; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 ; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $202, %ymm3, %ymm2, %ymm0 +; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm2 ^ ymm3)) ; AVX512VL-NEXT: retq ; ; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: @@ -471,12 +471,12 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -501,8 +501,8 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512VL-FAST-PERLANE-NEXT: kmovw %edi, %k1 ; AVX512VL-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} -; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastq %xmm1, %ymm1 +; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX512VL-FAST-PERLANE-NEXT: vpslld $31, %ymm1, %ymm1 ; AVX512VL-FAST-PERLANE-NEXT: vptestmd %ymm1, %ymm1, %k1 ; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} @@ -525,8 +525,8 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; VL_BW_DQ-FAST-PERLANE: # %bb.0: ; VL_BW_DQ-FAST-PERLANE-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0 -; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastq %xmm0, %ymm0 +; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm0 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0 ; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2w %k0, %xmm0 ; VL_BW_DQ-FAST-PERLANE-NEXT: vzeroupper @@ -540,7 +540,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,2,10,0,3,0,2,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -588,7 +588,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,4,5,6,7] ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 @@ -630,7 +630,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -676,7 +676,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 @@ -718,7 +718,7 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,6,1,0,3,7,7,1] ; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,0,0,0,0,0,0] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 @@ -764,9 +764,9 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -811,7 +811,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -822,7 +822,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax @@ -850,7 +850,7 @@ define i64 @shuf64i1_zero(i64 %a) { ; AVX512F-LABEL: shuf64i1_zero: ; AVX512F: # %bb.0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -866,7 +866,7 @@ define i64 @shuf64i1_zero(i64 %a) { ; AVX512VL-LABEL: shuf64i1_zero: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: kmovw %edi, %k1 -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax @@ -906,7 +906,7 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) { ; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1} -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -922,7 +922,7 @@ define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) { ; AVX512VL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1} -; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 ; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq diff --git a/llvm/test/ExecutionEngine/JITLink/Generic/Inputs/sectcreate-data.txt b/llvm/test/ExecutionEngine/JITLink/Generic/Inputs/sect@create/sectcreate-data.txt similarity index 100% rename from llvm/test/ExecutionEngine/JITLink/Generic/Inputs/sectcreate-data.txt rename to llvm/test/ExecutionEngine/JITLink/Generic/Inputs/sect@create/sectcreate-data.txt diff --git a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test index c09513a7d3707c..08b6372dcf2c73 100644 --- a/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test +++ b/llvm/test/ExecutionEngine/JITLink/Generic/sectcreate.test @@ -1,6 +1,6 @@ # RUN: llc -filetype=obj -o %t.o %S/Inputs/main-ret-0.ll # RUN: llvm-jitlink -noexec \ -# RUN: -sectcreate __data,%S/Inputs/sectcreate-data.txt@foo=0 \ +# RUN: -sectcreate __data,%S/Inputs/sect@create/sectcreate-data.txt@foo=0 \ # RUN: %t.o # # Use -sectcreate to create a section from a data file. diff --git a/llvm/test/MC/AArch64/SVE2/aesd.s b/llvm/test/MC/AArch64/SVE2/aesd.s index f0cbc39ce74d76..44eb9b68fd44ee 100644 --- a/llvm/test/MC/AArch64/SVE2/aesd.s +++ b/llvm/test/MC/AArch64/SVE2/aesd.s @@ -1,17 +1,17 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN aesd z0.b, z0.b, z31.b // CHECK-INST: aesd z0.b, z0.b, z31.b // CHECK-ENCODING: [0xe0,0xe7,0x22,0x45] -// CHECK-ERROR: instruction requires: sve2 sve-aes +// CHECK-ERROR: instruction requires: sve2-aes // CHECK-UNKNOWN: 4522e7e0 diff --git a/llvm/test/MC/AArch64/SVE2/aese.s b/llvm/test/MC/AArch64/SVE2/aese.s index 91af38604e292a..e64f2137ad39a5 100644 --- a/llvm/test/MC/AArch64/SVE2/aese.s +++ b/llvm/test/MC/AArch64/SVE2/aese.s @@ -1,17 +1,17 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN aese z0.b, z0.b, z31.b // CHECK-INST: aese z0.b, z0.b, z31.b // CHECK-ENCODING: [0xe0,0xe3,0x22,0x45] -// CHECK-ERROR: instruction requires: sve2 sve-aes +// CHECK-ERROR: instruction requires: sve2-aes // CHECK-UNKNOWN: 4522e3e0 diff --git a/llvm/test/MC/AArch64/SVE2/aesimc.s b/llvm/test/MC/AArch64/SVE2/aesimc.s index 8d108d4d7ad32c..c868ed0badf49f 100644 --- a/llvm/test/MC/AArch64/SVE2/aesimc.s +++ b/llvm/test/MC/AArch64/SVE2/aesimc.s @@ -1,23 +1,23 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN aesimc z0.b, z0.b // CHECK-INST: aesimc z0.b, z0.b // CHECK-ENCODING: [0x00,0xe4,0x20,0x45] -// CHECK-ERROR: instruction requires: sve2 sve-aes +// CHECK-ERROR: instruction requires: sve2-aes // CHECK-UNKNOWN: 4520e400 aesimc z31.b, z31.b // CHECK-INST: aesimc z31.b, z31.b // CHECK-ENCODING: [0x1f,0xe4,0x20,0x45] -// CHECK-ERROR: instruction requires: sve2 sve-aes +// CHECK-ERROR: instruction requires: sve2-aes // CHECK-UNKNOWN: 4520e41f diff --git a/llvm/test/MC/AArch64/SVE2/aesmc.s b/llvm/test/MC/AArch64/SVE2/aesmc.s index d3d8ba1dc9fef2..e158d2b1e0b56e 100644 --- a/llvm/test/MC/AArch64/SVE2/aesmc.s +++ b/llvm/test/MC/AArch64/SVE2/aesmc.s @@ -1,23 +1,23 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN aesmc z0.b, z0.b // CHECK-INST: aesmc z0.b, z0.b // CHECK-ENCODING: [0x00,0xe0,0x20,0x45] -// CHECK-ERROR: instruction requires: sve2 sve-aes +// CHECK-ERROR: instruction requires: sve2-aes // CHECK-UNKNOWN: 4520e000 aesmc z31.b, z31.b // CHECK-INST: aesmc z31.b, z31.b // CHECK-ENCODING: [0x1f,0xe0,0x20,0x45] -// CHECK-ERROR: instruction requires: sve2 sve-aes +// CHECK-ERROR: instruction requires: sve2-aes // CHECK-UNKNOWN: 4520e01f diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s index fd070543bf8a27..966bead071fe39 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch-negative.s @@ -6,10 +6,10 @@ tbx z0.b, z1.b, z2.b // CHECK: error: instruction requires: sve2 or sme // CHECK-NEXT: tbx z0.b, z1.b, z2.b -.arch armv9-a+sve-aes -.arch armv9-a+nosve-aes +.arch armv9-a+sve2-aes +.arch armv9-a+nosve2-aes aesd z23.b, z23.b, z13.b -// CHECK: error: instruction requires: sve-aes +// CHECK: error: instruction requires: sve2-aes // CHECK-NEXT: aesd z23.b, z23.b, z13.b .arch armv9-a+sve2-sm4 diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch.s b/llvm/test/MC/AArch64/SVE2/directive-arch.s index 529b40f74801ab..99f6198a60abbc 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch.s @@ -4,7 +4,7 @@ tbx z0.b, z1.b, z2.b // CHECK: tbx z0.b, z1.b, z2.b -.arch armv9-a+sve-aes +.arch armv9-a+sve2-aes aesd z23.b, z23.b, z13.b // CHECK: aesd z23.b, z23.b, z13.b diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s index 594608d34b509f..e967f5aa60bd73 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension-negative.s @@ -6,10 +6,10 @@ tbx z0.b, z1.b, z2.b // CHECK: error: instruction requires: sve2 or sme // CHECK-NEXT: tbx z0.b, z1.b, z2.b -.arch_extension sve-aes -.arch_extension nosve-aes +.arch_extension sve2-aes +.arch_extension nosve2-aes aesd z23.b, z23.b, z13.b -// CHECK: error: instruction requires: sve2 sve-aes +// CHECK: error: instruction requires: sve2-aes // CHECK-NEXT: aesd z23.b, z23.b, z13.b .arch_extension sve2-sm4 diff --git a/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s b/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s index 25dbfdde9d31de..2fdbb525464d90 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s +++ b/llvm/test/MC/AArch64/SVE2/directive-arch_extension.s @@ -4,7 +4,7 @@ tbx z0.b, z1.b, z2.b // CHECK: tbx z0.b, z1.b, z2.b -.arch_extension sve-aes +.arch_extension sve2-aes aesd z23.b, z23.b, z13.b // CHECK: aesd z23.b, z23.b, z13.b diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s index aec059683dcff7..9a8af638b70378 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s +++ b/llvm/test/MC/AArch64/SVE2/directive-cpu-negative.s @@ -6,10 +6,10 @@ tbx z0.b, z1.b, z2.b // CHECK: error: instruction requires: sve2 or sme // CHECK-NEXT: tbx z0.b, z1.b, z2.b -.cpu generic+sve2+sve-aes -.cpu generic+nosve-aes +.cpu generic+sve2-aes +.cpu generic+nosve2-aes aesd z23.b, z23.b, z13.b -// CHECK: error: instruction requires: sve2 sve-aes +// CHECK: error: instruction requires: sve2-aes // CHECK-NEXT: aesd z23.b, z23.b, z13.b .cpu generic+sve2-sm4 diff --git a/llvm/test/MC/AArch64/SVE2/directive-cpu.s b/llvm/test/MC/AArch64/SVE2/directive-cpu.s index a98b8b207ef185..daa5ec510b226a 100644 --- a/llvm/test/MC/AArch64/SVE2/directive-cpu.s +++ b/llvm/test/MC/AArch64/SVE2/directive-cpu.s @@ -4,7 +4,7 @@ tbx z0.b, z1.b, z2.b // CHECK: tbx z0.b, z1.b, z2.b -.cpu generic+sve2+sve-aes +.cpu generic+sve2-aes aesd z23.b, z23.b, z13.b // CHECK: aesd z23.b, z23.b, z13.b diff --git a/llvm/test/MC/AArch64/SVE2/pmullb-128.s b/llvm/test/MC/AArch64/SVE2/pmullb-128.s index 0d562439a6021c..d48c75b3d49997 100644 --- a/llvm/test/MC/AArch64/SVE2/pmullb-128.s +++ b/llvm/test/MC/AArch64/SVE2/pmullb-128.s @@ -1,17 +1,17 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN pmullb z29.q, z30.d, z31.d // CHECK-INST: pmullb z29.q, z30.d, z31.d // CHECK-ENCODING: [0xdd,0x6b,0x1f,0x45] -// CHECK-ERROR: instruction requires: sve2 sve-aes +// CHECK-ERROR: instruction requires: sve2-aes // CHECK-UNKNOWN: 451f6bdd diff --git a/llvm/test/MC/AArch64/SVE2/pmullt-128.s b/llvm/test/MC/AArch64/SVE2/pmullt-128.s index 75b6508458b6df..e1eca8d1d89f80 100644 --- a/llvm/test/MC/AArch64/SVE2/pmullt-128.s +++ b/llvm/test/MC/AArch64/SVE2/pmullt-128.s @@ -1,17 +1,17 @@ -// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2-aes < %s \ // RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST // RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR // RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme < %s 2>&1 \ // RUN: | FileCheck %s --check-prefix=CHECK-ERROR -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ -// RUN: | llvm-objdump -d --mattr=+sve2,+sve-aes - | FileCheck %s --check-prefix=CHECK-INST -// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+sve-aes < %s \ +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ +// RUN: | llvm-objdump -d --mattr=+sve2-aes - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2-aes < %s \ // RUN: | llvm-objdump -d --mattr=-sve2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN pmullt z29.q, z30.d, z31.d // CHECK-INST: pmullt z29.q, z30.d, z31.d // CHECK-ENCODING: [0xdd,0x6f,0x1f,0x45] -// CHECK-ERROR: instruction requires: sve2 sve-aes +// CHECK-ERROR: instruction requires: sve2-aes // CHECK-UNKNOWN: 451f6fdd diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt new file mode 100644 index 00000000000000..f372c42982b1b6 --- /dev/null +++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-tf32.txt @@ -0,0 +1,19 @@ +# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck -check-prefix=ATT %s +# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck -check-prefix=INTEL %s + +# ATT: tmmultf32ps %tmm4, %tmm5, %tmm6 +# INTEL: tmmultf32ps tmm6, tmm5, tmm4 +0xc4,0xe2,0x59,0x48,0xf5 + +# ATT: tmmultf32ps %tmm1, %tmm2, %tmm3 +# INTEL: tmmultf32ps tmm3, tmm2, tmm1 +0xc4,0xe2,0x71,0x48,0xda + +# ATT: ttmmultf32ps %tmm4, %tmm5, %tmm6 +# INTEL: ttmmultf32ps tmm6, tmm5, tmm4 +0xc4,0xe2,0x58,0x48,0xf5 + +# ATT: ttmmultf32ps %tmm1, %tmm2, %tmm3 +# INTEL: ttmmultf32ps tmm3, tmm2, tmm1 +0xc4,0xe2,0x70,0x48,0xda + diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s new file mode 100644 index 00000000000000..b413597cd9da71 --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-att.s @@ -0,0 +1,17 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding < %s | FileCheck %s + +// CHECK: tmmultf32ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe2,0x59,0x48,0xf5] + tmmultf32ps %tmm4, %tmm5, %tmm6 + +// CHECK: tmmultf32ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x71,0x48,0xda] + tmmultf32ps %tmm1, %tmm2, %tmm3 + +// CHECK: ttmmultf32ps %tmm4, %tmm5, %tmm6 +// CHECK: encoding: [0xc4,0xe2,0x58,0x48,0xf5] + ttmmultf32ps %tmm4, %tmm5, %tmm6 + +// CHECK: ttmmultf32ps %tmm1, %tmm2, %tmm3 +// CHECK: encoding: [0xc4,0xe2,0x70,0x48,0xda] + ttmmultf32ps %tmm1, %tmm2, %tmm3 diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s new file mode 100644 index 00000000000000..98f55275716eb0 --- /dev/null +++ b/llvm/test/MC/X86/AMX/x86-64-amx-tf32-intel.s @@ -0,0 +1,17 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s + +// CHECK: tmmultf32ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe2,0x59,0x48,0xf5] + tmmultf32ps tmm6, tmm5, tmm4 + +// CHECK: tmmultf32ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe2,0x71,0x48,0xda] + tmmultf32ps tmm3, tmm2, tmm1 + +// CHECK: ttmmultf32ps tmm6, tmm5, tmm4 +// CHECK: encoding: [0xc4,0xe2,0x58,0x48,0xf5] + ttmmultf32ps tmm6, tmm5, tmm4 + +// CHECK: ttmmultf32ps tmm3, tmm2, tmm1 +// CHECK: encoding: [0xc4,0xe2,0x70,0x48,0xda] + ttmmultf32ps tmm3, tmm2, tmm1 diff --git a/llvm/test/MachineVerifier/test_step-vector.mir b/llvm/test/MachineVerifier/test_step-vector.mir new file mode 100644 index 00000000000000..b4a01bb258da10 --- /dev/null +++ b/llvm/test/MachineVerifier/test_step-vector.mir @@ -0,0 +1,29 @@ +# RUN: not --crash llc -verify-machineinstrs -mtriple=arm64 -run-pass none -o /dev/null %s 2>&1 | FileCheck %s +# REQUIRES: aarch64-registered-target + +--- +name: g_step_vector +body: | + bb.0: + + %0:_(s32) = G_CONSTANT i32 4 + + ; CHECK: operand must be cimm + %1:_(s32) = G_STEP_VECTOR %0 + + ; CHECK: step must be > 0 + %2:_(s32) = G_STEP_VECTOR i32 -1 + + ; CHECK: Destination type must be a scalable vector + %3:_(<4 x s64>) = G_STEP_VECTOR i32 5 + + ; CHECK: Destination element type must be scalar + %4:_() = G_STEP_VECTOR i32 9 + + ; CHECK: step bitwidth differs from result type element bitwidth + %6:_() = G_STEP_VECTOR i32 56 + + %7:_() = G_STEP_VECTOR i128 79 + +... + diff --git a/llvm/test/Other/new-pm-pgo-O0.ll b/llvm/test/Other/new-pm-pgo-O0.ll index d7a6a03b8e44e3..d4f662fb25ace7 100644 --- a/llvm/test/Other/new-pm-pgo-O0.ll +++ b/llvm/test/Other/new-pm-pgo-O0.ll @@ -9,8 +9,9 @@ ; RUN: |FileCheck %s --check-prefixes=USE_POST_LINK,USE ; RUN: opt -debug-pass-manager -passes='lto' -pgo-kind=pgo-instr-use-pipeline -profile-file='%t.profdata' %s 2>&1 \ ; RUN: |FileCheck %s --check-prefixes=USE_POST_LINK,USE +; RUN: opt -debug-pass-manager -passes='default' -pgo-kind=pgo-sample-use-pipeline -profile-file='%S/Inputs/new-pm-pgo.prof' %s 2>&1 \ +; RUN: |FileCheck %s --check-prefixes=SAMPLE_USE -; ; GEN: Running pass: PGOInstrumentationGen ; USE_DEFAULT: Running pass: PGOInstrumentationUse ; USE_PRE_LINK: Running pass: PGOInstrumentationUse @@ -18,6 +19,9 @@ ; USE-NOT: Running pass: PGOIndirectCallPromotion ; USE-NOT: Running pass: PGOMemOPSizeOpt +; SAMPLE_USE: Running pass: AddDiscriminatorsPass +; SAMPLE_USE: Running pass: SampleProfileLoaderPass + define void @foo() { ret void } diff --git a/llvm/test/ThinLTO/X86/memprof-icp.ll b/llvm/test/ThinLTO/X86/memprof-icp.ll index d4d0e21b4bd7db..3e2912da576f46 100644 --- a/llvm/test/ThinLTO/X86/memprof-icp.ll +++ b/llvm/test/ThinLTO/X86/memprof-icp.ll @@ -93,6 +93,8 @@ ; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \ ; RUN: -r=%t/foo.o,_ZN2B03barEj.abc,plx \ ; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \ +; RUN: -r=%t/foo.o,_ZN2B03barEj, \ +; RUN: -r=%t/foo.o,_ZN1B3barEj, \ ; RUN: -r=%t/main.o,_Z3fooR2B0j, \ ; RUN: -r=%t/main.o,_Znwm, \ ; RUN: -r=%t/main.o,_ZdlPvm, \ @@ -113,9 +115,9 @@ ; RUN: -pass-remarks=. -save-temps \ ; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \ ; RUN: --check-prefix=STATS-BE --check-prefix=REMARKS-MAIN \ -; RUN: --check-prefix=REMARKS-FOO +; RUN: --check-prefix=REMARKS-FOO --check-prefix=REMARKS-FOO-IMPORT -; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR +; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR --check-prefix=IR-IMPORT ;; Try again but with distributed ThinLTO ; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \ @@ -124,6 +126,8 @@ ; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \ ; RUN: -r=%t/foo.o,_ZN2B03barEj.abc,plx \ ; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \ +; RUN: -r=%t/foo.o,_ZN2B03barEj, \ +; RUN: -r=%t/foo.o,_ZN1B3barEj, \ ; RUN: -r=%t/main.o,_Z3fooR2B0j, \ ; RUN: -r=%t/main.o,_Znwm, \ ; RUN: -r=%t/main.o,_ZdlPvm, \ @@ -147,8 +151,9 @@ ; RUN: -enable-memprof-indirect-call-support=true \ ; RUN: -summary-file=%t/foo.o.thinlto.bc -memprof-import-summary=%t/foo.o.thinlto.bc \ ; RUN: -enable-import-metadata -stats -pass-remarks=. \ -; RUN: %t/foo.o -S 2>&1 | FileCheck %s --check-prefix=IR \ -; RUN: --check-prefix=STATS-BE-DISTRIB --check-prefix=REMARKS-FOO +; RUN: %t/foo.o -S 2>&1 | FileCheck %s --check-prefix=IR --check-prefix=IR-IMPORT \ +; RUN: --check-prefix=STATS-BE-DISTRIB --check-prefix=REMARKS-FOO \ +; RUN: --check-prefix=REMARKS-FOO-IMPORT ;; Retry with the ICP-disabled object file, and make sure we disable it again ;; so we don't look for the synthesized callsite records when applying imports. @@ -159,6 +164,8 @@ ; RUN: -r=%t/foo.noicp.o,_Z3fooR2B0j,plx \ ; RUN: -r=%t/foo.noicp.o,_ZN2B03barEj.abc,plx \ ; RUN: -r=%t/foo.noicp.o,_Z3xyzR2B0j, \ +; RUN: -r=%t/foo.noicp.o,_ZN2B03barEj, \ +; RUN: -r=%t/foo.noicp.o,_ZN1B3barEj, \ ; RUN: -r=%t/main.o,_Z3fooR2B0j, \ ; RUN: -r=%t/main.o,_Znwm, \ ; RUN: -r=%t/main.o,_ZdlPvm, \ @@ -184,6 +191,74 @@ ;; metadata. ; RUN: llvm-dis %t.noicp.out.2.4.opt.bc -o - | FileCheck %s --implicit-check-not "_Z3fooR2B0j.memprof" --implicit-check-not "!callsite" +;; Run in-process ThinLTO again, but with importing disabled by setting the +;; instruction limit to 0. Ensure that the existing declarations of B::bar +;; and B0::bar are sufficient to allow for the promotion and cloning. +; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \ +; RUN: -import-instr-limit=0 \ +; RUN: -enable-memprof-indirect-call-support=true \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \ +; RUN: -r=%t/foo.o,_ZN2B03barEj.abc,plx \ +; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \ +; RUN: -r=%t/foo.o,_ZN2B03barEj, \ +; RUN: -r=%t/foo.o,_ZN1B3barEj, \ +; RUN: -r=%t/main.o,_Z3fooR2B0j, \ +; RUN: -r=%t/main.o,_Znwm, \ +; RUN: -r=%t/main.o,_ZdlPvm, \ +; RUN: -r=%t/main.o,_Z8externalPi, \ +; RUN: -r=%t/main.o,main,plx \ +; RUN: -r=%t/main.o,_ZN2B03barEj,plx \ +; RUN: -r=%t/main.o,_ZN1B3barEj,plx \ +; RUN: -r=%t/main.o,_ZTV1B,plx \ +; RUN: -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \ +; RUN: -r=%t/main.o,_ZTS1B,plx \ +; RUN: -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \ +; RUN: -r=%t/main.o,_ZTS2B0,plx \ +; RUN: -r=%t/main.o,_ZTI2B0,plx \ +; RUN: -r=%t/main.o,_ZTI1B,plx \ +; RUN: -r=%t/main.o,_ZTV2B0,plx \ +; RUN: -thinlto-threads=1 \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes -stats \ +; RUN: -pass-remarks=. -save-temps \ +; RUN: -o %t.out 2>&1 | FileCheck %s --check-prefix=STATS \ +; RUN: --check-prefix=STATS-BE-NOIMPORT --check-prefix=REMARKS-MAIN \ +; RUN: --check-prefix=REMARKS-FOO + +; RUN: llvm-dis %t.out.2.4.opt.bc -o - | FileCheck %s --check-prefix=IR --check-prefix=IR-NOIMPORT + +;; Run it gain but with -memprof-require-definition-for-promotion, and confirm +;; that no promotions occur. +; RUN: llvm-lto2 run %t/main.o %t/foo.o -enable-memprof-context-disambiguation \ +; RUN: -import-instr-limit=0 \ +; RUN: -memprof-require-definition-for-promotion \ +; RUN: -enable-memprof-indirect-call-support=true \ +; RUN: -supports-hot-cold-new \ +; RUN: -r=%t/foo.o,_Z3fooR2B0j,plx \ +; RUN: -r=%t/foo.o,_ZN2B03barEj.abc,plx \ +; RUN: -r=%t/foo.o,_Z3xyzR2B0j, \ +; RUN: -r=%t/foo.o,_ZN2B03barEj, \ +; RUN: -r=%t/foo.o,_ZN1B3barEj, \ +; RUN: -r=%t/main.o,_Z3fooR2B0j, \ +; RUN: -r=%t/main.o,_Znwm, \ +; RUN: -r=%t/main.o,_ZdlPvm, \ +; RUN: -r=%t/main.o,_Z8externalPi, \ +; RUN: -r=%t/main.o,main,plx \ +; RUN: -r=%t/main.o,_ZN2B03barEj,plx \ +; RUN: -r=%t/main.o,_ZN1B3barEj,plx \ +; RUN: -r=%t/main.o,_ZTV1B,plx \ +; RUN: -r=%t/main.o,_ZTVN10__cxxabiv120__si_class_type_infoE,plx \ +; RUN: -r=%t/main.o,_ZTS1B,plx \ +; RUN: -r=%t/main.o,_ZTVN10__cxxabiv117__class_type_infoE,plx \ +; RUN: -r=%t/main.o,_ZTS2B0,plx \ +; RUN: -r=%t/main.o,_ZTI2B0,plx \ +; RUN: -r=%t/main.o,_ZTI1B,plx \ +; RUN: -r=%t/main.o,_ZTV2B0,plx \ +; RUN: -thinlto-threads=1 \ +; RUN: -memprof-verify-ccg -memprof-verify-nodes \ +; RUN: -pass-remarks=. \ +; RUN: -o %t.out 2>&1 | FileCheck %s --implicit-check-not Promote + ; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1 ; REMARKS-MAIN: call in clone main assigned to call function clone _Z3fooR2B0j.memprof.1 ; REMARKS-MAIN: created clone _ZN2B03barEj.memprof.1 @@ -208,30 +283,36 @@ ; REMARKS-FOO: call in clone _Z3fooR2B0j promoted and assigned to call function clone _ZN2B03barEj ; REMARKS-FOO: Promote indirect call to _ZN2B03barEj with count 2 out of 2 ; REMARKS-FOO: call in clone _Z3fooR2B0j.memprof.1 promoted and assigned to call function clone _ZN2B03barEj.memprof.1 -; REMARKS-FOO: created clone _ZN2B03barEj.memprof.1 -; REMARKS-FOO: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold -; REMARKS-FOO: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold -; REMARKS-FOO: created clone _ZN1B3barEj.memprof.1 -; REMARKS-FOO: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold -; REMARKS-FOO: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold +; REMARKS-FOO-IMPORT: created clone _ZN2B03barEj.memprof.1 +; REMARKS-FOO-IMPORT: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold +; REMARKS-FOO-IMPORT: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold +; REMARKS-FOO-IMPORT: created clone _ZN1B3barEj.memprof.1 +; REMARKS-FOO-IMPORT: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold +; REMARKS-FOO-IMPORT: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold ; STATS: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during whole program analysis ; STATS-BE: 8 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE-NOIMPORT: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend ; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during whole program analysis ; STATS-BE: 8 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend +; STATS-BE-NOIMPORT: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend ; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis ; STATS-BE: 5 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; STATS-BE-NOIMPORT: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend +; IR-NOIMPORT: foo ; IR: define {{.*}} @_Z3fooR2B0j( -; IR: %1 = icmp eq ptr %0, @_ZN1B3barEj -; IR: br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect +; IR: %[[R1:[0-9]+]] = icmp eq ptr %0, @_ZN1B3barEj +; IR: br i1 %[[R1]], label %if.true.direct_targ, label %if.false.orig_indirect ; IR: if.true.direct_targ: -; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]] +; IR-IMPORT: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD:[0-9]+]] +; IR-NOIMPORT: call {{.*}} @_ZN1B3barEj( ; IR: if.false.orig_indirect: -; IR: %2 = icmp eq ptr %0, @_ZN2B03barEj -; IR: br i1 %2, label %if.true.direct_targ1, label %if.false.orig_indirect2 +; IR: %[[R2:[0-9]+]] = icmp eq ptr %0, @_ZN2B03barEj +; IR: br i1 %[[R2]], label %if.true.direct_targ1, label %if.false.orig_indirect2 ; IR: if.true.direct_targ1: -; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]] +; IR-IMPORT: call {{.*}} @_Znwm(i64 noundef 4) #[[NOTCOLD]] +; IR-NOIMPORT: call {{.*}} @_ZN2B03barEj( ; IR: if.false.orig_indirect2: ; IR: call {{.*}} %0 @@ -239,20 +320,22 @@ ;; We should still compare against the original versions of bar since that is ;; what is in the vtable. However, we should have called the cloned versions ;; that perform cold allocations, which were subsequently inlined. -; IR: %1 = icmp eq ptr %0, @_ZN1B3barEj -; IR: br i1 %1, label %if.true.direct_targ, label %if.false.orig_indirect +; IR: %[[R3:[0-9]+]] = icmp eq ptr %0, @_ZN1B3barEj +; IR: br i1 %[[R3]], label %if.true.direct_targ, label %if.false.orig_indirect ; IR: if.true.direct_targ: -; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]] +; IR-IMPORT: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD:[0-9]+]] +; IR-NOIMPORT: call {{.*}} @_ZN1B3barEj.memprof.1( ; IR: if.false.orig_indirect: -; IR: %2 = icmp eq ptr %0, @_ZN2B03barEj -; IR: br i1 %2, label %if.true.direct_targ1, label %if.false.orig_indirect2 +; IR: %[[R4:[0-9]+]] = icmp eq ptr %0, @_ZN2B03barEj +; IR: br i1 %[[R4]], label %if.true.direct_targ1, label %if.false.orig_indirect2 ; IR: if.true.direct_targ1: -; IR: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]] +; IR-IMPORT: call {{.*}} @_Znwm(i64 noundef 4) #[[COLD]] +; IR-NOIMPORT: call {{.*}} @_ZN2B03barEj.memprof.1( ; IR: if.false.orig_indirect2: ; IR: call {{.*}} %0 -; IR: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold" -; IR: attributes #[[COLD]] = {{.*}} "memprof"="cold" +; IR-IMPORT: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold" +; IR-IMPORT: attributes #[[COLD]] = {{.*}} "memprof"="cold" ; STATS-BE-DISTRIB: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend ; STATS-BE-DISTRIB: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend @@ -272,6 +355,9 @@ define i32 @_ZN2B03barEj.abc(ptr %this, i32 %s) { ret i32 0 } +declare i32 @_ZN2B03barEj(ptr %this, i32 %s) +declare i32 @_ZN1B3barEj(ptr %this, i32 %s) + define i32 @_Z3fooR2B0j(ptr %b) { entry: %0 = load ptr, ptr %b, align 8 diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/2008-11-24-RAUW-Self.ll b/llvm/test/Transforms/CodeGenPrepare/X86/2008-11-24-RAUW-Self.ll index 5b501ed980a5e5..214cb33287a9e6 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/2008-11-24-RAUW-Self.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/2008-11-24-RAUW-Self.ll @@ -3,7 +3,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" target triple = "x86_64-unknown-linux-gnu" -define fastcc i32 @ascii2flt(ptr %str) nounwind { +define fastcc i32 @ascii2flt(ptr %str, i1 %arg) nounwind { entry: br label %bb2.i @@ -45,7 +45,7 @@ bb10.i196.bb7.i197_crit_edge: ; No predecessors! bb7.i197: ; preds = %bb10.i196.bb7.i197_crit_edge, %base2flt.exit.bb7.i197_crit_edge, %bb11.i.bb7.i197_crit_edge %.reg2mem.0 = phi i32 [ 0, %base2flt.exit.bb7.i197_crit_edge ], [ %.reg2mem.0, %bb10.i196.bb7.i197_crit_edge ], [ 0, %bb11.i.bb7.i197_crit_edge ] ; [#uses=1] - br i1 undef, label %bb10.i196.base2flt.exit204_crit_edge, label %bb10.i196 + br i1 %arg, label %bb10.i196.base2flt.exit204_crit_edge, label %bb10.i196 base2flt.exit204: ; preds = %bb10.i196.base2flt.exit204_crit_edge, %base2flt.exit.base2flt.exit204_crit_edge, %bb11.i.base2flt.exit204_crit_edge br i1 false, label %base2flt.exit204.bb8_crit_edge, label %bb diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll b/llvm/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll index 5349afc18d84de..5ddd0b9f4107c2 100644 --- a/llvm/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll +++ b/llvm/test/Transforms/CodeGenPrepare/X86/extend-sink-hoist.ll @@ -45,11 +45,11 @@ block2: ; Make sure the cast sink logic and OptimizeExtUses don't end up in an infinite ; loop. -define i128 @use_ext_source() { +define i128 @use_ext_source(i1 %arg) { block1: %v1 = or i64 undef, undef %v2 = zext i64 %v1 to i128 - br i1 undef, label %block2, label %block3 + br i1 %arg, label %block2, label %block3 block2: %v3 = add i64 %v1, 1 diff --git a/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll b/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll index 69e84e942de65b..3577223c20e1ea 100644 --- a/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll +++ b/llvm/test/Transforms/ConstantHoisting/AArch64/consthoist-unreachable.ll @@ -7,11 +7,11 @@ @c.a = external global i32, align 1 -define void @c() { +define void @c(i1 %arg) { ; CHECK-LABEL: @c( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i16 0, 0 -; CHECK-NEXT: br i1 undef, label [[LBL1_US:%.*]], label [[ENTRY_ENTRY_SPLIT_CRIT_EDGE:%.*]] +; CHECK-NEXT: br i1 %arg, label [[LBL1_US:%.*]], label [[ENTRY_ENTRY_SPLIT_CRIT_EDGE:%.*]] ; CHECK: entry.entry.split_crit_edge: ; CHECK-NEXT: [[CONST:%.*]] = bitcast i32 1232131 to i32 ; CHECK-NEXT: br label [[LBL1:%.*]] @@ -21,9 +21,9 @@ define void @c() { ; CHECK-NEXT: br label [[FOR_COND4:%.*]] ; CHECK: lbl1: ; CHECK-NEXT: store i32 [[CONST]], ptr @c.a, align 1 -; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[FOR_END12:%.*]] +; CHECK-NEXT: br i1 %arg, label [[IF_THEN:%.*]], label [[FOR_END12:%.*]] ; CHECK: if.then: -; CHECK-NEXT: br i1 undef, label [[LBL1]], label [[FOR_COND4]] +; CHECK-NEXT: br i1 %arg, label [[LBL1]], label [[FOR_COND4]] ; CHECK: for.cond4: ; CHECK-NEXT: br label [[FOR_COND4]] ; CHECK: for.body9: @@ -35,7 +35,7 @@ define void @c() { ; entry: %tobool = icmp ne i16 0, 0 - br i1 undef, label %lbl1.us, label %entry.entry.split_crit_edge + br i1 %arg, label %lbl1.us, label %entry.entry.split_crit_edge entry.entry.split_crit_edge: ; preds = %entry br label %lbl1 @@ -46,10 +46,10 @@ lbl1.us: ; preds = %entry lbl1: ; preds = %if.then, %entry.entry.split_crit_edge store i32 1232131, ptr @c.a, align 1 - br i1 undef, label %if.then, label %for.end12 + br i1 %arg, label %if.then, label %for.end12 if.then: ; preds = %lbl1 - br i1 undef, label %lbl1, label %for.cond4 + br i1 %arg, label %lbl1, label %for.cond4 for.cond4: ; preds = %for.cond4, %if.then, %lbl1.us br label %for.cond4 diff --git a/llvm/test/Transforms/ConstantHoisting/ARM/same-offset-multi-types.ll b/llvm/test/Transforms/ConstantHoisting/ARM/same-offset-multi-types.ll index 63447cb80c6d50..0a72b8895b09ac 100644 --- a/llvm/test/Transforms/ConstantHoisting/ARM/same-offset-multi-types.ll +++ b/llvm/test/Transforms/ConstantHoisting/ARM/same-offset-multi-types.ll @@ -26,9 +26,9 @@ target triple = "thumbv6m-none--musleabi" @global = external dso_local global %0, align 4 ; Function Attrs: nounwind optsize ssp -define dso_local void @zot() { +define dso_local void @zot(i1 %arg) { bb: - br i1 undef, label %bb2, label %bb1 + br i1 %arg, label %bb2, label %bb1 bb1: ; preds = %bb %tmp = load ptr, ptr getelementptr inbounds (%0, ptr @global, i32 0, i32 2, i32 0), align 4 diff --git a/llvm/test/Transforms/ConstantHoisting/PowerPC/masks.ll b/llvm/test/Transforms/ConstantHoisting/PowerPC/masks.ll index 5787ff19753e58..45abdc4f6e9513 100644 --- a/llvm/test/Transforms/ConstantHoisting/PowerPC/masks.ll +++ b/llvm/test/Transforms/ConstantHoisting/PowerPC/masks.ll @@ -3,18 +3,18 @@ target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" ; Here the masks are all contiguous, and should not be hoisted. -define i32 @test1() nounwind { +define i32 @test1(i1 %arg) nounwind { entry: ; CHECK-LABEL: @test1 ; CHECK-NOT: bitcast i32 65535 to i32 ; CHECK: and i32 undef, 65535 %conv121 = and i32 undef, 65535 - br i1 undef, label %if.then152, label %if.end167 + br i1 %arg, label %if.then152, label %if.end167 if.then152: ; CHECK: and i32 undef, 65535 %conv153 = and i32 undef, 65535 - br i1 undef, label %if.end167, label %end2 + br i1 %arg, label %if.end167, label %end2 if.end167: ; CHECK: and i32 {{.*}}, 32768 @@ -35,16 +35,16 @@ end2: } ; Here the masks are not contiguous, and should be hoisted. -define i32 @test2() nounwind { +define i32 @test2(i1 %arg) nounwind { entry: ; CHECK-LABEL: @test2 ; CHECK: bitcast i32 65531 to i32 %conv121 = and i32 undef, 65531 - br i1 undef, label %if.then152, label %if.end167 + br i1 %arg, label %if.then152, label %if.end167 if.then152: %conv153 = and i32 undef, 65531 - br i1 undef, label %if.end167, label %end2 + br i1 %arg, label %if.end167, label %end2 if.end167: ; CHECK: add i32 {{.*}}, -32758 diff --git a/llvm/test/Transforms/ConstantHoisting/X86/pr43903-not-all-uses-rebased.ll b/llvm/test/Transforms/ConstantHoisting/X86/pr43903-not-all-uses-rebased.ll index 1fa27aabe35884..8c1b8b6834197e 100644 --- a/llvm/test/Transforms/ConstantHoisting/X86/pr43903-not-all-uses-rebased.ll +++ b/llvm/test/Transforms/ConstantHoisting/X86/pr43903-not-all-uses-rebased.ll @@ -8,12 +8,12 @@ target triple = "x86_64-unknown-linux-gnu" @a = external global [2 x i16], align 1 -define void @c() { +define void @c(i1 %arg) { ; CHECK-LABEL: @c( ; CHECK-NEXT: for.cond: -; CHECK-NEXT: br i1 undef, label [[FOR_BODY2:%.*]], label [[FOR_END4:%.*]] +; CHECK-NEXT: br i1 %arg, label [[FOR_BODY2:%.*]], label [[FOR_END4:%.*]] ; CHECK: for.body2: -; CHECK-NEXT: br i1 undef, label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; CHECK-NEXT: br i1 %arg, label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] ; CHECK: land.rhs: ; CHECK-NEXT: unreachable ; CHECK: land.end: @@ -27,10 +27,10 @@ define void @c() { ; CHECK-NEXT: ret void ; for.cond: - br i1 undef, label %for.body2, label %for.end4 + br i1 %arg, label %for.body2, label %for.end4 for.body2: ; preds = %for.cond - br i1 undef, label %land.rhs, label %land.end + br i1 %arg, label %land.rhs, label %land.end land.rhs: ; preds = %for.body2 unreachable diff --git a/llvm/test/Transforms/Coroutines/coro-async-remat.ll b/llvm/test/Transforms/Coroutines/coro-async-remat.ll index fd2a35c0c7f881..808084ef3c8633 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-remat.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-remat.ll @@ -24,7 +24,7 @@ entry: ret ptr undef } -define swifttailcc void @repo(ptr %0, ptr %1, ptr %arg, ptr %2) #1 { +define swifttailcc void @repo(ptr %0, ptr %1, ptr %arg, ptr %2, i1 %arg2) #1 { entry: %swifterror = alloca swifterror ptr, align 8 %3 = call token @llvm.coro.id.async(i32 20, i32 16, i32 1, ptr @repoTU) @@ -33,10 +33,10 @@ entry: br label %6 6: ; preds = %21, %15, %entry - br i1 undef, label %7, label %23 + br i1 %arg2, label %7, label %23 7: ; preds = %6 - br i1 undef, label %8, label %16 + br i1 %arg2, label %8, label %16 8: ; preds = %7 %initializeWithTake35 = bitcast ptr undef to ptr @@ -44,10 +44,10 @@ entry: %10 = call ptr @llvm.coro.async.resume() %11 = bitcast ptr %10 to ptr %12 = call { ptr, ptr } (i32, ptr, ptr, ...) @llvm.coro.suspend.async.sl_p0i8p0s_swift.error.4.220.413.429.445.461.672.683ss(i32 256, ptr %10, ptr @__swift_async_resume_project_context, ptr @__swift_suspend_dispatch_5.23, ptr undef, ptr undef, ptr undef, ptr %5, ptr undef, ptr undef) - br i1 undef, label %25, label %13 + br i1 %arg2, label %25, label %13 13: ; preds = %8 - br i1 undef, label %14, label %15 + br i1 %arg2, label %14, label %15 14: ; preds = %13 br label %24 @@ -56,16 +56,16 @@ entry: br label %6 16: ; preds = %7 - br i1 undef, label %26, label %17 + br i1 %arg2, label %26, label %17 17: ; preds = %16 - br i1 undef, label %18, label %22 + br i1 %arg2, label %18, label %22 18: ; preds = %17 - br i1 undef, label %27, label %19 + br i1 %arg2, label %27, label %19 19: ; preds = %18 - br i1 undef, label %20, label %21 + br i1 %arg2, label %20, label %21 20: ; preds = %19 br label %24 diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/2010-09-26-MergeConstantRange.ll b/llvm/test/Transforms/CorrelatedValuePropagation/2010-09-26-MergeConstantRange.ll index fb2ca2b23bf4b9..52d95ddbd1d11f 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/2010-09-26-MergeConstantRange.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/2010-09-26-MergeConstantRange.ll @@ -5,7 +5,7 @@ @g_128 = external global %struct.S2, align 1 @g_106 = external global i16, align 2 -define void @int328(i16 signext %p_82) noreturn nounwind ssp { +define void @int328(i16 signext %p_82, i1 %arg) noreturn nounwind ssp { entry: %tobool3 = icmp eq i16 %p_82, 0 br label %for.cond.outer @@ -32,7 +32,7 @@ for.cond.split.us: ; preds = %for.cond br label %lbl_133.us lbl_133.us: ; preds = %lbl_134.us, %for.cond.split.us - br i1 undef, label %if.else14.us-lcssa.us, label %if.then.us + br i1 %arg, label %if.else14.us-lcssa.us, label %if.then.us lbl_134.us: ; preds = %if.then.us %cmp = icmp eq i16 ptrtoint (ptr @g_128 to i16), 0 @@ -66,7 +66,7 @@ for.cond9.preheader.us-lcssa: ; preds = %lbl_134 br label %for.cond9.preheader for.cond9.preheader: ; preds = %for.cond9.preheader.us-lcssa, %for.cond9.preheader.us-lcssa.us - br i1 undef, label %bb.nph, label %for.cond.loopexit + br i1 %arg, label %bb.nph, label %for.cond.loopexit bb.nph: ; preds = %for.cond9.preheader br label %for.cond.loopexit diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/ashr.ll b/llvm/test/Transforms/CorrelatedValuePropagation/ashr.ll index f719effac113e9..dcc90d548d698e 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/ashr.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/ashr.ll @@ -133,10 +133,10 @@ define void @test5(i32 %n) { ; CHECK-NEXT: br i1 [[CMP]], label %[[LOOP:.*]], label %[[EXIT:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[A:%.*]] = phi i32 [ [[N]], %[[ENTRY]] ], [ [[SHR:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[A]], 4 +; CHECK-NEXT: [[COND:%.*]] = icmp samesign ugt i32 [[A]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) ; CHECK-NEXT: [[SHR]] = lshr i32 [[A]], 1 -; CHECK-NEXT: [[LOOPCOND:%.*]] = icmp ugt i32 [[SHR]], 8 +; CHECK-NEXT: [[LOOPCOND:%.*]] = icmp samesign ugt i32 [[SHR]], 8 ; CHECK-NEXT: br i1 [[LOOPCOND]], label %[[LOOP]], label %[[EXIT]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll index 3c3b1d4bef45bb..a0175f3ebdd731 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/basic.ll @@ -579,7 +579,7 @@ define i1 @umin(i32 %a, i32 %b) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[B]], 20 ; CHECK-NEXT: br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]] ; CHECK: b_guard: -; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp ult i32 [[A]], [[B]] +; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp samesign ult i32 [[A]], [[B]] ; CHECK-NEXT: [[MIN:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 [[B]] ; CHECK-NEXT: ret i1 false ; CHECK: out: @@ -612,7 +612,7 @@ define i1 @smin(i32 %a, i32 %b) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[B]], 20 ; CHECK-NEXT: br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]] ; CHECK: b_guard: -; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp ule i32 [[A]], [[B]] +; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp samesign ule i32 [[A]], [[B]] ; CHECK-NEXT: [[MIN:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 [[B]] ; CHECK-NEXT: ret i1 false ; CHECK: out: @@ -645,7 +645,7 @@ define i1 @smax(i32 %a, i32 %b) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[B]], 20 ; CHECK-NEXT: br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]] ; CHECK: b_guard: -; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp uge i32 [[A]], [[B]] +; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp samesign uge i32 [[A]], [[B]] ; CHECK-NEXT: [[MAX:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 [[B]] ; CHECK-NEXT: ret i1 false ; CHECK: out: @@ -678,7 +678,7 @@ define i1 @umax(i32 %a, i32 %b) { ; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[B]], 20 ; CHECK-NEXT: br i1 [[CMP2]], label [[B_GUARD:%.*]], label [[OUT]] ; CHECK: b_guard: -; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp uge i32 [[A]], [[B]] +; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp samesign uge i32 [[A]], [[B]] ; CHECK-NEXT: [[MAX:%.*]] = select i1 [[SEL_CMP]], i32 [[A]], i32 [[B]] ; CHECK-NEXT: ret i1 false ; CHECK: out: @@ -824,7 +824,7 @@ define i1 @clamp_low3(i32 noundef %a) { ; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[A]], 5 ; CHECK-NEXT: br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]] ; CHECK: a_guard: -; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp ugt i32 [[A]], 5 +; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp samesign ugt i32 [[A]], 5 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[A]], -1 ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[SEL_CMP]], i32 [[ADD]], i32 5 ; CHECK-NEXT: ret i1 false @@ -852,7 +852,7 @@ define i1 @clamp_low4(i32 noundef %a) { ; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[A]], 5 ; CHECK-NEXT: br i1 [[CMP]], label [[A_GUARD:%.*]], label [[OUT:%.*]] ; CHECK: a_guard: -; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp ule i32 [[A]], 5 +; CHECK-NEXT: [[SEL_CMP:%.*]] = icmp samesign ule i32 [[A]], 5 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[A]], -1 ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[SEL_CMP]], i32 5, i32 [[ADD]] ; CHECK-NEXT: ret i1 false @@ -1085,10 +1085,10 @@ define void @abs1(i32 %a, ptr %p) { ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A]], 0 ; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[SUB]], i32 [[A]] ; CHECK-NEXT: store i1 true, ptr [[P]], align 1 -; CHECK-NEXT: [[C2:%.*]] = icmp ult i32 [[ABS]], 19 +; CHECK-NEXT: [[C2:%.*]] = icmp samesign ult i32 [[ABS]], 19 ; CHECK-NEXT: store i1 [[C2]], ptr [[P]], align 1 ; CHECK-NEXT: store i1 true, ptr [[P]], align 1 -; CHECK-NEXT: [[C4:%.*]] = icmp uge i32 [[ABS]], 1 +; CHECK-NEXT: [[C4:%.*]] = icmp samesign uge i32 [[ABS]], 1 ; CHECK-NEXT: store i1 [[C4]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1131,10 +1131,10 @@ define void @abs2(i32 %a, ptr %p) { ; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[A]], 0 ; CHECK-NEXT: [[ABS:%.*]] = select i1 [[CMP]], i32 [[A]], i32 [[SUB]] ; CHECK-NEXT: store i1 true, ptr [[P]], align 1 -; CHECK-NEXT: [[C2:%.*]] = icmp ult i32 [[ABS]], 19 +; CHECK-NEXT: [[C2:%.*]] = icmp samesign ult i32 [[ABS]], 19 ; CHECK-NEXT: store i1 [[C2]], ptr [[P]], align 1 ; CHECK-NEXT: store i1 true, ptr [[P]], align 1 -; CHECK-NEXT: [[C4:%.*]] = icmp uge i32 [[ABS]], 1 +; CHECK-NEXT: [[C4:%.*]] = icmp samesign uge i32 [[ABS]], 1 ; CHECK-NEXT: store i1 [[C4]], ptr [[P]], align 1 ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1458,12 +1458,12 @@ entry: ret i1 %cmp } -define i1 @srem_unknown(i32 %a) { +define i1 @srem_unknown(i32 %a, i1 %arg) { ; CHECK-LABEL: define i1 @srem_unknown -; CHECK-SAME: (i32 [[A:%.*]]) { +; CHECK-SAME: (i32 [[A:%.*]], i1 [[ARG:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SREM:%.*]] = srem i32 [[A]], 30 -; CHECK-NEXT: br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[EXIT1:%.*]], label [[EXIT2:%.*]] ; CHECK: exit1: ; CHECK-NEXT: ret i1 true ; CHECK: exit2: @@ -1473,19 +1473,19 @@ entry: %srem = srem i32 %a, 30 %cmp1 = icmp slt i32 %srem, 30 %cmp2 = icmp sgt i32 %srem, -30 - br i1 undef, label %exit1, label %exit2 + br i1 %arg, label %exit1, label %exit2 exit1: ret i1 %cmp1 exit2: ret i1 %cmp2 } -define i1 @sdiv_unknown(i32 %a) { +define i1 @sdiv_unknown(i32 %a, i1 %arg) { ; CHECK-LABEL: define i1 @sdiv_unknown -; CHECK-SAME: (i32 [[A:%.*]]) { +; CHECK-SAME: (i32 [[A:%.*]], i1 [[ARG:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SREM:%.*]] = sdiv i32 [[A]], 123 -; CHECK-NEXT: br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[EXIT1:%.*]], label [[EXIT2:%.*]] ; CHECK: exit1: ; CHECK-NEXT: ret i1 true ; CHECK: exit2: @@ -1495,20 +1495,20 @@ entry: %srem = sdiv i32 %a, 123 %cmp1 = icmp slt i32 %srem, 17459217 %cmp2 = icmp sgt i32 %srem, -17459217 - br i1 undef, label %exit1, label %exit2 + br i1 %arg, label %exit1, label %exit2 exit1: ret i1 %cmp1 exit2: ret i1 %cmp2 } -define i1 @uadd_sat_unknown(i32 %a) { +define i1 @uadd_sat_unknown(i32 %a, i1 %arg) { ; CHECK-LABEL: define i1 @uadd_sat_unknown -; CHECK-SAME: (i32 [[A:%.*]]) { +; CHECK-SAME: (i32 [[A:%.*]], i1 [[ARG:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.uadd.sat.i32(i32 [[A]], i32 100) ; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i32 [[VAL]], 100 -; CHECK-NEXT: br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[EXIT1:%.*]], label [[EXIT2:%.*]] ; CHECK: exit1: ; CHECK-NEXT: ret i1 true ; CHECK: exit2: @@ -1518,20 +1518,20 @@ entry: %val = call i32 @llvm.uadd.sat.i32(i32 %a, i32 100) %cmp1 = icmp uge i32 %val, 100 %cmp2 = icmp ugt i32 %val, 100 - br i1 undef, label %exit1, label %exit2 + br i1 %arg, label %exit1, label %exit2 exit1: ret i1 %cmp1 exit2: ret i1 %cmp2 } -define i1 @usub_sat_unknown(i32 %a) { +define i1 @usub_sat_unknown(i32 %a, i1 %arg) { ; CHECK-LABEL: define i1 @usub_sat_unknown -; CHECK-SAME: (i32 [[A:%.*]]) { +; CHECK-SAME: (i32 [[A:%.*]], i1 [[ARG:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[A]], i32 100) ; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[VAL]], -101 -; CHECK-NEXT: br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[EXIT1:%.*]], label [[EXIT2:%.*]] ; CHECK: exit1: ; CHECK-NEXT: ret i1 true ; CHECK: exit2: @@ -1541,20 +1541,20 @@ entry: %val = call i32 @llvm.usub.sat.i32(i32 %a, i32 100) %cmp1 = icmp ule i32 %val, 4294967195 %cmp2 = icmp ult i32 %val, 4294967195 - br i1 undef, label %exit1, label %exit2 + br i1 %arg, label %exit1, label %exit2 exit1: ret i1 %cmp1 exit2: ret i1 %cmp2 } -define i1 @sadd_sat_unknown(i32 %a) { +define i1 @sadd_sat_unknown(i32 %a, i1 %arg) { ; CHECK-LABEL: define i1 @sadd_sat_unknown -; CHECK-SAME: (i32 [[A:%.*]]) { +; CHECK-SAME: (i32 [[A:%.*]], i1 [[ARG:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.sadd.sat.i32(i32 [[A]], i32 100) ; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[VAL]], -2147483548 -; CHECK-NEXT: br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[EXIT1:%.*]], label [[EXIT2:%.*]] ; CHECK: exit1: ; CHECK-NEXT: ret i1 true ; CHECK: exit2: @@ -1564,20 +1564,20 @@ entry: %val = call i32 @llvm.sadd.sat.i32(i32 %a, i32 100) %cmp1 = icmp sge i32 %val, -2147483548 %cmp2 = icmp sgt i32 %val, -2147483548 - br i1 undef, label %exit1, label %exit2 + br i1 %arg, label %exit1, label %exit2 exit1: ret i1 %cmp1 exit2: ret i1 %cmp2 } -define i1 @ssub_sat_unknown(i32 %a) { +define i1 @ssub_sat_unknown(i32 %a, i1 %arg) { ; CHECK-LABEL: define i1 @ssub_sat_unknown -; CHECK-SAME: (i32 [[A:%.*]]) { +; CHECK-SAME: (i32 [[A:%.*]], i1 [[ARG:%.*]]) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.ssub.sat.i32(i32 [[A]], i32 100) ; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[VAL]], 2147483547 -; CHECK-NEXT: br i1 undef, label [[EXIT1:%.*]], label [[EXIT2:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[EXIT1:%.*]], label [[EXIT2:%.*]] ; CHECK: exit1: ; CHECK-NEXT: ret i1 true ; CHECK: exit2: @@ -1587,7 +1587,7 @@ entry: %val = call i32 @llvm.ssub.sat.i32(i32 %a, i32 100) %cmp1 = icmp sle i32 %val, 2147483547 %cmp2 = icmp slt i32 %val, 2147483547 - br i1 undef, label %exit1, label %exit2 + br i1 %arg, label %exit1, label %exit2 exit1: ret i1 %cmp1 exit2: @@ -1934,7 +1934,7 @@ define void @select_assume(i32 %a, i32 %b, i1 %c, ptr %p) { ; CHECK-NEXT: [[C2:%.*]] = icmp ult i32 [[B]], 20 ; CHECK-NEXT: call void @llvm.assume(i1 [[C2]]) ; CHECK-NEXT: [[S:%.*]] = select i1 [[C]], i32 [[A]], i32 [[B]] -; CHECK-NEXT: [[C3:%.*]] = icmp ult i32 [[S]], 19 +; CHECK-NEXT: [[C3:%.*]] = icmp samesign ult i32 [[S]], 19 ; CHECK-NEXT: store i1 [[C3]], ptr [[P]], align 1 ; CHECK-NEXT: store i1 true, ptr [[P]], align 1 ; CHECK-NEXT: ret void @@ -1957,10 +1957,10 @@ define void @xor(i8 %a, ptr %p) { ; CHECK-NEXT: [[A_MASK:%.*]] = and i8 [[A]], 15 ; CHECK-NEXT: [[XOR:%.*]] = xor i8 [[A_MASK]], -86 ; CHECK-NEXT: store i1 true, ptr [[P]], align 1 -; CHECK-NEXT: [[C2:%.*]] = icmp ugt i8 [[XOR]], -96 +; CHECK-NEXT: [[C2:%.*]] = icmp samesign ugt i8 [[XOR]], -96 ; CHECK-NEXT: store i1 [[C2]], ptr [[P]], align 1 ; CHECK-NEXT: store i1 true, ptr [[P]], align 1 -; CHECK-NEXT: [[C4:%.*]] = icmp ult i8 [[XOR]], -81 +; CHECK-NEXT: [[C4:%.*]] = icmp samesign ult i8 [[XOR]], -81 ; CHECK-NEXT: store i1 [[C4]], ptr [[P]], align 1 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll b/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll index a7a1803bccc263..89ce59b96a5788 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/cond-using-block-value.ll @@ -13,7 +13,7 @@ define void @test_icmp_from_implied_cond(i32 %a, i32 %b) { ; CHECK-NEXT: br i1 [[COND]], label [[L2:%.*]], label [[END]] ; CHECK: l2: ; CHECK-NEXT: call void @use(i1 true) -; CHECK-NEXT: [[B_CMP2:%.*]] = icmp ult i32 [[B]], 31 +; CHECK-NEXT: [[B_CMP2:%.*]] = icmp samesign ult i32 [[B]], 31 ; CHECK-NEXT: call void @use(i1 [[B_CMP2]]) ; CHECK-NEXT: ret void ; CHECK: end: @@ -74,7 +74,7 @@ define void @test_icmp_from_implied_range(i16 %x, i32 %b) { ; CHECK-NEXT: br i1 [[COND]], label [[L1:%.*]], label [[END:%.*]] ; CHECK: l1: ; CHECK-NEXT: call void @use(i1 true) -; CHECK-NEXT: [[B_CMP2:%.*]] = icmp ult i32 [[B]], 65534 +; CHECK-NEXT: [[B_CMP2:%.*]] = icmp samesign ult i32 [[B]], 65534 ; CHECK-NEXT: call void @use(i1 [[B_CMP2]]) ; CHECK-NEXT: ret void ; CHECK: end: diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/crash.ll b/llvm/test/Transforms/CorrelatedValuePropagation/crash.ll index 8a7a4afb894f45..031cc0e9bb3ae1 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/crash.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/crash.ll @@ -37,13 +37,13 @@ func_29.exit: } ; PR13972 -define void @test3() nounwind { +define void @test3(i1 %arg) nounwind { for.body: br label %return for.cond.i: ; preds = %if.else.i, %for.body.i %e.2.i = phi i32 [ %e.2.i, %if.else.i ], [ -8, %for.body.i ] - br i1 undef, label %return, label %for.body.i + br i1 %arg, label %return, label %for.body.i for.body.i: ; preds = %for.cond.i switch i32 %e.2.i, label %for.cond3.i [ diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/deopt.ll b/llvm/test/Transforms/CorrelatedValuePropagation/deopt.ll index 96cc6c17503123..03815828ba6d50 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/deopt.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/deopt.ll @@ -97,7 +97,7 @@ define void @test3(i1 %c, i1 %c2) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C:%.*]], i64 0, i64 1 ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2:%.*]], i64 [[SEL]], i64 2 -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[SEL2]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i64 [[SEL2]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[TAKEN:%.*]], label [[UNTAKEN:%.*]] ; CHECK: taken: ; CHECK-NEXT: call void @use() [ "deopt"(i64 2) ] @@ -122,7 +122,7 @@ define void @test4(i1 %c, i1 %c2) { ; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[C2:%.*]], i64 0, i64 1 ; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i64 0, [[SEL]] ; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i64 [[ADD1]], [[SEL2]] -; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[ADD2]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ugt i64 [[ADD2]], 1 ; CHECK-NEXT: br i1 [[CMP]], label [[TAKEN:%.*]], label [[UNTAKEN:%.*]] ; CHECK: taken: ; CHECK-NEXT: call void @use() [ "deopt"(i64 2) ] diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll b/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll index 57c9f8926b524f..72f09a949a060d 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll @@ -128,7 +128,7 @@ define i1 @test4(i32 %x, i32 %y) #0 { ; CHECK-NEXT: br i1 [[CMP2]], label [[CONT2:%.*]], label [[OUT]] ; CHECK: cont2: ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[X]], [[Y]] -; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i32 [[ADD]], 15 +; CHECK-NEXT: [[CMP3:%.*]] = icmp samesign ult i32 [[ADD]], 15 ; CHECK-NEXT: br label [[OUT]] ; CHECK: out: ; CHECK-NEXT: [[RET:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ true, [[CONT1]] ], [ [[CMP3]], [[CONT2]] ] @@ -198,7 +198,7 @@ define i1 @test6(i32 %x, i32 %y) #0 { ; CHECK-NEXT: br i1 [[CMP2]], label [[CONT2:%.*]], label [[OUT]] ; CHECK: cont2: ; CHECK-NEXT: [[SHIFTED:%.*]] = shl nuw nsw i32 [[X]], [[Y]] -; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i32 [[SHIFTED]], 65536 +; CHECK-NEXT: [[CMP3:%.*]] = icmp samesign ult i32 [[SHIFTED]], 65536 ; CHECK-NEXT: br label [[OUT]] ; CHECK: out: ; CHECK-NEXT: [[RET:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ true, [[CONT1]] ], [ [[CMP3]], [[CONT2]] ] @@ -1265,7 +1265,7 @@ define void @ashr_sgt(i8 %x) { ; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: call void @check1(i1 true) -; CHECK-NEXT: [[C3:%.*]] = icmp ugt i8 [[X]], 8 +; CHECK-NEXT: [[C3:%.*]] = icmp samesign ugt i8 [[X]], 8 ; CHECK-NEXT: call void @check1(i1 [[C3]]) ; CHECK-NEXT: ret void ; CHECK: else: @@ -1291,7 +1291,7 @@ define void @ashr_sge(i8 %x) { ; CHECK-NEXT: br i1 [[C]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: call void @check1(i1 true) -; CHECK-NEXT: [[C3:%.*]] = icmp uge i8 [[X]], 5 +; CHECK-NEXT: [[C3:%.*]] = icmp samesign uge i8 [[X]], 5 ; CHECK-NEXT: call void @check1(i1 [[C3]]) ; CHECK-NEXT: ret void ; CHECK: else: @@ -1374,7 +1374,7 @@ define i1 @pr69928(i64 noundef %arg, i64 noundef %arg1) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[ARG:%.*]], 64424509440 ; CHECK-NEXT: [[AND:%.*]] = and i64 [[ARG1:%.*]], 4294967295 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[ARG]], [[AND]] +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign ult i64 [[ARG]], [[AND]] ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 false ; CHECK-NEXT: ret i1 [[SELECT]] ; @@ -1390,7 +1390,7 @@ define i1 @test_select_flip(i64 noundef %arg) { ; CHECK-LABEL: @test_select_flip( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[ARG:%.*]], 1000 -; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[ARG]], 100 +; CHECK-NEXT: [[CMP2:%.*]] = icmp samesign ult i64 [[ARG]], 100 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 false ; CHECK-NEXT: ret i1 [[SELECT]] ; diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/minmaxabs.ll b/llvm/test/Transforms/CorrelatedValuePropagation/minmaxabs.ll index a13ec50bd053a4..49a17fd4160679 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/minmaxabs.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/minmaxabs.ll @@ -12,7 +12,7 @@ define void @test_umin(i32 %x) { ; CHECK-LABEL: @test_umin( ; CHECK-NEXT: [[M:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 10) ; CHECK-NEXT: call void @use(i1 true) -; CHECK-NEXT: [[C2:%.*]] = icmp ult i32 [[M]], 10 +; CHECK-NEXT: [[C2:%.*]] = icmp samesign ult i32 [[M]], 10 ; CHECK-NEXT: call void @use(i1 [[C2]]) ; CHECK-NEXT: ret void ; @@ -60,7 +60,7 @@ define void @test_smax(i32 %x) { ; CHECK-LABEL: @test_smax( ; CHECK-NEXT: [[M:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 10) ; CHECK-NEXT: call void @use(i1 true) -; CHECK-NEXT: [[C2:%.*]] = icmp ugt i32 [[M]], 10 +; CHECK-NEXT: [[C2:%.*]] = icmp samesign ugt i32 [[M]], 10 ; CHECK-NEXT: call void @use(i1 [[C2]]) ; CHECK-NEXT: ret void ; @@ -77,7 +77,7 @@ define void @test_abs1(ptr %p) { ; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[P:%.*]], align 4, !range [[RNG0:![0-9]+]] ; CHECK-NEXT: [[A:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 true) ; CHECK-NEXT: call void @use(i1 true) -; CHECK-NEXT: [[C2:%.*]] = icmp ult i32 [[A]], 15 +; CHECK-NEXT: [[C2:%.*]] = icmp samesign ult i32 [[A]], 15 ; CHECK-NEXT: call void @use(i1 [[C2]]) ; CHECK-NEXT: ret void ; @@ -110,7 +110,7 @@ define void @test_abs3(i32 %x) { ; CHECK-LABEL: @test_abs3( ; CHECK-NEXT: [[A:%.*]] = call i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true) ; CHECK-NEXT: call void @use(i1 true) -; CHECK-NEXT: [[C2:%.*]] = icmp ugt i32 [[A]], 0 +; CHECK-NEXT: [[C2:%.*]] = icmp samesign ugt i32 [[A]], 0 ; CHECK-NEXT: call void @use(i1 [[C2]]) ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/overflow_predicate.ll b/llvm/test/Transforms/CorrelatedValuePropagation/overflow_predicate.ll index 75f66c5a89e7e2..8591ab73ebfeb8 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/overflow_predicate.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/overflow_predicate.ll @@ -49,7 +49,7 @@ define i1 @uadd_ov_true(i8 %x, ptr %px, ptr %pc) { ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[VAL_OV]], 1 ; CHECK-NEXT: br i1 [[OV]], label [[OVERFLOW:%.*]], label [[TRAP:%.*]] ; CHECK: overflow: -; CHECK-NEXT: [[C1:%.*]] = icmp ugt i8 [[X]], -100 +; CHECK-NEXT: [[C1:%.*]] = icmp samesign ugt i8 [[X]], -100 ; CHECK-NEXT: store i1 [[C1]], ptr [[PC:%.*]], align 1 ; CHECK-NEXT: ret i1 true ; CHECK: trap: @@ -113,7 +113,7 @@ define i1 @sadd_ov_true(i8 %x, ptr %px, ptr %pc) { ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[VAL_OV]], 1 ; CHECK-NEXT: br i1 [[OV]], label [[OVERFLOW:%.*]], label [[TRAP:%.*]] ; CHECK: overflow: -; CHECK-NEXT: [[C1:%.*]] = icmp ugt i8 [[X]], 28 +; CHECK-NEXT: [[C1:%.*]] = icmp samesign ugt i8 [[X]], 28 ; CHECK-NEXT: store i1 [[C1]], ptr [[PC:%.*]], align 1 ; CHECK-NEXT: ret i1 true ; CHECK: trap: @@ -177,7 +177,7 @@ define i1 @usub_ov_true(i8 %x, ptr %px, ptr %pc) { ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[VAL_OV]], 1 ; CHECK-NEXT: br i1 [[OV]], label [[OVERFLOW:%.*]], label [[TRAP:%.*]] ; CHECK: overflow: -; CHECK-NEXT: [[C1:%.*]] = icmp ult i8 [[X]], 99 +; CHECK-NEXT: [[C1:%.*]] = icmp samesign ult i8 [[X]], 99 ; CHECK-NEXT: store i1 [[C1]], ptr [[PC:%.*]], align 1 ; CHECK-NEXT: ret i1 true ; CHECK: trap: @@ -241,7 +241,7 @@ define i1 @ssub_ov_true(i8 %x, ptr %px, ptr %pc) { ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[VAL_OV]], 1 ; CHECK-NEXT: br i1 [[OV]], label [[OVERFLOW:%.*]], label [[TRAP:%.*]] ; CHECK: overflow: -; CHECK-NEXT: [[C1:%.*]] = icmp ult i8 [[X]], -29 +; CHECK-NEXT: [[C1:%.*]] = icmp samesign ult i8 [[X]], -29 ; CHECK-NEXT: store i1 [[C1]], ptr [[PC:%.*]], align 1 ; CHECK-NEXT: ret i1 true ; CHECK: trap: @@ -273,7 +273,7 @@ define i1 @umul_ov_false(i8 %x, ptr %px, ptr %pc) { ; CHECK-NEXT: [[OV:%.*]] = extractvalue { i8, i1 } [[VAL_OV]], 1 ; CHECK-NEXT: br i1 [[OV]], label [[TRAP:%.*]], label [[NO_OVERFLOW:%.*]] ; CHECK: no_overflow: -; CHECK-NEXT: [[C1:%.*]] = icmp ugt i8 [[X]], 24 +; CHECK-NEXT: [[C1:%.*]] = icmp samesign ugt i8 [[X]], 24 ; CHECK-NEXT: store i1 [[C1]], ptr [[PC:%.*]], align 1 ; CHECK-NEXT: ret i1 false ; CHECK: trap: diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/pr35807.ll b/llvm/test/Transforms/CorrelatedValuePropagation/pr35807.ll index c1898569b8b070..a5b6ff6a759421 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/pr35807.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/pr35807.ll @@ -3,13 +3,13 @@ target triple = "x86_64-apple-darwin17.4.0" -define void @patatino() { +define void @patatino(i1 %arg) { ; CHECK-LABEL: @patatino( -; CHECK-NEXT: br i1 undef, label [[BB3:%.*]], label [[BB4:%.*]] +; CHECK-NEXT: br i1 %arg, label [[BB3:%.*]], label [[BB4:%.*]] ; CHECK: bb3: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb4: -; CHECK-NEXT: br i1 undef, label [[BB40:%.*]], label [[BB22:%.*]] +; CHECK-NEXT: br i1 %arg, label [[BB40:%.*]], label [[BB22:%.*]] ; CHECK: bb7: ; CHECK-NEXT: br label [[BB14:%.*]] ; CHECK: bb12: @@ -17,24 +17,24 @@ define void @patatino() { ; CHECK: bb14: ; CHECK-NEXT: [[TMP19:%.*]] = icmp sgt i32 undef, undef ; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i64 [[TMP20]], i64 0 -; CHECK-NEXT: br i1 undef, label [[BB40]], label [[BB7:%.*]] +; CHECK-NEXT: br i1 %arg, label [[BB40]], label [[BB7:%.*]] ; CHECK: bb22: ; CHECK-NEXT: br label [[BB24:%.*]] ; CHECK: bb24: ; CHECK-NEXT: br label [[BB32:%.*]] ; CHECK: bb32: -; CHECK-NEXT: br i1 undef, label [[BB40]], label [[BB24]] +; CHECK-NEXT: br i1 %arg, label [[BB40]], label [[BB24]] ; CHECK: bb40: ; CHECK-NEXT: [[TMP41:%.*]] = phi i64 [ 4, [[BB4]] ], [ [[TMP20]], [[BB14]] ], [ undef, [[BB32]] ] ; CHECK-NEXT: ret void ; - br i1 undef, label %bb3, label %bb4 + br i1 %arg, label %bb3, label %bb4 bb3: br label %bb3 bb4: - br i1 undef, label %bb40, label %bb22 + br i1 %arg, label %bb40, label %bb22 bb7: br label %bb14 @@ -49,7 +49,7 @@ bb12: bb14: %tmp19 = icmp sgt i32 undef, undef %tmp20 = select i1 %tmp19, i64 %tmp20, i64 0 - br i1 undef, label %bb40, label %bb7 + br i1 %arg, label %bb40, label %bb7 bb22: br label %bb24 @@ -58,7 +58,7 @@ bb24: br label %bb32 bb32: - br i1 undef, label %bb40, label %bb24 + br i1 %arg, label %bb40, label %bb24 bb40: %tmp41 = phi i64 [ 4, %bb4 ], [ %tmp20, %bb14 ], [ undef, %bb32 ] diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll index ce1b591218d1b1..03d71fa9b52773 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/range.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/range.ll @@ -64,7 +64,7 @@ define i32 @test3(i32 %c) nounwind { ; CHECK: if.then: ; CHECK-NEXT: ret i32 1 ; CHECK: if.end: -; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i32 [[C]], 3 +; CHECK-NEXT: [[CMP1:%.*]] = icmp samesign ult i32 [[C]], 3 ; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN2:%.*]], label [[IF_END8:%.*]] ; CHECK: if.then2: ; CHECK-NEXT: br i1 true, label [[IF_THEN4:%.*]], label [[IF_END6:%.*]] @@ -989,11 +989,11 @@ define i1 @ctlz_nofold(i16 %x) { ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: [[CTLZ:%.*]] = call i16 @llvm.ctlz.i16(i16 [[X]], i1 false) -; CHECK-NEXT: [[RES:%.*]] = icmp uge i16 [[CTLZ]], 9 +; CHECK-NEXT: [[RES:%.*]] = icmp samesign uge i16 [[CTLZ]], 9 ; CHECK-NEXT: ret i1 [[RES]] ; CHECK: else: ; CHECK-NEXT: [[CTLZ2:%.*]] = call i16 @llvm.ctlz.i16(i16 [[X]], i1 false) -; CHECK-NEXT: [[RES2:%.*]] = icmp ult i16 [[CTLZ2]], 7 +; CHECK-NEXT: [[RES2:%.*]] = icmp samesign ult i16 [[CTLZ2]], 7 ; CHECK-NEXT: ret i1 [[RES2]] ; %cmp = icmp ult i16 %x, 256 @@ -1038,7 +1038,7 @@ define i1 @cttz_nofold1(i16 %x) { ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X]], i1 true) -; CHECK-NEXT: [[RES:%.*]] = icmp uge i16 [[CTTZ]], 7 +; CHECK-NEXT: [[RES:%.*]] = icmp samesign uge i16 [[CTTZ]], 7 ; CHECK-NEXT: ret i1 [[RES]] ; CHECK: else: ; CHECK-NEXT: ret i1 false @@ -1061,7 +1061,7 @@ define i1 @cttz_nofold2(i16 %x) { ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X]], i1 false) -; CHECK-NEXT: [[RES:%.*]] = icmp uge i16 [[CTTZ]], 8 +; CHECK-NEXT: [[RES:%.*]] = icmp samesign uge i16 [[CTTZ]], 8 ; CHECK-NEXT: ret i1 [[RES]] ; CHECK: else: ; CHECK-NEXT: ret i1 false @@ -1106,7 +1106,7 @@ define i1 @ctpop_nofold(i16 %x) { ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: [[CTPOP:%.*]] = call i16 @llvm.ctpop.i16(i16 [[X]]) -; CHECK-NEXT: [[RES:%.*]] = icmp ule i16 [[CTPOP]], 7 +; CHECK-NEXT: [[RES:%.*]] = icmp samesign ule i16 [[CTPOP]], 7 ; CHECK-NEXT: ret i1 [[RES]] ; CHECK: else: ; CHECK-NEXT: ret i1 true diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/sdiv.ll b/llvm/test/Transforms/CorrelatedValuePropagation/sdiv.ll index d88fe358a0aa2c..13e38902474fa0 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/sdiv.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/sdiv.ll @@ -128,10 +128,10 @@ define void @test5(i32 %n) { ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP:%.*]], label [[EXIT:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[A:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[DIV1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[A]], 4 +; CHECK-NEXT: [[COND:%.*]] = icmp samesign ugt i32 [[A]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) ; CHECK-NEXT: [[DIV1]] = udiv i32 [[A]], 6 -; CHECK-NEXT: [[LOOPCOND:%.*]] = icmp ugt i32 [[DIV1]], 8 +; CHECK-NEXT: [[LOOPCOND:%.*]] = icmp samesign ugt i32 [[DIV1]], 8 ; CHECK-NEXT: br i1 [[LOOPCOND]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/srem.ll b/llvm/test/Transforms/CorrelatedValuePropagation/srem.ll index bc2b0aec269b9d..e7339f4ee45a9f 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/srem.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/srem.ll @@ -41,10 +41,10 @@ define void @test4(i32 %n) { ; CHECK-NEXT: br i1 [[CMP]], label [[LOOP:%.*]], label [[EXIT:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[A:%.*]] = phi i32 [ [[N]], [[ENTRY:%.*]] ], [ [[REM1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[A]], 4 +; CHECK-NEXT: [[COND:%.*]] = icmp samesign ugt i32 [[A]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[COND]]) ; CHECK-NEXT: [[REM1]] = urem i32 [[A]], 17 -; CHECK-NEXT: [[LOOPCOND:%.*]] = icmp ugt i32 [[REM1]], 8 +; CHECK-NEXT: [[LOOPCOND:%.*]] = icmp samesign ugt i32 [[REM1]], 8 ; CHECK-NEXT: br i1 [[LOOPCOND]], label [[LOOP]], label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/udiv-expansion.ll b/llvm/test/Transforms/CorrelatedValuePropagation/udiv-expansion.ll index a5fc26ebab00f5..8e568c671f94d4 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/udiv-expansion.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/udiv-expansion.ll @@ -92,7 +92,7 @@ define i8 @constant.divisor.v7(i8 %x) { define i8 @constant.divisor.v6to8(i8 %x) { ; CHECK-LABEL: @constant.divisor.v6to8( -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X:%.*]], 6 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X:%.*]], 6 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X]], 9 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) @@ -109,7 +109,7 @@ define i8 @constant.divisor.v6to8(i8 %x) { define i8 @constant.divisor.v9to11(i8 %x) { ; CHECK-LABEL: @constant.divisor.v9to11( -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X:%.*]], 9 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X:%.*]], 9 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X]], 12 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) @@ -126,7 +126,7 @@ define i8 @constant.divisor.v9to11(i8 %x) { define i8 @constant.divisor.v12to14(i8 %x) { ; CHECK-LABEL: @constant.divisor.v12to14( -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X:%.*]], 12 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X:%.*]], 12 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X]], 15 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) @@ -143,7 +143,7 @@ define i8 @constant.divisor.v12to14(i8 %x) { define i8 @constant.divisor.v6to11(i8 %x) { ; CHECK-LABEL: @constant.divisor.v6to11( -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X:%.*]], 6 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X:%.*]], 6 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X]], 12 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) @@ -164,7 +164,7 @@ define i8 @variable.v3(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v3( ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i8 [[X:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -183,7 +183,7 @@ define i8 @variable.v4(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v4( ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i8 [[X:%.*]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -217,7 +217,7 @@ define i8 @variable.v5(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v5( ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i8 [[X:%.*]], 5 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -238,7 +238,7 @@ define i8 @variable.v6(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v6( ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i8 [[X:%.*]], 6 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -259,7 +259,7 @@ define i8 @variable.v7(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v7( ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i8 [[X:%.*]], 7 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -371,7 +371,7 @@ define i8 @known_uge(i8 noundef %x) { ; CHECK-LABEL: @known_uge( ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X:%.*]], 6 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X]], 3 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: ret i8 1 ; diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/urem-expansion.ll b/llvm/test/Transforms/CorrelatedValuePropagation/urem-expansion.ll index 8e276d010fdd1b..c6e4265f855e04 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/urem-expansion.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/urem-expansion.ll @@ -102,7 +102,7 @@ define i8 @constant.divisor.v7(i8 %x) { define i8 @constant.divisor.v6to8(i8 %x) { ; CHECK-LABEL: @constant.divisor.v6to8( -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X:%.*]], 6 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X:%.*]], 6 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X]], 9 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) @@ -119,7 +119,7 @@ define i8 @constant.divisor.v6to8(i8 %x) { define i8 @constant.divisor.v9to11(i8 %x) { ; CHECK-LABEL: @constant.divisor.v9to11( -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X:%.*]], 9 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X:%.*]], 9 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X]], 12 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) @@ -136,7 +136,7 @@ define i8 @constant.divisor.v9to11(i8 %x) { define i8 @constant.divisor.v12to14(i8 %x) { ; CHECK-LABEL: @constant.divisor.v12to14( -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X:%.*]], 12 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X:%.*]], 12 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X]], 15 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) @@ -153,7 +153,7 @@ define i8 @constant.divisor.v12to14(i8 %x) { define i8 @constant.divisor.v6to11(i8 %x) { ; CHECK-LABEL: @constant.divisor.v6to11( -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X:%.*]], 6 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X:%.*]], 6 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X]], 12 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) @@ -174,7 +174,7 @@ define i8 @variable.v3(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v3( ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i8 [[X:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -193,7 +193,7 @@ define i8 @variable.v4(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v4( ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i8 [[X:%.*]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -233,7 +233,7 @@ define i8 @variable.v5(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v5( ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i8 [[X:%.*]], 5 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -257,7 +257,7 @@ define i8 @variable.v6(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v6( ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i8 [[X:%.*]], 6 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -281,7 +281,7 @@ define i8 @variable.v7(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v7( ; CHECK-NEXT: [[CMP_X:%.*]] = icmp ult i8 [[X:%.*]], 7 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -300,11 +300,11 @@ define i8 @variable.v7(i8 %x, i8 %y) { define i8 @variable.v6to8.v3to4(i8 %x, i8 %y) { ; CHECK-LABEL: @variable.v6to8.v3to4( -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X:%.*]], 6 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X:%.*]], 6 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X]], 8 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) -; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp uge i8 [[Y:%.*]], 3 +; CHECK-NEXT: [[CMP_Y_LOWER:%.*]] = icmp samesign uge i8 [[Y:%.*]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_LOWER]]) ; CHECK-NEXT: [[CMP_Y_UPPER:%.*]] = icmp ule i8 [[Y]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_Y_UPPER]]) @@ -444,7 +444,7 @@ define i8 @known_uge(i8 noundef %x) { ; CHECK-LABEL: @known_uge( ; CHECK-NEXT: [[CMP_X_UPPER:%.*]] = icmp ult i8 [[X:%.*]], 6 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_UPPER]]) -; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp uge i8 [[X]], 3 +; CHECK-NEXT: [[CMP_X_LOWER:%.*]] = icmp samesign uge i8 [[X]], 3 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_X_LOWER]]) ; CHECK-NEXT: [[REM:%.*]] = sub nuw i8 [[X]], 3 ; CHECK-NEXT: ret i8 [[REM]] diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/uscmp.ll b/llvm/test/Transforms/CorrelatedValuePropagation/uscmp.ll index efe4235b344a67..503c715e3bf7c6 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/uscmp.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/uscmp.ll @@ -24,7 +24,7 @@ define i8 @scmp_0(i32 %x, i32 %y) { define i8 @ucmp_1(i32 %x, i32 %y) { ; X is within [4, 8) ; CHECK-LABEL: @ucmp_1( -; CHECK-NEXT: [[COND1:%.*]] = icmp uge i32 [[X:%.*]], 4 +; CHECK-NEXT: [[COND1:%.*]] = icmp samesign uge i32 [[X:%.*]], 4 ; CHECK-NEXT: call void @llvm.assume(i1 [[COND1]]) ; CHECK-NEXT: [[COND2:%.*]] = icmp ult i32 [[X]], 8 ; CHECK-NEXT: call void @llvm.assume(i1 [[COND2]]) diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll index 37eb4d9c978ec8..7060b4244d9885 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/vectors.ll @@ -51,7 +51,7 @@ define <2 x i1> @cmp_signedness(<2 x i8> %a) { ; CHECK-LABEL: define <2 x i1> @cmp_signedness( ; CHECK-SAME: <2 x i8> [[A:%.*]]) { ; CHECK-NEXT: [[ZEXT:%.*]] = zext <2 x i8> [[A]] to <2 x i16> -; CHECK-NEXT: [[CMP:%.*]] = icmp ult <2 x i16> [[ZEXT]], splat (i16 5) +; CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult <2 x i16> [[ZEXT]], splat (i16 5) ; CHECK-NEXT: ret <2 x i1> [[CMP]] ; %zext = zext <2 x i8> %a to <2 x i16> diff --git a/llvm/test/Transforms/DeadStoreElimination/overlap.ll b/llvm/test/Transforms/DeadStoreElimination/overlap.ll index d1dedbf5a95237..88271db224e597 100644 --- a/llvm/test/Transforms/DeadStoreElimination/overlap.ll +++ b/llvm/test/Transforms/DeadStoreElimination/overlap.ll @@ -29,14 +29,14 @@ define void @test1() { ret void } -define void @test2() { +define void @test2(i1 %arg) { ; CHECK-LABEL: @test2( ; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8 ; CHECK-NEXT: call void @use(ptr [[A]]) ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr i8, ptr [[A]], i32 1 ; CHECK-NEXT: store i8 10, ptr [[A]], align 1 ; CHECK-NEXT: store i8 20, ptr [[PTR2]], align 1 -; CHECK-NEXT: br i1 undef, label [[BB1:%.*]], label [[END:%.*]] +; CHECK-NEXT: br i1 %arg, label [[BB1:%.*]], label [[END:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[LV:%.*]] = load i64, ptr [[A]], align 4 ; CHECK-NEXT: br label [[END]] @@ -51,7 +51,7 @@ define void @test2() { store i8 10, ptr %a store i8 20, ptr %ptr2 - br i1 undef, label %bb1, label %end + br i1 %arg, label %bb1, label %end bb1: %lv = load i64, ptr %a diff --git a/llvm/test/Transforms/DeadStoreElimination/simple.ll b/llvm/test/Transforms/DeadStoreElimination/simple.ll index ef2c4ef564b24a..af5b77c79acdc6 100644 --- a/llvm/test/Transforms/DeadStoreElimination/simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/simple.ll @@ -476,7 +476,7 @@ bb3: } ; Don't remove redundant store in a loop with a may-alias store. -define i32 @test32(i1 %c, ptr %p, i32 %i) { +define i32 @test32(i1 %c, ptr %p, i32 %i, i1 %arg) { ; CHECK-LABEL: @test32( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[P:%.*]], align 4 @@ -484,7 +484,7 @@ define i32 @test32(i1 %c, ptr %p, i32 %i) { ; CHECK: bb1: ; CHECK-NEXT: store i32 [[V]], ptr [[P]], align 4 ; CHECK-NEXT: call void @unknown_func() -; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2:%.*]] +; CHECK-NEXT: br i1 %arg, label [[BB1]], label [[BB2:%.*]] ; CHECK: bb2: ; CHECK-NEXT: ret i32 0 ; @@ -495,7 +495,7 @@ bb1: store i32 %v, ptr %p, align 4 ; Might read and overwrite value at %p call void @unknown_func() - br i1 undef, label %bb1, label %bb2 + br i1 %arg, label %bb1, label %bb2 bb2: ret i32 0 } diff --git a/llvm/test/Transforms/EarlyCSE/X86/preserve_memoryssa.ll b/llvm/test/Transforms/EarlyCSE/X86/preserve_memoryssa.ll index dcaa0b5b399af6..cadb61043c72ab 100644 --- a/llvm/test/Transforms/EarlyCSE/X86/preserve_memoryssa.ll +++ b/llvm/test/Transforms/EarlyCSE/X86/preserve_memoryssa.ll @@ -89,10 +89,10 @@ entry: %struct.gnode.0.1.3.6.9.18.20.79 = type { i32, i32, i32, i32, i32, i32, i32, ptr } @gnodeArray = external global ptr, align 8 -define void @test4_shortest() { +define void @test4_shortest(i1 %arg) { entry: %exl.i = alloca [5 x i32], align 16 - br i1 undef, label %if.then274, label %for.cond404 + br i1 %arg, label %if.then274, label %for.cond404 if.then274: ; preds = %if.end256 %arrayidx.i = getelementptr inbounds [5 x i32], ptr %exl.i, i64 0, i64 1 @@ -118,7 +118,7 @@ for.cond404: ; preds = %if.end256 %arrayidx6.i968 = getelementptr inbounds ptr, ptr %0, i64 undef ; MemoryUse(1) MayAlias %1 = load ptr, ptr %arrayidx6.i968, align 8 - br i1 undef, label %for.cond26.preheader.i974, label %if.then20.for.body_crit_edge.i999 + br i1 %arg, label %for.cond26.preheader.i974, label %if.then20.for.body_crit_edge.i999 for.cond26.preheader.i974: ; preds = %if.then20.i996 %arrayidx.i924 = getelementptr inbounds [5 x i32], ptr %exl.i, i64 0, i64 1 diff --git a/llvm/test/Transforms/FixIrreducible/bug45623.ll b/llvm/test/Transforms/FixIrreducible/bug45623.ll index beddc967ebb0b6..58724431ff0ee1 100644 --- a/llvm/test/Transforms/FixIrreducible/bug45623.ll +++ b/llvm/test/Transforms/FixIrreducible/bug45623.ll @@ -3,34 +3,35 @@ ; RUN: opt < %s -passes='fix-irreducible,verify' -S | FileCheck %s ; RUN: opt < %s -passes='verify,fix-irreducible,verify' -S | FileCheck %s -define dso_local void @tre_tnfa_run_backtrack() { +define dso_local void @tre_tnfa_run_backtrack(i1 %arg) { ; CHECK-LABEL: @tre_tnfa_run_backtrack( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARG_INV:%.*]] = xor i1 [[ARG:%.*]], true ; CHECK-NEXT: br label [[RETRY:%.*]] ; CHECK: retry: ; CHECK-NEXT: br label [[IRR_GUARD:%.*]] ; CHECK: while.body248: -; CHECK-NEXT: br i1 undef, label [[IF_THEN250:%.*]], label [[IF_END275:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[IF_THEN250:%.*]], label [[IF_END275:%.*]] ; CHECK: if.then250: ; CHECK-NEXT: br label [[FOR_COND264:%.*]] ; CHECK: for.cond264: -; CHECK-NEXT: br i1 undef, label [[FOR_BODY267:%.*]], label [[BACKTRACK:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[FOR_BODY267:%.*]], label [[BACKTRACK:%.*]] ; CHECK: for.body267: ; CHECK-NEXT: br label [[FOR_COND264]] ; CHECK: if.end275: ; CHECK-NEXT: br label [[FOR_COND342:%.*]] ; CHECK: for.cond342: -; CHECK-NEXT: br i1 undef, label [[FOR_BODY345:%.*]], label [[FOR_END580:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[FOR_BODY345:%.*]], label [[FOR_END580:%.*]] ; CHECK: for.body345: ; CHECK-NEXT: br label [[FOR_COND342]] ; CHECK: for.end580: ; CHECK-NEXT: br label [[BACKTRACK]] ; CHECK: backtrack: -; CHECK-NEXT: br i1 undef, label [[IF_THEN595:%.*]], label [[IF_ELSE629:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[IF_THEN595:%.*]], label [[IF_ELSE629:%.*]] ; CHECK: if.then595: ; CHECK-NEXT: br label [[FOR_COND616:%.*]] ; CHECK: for.cond616: -; CHECK-NEXT: br i1 undef, label [[FOR_BODY619:%.*]], label [[FOR_END626:%.*]] +; CHECK-NEXT: br i1 [[ARG]], label [[FOR_BODY619:%.*]], label [[FOR_END626:%.*]] ; CHECK: for.body619: ; CHECK-NEXT: br label [[FOR_COND616]] ; CHECK: for.end626: @@ -38,23 +39,23 @@ define dso_local void @tre_tnfa_run_backtrack() { ; CHECK: if.else629: ; CHECK-NEXT: br label [[RETRY]] ; CHECK: irr.guard: -; CHECK-NEXT: [[GUARD_WHILE_BODY248:%.*]] = phi i1 [ true, [[FOR_END626]] ], [ undef, [[RETRY]] ] +; CHECK-NEXT: [[GUARD_WHILE_BODY248:%.*]] = phi i1 [ true, [[FOR_END626]] ], [ [[ARG_INV]], [[RETRY]] ] ; CHECK-NEXT: br i1 [[GUARD_WHILE_BODY248]], label [[WHILE_BODY248:%.*]], label [[BACKTRACK]] ; entry: br label %retry retry: - br i1 undef, label %backtrack, label %while.body248 + br i1 %arg, label %backtrack, label %while.body248 while.body248: ; preds = %for.end626, %retry - br i1 undef, label %if.then250, label %if.end275 + br i1 %arg, label %if.then250, label %if.end275 if.then250: ; preds = %while.body248 br label %for.cond264 for.cond264: ; preds = %for.body267, %if.then250 - br i1 undef, label %for.body267, label %backtrack + br i1 %arg, label %for.body267, label %backtrack for.body267: ; preds = %for.cond264 br label %for.cond264 @@ -63,7 +64,7 @@ if.end275: ; preds = %while.body248 br label %for.cond342 for.cond342: ; preds = %for.body345, %if.end275 - br i1 undef, label %for.body345, label %for.end580 + br i1 %arg, label %for.body345, label %for.end580 for.body345: ; preds = %for.cond342 br label %for.cond342 @@ -72,13 +73,13 @@ for.end580: ; preds = %for.cond342 br label %backtrack backtrack: ; preds = %for.end580, %for.cond264, %retry - br i1 undef, label %if.then595, label %if.else629 + br i1 %arg, label %if.then595, label %if.else629 if.then595: ; preds = %backtrack br label %for.cond616 for.cond616: ; preds = %for.body619, %if.then595 - br i1 undef, label %for.body619, label %for.end626 + br i1 %arg, label %for.body619, label %for.end626 for.body619: ; preds = %for.cond616 br label %for.cond616 diff --git a/llvm/test/Transforms/FixIrreducible/unreachable.ll b/llvm/test/Transforms/FixIrreducible/unreachable.ll index cc9a29b2a8d711..defbefb3ba8121 100644 --- a/llvm/test/Transforms/FixIrreducible/unreachable.ll +++ b/llvm/test/Transforms/FixIrreducible/unreachable.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: @unreachable( ; CHECK: entry: ; CHECK-NOT: irr.guard: -define void @unreachable(i32 %n) { +define void @unreachable(i32 %n, i1 %arg) { entry: br label %loop.body @@ -17,7 +17,7 @@ unreachable.block: br label %inner.block inner.block: - br i1 undef, label %loop.exit, label %loop.latch + br i1 %arg, label %loop.exit, label %loop.latch loop.latch: br label %loop.body diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll index 05c8bdaf66e7aa..6dd2399093b668 100644 --- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll +++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll @@ -148,14 +148,15 @@ define ptr @test5(i1 %c) { } ; Local analysis, but going through a self recursive phi -define ptr @test6a() { -; COMMON-LABEL: define nonnull ptr @test6a() { +define ptr @test6a(i1 %arg) { +; COMMON-LABEL: define nonnull ptr @test6a( +; COMMON-SAME: i1 [[ARG:%.*]]) { ; COMMON-NEXT: entry: ; COMMON-NEXT: [[RET:%.*]] = call ptr @ret_nonnull() ; COMMON-NEXT: br label [[LOOP:%.*]] ; COMMON: loop: ; COMMON-NEXT: [[PHI:%.*]] = phi ptr [ [[RET]], [[ENTRY:%.*]] ], [ [[PHI]], [[LOOP]] ] -; COMMON-NEXT: br i1 undef, label [[LOOP]], label [[EXIT:%.*]] +; COMMON-NEXT: br i1 [[ARG]], label [[LOOP]], label [[EXIT:%.*]] ; COMMON: exit: ; COMMON-NEXT: ret ptr [[PHI]] ; @@ -164,7 +165,7 @@ entry: br label %loop loop: %phi = phi ptr [%ret, %entry], [%phi, %loop] - br i1 undef, label %loop, label %exit + br i1 %arg, label %loop, label %exit exit: ret ptr %phi } diff --git a/llvm/test/Transforms/FunctionSpecialization/bug55000-read-uninitialized-value.ll b/llvm/test/Transforms/FunctionSpecialization/bug55000-read-uninitialized-value.ll index d96460efe34620..a2fd239c9ce305 100644 --- a/llvm/test/Transforms/FunctionSpecialization/bug55000-read-uninitialized-value.ll +++ b/llvm/test/Transforms/FunctionSpecialization/bug55000-read-uninitialized-value.ll @@ -7,7 +7,7 @@ declare hidden { i8, ptr } @getType(ptr) align 2 ; CHECK-LABEL: @foo.specialized.1 ; CHECK-LABEL: @foo.specialized.2 -define internal void @foo(ptr %TLI, ptr %DL, ptr %Ty, ptr %ValueVTs, ptr %Offsets, i64 %StartingOffset) { +define internal void @foo(ptr %TLI, ptr %DL, ptr %Ty, ptr %ValueVTs, ptr %Offsets, i64 %StartingOffset, i1 %arg) { entry: %VT = alloca i64, align 8 br i1 false, label %if.then, label %if.end4 @@ -21,7 +21,7 @@ if.end4: ; preds = %entry for.body: ; preds = %if.end4 %add13 = add i64 %StartingOffset, undef - call void @foo(ptr %TLI, ptr %DL, ptr undef, ptr %ValueVTs, ptr %Offsets, i64 %add13) + call void @foo(ptr %TLI, ptr %DL, ptr undef, ptr %ValueVTs, ptr %Offsets, i64 %add13, i1 %arg) unreachable for.cond16: ; preds = %for.cond34, %if.end4 @@ -29,14 +29,14 @@ for.cond16: ; preds = %for.cond34, %if.end br label %for.cond34 for.cond34: ; preds = %for.body37, %for.cond16 - br i1 undef, label %for.body37, label %for.cond16 + br i1 %arg, label %for.body37, label %for.cond16 for.body37: ; preds = %for.cond34 %tobool39 = icmp ne ptr %Offsets, null br label %for.cond34 } -define hidden { ptr, i32 } @bar(ptr %this) { +define hidden { ptr, i32 } @bar(ptr %this, i1 %arg) { entry: %Offsets = alloca i64, align 8 %cmp26 = call zeroext i1 @compare(ptr undef) @@ -50,11 +50,11 @@ for.body28: ; preds = %entry br i1 %call33, label %if.then34, label %if.end106 if.then34: ; preds = %for.body28 - call void @foo(ptr %this, ptr undef, ptr undef, ptr undef, ptr null, i64 0) + call void @foo(ptr %this, ptr undef, ptr undef, ptr undef, ptr null, i64 0, i1 %arg) unreachable if.end106: ; preds = %for.body28 - call void @foo(ptr %this, ptr undef, ptr undef, ptr undef, ptr %Offsets, i64 0) + call void @foo(ptr %this, ptr undef, ptr undef, ptr undef, ptr %Offsets, i64 0, i1 %arg) unreachable } diff --git a/llvm/test/Transforms/IRCE/pr57335.ll b/llvm/test/Transforms/IRCE/pr57335.ll index dbc61e0e9a255e..eb9fc129b93a33 100644 --- a/llvm/test/Transforms/IRCE/pr57335.ll +++ b/llvm/test/Transforms/IRCE/pr57335.ll @@ -2,7 +2,7 @@ ; RUN: opt -passes='require,irce' -S < %s 2>&1 | FileCheck %s ; Make sure we don't crash. -define void @test() { +define void @test(i1 %arg) { ; CHECK-LABEL: test bb: %tmp = icmp ult i32 0, undef @@ -41,7 +41,7 @@ bb17: ; preds = %bb12 bb20: ; preds = %bb17 %tmp21 = add nuw nsw i32 %tmp7, 2 - br i1 undef, label %bb22, label %bb2 + br i1 %arg, label %bb22, label %bb2 bb22: ; preds = %bb20 %tmp23 = phi i32 [ %tmp18, %bb20 ] diff --git a/llvm/test/Transforms/Inline/arg-attr-propagation.ll b/llvm/test/Transforms/Inline/arg-attr-propagation.ll index 7b096539e7e1b1..fad6c7ced2edd2 100644 --- a/llvm/test/Transforms/Inline/arg-attr-propagation.ll +++ b/llvm/test/Transforms/Inline/arg-attr-propagation.ll @@ -76,3 +76,29 @@ define i32 @caller3(ptr dereferenceable(33) %t1) { ret i32 %t2 } +; Make sure that we don't propagate a pointer-only attribute to a vector of pointers. + +declare void @helper4(<4 x ptr> %ptr) + +define void @callee4(ptr readonly %ptr, <4 x i64> %idx) { +; CHECK-LABEL: define {{[^@]+}}@callee4 +; CHECK-SAME: (ptr readonly [[PTR:%.*]], <4 x i64> [[IDX:%.*]]) { +; CHECK-NEXT: [[PTRS:%.*]] = getelementptr inbounds i8, ptr [[PTR]], <4 x i64> [[IDX]] +; CHECK-NEXT: call void @helper4(<4 x ptr> [[PTRS]]) +; CHECK-NEXT: ret void +; + %ptrs = getelementptr inbounds i8, ptr %ptr, <4 x i64> %idx + call void @helper4(<4 x ptr> %ptrs) + ret void +} + +define void @caller4(ptr readonly %ptr, <4 x i64> %idx) { +; CHECK-LABEL: define {{[^@]+}}@caller4 +; CHECK-SAME: (ptr readonly [[PTR:%.*]], <4 x i64> [[IDX:%.*]]) { +; CHECK-NEXT: [[PTRS_I:%.*]] = getelementptr inbounds i8, ptr [[PTR]], <4 x i64> [[IDX]] +; CHECK-NEXT: call void @helper4(<4 x ptr> [[PTRS_I]]) +; CHECK-NEXT: ret void +; + call void @callee4(ptr readonly %ptr, <4 x i64> %idx) + ret void +} diff --git a/llvm/test/Transforms/InstCombine/load-cmp.ll b/llvm/test/Transforms/InstCombine/load-cmp.ll index 8e39fe33cded88..12be81b8f815d0 100644 --- a/llvm/test/Transforms/InstCombine/load-cmp.ll +++ b/llvm/test/Transforms/InstCombine/load-cmp.ll @@ -312,9 +312,7 @@ define i1 @test10_struct_arr_i64(i64 %x) { define i1 @test10_struct_arr_noinbounds_i16(i16 %x) { ; CHECK-LABEL: @test10_struct_arr_noinbounds_i16( -; CHECK-NEXT: [[TMP1:%.*]] = sext i16 [[X:%.*]] to i32 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 268435455 -; CHECK-NEXT: [[R:%.*]] = icmp ne i32 [[TMP2]], 1 +; CHECK-NEXT: [[R:%.*]] = icmp ne i16 [[X:%.*]], 1 ; CHECK-NEXT: ret i1 [[R]] ; %p = getelementptr [4 x %Foo], ptr @GStructArr, i32 0, i16 %x, i32 2 diff --git a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll index 247a02f0bcc14a..dc9daf5265d370 100644 --- a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll +++ b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals smart ; RUN: opt -passes=instcombine -S < %s | FileCheck %s target datalayout = "e-m:e-p:64:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -20,7 +20,7 @@ define i32 @test_load_cast_combine_noalias(ptr %ptr) { ; Ensure (cast (load (...))) -> (load (cast (...))) preserves no-alias metadata. ; CHECK-LABEL: @test_load_cast_combine_noalias( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !alias.scope !3, !noalias !3 +; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META3]] ; CHECK-NEXT: ret i32 [[L1]] ; entry: @@ -48,7 +48,7 @@ define i32 @test_load_cast_combine_invariant(ptr %ptr) { ; Ensure (cast (load (...))) -> (load (cast (...))) preserves invariant metadata. ; CHECK-LABEL: @test_load_cast_combine_invariant( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !invariant.load !6 +; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !invariant.load [[META6:![0-9]+]] ; CHECK-NEXT: ret i32 [[L1]] ; entry: @@ -62,7 +62,7 @@ define i32 @test_load_cast_combine_nontemporal(ptr %ptr) { ; metadata. ; CHECK-LABEL: @test_load_cast_combine_nontemporal( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !nontemporal !7 +; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !nontemporal [[META7:![0-9]+]] ; CHECK-NEXT: ret i32 [[L1]] ; entry: @@ -76,7 +76,7 @@ define ptr @test_load_cast_combine_align(ptr %ptr) { ; metadata. ; CHECK-LABEL: @test_load_cast_combine_align( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !align !8 +; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !align [[META8:![0-9]+]] ; CHECK-NEXT: ret ptr [[L]] ; entry: @@ -89,7 +89,7 @@ define ptr @test_load_cast_combine_deref(ptr %ptr) { ; metadata. ; CHECK-LABEL: @test_load_cast_combine_deref( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !dereferenceable !8 +; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !dereferenceable [[META8]] ; CHECK-NEXT: ret ptr [[L]] ; entry: @@ -102,7 +102,7 @@ define ptr @test_load_cast_combine_deref_or_null(ptr %ptr) { ; dereferenceable_or_null metadata. ; CHECK-LABEL: @test_load_cast_combine_deref_or_null( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !dereferenceable_or_null !8 +; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !dereferenceable_or_null [[META8]] ; CHECK-NEXT: ret ptr [[L]] ; entry: @@ -151,7 +151,7 @@ exit: define void @test_load_cast_combine_nonnull(ptr %ptr) { ; CHECK-LABEL: @test_load_cast_combine_nonnull( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[P:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !nonnull !6 +; CHECK-NEXT: [[P:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !nonnull [[META6]] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[PTR]], i64 336 ; CHECK-NEXT: store ptr [[P]], ptr [[GEP]], align 8 ; CHECK-NEXT: ret void @@ -165,7 +165,7 @@ entry: define i32 @test_load_cast_combine_noundef(ptr %ptr) { ; CHECK-LABEL: @test_load_cast_combine_noundef( -; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !noundef !6 +; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !noundef [[META6]] ; CHECK-NEXT: ret i32 [[L1]] ; %l = load float, ptr %ptr, !noundef !{} @@ -186,6 +186,96 @@ entry: ret i32 %c } +; FIXME: Should preserve metadata on loads, except !noundef and !invariant.load. +define ptr @preserve_load_metadata_after_select_transform1(i1 %c, ptr dereferenceable(8) %a, ptr dereferenceable(8) %b) { +; CHECK-LABEL: @preserve_load_metadata_after_select_transform1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B_VAL:%.*]] = load ptr, ptr [[B:%.*]], align 1 +; CHECK-NEXT: [[A_VAL:%.*]] = load ptr, ptr [[A:%.*]], align 1 +; CHECK-NEXT: [[L_SEL:%.*]] = select i1 [[C:%.*]], ptr [[B_VAL]], ptr [[A_VAL]] +; CHECK-NEXT: ret ptr [[L_SEL]] +; +entry: + %ptr.sel = select i1 %c, ptr %b, ptr %a + %l.sel = load ptr, ptr %ptr.sel, align 1, !tbaa !0, !llvm.access.group !7, !dereferenceable !9, !noundef !{}, !invariant.load !7 + ret ptr %l.sel +} + +; FIXME: Should preserve metadata on loads. +define double @preserve_load_metadata_after_select_transform2(ptr %a, ptr %b) { +; CHECK-LABEL: @preserve_load_metadata_after_select_transform2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]] +; CHECK-NEXT: [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]] +; CHECK-NEXT: ret double [[L_SEL]] +; +entry: + %l.a = load double, ptr %a, align 8, !tbaa !0, !llvm.access.group !7 + %l.b = load double, ptr %b, align 8, !tbaa !0, !llvm.access.group !7 + %cmp.i = fcmp fast olt double %l.a, %l.b + %ptr.sel = select i1 %cmp.i, ptr %b, ptr %a + %l.sel = load double, ptr %ptr.sel, align 8, !tbaa !0, !llvm.access.group !7 + ret double %l.sel +} + +; FIXME: Should preserve metadata on loads. +define double @preserve_load_metadata_after_select_transform_metadata_missing_1(ptr %a, ptr %b) { +; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]] +; CHECK-NEXT: [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]] +; CHECK-NEXT: ret double [[L_SEL]] +; +entry: + %l.a = load double, ptr %a, align 8, !llvm.access.group !7 + %l.b = load double, ptr %b, align 8, !tbaa !0, !llvm.access.group !7 + %cmp.i = fcmp fast olt double %l.a, %l.b + %ptr.sel = select i1 %cmp.i, ptr %b, ptr %a + %l.sel = load double, ptr %ptr.sel, align 8, !tbaa !0, !llvm.access.group !7 + ret double %l.sel +} + +define double @preserve_load_metadata_after_select_transform_metadata_missing_2(ptr %a, ptr %b) { +; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]] +; CHECK-NEXT: [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]] +; CHECK-NEXT: ret double [[L_SEL]] +; +entry: + %l.a = load double, ptr %a, align 8, !llvm.access.group !7 + %l.b = load double, ptr %b, align 8, !llvm.access.group !7 + %cmp.i = fcmp fast olt double %l.a, %l.b + %ptr.sel = select i1 %cmp.i, ptr %b, ptr %a + %l.sel = load double, ptr %ptr.sel, align 8, !tbaa !0, !llvm.access.group !12 + ret double %l.sel +} + +; FIXME: Should preserve metadata on loads. +define double @preserve_load_metadata_after_select_transform_metadata_missing_3(ptr %a, ptr %b) { +; CHECK-LABEL: @preserve_load_metadata_after_select_transform_metadata_missing_3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[L_A:%.*]] = load double, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[L_B:%.*]] = load double, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast olt double [[L_A]], [[L_B]] +; CHECK-NEXT: [[L_SEL:%.*]] = select i1 [[CMP_I]], double [[L_B]], double [[L_A]] +; CHECK-NEXT: ret double [[L_SEL]] +; +entry: + %l.a = load double, ptr %a, align 8, !tbaa !0, !llvm.access.group !7 + %l.b = load double, ptr %b, align 8, !tbaa !0, !llvm.access.group !7 + %cmp.i = fcmp fast olt double %l.a, %l.b + %ptr.sel = select i1 %cmp.i, ptr %b, ptr %a + %l.sel = load double, ptr %ptr.sel, align 8, !tbaa !0, !llvm.access.group !12 + ret double %l.sel +} + !0 = !{!1, !1, i64 0} !1 = !{!"scalar type", !2} !2 = !{!"root"} @@ -198,3 +288,16 @@ entry: !9 = !{i64 8} !10 = distinct !{} !11 = !{i32 5, i32 6} +!12 = !{} +;. +; CHECK: [[TBAA0]] = !{[[LOOP1]], [[LOOP1]], i64 0} +; CHECK: [[LOOP1]] = !{!"scalar type", [[META2:![0-9]+]]} +; CHECK: [[META2]] = !{!"root"} +; CHECK: [[META3]] = !{[[META4:![0-9]+]]} +; CHECK: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]]} +; CHECK: [[META5]] = distinct !{[[META5]]} +; CHECK: [[META6]] = !{} +; CHECK: [[META7]] = !{i32 1} +; CHECK: [[META8]] = !{i64 8} +; CHECK: [[ACC_GRP9]] = distinct !{} +;. diff --git a/llvm/test/Transforms/InstCombine/sext-and.ll b/llvm/test/Transforms/InstCombine/sext-and.ll new file mode 100644 index 00000000000000..e08bbbd83d8845 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/sext-and.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +declare void @use(i8) + +define i1 @fold_sext_to_and(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -127 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP1]], 1 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, -2147483647 + %3 = icmp eq i32 %2, 1 + ret i1 %3 +} + +define i1 @fold_sext_to_and1(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and1( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -127 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP1]], 1 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, -2147483647 + %3 = icmp ne i32 %2, 1 + ret i1 %3 +} + +define i1 @fold_sext_to_and2(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and2( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -126 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP1]], 2 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, 1073741826 + %3 = icmp eq i32 %2, 2 + ret i1 %3 +} + +define i1 @fold_sext_to_and3(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and3( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -126 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP1]], 2 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, 1073741826 + %3 = icmp ne i32 %2, 2 + ret i1 %3 +} + +define i1 @fold_sext_to_and_multi_use(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_multi_use( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X]] to i32 +; CHECK-NEXT: call void @use(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X]], -127 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 1 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %1 = sext i8 %x to i32 + call void @use(i32 %1) + %2 = and i32 %1, -2147483647 + %3 = icmp eq i32 %2, 1 + ret i1 %3 +} + +define i1 @fold_sext_to_and_multi_use1(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_multi_use1( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X]] to i32 +; CHECK-NEXT: call void @use(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X]], -127 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP2]], 1 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %1 = sext i8 %x to i32 + call void @use(i32 %1) + %2 = and i32 %1, -2147483647 + %3 = icmp ne i32 %2, 1 + ret i1 %3 +} + +define i1 @fold_sext_to_and_multi_use2(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_multi_use2( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X]] to i32 +; CHECK-NEXT: call void @use(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X]], -126 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 2 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %1 = sext i8 %x to i32 + call void @use(i32 %1) + %2 = and i32 %1, 1073741826 + %3 = icmp eq i32 %2, 2 + ret i1 %3 +} + +define i1 @fold_sext_to_and_multi_use3(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_multi_use3( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = sext i8 [[X]] to i32 +; CHECK-NEXT: call void @use(i32 [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[X]], -126 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i8 [[TMP2]], 2 +; CHECK-NEXT: ret i1 [[TMP3]] +; + %1 = sext i8 %x to i32 + call void @use(i32 %1) + %2 = and i32 %1, 1073741826 + %3 = icmp ne i32 %2, 2 + ret i1 %3 +} + +; Negative tests + +define i1 @fold_sext_to_and_wrong(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_wrong( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i1 false +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, -2147483647 + %3 = icmp eq i32 %2, -1 + ret i1 %3 +} + +define i1 @fold_sext_to_and_wrong2(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_wrong2( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i1 false +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, -2147483647 + %3 = icmp eq i32 %2, 128 + ret i1 %3 +} + +define i1 @fold_sext_to_and_wrong3(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_wrong3( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i1 false +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, 128 + %3 = icmp eq i32 %2, -2147483648 + ret i1 %3 +} + +define i1 @fold_sext_to_and_wrong4(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_wrong4( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i1 false +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, 128 + %3 = icmp eq i32 %2, 1 + ret i1 %3 +} + +define i1 @fold_sext_to_and_wrong5(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_wrong5( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i1 false +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, -256 + %3 = icmp eq i32 %2, 1 + ret i1 %3 +} + +define i1 @fold_sext_to_and_wrong6(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_wrong6( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i1 true +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, -2147483647 + %3 = icmp ne i32 %2, -1 + ret i1 %3 +} + +define i1 @fold_sext_to_and_wrong7(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_wrong7( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i1 true +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, -2147483647 + %3 = icmp ne i32 %2, 128 + ret i1 %3 +} + +define i1 @fold_sext_to_and_wrong8(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_wrong8( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i1 true +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, 128 + %3 = icmp ne i32 %2, -2147483648 + ret i1 %3 +} + +define i1 @fold_sext_to_and_wrong9(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_wrong9( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i1 true +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, 128 + %3 = icmp ne i32 %2, 1 + ret i1 %3 +} + +define i1 @fold_sext_to_and_wrong10(i8 %x) { +; CHECK-LABEL: define i1 @fold_sext_to_and_wrong10( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: ret i1 true +; + %1 = sext i8 %x to i32 + %2 = and i32 %1, -256 + %3 = icmp ne i32 %2, 1 + ret i1 %3 +} diff --git a/llvm/test/Transforms/InstCombine/vec_shuffle.ll b/llvm/test/Transforms/InstCombine/vec_shuffle.ll index 163d9c9557b239..9fb68b5399c845 100644 --- a/llvm/test/Transforms/InstCombine/vec_shuffle.ll +++ b/llvm/test/Transforms/InstCombine/vec_shuffle.ll @@ -2411,6 +2411,18 @@ define <4 x i32> @shuf_same_length_vec_select(<4 x i1> %cond) { ret <4 x i32> %shuf } +; Make sure we do not fold in this case. +define <4 x i8> @shuf_cmp_may_be_poison(<4 x i8> %x, <4 x i8> %y, i1 %cmp) { +; CHECK-LABEL: @shuf_cmp_may_be_poison( +; CHECK-NEXT: [[Y:%.*]] = select i1 [[CMP:%.*]], <4 x i8> [[Y1:%.*]], <4 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i8> [[Y]], <4 x i8> , <4 x i32> +; CHECK-NEXT: ret <4 x i8> [[TMP1]] +; + %sel = select i1 %cmp, <4 x i8> %y, <4 x i8> + %shuf = shufflevector <4 x i8> %sel, <4 x i8> , <4 x i32> + ret <4 x i8> %shuf +} + declare i1 @cond() declare <4 x i32> @value() diff --git a/llvm/test/Transforms/InstSimplify/cmp-alloca-offsets.ll b/llvm/test/Transforms/InstSimplify/cmp-alloca-offsets.ll index 79fd9a22ba5cc9..d076035b269e46 100644 --- a/llvm/test/Transforms/InstSimplify/cmp-alloca-offsets.ll +++ b/llvm/test/Transforms/InstSimplify/cmp-alloca-offsets.ll @@ -230,4 +230,96 @@ define i1 @mixed_alloca_size4() { ret i1 %res } +define i1 @zst_alloca_start() { +; CHECK-LABEL: @zst_alloca_start( +; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8 +; CHECK-NEXT: [[A2:%.*]] = alloca {}, align 8 +; CHECK-NEXT: call void @escape(ptr [[A]], ptr [[A2]]) +; CHECK-NEXT: ret i1 false +; + %a = alloca i64 + %a2 = alloca {} + %gep = getelementptr i8, ptr %a, i64 0 + %cmp = icmp eq ptr %gep, %a2 + call void @escape(ptr %a, ptr %a2) + ret i1 %cmp +} + +define i1 @zst_alloca_middle() { +; CHECK-LABEL: @zst_alloca_middle( +; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8 +; CHECK-NEXT: [[A2:%.*]] = alloca {}, align 8 +; CHECK-NEXT: call void @escape(ptr [[A]], ptr [[A2]]) +; CHECK-NEXT: ret i1 false +; + %a = alloca i64 + %a2 = alloca {} + %gep = getelementptr i8, ptr %a, i64 4 + %cmp = icmp eq ptr %gep, %a2 + call void @escape(ptr %a, ptr %a2) + ret i1 %cmp +} + +define i1 @zst_alloca_end() { +; CHECK-LABEL: @zst_alloca_end( +; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8 +; CHECK-NEXT: [[A2:%.*]] = alloca {}, align 8 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[GEP]], [[A2]] +; CHECK-NEXT: call void @escape(ptr [[A]], ptr [[A2]]) +; CHECK-NEXT: ret i1 [[CMP]] +; + %a = alloca i64 + %a2 = alloca {} + %gep = getelementptr i8, ptr %a, i64 8 + %cmp = icmp eq ptr %gep, %a2 + call void @escape(ptr %a, ptr %a2) + ret i1 %cmp +} + +@gz = external global {} + +define i1 @zst_global_start() { +; CHECK-LABEL: @zst_global_start( +; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @escape(ptr [[A]], ptr @gz) +; CHECK-NEXT: ret i1 false +; + %a = alloca i64 + %gep = getelementptr i8, ptr %a, i64 0 + %cmp = icmp eq ptr %gep, @gz + call void @escape(ptr %a, ptr @gz) + ret i1 %cmp +} + +define i1 @zst_global_middle() { +; CHECK-LABEL: @zst_global_middle( +; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8 +; CHECK-NEXT: call void @escape(ptr [[A]], ptr @gz) +; CHECK-NEXT: ret i1 false +; + %a = alloca i64 + %gep = getelementptr i8, ptr %a, i64 4 + %cmp = icmp eq ptr %gep, @gz + call void @escape(ptr %a, ptr @gz) + ret i1 %cmp +} + +define i1 @zst_global_end() { +; CHECK-LABEL: @zst_global_end( +; CHECK-NEXT: [[A:%.*]] = alloca i64, align 8 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[A]], i64 8 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq ptr [[GEP]], @gz +; CHECK-NEXT: call void @escape(ptr [[A]], ptr @gz) +; CHECK-NEXT: ret i1 [[CMP]] +; + %a = alloca i64 + %gep = getelementptr i8, ptr %a, i64 8 + %cmp = icmp eq ptr %gep, @gz + call void @escape(ptr %a, ptr @gz) + ret i1 %cmp +} + +declare void @escape(ptr, ptr) + attributes #0 = { null_pointer_is_valid } diff --git a/llvm/test/Transforms/LoopStrengthReduce/Power/incomplete-phi.ll b/llvm/test/Transforms/LoopStrengthReduce/Power/incomplete-phi.ll index c57761c1a01f14..53aac1d9cf7f8d 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/Power/incomplete-phi.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/Power/incomplete-phi.ll @@ -21,12 +21,10 @@ define void @foo(ptr %arg) { ; CHECK-LABEL: define void @foo( ; CHECK-SAME: ptr [[ARG:%.*]]) { ; CHECK-NEXT: bb: -; CHECK-NEXT: [[I:%.*]] = getelementptr [0 x %0], ptr [[ARG]], i64 0, i64 -1 -; CHECK-NEXT: [[I2:%.*]] = getelementptr i8, ptr [[I]], i64 4 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG]], i64 396 ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb3: -; CHECK-NEXT: [[LSR_IV7:%.*]] = phi ptr [ [[SCEVGEP8:%.*]], [[BB18:%.*]] ], [ [[I2]], [[BB:%.*]] ] +; CHECK-NEXT: [[LSR_IV7:%.*]] = phi ptr [ [[SCEVGEP8:%.*]], [[BB18:%.*]] ], [ [[ARG]], [[BB:%.*]] ] ; CHECK-NEXT: [[LSR_IV5:%.*]] = phi i64 [ [[LSR_IV_NEXT6:%.*]], [[BB18]] ], [ 4, [[BB]] ] ; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], [[BB18]] ], [ [[SCEVGEP]], [[BB]] ] ; CHECK-NEXT: br i1 true, label [[BB22_PREHEADER:%.*]], label [[BB9_PREHEADER:%.*]] diff --git a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll index 6f34dc843ae1ee..3c53befa67e230 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll @@ -113,7 +113,7 @@ for.end: ; preds = %for.body define void @ptr_of_ptr_addrec(ptr %ptrptr, i32 %length) { ; CHECK-LABEL: @ptr_of_ptr_addrec( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[START_PTRPTR:%.*]] = getelementptr ptr, ptr [[PTRPTR:%.*]] +; CHECK-NEXT: [[START_PTRPTR1:%.*]] = getelementptr inbounds ptr, ptr [[START_PTRPTR:%.*]] ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LENGTH:%.*]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3 @@ -121,7 +121,7 @@ define void @ptr_of_ptr_addrec(ptr %ptrptr, i32 %length) { ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[START_PTRPTR]], i64 [[TMP3]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IT_04:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[START_PTRPTR]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[IT_04:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[START_PTRPTR1]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[IT_04]], align 8 ; CHECK-NEXT: tail call void @foo(ptr [[TMP4]]) ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr ptr, ptr [[IT_04]], i64 1 diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll index 66d6a2fc567e89..a5df23296440d3 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/massv-calls.ll @@ -1244,6 +1244,52 @@ for.end: ret void } +define void @atan2_f64_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @atan2_f64_intrinsic( +; CHECK: __atan2d2{{.*}}<2 x double> +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.atan2.f64(double %conv, double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @atan2_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @atan2_f32_intrinsic( +; CHECK: __atan2f4{{.*}}<4 x float> +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.atan2.f32(float %conv, float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + define void @sinh_f64(ptr nocapture %varray) { ; CHECK-LABEL: @sinh_f64( ; CHECK: __sinhd2{{.*}}<2 x double> diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll new file mode 100644 index 00000000000000..27923f82411d00 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll @@ -0,0 +1,233 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFBFMIN +; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -S | FileCheck %s -check-prefix=ZVFBFMIN + +define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) { +; NO-ZVFBFMIN-LABEL: define void @fadd( +; NO-ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-ZVFBFMIN-NEXT: [[ENTRY:.*]]: +; NO-ZVFBFMIN-NEXT: br label %[[LOOP:.*]] +; NO-ZVFBFMIN: [[LOOP]]: +; NO-ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; NO-ZVFBFMIN-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]] +; NO-ZVFBFMIN-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]] +; NO-ZVFBFMIN-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2 +; NO-ZVFBFMIN-NEXT: [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2 +; NO-ZVFBFMIN-NEXT: [[Z:%.*]] = fadd bfloat [[X]], [[Y]] +; NO-ZVFBFMIN-NEXT: store bfloat [[Z]], ptr [[A_GEP]], align 2 +; NO-ZVFBFMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1 +; NO-ZVFBFMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; NO-ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]] +; NO-ZVFBFMIN: [[EXIT]]: +; NO-ZVFBFMIN-NEXT: ret void +; +; ZVFBFMIN-LABEL: define void @fadd( +; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; ZVFBFMIN-NEXT: [[ENTRY:.*]]: +; ZVFBFMIN-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; ZVFBFMIN-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; ZVFBFMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]] +; ZVFBFMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; ZVFBFMIN: [[VECTOR_PH]]: +; ZVFBFMIN-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; ZVFBFMIN-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8 +; ZVFBFMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]] +; ZVFBFMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; ZVFBFMIN-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; ZVFBFMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP12]], 8 +; ZVFBFMIN-NEXT: br label %[[VECTOR_BODY:.*]] +; ZVFBFMIN: [[VECTOR_BODY]]: +; ZVFBFMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFBFMIN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; ZVFBFMIN-NEXT: [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]] +; ZVFBFMIN-NEXT: [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]] +; ZVFBFMIN-NEXT: [[TMP3:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0 +; ZVFBFMIN-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 2 +; ZVFBFMIN-NEXT: [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0 +; ZVFBFMIN-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP4]], align 2 +; ZVFBFMIN-NEXT: [[TMP11:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] +; ZVFBFMIN-NEXT: store [[TMP11]], ptr [[TMP3]], align 2 +; ZVFBFMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; ZVFBFMIN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ZVFBFMIN-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; ZVFBFMIN: [[MIDDLE_BLOCK]]: +; ZVFBFMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; ZVFBFMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; ZVFBFMIN: [[SCALAR_PH]]: +; ZVFBFMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; ZVFBFMIN-NEXT: br label %[[LOOP:.*]] +; ZVFBFMIN: [[LOOP]]: +; ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; ZVFBFMIN-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]] +; ZVFBFMIN-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]] +; ZVFBFMIN-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2 +; ZVFBFMIN-NEXT: [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2 +; ZVFBFMIN-NEXT: [[Z:%.*]] = fadd bfloat [[X]], [[Y]] +; ZVFBFMIN-NEXT: store bfloat [[Z]], ptr [[A_GEP]], align 2 +; ZVFBFMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1 +; ZVFBFMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; ZVFBFMIN: [[EXIT]]: +; ZVFBFMIN-NEXT: ret void +; +entry: + br label %loop +loop: + %i = phi i64 [0, %entry], [%i.next, %loop] + %a.gep = getelementptr bfloat, ptr %a, i64 %i + %b.gep = getelementptr bfloat, ptr %b, i64 %i + %x = load bfloat, ptr %a.gep + %y = load bfloat, ptr %b.gep + %z = fadd bfloat %x, %y + store bfloat %z, ptr %a.gep + %i.next = add i64 %i, 1 + %done = icmp eq i64 %i.next, %n + br i1 %done, label %exit, label %loop +exit: + ret void +} + +define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) { +; NO-ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv( +; NO-ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; NO-ZVFBFMIN-NEXT: [[ENTRY:.*]]: +; NO-ZVFBFMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; NO-ZVFBFMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; NO-ZVFBFMIN: [[VECTOR_PH]]: +; NO-ZVFBFMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; NO-ZVFBFMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; NO-ZVFBFMIN-NEXT: br label %[[VECTOR_BODY:.*]] +; NO-ZVFBFMIN: [[VECTOR_BODY]]: +; NO-ZVFBFMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; NO-ZVFBFMIN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-ZVFBFMIN-NEXT: [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]] +; NO-ZVFBFMIN-NEXT: [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]] +; NO-ZVFBFMIN-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP0]] +; NO-ZVFBFMIN-NEXT: [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0 +; NO-ZVFBFMIN-NEXT: [[WIDE_LOAD:%.*]] = load <8 x bfloat>, ptr [[TMP4]], align 2 +; NO-ZVFBFMIN-NEXT: [[TMP5:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0 +; NO-ZVFBFMIN-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x bfloat>, ptr [[TMP5]], align 2 +; NO-ZVFBFMIN-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[TMP3]], i32 0 +; NO-ZVFBFMIN-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP6]], align 4 +; NO-ZVFBFMIN-NEXT: [[TMP7:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD]] to <8 x float> +; NO-ZVFBFMIN-NEXT: [[TMP8:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD1]] to <8 x float> +; NO-ZVFBFMIN-NEXT: [[TMP9:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP7]], <8 x float> [[TMP8]], <8 x float> [[WIDE_LOAD2]]) +; NO-ZVFBFMIN-NEXT: store <8 x float> [[TMP9]], ptr [[TMP6]], align 4 +; NO-ZVFBFMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; NO-ZVFBFMIN-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-ZVFBFMIN-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-ZVFBFMIN: [[MIDDLE_BLOCK]]: +; NO-ZVFBFMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; NO-ZVFBFMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; NO-ZVFBFMIN: [[SCALAR_PH]]: +; NO-ZVFBFMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; NO-ZVFBFMIN-NEXT: br label %[[LOOP:.*]] +; NO-ZVFBFMIN: [[LOOP]]: +; NO-ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; NO-ZVFBFMIN-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]] +; NO-ZVFBFMIN-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]] +; NO-ZVFBFMIN-NEXT: [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]] +; NO-ZVFBFMIN-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2 +; NO-ZVFBFMIN-NEXT: [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2 +; NO-ZVFBFMIN-NEXT: [[Z:%.*]] = load float, ptr [[C_GEP]], align 4 +; NO-ZVFBFMIN-NEXT: [[X_EXT:%.*]] = fpext bfloat [[X]] to float +; NO-ZVFBFMIN-NEXT: [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float +; NO-ZVFBFMIN-NEXT: [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]]) +; NO-ZVFBFMIN-NEXT: store float [[FMULADD]], ptr [[C_GEP]], align 4 +; NO-ZVFBFMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1 +; NO-ZVFBFMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; NO-ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; NO-ZVFBFMIN: [[EXIT]]: +; NO-ZVFBFMIN-NEXT: ret void +; +; ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv( +; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; ZVFBFMIN-NEXT: [[ENTRY:.*]]: +; ZVFBFMIN-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; ZVFBFMIN-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; ZVFBFMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; ZVFBFMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; ZVFBFMIN: [[VECTOR_PH]]: +; ZVFBFMIN-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; ZVFBFMIN-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; ZVFBFMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; ZVFBFMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; ZVFBFMIN-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; ZVFBFMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; ZVFBFMIN-NEXT: br label %[[VECTOR_BODY:.*]] +; ZVFBFMIN: [[VECTOR_BODY]]: +; ZVFBFMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFBFMIN-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; ZVFBFMIN-NEXT: [[TMP7:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP6]] +; ZVFBFMIN-NEXT: [[TMP8:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP6]] +; ZVFBFMIN-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP6]] +; ZVFBFMIN-NEXT: [[TMP10:%.*]] = getelementptr bfloat, ptr [[TMP7]], i32 0 +; ZVFBFMIN-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 2 +; ZVFBFMIN-NEXT: [[TMP11:%.*]] = getelementptr bfloat, ptr [[TMP8]], i32 0 +; ZVFBFMIN-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 2 +; ZVFBFMIN-NEXT: [[TMP12:%.*]] = getelementptr float, ptr [[TMP9]], i32 0 +; ZVFBFMIN-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP12]], align 4 +; ZVFBFMIN-NEXT: [[TMP13:%.*]] = fpext [[WIDE_LOAD]] to +; ZVFBFMIN-NEXT: [[TMP14:%.*]] = fpext [[WIDE_LOAD1]] to +; ZVFBFMIN-NEXT: [[TMP15:%.*]] = call @llvm.fmuladd.nxv4f32( [[TMP13]], [[TMP14]], [[WIDE_LOAD2]]) +; ZVFBFMIN-NEXT: store [[TMP15]], ptr [[TMP12]], align 4 +; ZVFBFMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; ZVFBFMIN-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ZVFBFMIN-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; ZVFBFMIN: [[MIDDLE_BLOCK]]: +; ZVFBFMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; ZVFBFMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; ZVFBFMIN: [[SCALAR_PH]]: +; ZVFBFMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; ZVFBFMIN-NEXT: br label %[[LOOP:.*]] +; ZVFBFMIN: [[LOOP]]: +; ZVFBFMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; ZVFBFMIN-NEXT: [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]] +; ZVFBFMIN-NEXT: [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]] +; ZVFBFMIN-NEXT: [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]] +; ZVFBFMIN-NEXT: [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2 +; ZVFBFMIN-NEXT: [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2 +; ZVFBFMIN-NEXT: [[Z:%.*]] = load float, ptr [[C_GEP]], align 4 +; ZVFBFMIN-NEXT: [[X_EXT:%.*]] = fpext bfloat [[X]] to float +; ZVFBFMIN-NEXT: [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float +; ZVFBFMIN-NEXT: [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]]) +; ZVFBFMIN-NEXT: store float [[FMULADD]], ptr [[C_GEP]], align 4 +; ZVFBFMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1 +; ZVFBFMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; ZVFBFMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; ZVFBFMIN: [[EXIT]]: +; ZVFBFMIN-NEXT: ret void +; +entry: + br label %loop +loop: + %i = phi i64 [0, %entry], [%i.next, %loop] + %a.gep = getelementptr bfloat, ptr %a, i64 %i + %b.gep = getelementptr bfloat, ptr %b, i64 %i + %c.gep = getelementptr float, ptr %c, i64 %i + %x = load bfloat, ptr %a.gep + %y = load bfloat, ptr %b.gep + %z = load float, ptr %c.gep + %x.ext = fpext bfloat %x to float + %y.ext = fpext bfloat %y to float + %fmuladd = call float @llvm.fmuladd.f32(float %x.ext, float %y.ext, float %z) + store float %fmuladd, ptr %c.gep + %i.next = add i64 %i, 1 + %done = icmp eq i64 %i.next, %n + br i1 %done, label %exit, label %loop +exit: + ret void +} +;. +; NO-ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; NO-ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; NO-ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; NO-ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. +; ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; ZVFBFMIN: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; ZVFBFMIN: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll new file mode 100644 index 00000000000000..2b267f6a2a9778 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll @@ -0,0 +1,94 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFHMIN +; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -S | FileCheck %s -check-prefix=ZVFHMIN + +define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) { +; NO-ZVFHMIN-LABEL: define void @fadd( +; NO-ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; NO-ZVFHMIN-NEXT: [[ENTRY:.*]]: +; NO-ZVFHMIN-NEXT: br label %[[LOOP:.*]] +; NO-ZVFHMIN: [[LOOP]]: +; NO-ZVFHMIN-NEXT: [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; NO-ZVFHMIN-NEXT: [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]] +; NO-ZVFHMIN-NEXT: [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]] +; NO-ZVFHMIN-NEXT: [[X:%.*]] = load half, ptr [[A_GEP]], align 2 +; NO-ZVFHMIN-NEXT: [[Y:%.*]] = load half, ptr [[B_GEP]], align 2 +; NO-ZVFHMIN-NEXT: [[Z:%.*]] = fadd half [[X]], [[Y]] +; NO-ZVFHMIN-NEXT: store half [[Z]], ptr [[A_GEP]], align 2 +; NO-ZVFHMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1 +; NO-ZVFHMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; NO-ZVFHMIN-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]] +; NO-ZVFHMIN: [[EXIT]]: +; NO-ZVFHMIN-NEXT: ret void +; +; ZVFHMIN-LABEL: define void @fadd( +; ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; ZVFHMIN-NEXT: [[ENTRY:.*]]: +; ZVFHMIN-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; ZVFHMIN-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 8 +; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]] +; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; ZVFHMIN: [[VECTOR_PH]]: +; ZVFHMIN-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; ZVFHMIN-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8 +; ZVFHMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]] +; ZVFHMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; ZVFHMIN-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; ZVFHMIN-NEXT: [[TMP5:%.*]] = mul i64 [[TMP12]], 8 +; ZVFHMIN-NEXT: br label %[[VECTOR_BODY:.*]] +; ZVFHMIN: [[VECTOR_BODY]]: +; ZVFHMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; ZVFHMIN-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; ZVFHMIN-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[A]], i64 [[TMP0]] +; ZVFHMIN-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[TMP0]] +; ZVFHMIN-NEXT: [[TMP3:%.*]] = getelementptr half, ptr [[TMP1]], i32 0 +; ZVFHMIN-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 2 +; ZVFHMIN-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[TMP2]], i32 0 +; ZVFHMIN-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP4]], align 2 +; ZVFHMIN-NEXT: [[TMP11:%.*]] = fadd [[WIDE_LOAD]], [[WIDE_LOAD1]] +; ZVFHMIN-NEXT: store [[TMP11]], ptr [[TMP3]], align 2 +; ZVFHMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; ZVFHMIN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ZVFHMIN-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; ZVFHMIN: [[MIDDLE_BLOCK]]: +; ZVFHMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; ZVFHMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; ZVFHMIN: [[SCALAR_PH]]: +; ZVFHMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; ZVFHMIN-NEXT: br label %[[LOOP:.*]] +; ZVFHMIN: [[LOOP]]: +; ZVFHMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ] +; ZVFHMIN-NEXT: [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]] +; ZVFHMIN-NEXT: [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]] +; ZVFHMIN-NEXT: [[X:%.*]] = load half, ptr [[A_GEP]], align 2 +; ZVFHMIN-NEXT: [[Y:%.*]] = load half, ptr [[B_GEP]], align 2 +; ZVFHMIN-NEXT: [[Z:%.*]] = fadd half [[X]], [[Y]] +; ZVFHMIN-NEXT: store half [[Z]], ptr [[A_GEP]], align 2 +; ZVFHMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1 +; ZVFHMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] +; ZVFHMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; ZVFHMIN: [[EXIT]]: +; ZVFHMIN-NEXT: ret void +; +entry: + br label %loop +loop: + %i = phi i64 [0, %entry], [%i.next, %loop] + %a.gep = getelementptr half, ptr %a, i64 %i + %b.gep = getelementptr half, ptr %b, i64 %i + %x = load half, ptr %a.gep + %y = load half, ptr %b.gep + %z = fadd half %x, %y + store half %z, ptr %a.gep + %i.next = add i64 %i, 1 + %done = icmp eq i64 %i.next, %n + br i1 %done, label %exit, label %loop +exit: + ret void +} +;. +; ZVFHMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; ZVFHMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; ZVFHMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; ZVFHMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index a38835f5613fd8..d68556fca4774f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -1,4 +1,3 @@ -; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py ; This is the loop in c++ being vectorize in this file with ;vector.reverse ; #pragma clang loop vectorize_width(4, scalable) @@ -195,12 +194,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Interleaving disabled by the pass manager -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Vectorizing: innermost loop. -; CHECK-EMPTY: +; CHECK: LV: Loop does not require scalar epilogue ; entry: %cmp7 = icmp sgt i32 %n, 0 @@ -414,11 +408,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 ; CHECK-NEXT: No successors ; CHECK-NEXT: } -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Interleaving disabled by the pass manager -; CHECK-NEXT: LV: Loop does not require scalar epilogue -; CHECK-NEXT: LV: Vectorizing: innermost loop. +; CHECK: LV: Loop does not require scalar epilogue ; entry: %cmp7 = icmp sgt i32 %n, 0 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll index 8e7cd7f6d530dd..01a2a757dea5dd 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll @@ -224,9 +224,66 @@ for.end: ret float %add } +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) +define half @fadd_fast_half_zvfh(ptr noalias nocapture readonly %a, i64 %n) "target-features"="+zvfh" { +; CHECK-LABEL: @fadd_fast_half_zvfh +; CHECK: vector.body: +; CHECK: %[[LOAD1:.*]] = load +; CHECK: %[[LOAD2:.*]] = load +; CHECK: %[[FADD1:.*]] = fadd fast %[[LOAD1]] +; CHECK: %[[FADD2:.*]] = fadd fast %[[LOAD2]] +; CHECK: middle.block: +; CHECK: %[[RDX:.*]] = fadd fast %[[FADD2]], %[[FADD1]] +; CHECK: call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, %[[RDX]]) +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi half [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv + %0 = load half, ptr %arrayidx, align 4 + %add = fadd fast half %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret half %add +} + ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. ; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2) -define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) { +define half @fadd_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) "target-features"="+zvfhmin" { +; CHECK-LABEL: @fadd_fast_half_zvfhmin +; CHECK: vector.body: +; CHECK: %[[LOAD1:.*]] = load <16 x half> +; CHECK: %[[LOAD2:.*]] = load <16 x half> +; CHECK: %[[FADD1:.*]] = fadd fast <16 x half> %[[LOAD1]] +; CHECK: %[[FADD2:.*]] = fadd fast <16 x half> %[[LOAD2]] +; CHECK: middle.block: +; CHECK: %[[RDX:.*]] = fadd fast <16 x half> %[[FADD2]], %[[FADD1]] +; CHECK: call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> %[[RDX]]) +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi half [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv + %0 = load half, ptr %arrayidx, align 4 + %add = fadd fast half %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret half %add +} + +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. +; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2) +define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) "target-features"="+zvfbfmin" { ; CHECK-LABEL: @fadd_fast_bfloat ; CHECK: vector.body: ; CHECK: %[[LOAD1:.*]] = load <16 x bfloat> @@ -427,6 +484,110 @@ for.end: ret float %muladd } +; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2) +define half @fmuladd_f16_zvfh(ptr %a, ptr %b, i64 %n) "target-features"="+zvfh" { +; CHECK-LABEL: @fmuladd_f16_zvfh( +; CHECK: vector.body: +; CHECK: [[WIDE_LOAD:%.*]] = load +; CHECK: [[WIDE_LOAD2:%.*]] = load +; CHECK: [[WIDE_LOAD3:%.*]] = load +; CHECK: [[WIDE_LOAD4:%.*]] = load +; CHECK: [[MULADD1:%.*]] = call reassoc @llvm.fmuladd.nxv8f16( [[WIDE_LOAD]], [[WIDE_LOAD3]], +; CHECK: [[MULADD2:%.*]] = call reassoc @llvm.fmuladd.nxv8f16( [[WIDE_LOAD2]], [[WIDE_LOAD4]], +; CHECK: middle.block: +; CHECK: [[BIN_RDX:%.*]] = fadd reassoc [[MULADD2]], [[MULADD1]] +; CHECK: call reassoc half @llvm.vector.reduce.fadd.nxv8f16(half 0xH8000, [[BIN_RDX]]) +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi half [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv + %0 = load half, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds half, ptr %b, i64 %iv + %1 = load half, ptr %arrayidx2, align 4 + %muladd = tail call reassoc half @llvm.fmuladd.f16(half %0, half %1, half %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret half %muladd +} + + +; We can't scalably vectorize reductions of f16 with zvfhmin or bf16 with zvfbfmin, so make sure we use fixed-length vectors instead. + +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. +; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2) +define half @fmuladd_f16_zvfhmin(ptr %a, ptr %b, i64 %n) "target-features"="+zvfhmin" { +; CHECK-LABEL: @fmuladd_f16_zvfhmin( +; CHECK: vector.body: +; CHECK: [[WIDE_LOAD:%.*]] = load <16 x half> +; CHECK: [[WIDE_LOAD2:%.*]] = load <16 x half> +; CHECK: [[WIDE_LOAD3:%.*]] = load <16 x half> +; CHECK: [[WIDE_LOAD4:%.*]] = load <16 x half> +; CHECK: [[MULADD1:%.*]] = call reassoc <16 x half> @llvm.fmuladd.v16f16(<16 x half> [[WIDE_LOAD]], <16 x half> [[WIDE_LOAD3]], +; CHECK: [[MULADD2:%.*]] = call reassoc <16 x half> @llvm.fmuladd.v16f16(<16 x half> [[WIDE_LOAD2]], <16 x half> [[WIDE_LOAD4]], +; CHECK: middle.block: +; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <16 x half> [[MULADD2]], [[MULADD1]] +; CHECK: call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> [[BIN_RDX]]) +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi half [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv + %0 = load half, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds half, ptr %b, i64 %iv + %1 = load half, ptr %arrayidx2, align 4 + %muladd = tail call reassoc half @llvm.fmuladd.f16(half %0, half %1, half %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret half %muladd +} + +; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop. +; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2) +define bfloat @fmuladd_bf16(ptr %a, ptr %b, i64 %n) "target-features"="+zvfbfmin" { +; CHECK-LABEL: @fmuladd_bf16( +; CHECK: vector.body: +; CHECK: [[WIDE_LOAD:%.*]] = load <16 x bfloat> +; CHECK: [[WIDE_LOAD2:%.*]] = load <16 x bfloat> +; CHECK: [[WIDE_LOAD3:%.*]] = load <16 x bfloat> +; CHECK: [[WIDE_LOAD4:%.*]] = load <16 x bfloat> +; CHECK: [[MULADD1:%.*]] = call reassoc <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> [[WIDE_LOAD]], <16 x bfloat> [[WIDE_LOAD3]], +; CHECK: [[MULADD2:%.*]] = call reassoc <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> [[WIDE_LOAD2]], <16 x bfloat> [[WIDE_LOAD4]], +; CHECK: middle.block: +; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <16 x bfloat> [[MULADD2]], [[MULADD1]] +; CHECK: call reassoc bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR8000, <16 x bfloat> [[BIN_RDX]]) +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %muladd, %for.body ] + %arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv + %0 = load bfloat, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds bfloat, ptr %b, i64 %iv + %1 = load bfloat, ptr %arrayidx2, align 4 + %muladd = tail call reassoc bfloat @llvm.fmuladd.bf16(bfloat %0, bfloat %1, bfloat %sum.07) + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret bfloat %muladd +} + declare float @llvm.fmuladd.f32(float, float, float) attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-invariant-cond-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-invariant-cond-cost.ll new file mode 100644 index 00000000000000..8df8e0725e3fd3 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-invariant-cond-cost.ll @@ -0,0 +1,49 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -S %s | FileCheck %s + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +; Test for https://github.com/llvm/llvm-project/issues/114860. +define void @test_invariant_cond_for_select(ptr %dst, i8 %x) #0 { +; CHECK-LABEL: define void @test_invariant_cond_for_select( +; CHECK-SAME: ptr [[DST:%.*]], i8 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: [[C_2:%.*]] = icmp sgt i64 [[IV]], 0 +; CHECK-NEXT: [[C_2_EXT:%.*]] = zext i1 [[C_2]] to i64 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C_1]], i64 [[C_2_EXT]], i64 0 +; CHECK-NEXT: [[SEL_TRUNC:%.*]] = trunc i64 [[SEL]] to i8 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i8 [[SEL_TRUNC]], ptr [[GEP]], align 1 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 4 +; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV]], 14 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %c.1 = icmp eq i8 %x, 0 + %c.2 = icmp sgt i64 %iv, 0 + %c.2.ext = zext i1 %c.2 to i64 + %sel = select i1 %c.1, i64 %c.2.ext, i64 0 + %sel.trunc = trunc i64 %sel to i8 + %gep = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 %sel.trunc, ptr %gep, align 1 + %iv.next = add i64 %iv, 4 + %ec = icmp ult i64 %iv, 14 + br i1 %ec, label %loop, label %exit + +exit: + ret void +} + +attributes #0 = { "target-features"="+64bit,+v" } + diff --git a/llvm/test/Transforms/LoopVectorize/pr45259.ll b/llvm/test/Transforms/LoopVectorize/pr45259.ll index 42bd14e4b752bc..6baed089fb6b6a 100644 --- a/llvm/test/Transforms/LoopVectorize/pr45259.ll +++ b/llvm/test/Transforms/LoopVectorize/pr45259.ll @@ -14,12 +14,14 @@ define i8 @widget(ptr %arr, i8 %t9) { ; CHECK-NEXT: br i1 [[C]], label [[FOR_PREHEADER:%.*]], label [[BB6]] ; CHECK: for.preheader: ; CHECK-NEXT: [[T1_0_LCSSA:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ] +; CHECK-NEXT: [[T1_0_LCSSA4:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ] +; CHECK-NEXT: [[T1_0_LCSSA1:%.*]] = phi ptr [ [[T1_0]], [[BB6]] ] ; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[ARR1]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 0, [[TMP0]] ; CHECK-NEXT: [[T1_0_LCSSA3:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[T1_0_LCSSA3]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA]] to i64 +; CHECK-NEXT: [[T1_0_LCSSA2:%.*]] = ptrtoint ptr [[T1_0_LCSSA4]] to i64 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: @@ -65,7 +67,7 @@ define i8 @widget(ptr %arr, i8 %t9) { ; CHECK-NEXT: [[T3_I:%.*]] = icmp slt i8 [[IV_NEXT]], [[T9]] ; CHECK-NEXT: [[T3_I8:%.*]] = zext i1 [[T3_I]] to i8 ; CHECK-NEXT: store i8 [[T3_I8]], ptr [[PTR]], align 1 -; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[T1_0_LCSSA]], [[PTR]] +; CHECK-NEXT: [[EC:%.*]] = icmp eq ptr [[T1_0_LCSSA1]], [[PTR]] ; CHECK-NEXT: br i1 [[EC]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.exit: ; CHECK-NEXT: [[IV_NEXT_LCSSA:%.*]] = phi i8 [ [[IV_NEXT]], [[FOR_BODY]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll new file mode 100644 index 00000000000000..791cab0c074db6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes="loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s + +define void @uitofp_preserve_nneg(ptr %result, i32 %size, float %y) { +; CHECK-LABEL: @uitofp_preserve_nneg( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[FOR_BODY_PREHEADER4:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x float> poison, float [[Y:%.*]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT2]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX1]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = uitofp nneg <4 x i32> [[VEC_IND]] to <4 x float> +; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP0]], [[BROADCAST_SPLAT3]] +; CHECK-NEXT: [[INDEX:%.*]] = zext nneg i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[RESULT:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP7]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX1]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[FOR_BODY_PREHEADER4]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER4]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[CONV:%.*]] = uitofp nneg i32 [[TMP4]] to float +; CHECK-NEXT: [[TMP5:%.*]] = fmul float [[CONV]], [[Y]] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = zext nneg i32 [[TMP4]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[RESULT]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[TMP5]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[TMP4]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], 256 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_EXIT]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.exit: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %conv = uitofp nneg i32 %iv to float + %val = fmul float %conv, %y + %idxprom = zext nneg i32 %iv to i64 + %arrayidx = getelementptr inbounds float, ptr %result, i64 %idxprom + store float %val, ptr %arrayidx, align 4 + %inc = add nuw nsw i32 %iv, 1 + %cmp = icmp slt i32 %inc, 256 + br i1 %cmp, label %for.body, label %for.exit + +for.exit: + ret void +} diff --git a/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll new file mode 100644 index 00000000000000..816ed6e831153b --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/X86/preserve-access-group.ll @@ -0,0 +1,251 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes='default' -S %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; FIXME: !llvm.access.group should be preserved, loop should be vectorized. +; End-to-end test for https://github.com/llvm/llvm-project/issues/115595. +define void @test(i32 noundef %nface, i32 noundef %ncell, ptr noalias noundef %face_cell, ptr noalias noundef %x, ptr noalias noundef %y) #0 { +; CHECK-LABEL: define void @test( +; CHECK-SAME: i32 noundef [[NFACE:%.*]], i32 noundef [[NCELL:%.*]], ptr noalias nocapture noundef readonly [[FACE_CELL:%.*]], ptr noalias nocapture noundef readonly [[X:%.*]], ptr noalias nocapture noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[NFACE]], 0 +; CHECK-NEXT: br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[NFACE]] to i64 +; CHECK-NEXT: [[INVARIANT_GEP:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[TMP0]] +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NFACE]], 4 +; CHECK-NEXT: br i1 [[TMP1]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:.*]], label %[[FOR_BODY_PREHEADER_NEW:.*]] +; CHECK: [[FOR_BODY_PREHEADER_NEW]]: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP0]], 2147483644 +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]]: +; CHECK-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY_EPIL:.*]] +; CHECK: [[FOR_BODY_EPIL]]: +; CHECK-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_EPIL:%.*]], %[[FOR_BODY_EPIL]] ], [ [[INDVARS_IV_UNR]], %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_NEXT:%.*]], %[[FOR_BODY_EPIL]] ], [ 0, %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV_EPIL]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_EPIL]], align 4, !tbaa [[TBAA0:![0-9]+]], !llvm.access.group [[ACC_GRP4:![0-9]+]] +; CHECK-NEXT: [[GEP_EPIL:%.*]] = getelementptr inbounds i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV_EPIL]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_EPIL]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[IDXPROM3_EPIL:%.*]] = sext i32 [[TMP2]] to i64 +; CHECK-NEXT: [[ARRAYIDX4_EPIL:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3_EPIL]] +; CHECK-NEXT: [[IDXPROM5_EPIL:%.*]] = sext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5_EPIL]] +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX4_EPIL]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX6_EPIL]], align 8 +; CHECK-NEXT: [[CMP_I_EPIL:%.*]] = fcmp fast olt double [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[CMP_I_EPIL]], double [[TMP5]], double [[TMP4]] +; CHECK-NEXT: store double [[TMP6]], ptr [[ARRAYIDX4_EPIL]], align 8, !tbaa [[TBAA5:![0-9]+]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1 +; CHECK-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 +; CHECK-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] +; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY_EPIL]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[FOR_COND_CLEANUP]]: +; CHECK-NEXT: ret void +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_3]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[GEP]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[IDXPROM3:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3]] +; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP9:%.*]] = load double, ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = load double, ptr [[ARRAYIDX6]], align 8 +; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast olt double [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = select i1 [[CMP_I]], double [[TMP10]], double [[TMP9]] +; CHECK-NEXT: store double [[TMP11]], ptr [[ARRAYIDX4]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = or disjoint i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX_1]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV_NEXT]] +; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[GEP_1]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[IDXPROM3_1:%.*]] = sext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3_1]] +; CHECK-NEXT: [[IDXPROM5_1:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5_1]] +; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX4_1]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX6_1]], align 8 +; CHECK-NEXT: [[CMP_I_1:%.*]] = fcmp fast olt double [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = select i1 [[CMP_I_1]], double [[TMP15]], double [[TMP14]] +; CHECK-NEXT: store double [[TMP16]], ptr [[ARRAYIDX4_1]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or disjoint i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV_NEXT_1]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX_2]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV_NEXT_1]] +; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[GEP_2]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[IDXPROM3_2:%.*]] = sext i32 [[TMP17]] to i64 +; CHECK-NEXT: [[ARRAYIDX4_2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3_2]] +; CHECK-NEXT: [[IDXPROM5_2:%.*]] = sext i32 [[TMP18]] to i64 +; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5_2]] +; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr [[ARRAYIDX4_2]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load double, ptr [[ARRAYIDX6_2]], align 8 +; CHECK-NEXT: [[CMP_I_2:%.*]] = fcmp fast olt double [[TMP19]], [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = select i1 [[CMP_I_2]], double [[TMP20]], double [[TMP19]] +; CHECK-NEXT: store double [[TMP21]], ptr [[ARRAYIDX4_2]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or disjoint i64 [[INDVARS_IV]], 3 +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, ptr [[FACE_CELL]], i64 [[INDVARS_IV_NEXT_2]] +; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX_3]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, ptr [[INVARIANT_GEP]], i64 [[INDVARS_IV_NEXT_2]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[GEP_3]], align 4, !tbaa [[TBAA0]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[IDXPROM3_3:%.*]] = sext i32 [[TMP22]] to i64 +; CHECK-NEXT: [[ARRAYIDX4_3:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM3_3]] +; CHECK-NEXT: [[IDXPROM5_3:%.*]] = sext i32 [[TMP23]] to i64 +; CHECK-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM5_3]] +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr [[ARRAYIDX4_3]], align 8 +; CHECK-NEXT: [[TMP25:%.*]] = load double, ptr [[ARRAYIDX6_3]], align 8 +; CHECK-NEXT: [[CMP_I_3:%.*]] = fcmp fast olt double [[TMP24]], [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = select i1 [[CMP_I_3]], double [[TMP25]], double [[TMP24]] +; CHECK-NEXT: store double [[TMP26]], ptr [[ARRAYIDX4_3]], align 8, !tbaa [[TBAA5]], !llvm.access.group [[ACC_GRP4]] +; CHECK-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; CHECK-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4 +; CHECK-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] +; CHECK-NEXT: br i1 [[NITER_NCMP_3]], label %[[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; +entry: + %nface.addr = alloca i32, align 4 + %ncell.addr = alloca i32, align 4 + %face_cell.addr = alloca ptr, align 8 + %x.addr = alloca ptr, align 8 + %y.addr = alloca ptr, align 8 + %il = alloca i32, align 4 + %ir = alloca i32, align 4 + %iface = alloca i32, align 4 + store i32 %nface, ptr %nface.addr, align 4, !tbaa !6 + store i32 %ncell, ptr %ncell.addr, align 4, !tbaa !6 + store ptr %face_cell, ptr %face_cell.addr, align 8, !tbaa !10 + store ptr %x, ptr %x.addr, align 8, !tbaa !10 + store ptr %y, ptr %y.addr, align 8, !tbaa !10 + call void @llvm.lifetime.start.p0(i64 4, ptr %il) #3 + call void @llvm.lifetime.start.p0(i64 4, ptr %ir) #3 + call void @llvm.lifetime.start.p0(i64 4, ptr %iface) #3 + store i32 0, ptr %iface, align 4, !tbaa !6 + br label %for.cond + +for.cond: + %0 = load i32, ptr %iface, align 4, !tbaa !6, !llvm.access.group !12 + %1 = load i32, ptr %nface.addr, align 4, !tbaa !6, !llvm.access.group !12 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: + call void @llvm.lifetime.end.p0(i64 4, ptr %iface) #3, !llvm.access.group !12 + br label %for.end + +for.body: + %2 = load ptr, ptr %face_cell.addr, align 8, !tbaa !10, !llvm.access.group !12 + %3 = load i32, ptr %iface, align 4, !tbaa !6, !llvm.access.group !12 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, ptr %2, i64 %idxprom + %4 = load i32, ptr %arrayidx, align 4, !tbaa !6, !llvm.access.group !12 + store i32 %4, ptr %il, align 4, !tbaa !6, !llvm.access.group !12 + %5 = load ptr, ptr %face_cell.addr, align 8, !tbaa !10, !llvm.access.group !12 + %6 = load i32, ptr %iface, align 4, !tbaa !6, !llvm.access.group !12 + %7 = load i32, ptr %nface.addr, align 4, !tbaa !6, !llvm.access.group !12 + %add = add nsw i32 %6, %7 + %idxprom1 = sext i32 %add to i64 + %arrayidx2 = getelementptr inbounds i32, ptr %5, i64 %idxprom1 + %8 = load i32, ptr %arrayidx2, align 4, !tbaa !6, !llvm.access.group !12 + store i32 %8, ptr %ir, align 4, !tbaa !6, !llvm.access.group !12 + %9 = load ptr, ptr %y.addr, align 8, !tbaa !10, !llvm.access.group !12 + %10 = load i32, ptr %il, align 4, !tbaa !6, !llvm.access.group !12 + %idxprom3 = sext i32 %10 to i64 + %arrayidx4 = getelementptr inbounds double, ptr %9, i64 %idxprom3 + %11 = load ptr, ptr %x.addr, align 8, !tbaa !10, !llvm.access.group !12 + %12 = load i32, ptr %ir, align 4, !tbaa !6, !llvm.access.group !12 + %idxprom5 = sext i32 %12 to i64 + %arrayidx6 = getelementptr inbounds double, ptr %11, i64 %idxprom5 + %call = call noundef nonnull align 8 dereferenceable(8) ptr @max(ptr noundef nonnull align 8 dereferenceable(8) %arrayidx4, ptr noundef nonnull align 8 dereferenceable(8) %arrayidx6), !llvm.access.group !12 + %13 = load double, ptr %call, align 8, !tbaa !13, !llvm.access.group !12 + %14 = load ptr, ptr %y.addr, align 8, !tbaa !10, !llvm.access.group !12 + %15 = load i32, ptr %il, align 4, !tbaa !6, !llvm.access.group !12 + %idxprom7 = sext i32 %15 to i64 + %arrayidx8 = getelementptr inbounds double, ptr %14, i64 %idxprom7 + store double %13, ptr %arrayidx8, align 8, !tbaa !13, !llvm.access.group !12 + br label %for.inc + +for.inc: + %16 = load i32, ptr %iface, align 4, !tbaa !6, !llvm.access.group !12 + %inc = add nsw i32 %16, 1 + store i32 %inc, ptr %iface, align 4, !tbaa !6, !llvm.access.group !12 + br label %for.cond, !llvm.loop !15 + +for.end: + call void @llvm.lifetime.end.p0(i64 4, ptr %ir) #3 + call void @llvm.lifetime.end.p0(i64 4, ptr %il) #3 + ret void +} + +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1 + +define linkonce_odr noundef nonnull align 8 dereferenceable(8) ptr @max(ptr noundef nonnull align 8 dereferenceable(8) %__a, ptr noundef nonnull align 8 dereferenceable(8) %__b) #2 { +entry: + %retval = alloca ptr, align 8 + %__a.addr = alloca ptr, align 8 + %__b.addr = alloca ptr, align 8 + store ptr %__a, ptr %__a.addr, align 8, !tbaa !10 + store ptr %__b, ptr %__b.addr, align 8, !tbaa !10 + %0 = load ptr, ptr %__a.addr, align 8, !tbaa !10 + %1 = load double, ptr %0, align 8, !tbaa !13 + %2 = load ptr, ptr %__b.addr, align 8, !tbaa !10 + %3 = load double, ptr %2, align 8, !tbaa !13 + %cmp = fcmp fast olt double %1, %3 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %4 = load ptr, ptr %__b.addr, align 8, !tbaa !10 + store ptr %4, ptr %retval, align 8 + br label %return + +if.end: + %5 = load ptr, ptr %__a.addr, align 8, !tbaa !10 + store ptr %5, ptr %retval, align 8 + br label %return + +return: + %6 = load ptr, ptr %retval, align 8 + ret ptr %6 +} + +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1 + +attributes #0 = { mustprogress "target-cpu" = "skylake-avx512" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) } + +!6 = !{!7, !7, i64 0} +!7 = !{!"int", !8, i64 0} +!8 = !{!"omnipotent char", !9, i64 0} +!9 = !{!"Simple C++ TBAA"} +!10 = !{!11, !11, i64 0} +!11 = !{!"any pointer", !8, i64 0} +!12 = distinct !{} +!13 = !{!14, !14, i64 0} +!14 = !{!"double", !8, i64 0} +!15 = distinct !{!15, !16, !17, !18} +!16 = !{!"llvm.loop.mustprogress"} +!17 = !{!"llvm.loop.parallel_accesses", !12} +!18 = !{!"llvm.loop.vectorize.enable", i1 true} + +;. +; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0} +; CHECK: [[META1]] = !{!"int", [[META2:![0-9]+]], i64 0} +; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0} +; CHECK: [[META3]] = !{!"Simple C++ TBAA"} +; CHECK: [[ACC_GRP4]] = distinct !{} +; CHECK: [[TBAA5]] = !{[[META6:![0-9]+]], [[META6]], i64 0} +; CHECK: [[META6]] = !{!"double", [[META2]], i64 0} +; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META8:![0-9]+]]} +; CHECK: [[META8]] = !{!"llvm.loop.unroll.disable"} +; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META10:![0-9]+]], [[META11:![0-9]+]], [[META12:![0-9]+]]} +; CHECK: [[META10]] = !{!"llvm.loop.mustprogress"} +; CHECK: [[META11]] = !{!"llvm.loop.parallel_accesses", [[ACC_GRP4]]} +; CHECK: [[META12]] = !{!"llvm.loop.vectorize.enable", i1 true} +;. diff --git a/llvm/test/Transforms/Reassociate/preserve-debugloc.ll b/llvm/test/Transforms/Reassociate/preserve-debugloc.ll new file mode 100644 index 00000000000000..ff1f8ac73410a4 --- /dev/null +++ b/llvm/test/Transforms/Reassociate/preserve-debugloc.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;; Tests that we preserve DebugLocs through reassociation of sub instructions. +; RUN: opt < %s -passes=reassociate -S | FileCheck %s + +define void @foo(i64 %0) { +; CHECK-LABEL: define void @foo( +; CHECK-SAME: i64 [[TMP0:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DOTNEG:%.*]] = sub i64 0, [[TMP0]], !dbg [[DBG3:![0-9]+]] +; CHECK-NEXT: [[ADD_I_I:%.*]] = add i64 [[DOTNEG]], 1 +; CHECK-NEXT: store i64 [[ADD_I_I]], ptr null, align 8 +; CHECK-NEXT: ret void +; +entry: + %sub5.i.i = sub i64 1, %0, !dbg !4 + %add.i.i = add i64 %sub5.i.i, 0 + store i64 %add.i.i, ptr null, align 8 + ret void +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 20.0.0git", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "test.cpp", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !DILocation(line: 212, column: 25, scope: !5) +!5 = distinct !DISubprogram(name: "foo", scope: !0, file: !1, line: 161, type: !6, scopeLine: 162, unit: !0, retainedNodes: !2) +!6 = distinct !DISubroutineType(types: !2) +;. +; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +; CHECK: [[META1]] = !DIFile(filename: "test.cpp", directory: {{.*}}) +; CHECK: [[DBG3]] = !DILocation(line: 212, column: 25, scope: [[META4:![0-9]+]]) +; CHECK: [[META4]] = distinct !DISubprogram(name: "foo", scope: [[META0]], file: [[META1]], line: 161, type: [[META5:![0-9]+]], scopeLine: 162, spFlags: DISPFlagDefinition, unit: [[META0]], retainedNodes: [[META6:![0-9]+]]) +; CHECK: [[META5]] = distinct !DISubroutineType(types: [[META6]]) +; CHECK: [[META6]] = !{} +;. diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/base-atomicrmw.ll b/llvm/test/Transforms/RewriteStatepointsForGC/base-atomicrmw.ll new file mode 100644 index 00000000000000..9d33a5760ce619 --- /dev/null +++ b/llvm/test/Transforms/RewriteStatepointsForGC/base-atomicrmw.ll @@ -0,0 +1,14 @@ +; RUN: opt < %s -passes=rewrite-statepoints-for-gc -S 2>&1 | FileCheck %s + +define ptr addrspace(1) @test(ptr %a, ptr addrspace(1) %b) gc "statepoint-example" { +; CHECK-LABEL: @test +; CHECK-NEXT: [[RES:%.*]] = atomicrmw xchg ptr %a, ptr addrspace(1) %b seq_cst +; CHECK-NEXT: [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(void ()) @foo, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(ptr addrspace(1) [[RES]]) ] +; CHECK-NEXT: [[RES_RELOCATED:%.*]] = call coldcc ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[STATEPOINT_TOKEN]], i32 0, i32 0) +; CHECK-NEXT: ret ptr addrspace(1) [[RES_RELOCATED]] + %res = atomicrmw xchg ptr %a, ptr addrspace(1) %b seq_cst + call void @foo() + ret ptr addrspace(1) %res +} + +declare void @foo() diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll index 40ce26f8c4ed48..e1fc7e056e0978 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -801,6 +801,103 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +declare float @atan2f(float,float) readonly nounwind willreturn +define <4 x float> @atan2_4x(ptr %a, ptr %b) { +; CHECK-LABEL: @atan2_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @atan2_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atan2f(float [[VECEXT]], float [[VECEXTB]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atan2f(float [[VECEXT_1]], float [[VECEXTB_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %bb = load <4 x float>, ptr %b, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %vecextb = extractelement <4 x float> %bb, i32 0 + %1 = tail call fast float @atan2f(float %vecext, float %vecextb) + %vecins = insertelement <4 x float> poison, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %vecextb.1 = extractelement <4 x float> %bb, i32 1 + %2 = tail call fast float @atan2f(float %vecext.1, float %vecextb.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %vecextb.2 = extractelement <4 x float> %bb, i32 2 + %3 = tail call fast float @atan2f(float %vecext.2, float %vecextb.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %vecextb.3 = extractelement <4 x float> %bb, i32 3 + %4 = tail call fast float @atan2f(float %vecext.3, float %vecextb.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} +define <4 x float> @int_atan2_4x(ptr %a, ptr %b) { +; CHECK-LABEL: @int_atan2_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_atan2_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT]], float [[VECEXTB]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_1]], float [[VECEXTB_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %bb = load <4 x float>, ptr %b, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %vecextb = extractelement <4 x float> %bb, i32 0 + %1 = tail call fast float @llvm.atan2.f32(float %vecext, float %vecextb) + %vecins = insertelement <4 x float> poison, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %vecextb.1 = extractelement <4 x float> %bb, i32 1 + %2 = tail call fast float @llvm.atan2.f32(float %vecext.1, float %vecextb.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %vecextb.2 = extractelement <4 x float> %bb, i32 2 + %3 = tail call fast float @llvm.atan2.f32(float %vecext.2, float %vecextb.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %vecextb.3 = extractelement <4 x float> %bb, i32 3 + %4 = tail call fast float @llvm.atan2.f32(float %vecext.3, float %vecextb.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @sinhf(float) readonly nounwind willreturn define <4 x float> @sinh_4x(ptr %a) { ; CHECK-LABEL: @sinh_4x( diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll index 285b85ed69be6b..058b68b8f6c78a 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -801,6 +801,103 @@ entry: %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 ret <4 x float> %vecins.3 } +declare float @atan2f(float,float) readonly nounwind willreturn +define <4 x float> @atan2_4x(ptr %a, ptr %b) { +; CHECK-LABEL: @atan2_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @atan2_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atan2f(float [[VECEXT]], float [[VECEXTB]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atan2f(float [[VECEXT_1]], float [[VECEXTB_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %bb = load <4 x float>, ptr %b, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %vecextb = extractelement <4 x float> %bb, i32 0 + %1 = tail call fast float @atan2f(float %vecext, float %vecextb) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %vecextb.1 = extractelement <4 x float> %bb, i32 1 + %2 = tail call fast float @atan2f(float %vecext.1, float %vecextb.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %vecextb.2 = extractelement <4 x float> %bb, i32 2 + %3 = tail call fast float @atan2f(float %vecext.2, float %vecextb.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %vecextb.3 = extractelement <4 x float> %bb, i32 3 + %4 = tail call fast float @atan2f(float %vecext.3, float %vecextb.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} +define <4 x float> @int_atan2_4x(ptr %a, ptr %b) { +; CHECK-LABEL: @int_atan2_4x( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]]) +; CHECK-NEXT: ret <4 x float> [[TMP1]] +; +; NOACCELERATE-LABEL: @int_atan2_4x( +; NOACCELERATE-NEXT: entry: +; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16 +; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16 +; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 +; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0 +; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT]], float [[VECEXTB]]) +; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 +; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_1]], float [[VECEXTB_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 +; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] +; +entry: + %0 = load <4 x float>, ptr %a, align 16 + %bb = load <4 x float>, ptr %b, align 16 + %vecext = extractelement <4 x float> %0, i32 0 + %vecextb = extractelement <4 x float> %bb, i32 0 + %1 = tail call fast float @llvm.atan2.f32(float %vecext, float %vecextb) + %vecins = insertelement <4 x float> undef, float %1, i32 0 + %vecext.1 = extractelement <4 x float> %0, i32 1 + %vecextb.1 = extractelement <4 x float> %bb, i32 1 + %2 = tail call fast float @llvm.atan2.f32(float %vecext.1, float %vecextb.1) + %vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1 + %vecext.2 = extractelement <4 x float> %0, i32 2 + %vecextb.2 = extractelement <4 x float> %bb, i32 2 + %3 = tail call fast float @llvm.atan2.f32(float %vecext.2, float %vecextb.2) + %vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2 + %vecext.3 = extractelement <4 x float> %0, i32 3 + %vecextb.3 = extractelement <4 x float> %bb, i32 3 + %4 = tail call fast float @llvm.atan2.f32(float %vecext.3, float %vecextb.3) + %vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3 + ret <4 x float> %vecins.3 +} declare float @sinhf(float) readonly nounwind willreturn define <4 x float> @sinh_4x(ptr %a) { ; CHECK-LABEL: @sinh_4x( diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll index 3d00ddf89aaa3b..b312688b7932dc 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll @@ -94,3 +94,43 @@ entry: %23 = fcmp ogt <8 x float> zeroinitializer, %19 ret void } + +define void @test3(float %0) { +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY_LR_PH:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP1]], <2 x float> zeroinitializer, i64 2) +; CHECK-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP2]], [[FOR_BODY_LR_PH]] ], [ [[TMP10:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr null, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fcmp olt <2 x float> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> poison, <2 x i1> splat (i1 true), i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i1> @llvm.vector.insert.v4i1.v2i1(<4 x i1> [[TMP6]], <2 x i1> [[TMP5]], i64 2) +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10]] = select <4 x i1> [[TMP7]], <4 x float> [[TMP9]], <4 x float> [[TMP2]] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; +entry: + br label %for.body.lr.ph + +for.body.lr.ph: + br i1 false, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %for.body.lr.ph + %1 = phi <2 x float> [ zeroinitializer, %for.body.lr.ph ], [ %5, %for.body ] + %2 = phi <2 x float> [ zeroinitializer, %for.body.lr.ph ], [ %6, %for.body ] + ret void + +for.body: + %3 = load <2 x float>, ptr null, align 4 + %4 = fcmp olt <2 x float> zeroinitializer, %3 + %5 = select <2 x i1> , <2 x float> %3, <2 x float> zeroinitializer + %6 = select <2 x i1> %4, <2 x float> %3, <2 x float> zeroinitializer + br label %for.cond.cleanup +} diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll index e56dbd75963f7a..49aeea9f8a8491 100644 --- a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll +++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll @@ -6,11 +6,7 @@ define void @store_load(ptr %ptr) { ; CHECK-SAME: ptr [[PTR:%.*]]) { ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1 -; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[PTR1]], align 4 ; CHECK-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 -; CHECK-NEXT: store float [[LD0]], ptr [[PTR0]], align 4 -; CHECK-NEXT: store float [[LD1]], ptr [[PTR1]], align 4 ; CHECK-NEXT: store <2 x float> [[VECL]], ptr [[PTR0]], align 4 ; CHECK-NEXT: ret void ; @@ -31,14 +27,8 @@ define void @store_fpext_load(ptr %ptr) { ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1 ; CHECK-NEXT: [[PTRD0:%.*]] = getelementptr double, ptr [[PTR]], i32 0 ; CHECK-NEXT: [[PTRD1:%.*]] = getelementptr double, ptr [[PTR]], i32 1 -; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[PTR1]], align 4 ; CHECK-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[FPEXT0:%.*]] = fpext float [[LD0]] to double -; CHECK-NEXT: [[FPEXT1:%.*]] = fpext float [[LD1]] to double ; CHECK-NEXT: [[VCAST:%.*]] = fpext <2 x float> [[VECL]] to <2 x double> -; CHECK-NEXT: store double [[FPEXT0]], ptr [[PTRD0]], align 8 -; CHECK-NEXT: store double [[FPEXT1]], ptr [[PTRD1]], align 8 ; CHECK-NEXT: store <2 x double> [[VCAST]], ptr [[PTRD0]], align 8 ; CHECK-NEXT: ret void ; @@ -62,20 +52,10 @@ define void @store_fcmp_zext_load(ptr %ptr) { ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1 ; CHECK-NEXT: [[PTRB0:%.*]] = getelementptr i32, ptr [[PTR]], i32 0 ; CHECK-NEXT: [[PTRB1:%.*]] = getelementptr i32, ptr [[PTR]], i32 1 -; CHECK-NEXT: [[LDB0:%.*]] = load float, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[LDB1:%.*]] = load float, ptr [[PTR1]], align 4 ; CHECK-NEXT: [[VECL1:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[LDA0:%.*]] = load float, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[LDA1:%.*]] = load float, ptr [[PTR1]], align 4 ; CHECK-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[FCMP0:%.*]] = fcmp ogt float [[LDA0]], [[LDB0]] -; CHECK-NEXT: [[FCMP1:%.*]] = fcmp ogt float [[LDA1]], [[LDB1]] ; CHECK-NEXT: [[VCMP:%.*]] = fcmp ogt <2 x float> [[VECL]], [[VECL1]] -; CHECK-NEXT: [[ZEXT0:%.*]] = zext i1 [[FCMP0]] to i32 -; CHECK-NEXT: [[ZEXT1:%.*]] = zext i1 [[FCMP1]] to i32 ; CHECK-NEXT: [[VCAST:%.*]] = zext <2 x i1> [[VCMP]] to <2 x i32> -; CHECK-NEXT: store i32 [[ZEXT0]], ptr [[PTRB0]], align 4 -; CHECK-NEXT: store i32 [[ZEXT1]], ptr [[PTRB1]], align 4 ; CHECK-NEXT: store <2 x i32> [[VCAST]], ptr [[PTRB0]], align 4 ; CHECK-NEXT: ret void ; @@ -101,17 +81,9 @@ define void @store_fadd_load(ptr %ptr) { ; CHECK-SAME: ptr [[PTR:%.*]]) { ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1 -; CHECK-NEXT: [[LDA0:%.*]] = load float, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[LDA1:%.*]] = load float, ptr [[PTR1]], align 4 ; CHECK-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[LDB0:%.*]] = load float, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[LDB1:%.*]] = load float, ptr [[PTR1]], align 4 ; CHECK-NEXT: [[VECL1:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[FADD0:%.*]] = fadd float [[LDA0]], [[LDB0]] -; CHECK-NEXT: [[FADD1:%.*]] = fadd float [[LDA1]], [[LDB1]] ; CHECK-NEXT: [[VEC:%.*]] = fadd <2 x float> [[VECL]], [[VECL1]] -; CHECK-NEXT: store float [[FADD0]], ptr [[PTR0]], align 4 -; CHECK-NEXT: store float [[FADD1]], ptr [[PTR1]], align 4 ; CHECK-NEXT: store <2 x float> [[VEC]], ptr [[PTR0]], align 4 ; CHECK-NEXT: ret void ; @@ -133,14 +105,8 @@ define void @store_fneg_load(ptr %ptr) { ; CHECK-SAME: ptr [[PTR:%.*]]) { ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1 -; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[PTR1]], align 4 ; CHECK-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 -; CHECK-NEXT: [[FNEG0:%.*]] = fneg float [[LD0]] -; CHECK-NEXT: [[FNEG1:%.*]] = fneg float [[LD1]] ; CHECK-NEXT: [[VEC:%.*]] = fneg <2 x float> [[VECL]] -; CHECK-NEXT: store float [[FNEG0]], ptr [[PTR0]], align 4 -; CHECK-NEXT: store float [[FNEG1]], ptr [[PTR1]], align 4 ; CHECK-NEXT: store <2 x float> [[VEC]], ptr [[PTR0]], align 4 ; CHECK-NEXT: ret void ; @@ -155,3 +121,25 @@ define void @store_fneg_load(ptr %ptr) { ret void } +define float @scalars_with_external_uses_not_dead(ptr %ptr) { +; CHECK-LABEL: define float @scalars_with_external_uses_not_dead( +; CHECK-SAME: ptr [[PTR:%.*]]) { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr float, ptr [[PTR]], i32 0 +; CHECK-NEXT: [[PTR1:%.*]] = getelementptr float, ptr [[PTR]], i32 1 +; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[PTR0]], align 4 +; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[PTR1]], align 4 +; CHECK-NEXT: [[VECL:%.*]] = load <2 x float>, ptr [[PTR0]], align 4 +; CHECK-NEXT: store <2 x float> [[VECL]], ptr [[PTR0]], align 4 +; CHECK-NEXT: [[USER:%.*]] = fneg float [[LD1]] +; CHECK-NEXT: ret float [[LD0]] +; + %ptr0 = getelementptr float, ptr %ptr, i32 0 + %ptr1 = getelementptr float, ptr %ptr, i32 1 + %ld0 = load float, ptr %ptr0 + %ld1 = load float, ptr %ptr1 + store float %ld0, ptr %ptr0 + store float %ld1, ptr %ptr1 + %user = fneg float %ld1 + ret float %ld0 +} + diff --git a/llvm/test/Transforms/VectorCombine/X86/pr115575.ll b/llvm/test/Transforms/VectorCombine/X86/pr115575.ll new file mode 100644 index 00000000000000..dd8e27727dd3c0 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/pr115575.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- | FileCheck %s + +define <2 x i8> @PR115575(i8 %x) { +; CHECK-LABEL: define <2 x i8> @PR115575( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[INS:%.*]] = insertelement <2 x i8> poison, i8 [[X]], i32 3 +; CHECK-NEXT: [[BO:%.*]] = sdiv <2 x i8> [[INS]], +; CHECK-NEXT: ret <2 x i8> [[BO]] +; + %ins = insertelement <2 x i8> poison, i8 %x, i32 3 + %bo = sdiv <2 x i8> %ins, + ret <2 x i8> %bo +} diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/05-dwarf-incorrect-lexical-scope-variable.test b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/05-dwarf-incorrect-lexical-scope-variable.test index 5453a46fb542dd..e1ac7588f1d8c4 100644 --- a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/05-dwarf-incorrect-lexical-scope-variable.test +++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/05-dwarf-incorrect-lexical-scope-variable.test @@ -50,7 +50,7 @@ ; ONE-NEXT: [003] 3 {Variable} 'Var_1' -> 'int' ; ONE-NEXT: [002] 11 {Function} extern not_inlined 'test' -> 'int' ; ONE-NEXT: [003] 12 {Variable} 'A' -> 'int' -; ONE-NEXT: [003] 14 {InlinedFunction} not_inlined 'InlineFunction' -> 'int' +; ONE-NEXT: [003] 13 {InlinedFunction} not_inlined 'InlineFunction' -> 'int' ; ONE-NEXT: [004] {Block} ; ONE-NEXT: [005] {Variable} 'Var_2' -> 'int' ; ONE-NEXT: [004] {Parameter} 'Param' -> 'int' diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/Inputs/ThreadLocalStorage.ll b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/Inputs/ThreadLocalStorage.ll new file mode 100644 index 00000000000000..45b7574f1843e9 --- /dev/null +++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/Inputs/ThreadLocalStorage.ll @@ -0,0 +1,42 @@ +source_filename = "ThreadLocalStorage.cpp" +target triple = "x86_64-pc-linux-gnu" + +@TGlobal = dso_local thread_local global i32 0, align 4, !dbg !0 +@NGlobal = dso_local global i32 1, align 4, !dbg !5 +@_ZZ4testvE6TLocal = internal thread_local global i32 0, align 4, !dbg !8 + +define dso_local void @_Z4testv() !dbg !10 { +entry: + %NLocal = alloca i32, align 4 + %0 = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @TGlobal), !dbg !22 + store i32 1, ptr %0, align 4 + #dbg_declare(ptr %NLocal, !24, !DIExpression(), !25) + store i32 0, ptr %NLocal, align 4, !dbg !25 + store i32 2, ptr @NGlobal, align 4 + ret void +} + +declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!14, !15} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "TGlobal", scope: !2, file: !3, line: 1, type: !7, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, emissionKind: FullDebug, globals: !4) +!3 = !DIFile(filename: "ThreadLocalStorage.cpp", directory: "") +!4 = !{!0, !5, !8} +!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression()) +!6 = distinct !DIGlobalVariable(name: "NGlobal", scope: !2, file: !3, line: 2, type: !7, isLocal: false, isDefinition: true) +!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!8 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression()) +!9 = distinct !DIGlobalVariable(name: "TLocal", scope: !10, file: !3, line: 4, type: !7, isLocal: true, isDefinition: true) +!10 = distinct !DISubprogram(name: "test", scope: !3, file: !3, line: 3, type: !11, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !2, retainedNodes: !13) +!11 = !DISubroutineType(types: !12) +!12 = !{null} +!13 = !{} +!14 = !{i32 7, !"Dwarf Version", i32 5} +!15 = !{i32 2, !"Debug Info Version", i32 3} +!22 = !DILocation(line: 5, scope: !10) +!24 = !DILocalVariable(name: "NLocal", scope: !10, file: !3, line: 7, type: !7) +!25 = !DILocation(line: 7, scope: !10) diff --git a/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/crash-thread-local-storage.test b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/crash-thread-local-storage.test new file mode 100644 index 00000000000000..5f2b3f9e824ab7 --- /dev/null +++ b/llvm/test/tools/llvm-debuginfo-analyzer/DWARF/crash-thread-local-storage.test @@ -0,0 +1,51 @@ +; REQUIRES: x86-registered-target + +; For the given test case: + +; // ThreadLocalStorage.cpp +; 1 thread_local int TGlobal = 0; +; 2 int NGlobal = 1; +; 3 void test() { +; 4 thread_local int TLocal = 0; +; 5 TGlobal = 1; +; 6 +; 7 int NLocal = 0; +; 8 NGlobal = 2; +; 9 } + +; The llvm-debuginfo-analyzer crashes when producing a logical view for +; the object file generated using the following commands: +; +; clang++ -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes +; -fno-discard-value-names -emit-llvm -S -g -O0 +; ThreadLocalStorage.cpp -o ThreadLocalStorage.ll +; llc --filetype=obj ThreadLocalStorage.ll -o ThreadLocalStorage.o +; +; llvm-debuginfo-analyzer --attribute=location --print=symbols +; ThreadLocalStorage.o + +; RUN: llc --filetype=obj \ +; RUN: %p/Inputs/ThreadLocalStorage.ll -o %t.ThreadLocalStorage.o + +; RUN: llvm-debuginfo-analyzer --attribute=location \ +; RUN: --print=symbols \ +; RUN: %t.ThreadLocalStorage.o 2>&1 | \ +; RUN: FileCheck --strict-whitespace %s + +; CHECK: Logical View: +; CHECK: {File} '{{.*}}threadlocalstorage.o' +; CHECK-EMPTY: +; CHECK: {CompileUnit} 'threadlocalstorage.cpp' +; CHECK: 1 {Variable} extern 'TGlobal' -> 'int' +; CHECK: {Location} +; CHECK: {Entry} const_u 0, gnu_push_tls_address +; CHECK: 2 {Variable} extern 'NGlobal' -> 'int' +; CHECK: {Location} +; CHECK: {Entry} addrx 0 +; CHECK: 3 {Function} extern not_inlined 'test' -> 'void' +; CHECK: 4 {Variable} 'TLocal' -> 'int' +; CHECK: {Location} +; CHECK: {Entry} const_u 0, gnu_push_tls_address +; CHECK: 7 {Variable} 'NLocal' -> 'int' +; CHECK: {Location} +; CHECK: {Entry} fbreg -4 diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp index 540c43889da758..dd275b73a0c7ec 100644 --- a/llvm/tools/lli/lli.cpp +++ b/llvm/tools/lli/lli.cpp @@ -24,6 +24,7 @@ #include "llvm/ExecutionEngine/JITSymbol.h" #include "llvm/ExecutionEngine/MCJIT.h" #include "llvm/ExecutionEngine/ObjectCache.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/ExecutionEngine/Orc/Debugging/DebuggerSupport.h" #include "llvm/ExecutionEngine/Orc/EPCDynamicLibrarySearchGenerator.h" diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index cc144cda18e1a3..f6b631834b1e34 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -15,6 +15,7 @@ #include "llvm-jitlink.h" #include "llvm/BinaryFormat/Magic.h" #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX, LLVM_ENABLE_THREADS +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/COFFPlatform.h" #include "llvm/ExecutionEngine/Orc/COFFVCRuntimeSupport.h" #include "llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h" @@ -1740,7 +1741,7 @@ static Error addSectCreates(Session &S, StringRef SCArg(*SCItr); - auto [SectAndFileName, ExtraSymbolsString] = SCArg.split('@'); + auto [SectAndFileName, ExtraSymbolsString] = SCArg.rsplit('@'); auto [SectName, FileName] = SectAndFileName.rsplit(','); if (SectName.empty()) return make_error("In -sectcreate=" + SCArg + diff --git a/llvm/tools/llvm-pdbutil/PdbYaml.h b/llvm/tools/llvm-pdbutil/PdbYaml.h index 4382e91e209737..21658e9d0e75e4 100644 --- a/llvm/tools/llvm-pdbutil/PdbYaml.h +++ b/llvm/tools/llvm-pdbutil/PdbYaml.h @@ -111,16 +111,16 @@ struct PdbObject { } } -LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbObject) -LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::MSFHeaders) -LLVM_YAML_DECLARE_MAPPING_TRAITS(msf::SuperBlock) -LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::StreamBlockList) -LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbInfoStream) -LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbDbiStream) -LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbTpiStream) -LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbPublicsStream) -LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::NamedStreamMapping) -LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbModiStream) -LLVM_YAML_DECLARE_MAPPING_TRAITS(pdb::yaml::PdbDbiModuleInfo) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(pdb::yaml::PdbObject) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(pdb::yaml::MSFHeaders) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(msf::SuperBlock) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(pdb::yaml::StreamBlockList) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(pdb::yaml::PdbInfoStream) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(pdb::yaml::PdbDbiStream) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(pdb::yaml::PdbTpiStream) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(pdb::yaml::PdbPublicsStream) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(pdb::yaml::NamedStreamMapping) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(pdb::yaml::PdbModiStream) +LLVM_YAML_DECLARE_MAPPING_TRAITS_PRIVATE(pdb::yaml::PdbDbiModuleInfo) #endif // LLVM_TOOLS_LLVMPDBDUMP_PDBYAML_H diff --git a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp index 39a49eb1799936..a907dfcf2cec5b 100644 --- a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp @@ -9,6 +9,7 @@ #include "OrcTestCommon.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/Config/llvm-config.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/Core.h" #include "llvm/ExecutionEngine/Orc/Shared/OrcError.h" #include "llvm/Testing/Support/Error.h" diff --git a/llvm/unittests/ExecutionEngine/Orc/ExecutionSessionWrapperFunctionCallsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ExecutionSessionWrapperFunctionCallsTest.cpp index 1b79e12ee168c8..a25da8a727de54 100644 --- a/llvm/unittests/ExecutionEngine/Orc/ExecutionSessionWrapperFunctionCallsTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/ExecutionSessionWrapperFunctionCallsTest.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/Core.h" #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h" #include "llvm/Support/MSVCErrorWorkarounds.h" diff --git a/llvm/unittests/ExecutionEngine/Orc/LookupAndRecordAddrsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LookupAndRecordAddrsTest.cpp index 05ea919d4131c9..f62151de77a4bc 100644 --- a/llvm/unittests/ExecutionEngine/Orc/LookupAndRecordAddrsTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/LookupAndRecordAddrsTest.cpp @@ -6,9 +6,9 @@ // //===----------------------------------------------------------------------===// -#include "OrcTestCommon.h" - #include "llvm/ExecutionEngine/Orc/LookupAndRecordAddrs.h" +#include "OrcTestCommon.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/Support/MSVCErrorWorkarounds.h" #include "llvm/Testing/Support/Error.h" diff --git a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp index 20db572417d5f6..083a924ce9aa16 100644 --- a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp +++ b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp @@ -1,6 +1,7 @@ #include "llvm/ExecutionEngine/Orc/ReOptimizeLayer.h" #include "OrcTestCommon.h" #include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h" +#include "llvm/ExecutionEngine/Orc/AbsoluteSymbols.h" #include "llvm/ExecutionEngine/Orc/CompileUtils.h" #include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 4bdb6ee5cf1825..d69b2d6b13b1a6 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1297,7 +1297,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { AArch64::AEK_SIMD, AArch64::AEK_FP16, AArch64::AEK_FP16FML, AArch64::AEK_PROFILE, AArch64::AEK_RAS, AArch64::AEK_SVE, - AArch64::AEK_SVE2, AArch64::AEK_ALIAS_SVE2AES, + AArch64::AEK_SVE2, AArch64::AEK_SVE2AES, AArch64::AEK_SVE2SM4, AArch64::AEK_SVE2SHA3, AArch64::AEK_SVE2BITPERM, AArch64::AEK_RCPC, AArch64::AEK_RAND, AArch64::AEK_MTE, @@ -1334,7 +1334,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { AArch64::AEK_FPRCVT, AArch64::AEK_CMPBR, AArch64::AEK_LSUI, AArch64::AEK_OCCMO, AArch64::AEK_PCDPHINT, AArch64::AEK_POPS, - AArch64::AEK_SVEAES}; + }; std::vector Features; @@ -1369,7 +1369,6 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) { EXPECT_TRUE(llvm::is_contained(Features, "+sve-bfscale")); EXPECT_TRUE(llvm::is_contained(Features, "+sve-f16f32mm")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2")); - EXPECT_TRUE(llvm::is_contained(Features, "+sve-aes")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-aes")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sm4")); EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sha3")); @@ -1539,7 +1538,6 @@ TEST(TargetParserTest, AArch64ArchExtFeature) { {"sve-bfscale", "nosve-bfscale", "+sve-bfscale", "-sve-bfscale"}, {"sve-f16f32mm", "nosve-f16f32mm", "+sve-f16f32mm", "-sve-f16f32mm"}, {"sve2", "nosve2", "+sve2", "-sve2"}, - {"sve-aes", "nosve-aes", "+sve-aes", "-sve-aes"}, {"sve2-aes", "nosve2-aes", "+sve2-aes", "-sve2-aes"}, {"sve2-sm4", "nosve2-sm4", "+sve2-sm4", "-sve2-sm4"}, {"sve2-sha3", "nosve2-sha3", "+sve2-sha3", "-sve2-sha3"}, @@ -1842,11 +1840,7 @@ AArch64ExtensionDependenciesBaseArchTestParams {}, {"sve", "sve-f16f32mm"}}, - // aes -> {sve-aes} - {AArch64::ARMV8A, {"noaes", "sve-aes"}, {"aes", "sve-aes"}, {}}, - {AArch64::ARMV8A, {"sve-aes", "noaes"}, {}, {"aes", "sve-aes"}}, - - // sve2 -> {sve2p1, sve2-bitperm, sve2-sha3, sve2-sm4, sve2-aes} + // sve2 -> {sve2p1, sve2-bitperm, sve2-sha3, sve2-sm4} {AArch64::ARMV8A, {"nosve2", "sve2p1"}, {"sve2", "sve2p1"}, {}}, {AArch64::ARMV8A, {"sve2p1", "nosve2"}, {}, {"sve2", "sve2p1"}}, {AArch64::ARMV8A, @@ -1861,8 +1855,6 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV8A, {"sve2-sha3", "nosve2"}, {}, {"sve2", "sve2-sha3"}}, {AArch64::ARMV8A, {"nosve2", "sve2-sm4"}, {"sve2", "sve2-sm4"}, {}}, {AArch64::ARMV8A, {"sve2-sm4", "nosve2"}, {}, {"sve2", "sve2-sm4"}}, - {AArch64::ARMV8A, {"nosve2", "sve2-aes"}, {"sve2", "sve2-aes"}, {}}, - {AArch64::ARMV8A, {"sve2-aes", "nosve2"}, {}, {"sve2", "sve2-aes"}}, // sve-b16b16 -> {sme-b16b16} {AArch64::ARMV9_4A, @@ -1963,23 +1955,16 @@ AArch64ExtensionDependenciesBaseArchTestParams {AArch64::ARMV8A, {"norcpc", "rcpc3"}, {"rcpc", "rcpc3"}, {}}, {AArch64::ARMV8A, {"rcpc3", "norcpc"}, {}, {"rcpc", "rcpc3"}}, - // sve-aes -> {ssve-aes, sve2-aes} + // sve2-aes -> ssve-aes {AArch64::ARMV9_6A, - {"nosve-aes", "ssve-aes"}, - {"sve-aes", "ssve-aes"}, + {"nosve2-aes", "ssve-aes"}, + {"sve2-aes", "ssve-aes"}, {}}, {AArch64::ARMV9_6A, - {"ssve-aes", "nosve-aes"}, + {"ssve-aes", "nosve2-aes"}, {}, - {"ssve-aes", "sve-aes"}}, - {AArch64::ARMV9_6A, - {"nosve-aes", "sve2-aes"}, - {"sve2-aes", "sve-aes"}, - {}}, - {AArch64::ARMV9_6A, - {"sve2-aes", "nosve-aes"}, - {}, - {"sve2-aes", "sve-aes"}}}; + {"ssve-aes", "sve2-aes"}}, +}; INSTANTIATE_TEST_SUITE_P( AArch64ExtensionDependenciesBaseArch, diff --git a/llvm/unittests/Transforms/Instrumentation/MemProfUseTest.cpp b/llvm/unittests/Transforms/Instrumentation/MemProfUseTest.cpp index a510a57099aba4..c864b06e991dc3 100644 --- a/llvm/unittests/Transforms/Instrumentation/MemProfUseTest.cpp +++ b/llvm/unittests/Transforms/Instrumentation/MemProfUseTest.cpp @@ -101,4 +101,123 @@ declare !dbg !19 void @_Z2f3v() EXPECT_THAT(CallSites[2], Pair(FieldsAre(2U, 9U), IndexedMemProfRecord::getGUID("_Z2f3v"))); } + +TEST(MemProf, ExtractDirectCallsFromIRInline) { + // The following IR is generated from: + // + // void f1(); + // static inline void f2() { + // // For an interesting line number. + // f1(); + // } + // static inline void f3() { + // /****/ f2(); // For an interesting column number. + // } + // + // void g1(); + // void g2(); + // static inline void g3() { + // /**/ g1(); // For an interesting column number. + // g2(); + // } + // + // void foo() { + // f3(); + // /***/ g3(); // For an interesting column number. + // } + StringRef IR = R"IR( +define dso_local void @_Z3foov() local_unnamed_addr !dbg !10 { +entry: + tail call void @_Z2f1v(), !dbg !13 + tail call void @_Z2g1v(), !dbg !18 + tail call void @_Z2g2v(), !dbg !21 + ret void, !dbg !22 +} + +declare !dbg !23 void @_Z2f1v() local_unnamed_addr + +declare !dbg !24 void @_Z2g1v() local_unnamed_addr + +declare !dbg !25 void @_Z2g2v() local_unnamed_addr + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8} +!llvm.ident = !{!9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "foobar.cc", directory: "/") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 1, !"MemProfProfileFilename", !"memprof.profraw"} +!6 = !{i32 8, !"PIC Level", i32 2} +!7 = !{i32 7, !"PIE Level", i32 2} +!8 = !{i32 7, !"uwtable", i32 2} +!9 = !{!"clang"} +!10 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!11 = !DISubroutineType(types: !12) +!12 = !{} +!13 = !DILocation(line: 4, column: 3, scope: !14, inlinedAt: !15) +!14 = distinct !DISubprogram(name: "f2", linkageName: "_ZL2f2v", scope: !1, file: !1, line: 2, type: !11, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!15 = distinct !DILocation(line: 7, column: 10, scope: !16, inlinedAt: !17) +!16 = distinct !DISubprogram(name: "f3", linkageName: "_ZL2f3v", scope: !1, file: !1, line: 6, type: !11, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!17 = distinct !DILocation(line: 18, column: 3, scope: !10) +!18 = !DILocation(line: 13, column: 8, scope: !19, inlinedAt: !20) +!19 = distinct !DISubprogram(name: "g3", linkageName: "_ZL2g3v", scope: !1, file: !1, line: 12, type: !11, scopeLine: 12, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0) +!20 = distinct !DILocation(line: 19, column: 9, scope: !10) +!21 = !DILocation(line: 14, column: 3, scope: !19, inlinedAt: !20) +!22 = !DILocation(line: 20, column: 1, scope: !10) +!23 = !DISubprogram(name: "f1", linkageName: "_Z2f1v", scope: !1, file: !1, line: 1, type: !11, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +!24 = !DISubprogram(name: "g1", linkageName: "_Z2g1v", scope: !1, file: !1, line: 10, type: !11, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +!25 = !DISubprogram(name: "g2", linkageName: "_Z2g2v", scope: !1, file: !1, line: 11, type: !11, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +)IR"; + + LLVMContext Ctx; + SMDiagnostic Err; + std::unique_ptr M = parseAssemblyString(IR, Err, Ctx); + ASSERT_TRUE(M); + + auto Calls = extractCallsFromIR(*M); + + // Expect exactly 4 callers. + ASSERT_THAT(Calls, SizeIs(4)); + + // Verify each key-value pair. + + auto FooIt = Calls.find(IndexedMemProfRecord::getGUID("_Z3foov")); + ASSERT_NE(FooIt, Calls.end()); + const auto &[FooCallerGUID, FooCallSites] = *FooIt; + EXPECT_EQ(FooCallerGUID, IndexedMemProfRecord::getGUID("_Z3foov")); + ASSERT_THAT(FooCallSites, SizeIs(2)); + EXPECT_THAT(FooCallSites[0], Pair(FieldsAre(1U, 3U), + IndexedMemProfRecord::getGUID("_ZL2f3v"))); + EXPECT_THAT(FooCallSites[1], Pair(FieldsAre(2U, 9U), + IndexedMemProfRecord::getGUID("_ZL2g3v"))); + + auto F2It = Calls.find(IndexedMemProfRecord::getGUID("_ZL2f2v")); + ASSERT_NE(F2It, Calls.end()); + const auto &[F2CallerGUID, F2CallSites] = *F2It; + EXPECT_EQ(F2CallerGUID, IndexedMemProfRecord::getGUID("_ZL2f2v")); + ASSERT_THAT(F2CallSites, SizeIs(1)); + EXPECT_THAT(F2CallSites[0], + Pair(FieldsAre(2U, 3U), IndexedMemProfRecord::getGUID("_Z2f1v"))); + + auto F3It = Calls.find(IndexedMemProfRecord::getGUID("_ZL2f3v")); + ASSERT_NE(F3It, Calls.end()); + const auto &[F3CallerGUID, F3CallSites] = *F3It; + EXPECT_EQ(F3CallerGUID, IndexedMemProfRecord::getGUID("_ZL2f3v")); + ASSERT_THAT(F3CallSites, SizeIs(1)); + EXPECT_THAT(F3CallSites[0], Pair(FieldsAre(1U, 10U), + IndexedMemProfRecord::getGUID("_ZL2f2v"))); + + auto G3It = Calls.find(IndexedMemProfRecord::getGUID("_ZL2g3v")); + ASSERT_NE(G3It, Calls.end()); + const auto &[G3CallerGUID, G3CallSites] = *G3It; + EXPECT_EQ(G3CallerGUID, IndexedMemProfRecord::getGUID("_ZL2g3v")); + ASSERT_THAT(G3CallSites, SizeIs(2)); + EXPECT_THAT(G3CallSites[0], + Pair(FieldsAre(1U, 8U), IndexedMemProfRecord::getGUID("_Z2g1v"))); + EXPECT_THAT(G3CallSites[1], + Pair(FieldsAre(2U, 3U), IndexedMemProfRecord::getGUID("_Z2g2v"))); +} } // namespace diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 4a8615cc086b08..3179cfc676ab67 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -731,10 +731,10 @@ TEST(VPBasicBlockTest, print) { } LLVMContext C; - auto *ScalarHeader = BasicBlock::Create(C, ""); - // FIXME: This looks wrong. - auto ScalarHeaderVPBB = std::make_unique(ScalarHeader); - VPlan Plan(VPBB0, TC, VPBB1, ScalarHeaderVPBB.get()); + auto *ScalarHeader = BasicBlock::Create(C, "scalar.header"); + auto * ScalarHeaderVPBB = new VPIRBasicBlock(ScalarHeader); + VPBlockUtils::connectBlocks(VPBB2, ScalarHeaderVPBB); + VPlan Plan(VPBB0, TC, VPBB1, ScalarHeaderVPBB); std::string FullDump; raw_string_ostream OS(FullDump); Plan.printDOT(OS); @@ -761,6 +761,11 @@ compound=true "bb2:\l" + " EMIT vp\<%5\> = mul vp\<%3\>, vp\<%2\>\l" + " EMIT ret vp\<%5\>\l" + + "Successor(s): ir-bb\\l" + ] + N2 -> N3 [ label=""] + N3 [label = + "ir-bb\:\l" + "No successors\l" ] } @@ -782,7 +787,7 @@ Successor(s): bb2 const char *ExpectedBlock2Str = R"(bb2: EMIT vp<%5> = mul vp<%3>, vp<%2> EMIT ret vp<%5> -No successors +Successor(s): ir-bb )"; std::string Block2Dump; raw_string_ostream OS2(Block2Dump); diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp index f17c62dd1fd9d4..c8186d6e69523f 100644 --- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp @@ -1838,7 +1838,7 @@ MVT::SimpleValueType SDNodeInfo::getKnownType(unsigned ResNo) const { static unsigned GetNumNodeResults(const Record *Operator, CodeGenDAGPatterns &CDP) { - if (Operator->getName() == "set" || Operator->getName() == "implicit") + if (Operator->getName() == "set") return 0; // All return nothing. if (Operator->isSubClassOf("Intrinsic")) @@ -2945,8 +2945,7 @@ TreePatternNodePtr TreePattern::ParseTreePattern(const Init *TheInit, !Operator->isSubClassOf("Instruction") && !Operator->isSubClassOf("SDNodeXForm") && !Operator->isSubClassOf("Intrinsic") && - !Operator->isSubClassOf("ComplexPattern") && - Operator->getName() != "set" && Operator->getName() != "implicit") + !Operator->isSubClassOf("ComplexPattern") && Operator->getName() != "set") error("Unrecognized node '" + Operator->getName() + "'!"); // Check to see if this is something that is illegal in an input pattern. @@ -3456,21 +3455,6 @@ void CodeGenDAGPatterns::FindPatternInputsAndOutputs( return; } - if (Pat->getOperator()->getName() == "implicit") { - for (unsigned i = 0, e = Pat->getNumChildren(); i != e; ++i) { - TreePatternNode &Dest = Pat->getChild(i); - if (!Dest.isLeaf()) - I.error("implicitly defined value should be a register!"); - - const DefInit *Val = dyn_cast(Dest.getLeafValue()); - if (!Val || !Val->getDef()->isSubClassOf("Register")) - I.error("implicitly defined value should be a register!"); - if (Val) - InstImpResults.push_back(Val->getDef()); - } - return; - } - if (Pat->getOperator()->getName() != "set") { // If this is not a set, verify that the children nodes are not void typed, // and recurse. diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp index 09c1ee4fd0f3c2..6a793c09394d02 100644 --- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp @@ -902,8 +902,6 @@ void MatcherGen::EmitResultInstructionAsOperand( // If this is the root instruction of a pattern that has physical registers in // its result pattern, add output VTs for them. For example, X86 has: // (set AL, (mul ...)) - // This also handles implicit results like: - // (implicit EFLAGS) if (isRoot && !Pattern.getDstRegs().empty()) { // If the root came from an implicit def in the instruction handling stuff, // don't re-add it. @@ -1038,7 +1036,7 @@ void MatcherGen::EmitResultCode() { // unsigned NumSrcResults = Pattern.getSrcPattern().getNumTypes(); - // If the pattern also has (implicit) results, count them as well. + // If the pattern also has implicit results, count them as well. if (!Pattern.getDstRegs().empty()) { // If the root came from an implicit def in the instruction handling stuff, // don't re-add it. diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 1e5d28f8ce95b7..a3344718cb3626 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -1015,6 +1015,15 @@ Error GlobalISelEmitter::importChildMatcher( return Error::success(); } } + } else if (auto *ChildDefInit = dyn_cast(SrcChild.getLeafValue())) { + auto *ChildRec = ChildDefInit->getDef(); + if (ChildRec->isSubClassOf("ValueType") && !SrcChild.hasName()) { + // An unnamed ValueType as in (sext_inreg GPR:$foo, i8). GISel represents + // this as a literal constant with the scalar size. + MVT::SimpleValueType VT = llvm::getValueType(ChildRec); + OM.addPredicate(MVT(VT).getScalarSizeInBits()); + return Error::success(); + } } // Immediate arguments have no meaningful type to check as they don't have @@ -2109,9 +2118,9 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { return failedImport( "Cannot infer register class for SUBREG_TO_REG operand #0"); MatchedRC = *MaybeRegClass; - } else if (MatchedRC.get()->isSubClassOf("RegisterOperand")) - MatchedRC = MatchedRC.get()->getValueAsDef("RegClass"); - else if (!MatchedRC.get()->isSubClassOf("RegisterClass")) + } else if (cast(MatchedRC)->isSubClassOf("RegisterOperand")) + MatchedRC = cast(MatchedRC)->getValueAsDef("RegClass"); + else if (!cast(MatchedRC)->isSubClassOf("RegisterClass")) return failedImport("Dst MI def isn't a register class" + to_string(Dst)); OperandMatcher &OM = InsnMatcher.getOperand(OpIdx); @@ -2121,10 +2130,10 @@ Expected GlobalISelEmitter::runOnPattern(const PatternToMatch &P) { // GIM_CheckIsSameOperand predicates by the defineOperand method. OM.setSymbolicName(getMangledRootDefName(DstIOperand.Name)); M.defineOperand(OM.getSymbolicName(), OM); - if (MatchedRC.is()) - MatchedRC = &Target.getRegisterClass(MatchedRC.get()); + if (auto *R = dyn_cast(MatchedRC)) + MatchedRC = &Target.getRegisterClass(R); OM.addPredicate( - *MatchedRC.get()); + *cast(MatchedRC)); ++OpIdx; } diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index 047126a58a89d1..4792cc1567df8c 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -131,6 +131,8 @@ copy("Headers") { "amxfp16intrin.h", "amxfp8intrin.h", "amxintrin.h", + "amxtf32intrin.h", + "amxtf32transposeintrin.h", "amxtransposeintrin.h", "arm64intr.h", "arm_acle.h", diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index dc62280d12c666..13a3a15b858775 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -810,12 +810,10 @@ if (current_toolchain == default_toolchain) { "__tuple/tuple_like_no_subrange.h", "__tuple/tuple_size.h", "__tuple/tuple_types.h", - "__type_traits/add_const.h", - "__type_traits/add_cv.h", + "__type_traits/add_cv_quals.h", "__type_traits/add_lvalue_reference.h", "__type_traits/add_pointer.h", "__type_traits/add_rvalue_reference.h", - "__type_traits/add_volatile.h", "__type_traits/aligned_storage.h", "__type_traits/aligned_union.h", "__type_traits/alignment_of.h", diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn index 25436c3cdb47d5..b18a8eb5995bb7 100644 --- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/BUILD.gn @@ -15,6 +15,7 @@ static_library("Orc") { "//llvm/lib/WindowsDriver", ] sources = [ + "AbsoluteSymbols.cpp", "COFFPlatform.cpp", "COFFVCRuntimeSupport.cpp", "CompileOnDemandLayer.cpp", diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index 92fd6e99338ae2..1dd9b9a440ecc8 100644 --- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -1158,7 +1158,7 @@ def AffineLinearizeIndexOp : Affine_Op<"linearize_index", let assemblyFormat = [{ (`disjoint` $disjoint^)? ` ` - `[` $multi_index `]` `by` ` ` + `[` $multi_index `]` `by` custom($dynamic_basis, $static_basis, "::mlir::AsmParser::Delimiter::Paren") attr-dict `:` type($linear_index) }]; diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td index 12c430df208925..352e2ec91bdbea 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMInterfaces.td @@ -114,6 +114,33 @@ def ExactFlagInterface : OpInterface<"ExactFlagInterface"> { ]; } +def NonNegFlagInterface : OpInterface<"NonNegFlagInterface"> { + let description = [{ + This interface defines an LLVM operation with an nneg flag and + provides a uniform API for accessing it. + }]; + + let cppNamespace = "::mlir::LLVM"; + + let methods = [ + InterfaceMethod<[{ + Get the nneg flag for the operation. + }], "bool", "getNonNeg", (ins), [{}], [{ + return $_op.getProperties().nonNeg; + }]>, + InterfaceMethod<[{ + Set the nneg flag for the operation. + }], "void", "setNonNeg", (ins "bool":$nonNeg), [{}], [{ + $_op.getProperties().nonNeg = nonNeg; + }]>, + StaticInterfaceMethod<[{ + Get the attribute name of the nonNeg property. + }], "StringRef", "getNonNegName", (ins), [{}], [{ + return "nonNeg"; + }]>, + ]; +} + def BranchWeightOpInterface : OpInterface<"BranchWeightOpInterface"> { let description = [{ An interface for operations that can carry branch weights metadata. It diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 315af2594047a5..81e9f69f15acf6 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -508,6 +508,40 @@ class LLVM_CastOp traits = []> : + LLVM_Op], traits)>, + LLVM_Builder<"$res = builder.Create" # instName # "($arg, $_resultType, /*Name=*/\"\", op.getNonNeg());"> { + let arguments = (ins type:$arg, UnitAttr:$nonNeg); + let results = (outs resultType:$res); + let builders = [LLVM_OneResultOpBuilder]; + let assemblyFormat = "(`nneg` $nonNeg^)? $arg attr-dict `:` type($arg) `to` type($res)"; + string llvmInstName = instName; + string mlirBuilder = [{ + auto op = $_builder.create<$_qualCppClassName>( + $_location, $_resultType, $arg); + moduleImport.setNonNegFlag(inst, op); + $res = op; + }]; +} + +class LLVM_CastOpWithOverflowFlag traits = []> : + LLVM_Op], traits)>, + LLVM_Builder<"$res = builder.Create" # instName # "($arg, $_resultType, /*Name=*/\"\", op.hasNoUnsignedWrap(), op.hasNoSignedWrap());"> { + let arguments = (ins type:$arg, EnumProperty<"IntegerOverflowFlags", "", "IntegerOverflowFlags::none">:$overflowFlags); + let results = (outs resultType:$res); + let builders = [LLVM_OneResultOpBuilder]; + let assemblyFormat = "$arg `` custom($overflowFlags) attr-dict `:` type($arg) `to` type($res)"; + string llvmInstName = instName; + string mlirBuilder = [{ + auto op = $_builder.create<$_qualCppClassName>( + $_location, $_resultType, $arg); + moduleImport.setIntegerOverflowFlags(inst, op); + $res = op; + }]; +} + def LLVM_BitcastOp : LLVM_CastOp<"bitcast", "BitCast", LLVM_AnyNonAggregate, LLVM_AnyNonAggregate, [DeclareOpInterfaceMethods]> { let hasFolder = 1; @@ -531,19 +565,19 @@ def LLVM_SExtOp : LLVM_CastOp<"sext", "SExt", LLVM_ScalarOrVectorOf> { let hasVerifier = 1; } -def LLVM_ZExtOp : LLVM_CastOp<"zext", "ZExt", +def LLVM_ZExtOp : LLVM_CastOpWithNNegFlag<"zext", "ZExt", LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf> { let hasFolder = 1; let hasVerifier = 1; } -def LLVM_TruncOp : LLVM_CastOp<"trunc", "Trunc", +def LLVM_TruncOp : LLVM_CastOpWithOverflowFlag<"trunc", "Trunc", LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf>; def LLVM_SIToFPOp : LLVM_CastOp<"sitofp", "SIToFP", LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf>; -def LLVM_UIToFPOp : LLVM_CastOp<"uitofp", "UIToFP", +def LLVM_UIToFPOp : LLVM_CastOpWithNNegFlag<"uitofp", "UIToFP", LLVM_ScalarOrVectorOf, LLVM_ScalarOrVectorOf>; def LLVM_FPToSIOp : LLVM_CastOp<"fptosi", "FPToSI", diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index a8662a3d6f63be..adfaea5a9d074d 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -1820,6 +1820,10 @@ void populateBubbleUpExtractSliceOpPatterns(RewritePatternSet &patterns); /// linalg.fill(%cst, tensor.extract_slice(%init)). void populateSwapExtractSliceWithFillPatterns(RewritePatternSet &patterns); +/// Add patterns to make explicit broadcasts and transforms in the +/// input operands of a genericOp. +void populateDecomposeProjectedPermutationPatterns(RewritePatternSet &patterns); + /// Patterns to apply `splitReduction` below. void populateSplitReductionPattern( RewritePatternSet &patterns, diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h index db7b64fda57d7b..75cb096130ca6e 100644 --- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h +++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.h @@ -15,6 +15,7 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/SymbolTable.h" +#include "mlir/Interfaces/DestinationStyleOpInterface.h" #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "llvm/Support/MathExtras.h" @@ -45,9 +46,9 @@ class MeshSharding { SmallVector partial_axes; ReductionKind partial_type; SmallVector static_halo_sizes; - SmallVector static_sharded_dims_sizes; + SmallVector static_sharded_dims_offsets; SmallVector dynamic_halo_sizes; - SmallVector dynamic_sharded_dims_sizes; + SmallVector dynamic_sharded_dims_offsets; public: MeshSharding() = default; @@ -57,21 +58,21 @@ class MeshSharding { ArrayRef partial_axes_ = {}, ReductionKind partial_type_ = ReductionKind::Sum, ArrayRef static_halo_sizes_ = {}, - ArrayRef static_sharded_dims_sizes_ = {}, + ArrayRef static_sharded_dims_offsets_ = {}, ArrayRef dynamic_halo_sizes_ = {}, - ArrayRef dynamic_sharded_dims_sizes_ = {}); + ArrayRef dynamic_sharded_dims_offsets_ = {}); ::mlir::FlatSymbolRefAttr getMeshAttr() const { return mesh; } ::llvm::StringRef getMesh() const { return mesh.getValue(); } ArrayRef getSplitAxes() const { return split_axes; } ArrayRef getPartialAxes() const { return partial_axes; } ReductionKind getPartialType() const { return partial_type; } ArrayRef getStaticHaloSizes() const { return static_halo_sizes; } - ArrayRef getStaticShardedDimsSizes() const { - return static_sharded_dims_sizes; + ArrayRef getStaticShardedDimsOffsets() const { + return static_sharded_dims_offsets; } ArrayRef getDynamicHaloSizes() const { return dynamic_halo_sizes; } - ArrayRef getDynamicShardedDimsSizes() const { - return dynamic_sharded_dims_sizes; + ArrayRef getDynamicShardedDimsOffsets() const { + return dynamic_sharded_dims_offsets; } operator bool() const { return (!mesh) == false; } bool operator==(Value rhs) const; @@ -80,6 +81,8 @@ class MeshSharding { bool operator!=(const MeshSharding &rhs) const; bool equalSplitAndPartialAxes(const MeshSharding &rhs) const; bool equalHaloAndShardSizes(const MeshSharding &rhs) const; + bool equalHaloSizes(const MeshSharding &rhs) const; + bool equalShardSizes(const MeshSharding &rhs) const; }; } // namespace mesh diff --git a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td index 8f696bbc1a0f6e..19498fe5a32d69 100644 --- a/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td +++ b/mlir/include/mlir/Dialect/Mesh/IR/MeshOps.td @@ -11,6 +11,7 @@ include "mlir/Dialect/Mesh/IR/MeshBase.td" include "mlir/Dialect/Shape/IR/ShapeBase.td" +include "mlir/Interfaces/DestinationStyleOpInterface.td" include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/BuiltinTypes.td" @@ -189,23 +190,27 @@ def Mesh_ShardingOp : Mesh_Op<"sharding", [ `generic`: is not an allowed value inside a shard attribute. 5. [Optional] Sizes of halos to be added for each sharded tensor dimension. - `halo_sizes`is provided as a flattened 1d array of i64s, 2 values for each sharded dimension. - `halo_sizes` = [1, 2] means that the first sharded dimension gets an additional - halo of size 1 at the start of the first dimension and a halo size is 2 at its end. - `halo_sizes` = [1, 2, 2, 3] defines halos for the first 2 sharded dimensions - e.g. the first sharded dimension gets [1,2] halos and the seconds gets [2,3] halos. - `?` indicates dynamic halo sizes. + `halo_sizes` is provided as a flattened 1d array of i64s, 2 values for each + sharded dimension. `halo_sizes = [1, 2]` means that the first sharded dimension + gets an additional halo of size 1 at the start of the first dimension and a halo + size is 2 at its end. `halo_sizes = [1, 2, 2, 3]` defines halos for the first 2 + sharded dimensions e.g. the first sharded dimension gets `[1,2]` halos and the + seconds gets `[2,3]` halos. `?` indicates dynamic halo sizes. + + 6. [Optional] Offsets for each shard and sharded tensor dimension. + `sharded_dims_offsets` is provided as a flattened 1d array of i64s. For each + sharded tensor dimension the offsets (starting index) of all shards in that + dimension and an additional value for the end of the last shard are provided. + For a 1d sharding this means that position `i` has the exclusive prefix sum for + shard `i`, and since only contiguous sharding is supported, its inclusive prefix + sum is at position 'i+1'. - 6. [Optional] Sizes of sharded dimensions of each shard. - `sharded_dims_sizes`is provided as a flattened 1d array of i64s: for each device of the - device-mesh one value for each sharded tensor dimension. Assuming a 3d-tensor of shape 32x32x32 with the first 2 dimensions being sharded, - `sharded_dims_sizes` = [16, 8, 16, 24] means that the first device of - the device-mesh will get a shard of shape 16x8x32 and the second device will get a - shard of shape 16x24x32. - `?` indicates dynamic shard dimensions. + `sharded_dims_offsets` = [0, 24, 32, 0, 20, 32] means that the first device of + the device-mesh will get a shard of shape 24x20x32 and the second device will get + a shard of shape 8x12x32. `?` indicates dynamic shard dimensions. - `halo_sizes` and `sharded_dims_sizes` are mutually exclusive. + `halo_sizes` and `sharded_dims_offsets` are mutually exclusive. Examples: @@ -240,7 +245,7 @@ def Mesh_ShardingOp : Mesh_Op<"sharding", [ // The tensor is sharded on its second dimension along axis 0 of @mesh1d_4 // and it has pre-defined shard sizes. The shards of the devices will have // the following shapes: [4x2, 4x3, 4x4, 4x5] - %sharding4 = mesh.sharding @mesh1d_4 split_axes = [[] split_axes = [0]] sharded_dims_sizes = [2, 3, 4, 5] + %sharding4 = mesh.sharding @mesh1d_4 split_axes = [[], [0]] sharded_dims_offsets = [0, 2, 5, 9, 14] %sharded2 = mesh.shard %arg0 to %sharding4 : tensor<4x14xf32> ``` }]; @@ -250,8 +255,8 @@ def Mesh_ShardingOp : Mesh_Op<"sharding", [ Mesh_MeshAxesArrayAttr:$split_axes, OptionalAttr:$partial_axes, OptionalAttr:$partial_type, - DefaultValuedAttr:$static_sharded_dims_sizes, - Variadic:$dynamic_sharded_dims_sizes, + DefaultValuedAttr:$static_sharded_dims_offsets, + Variadic:$dynamic_sharded_dims_offsets, DefaultValuedAttr:$static_halo_sizes, Variadic:$dynamic_halo_sizes ); @@ -263,7 +268,7 @@ def Mesh_ShardingOp : Mesh_Op<"sharding", [ `split_axes` `=` $split_axes (`partial` `=` $partial_type $partial_axes^)? (`halo_sizes` `=` custom($dynamic_halo_sizes, $static_halo_sizes)^)? - (`sharded_dims_sizes` `=` custom($dynamic_sharded_dims_sizes, $static_sharded_dims_sizes)^)? + (`sharded_dims_offsets` `=` custom($dynamic_sharded_dims_offsets, $static_sharded_dims_offsets)^)? attr-dict `:` type($result) }]; let builders = [ @@ -272,16 +277,17 @@ def Mesh_ShardingOp : Mesh_Op<"sharding", [ "ArrayRef":$partial_axes, "mesh::ReductionKind":$partial_type, CArg<"ArrayRef", "{}">:$static_halo_sizes, - CArg<"ArrayRef", "{}">:$static_sharded_dims_sizes)>, + CArg<"ArrayRef", "{}">:$static_sharded_dims_offsets)>, OpBuilder<(ins "FlatSymbolRefAttr":$mesh, "ArrayRef":$split_axes)>, OpBuilder<(ins "FlatSymbolRefAttr":$mesh, "ArrayRef":$split_axes, "::mlir::ArrayRef<::mlir::OpFoldResult>":$halo_sizes, - "::mlir::ArrayRef<::mlir::OpFoldResult>":$sharded_dims_sizes)>, + "::mlir::ArrayRef<::mlir::OpFoldResult>":$sharded_dims_offsets)>, OpBuilder<(ins "mlir::mesh::MeshSharding":$from)> ]; let hasVerifier = 1; + let hasCanonicalizer = 1; } def Mesh_ShardShapeOp : Mesh_Op<"shard_shape", [Pure]> { @@ -1052,37 +1058,54 @@ def Mesh_ShiftOp : Mesh_CollectiveCommunicationOpBase<"shift", [ } def Mesh_UpdateHaloOp : Mesh_Op<"update_halo", [ - DeclareOpInterfaceMethods + DestinationStyleOpInterface, + TypesMatchWith< + "result has same type as destination", + "result", "destination", "$_self">, + DeclareOpInterfaceMethods, + AttrSizedOperandSegments ]> { let summary = "Update halo data."; let description = [{ This operation updates halo regions of shards, e.g. if their sharding - specified halos and the actual tensor data might have changed + specified halos and the actual tensor/memref data might have changed on the remote devices. Changes might be caused by mutating operations and/or if the new halo regions are larger than the existing ones. + Source and destination might have different halo sizes. + Assumes all devices hold tensors with same-sized halo data as specified - by `dynamic/static_halo_sizes`. + by `source_halo_sizes/static_source_halo_sizes` and + `destination_halo_sizes/static_destination_halo_sizes` in source shard + and destination/result shard. `split_axes` specifies for each tensor axis along which mesh axes its halo data is updated. - Optionally resizes to new halo sizes `target_halo_sizes`. }]; let arguments = (ins - AnyNon0RankedMemRef:$input, + AnyTypeOf<[AnyNon0RankedMemRef, AnyNon0RankedTensor]>:$source, + AnyTypeOf<[AnyNon0RankedMemRef, AnyNon0RankedTensor]>:$destination, FlatSymbolRefAttr:$mesh, Mesh_MeshAxesArrayAttr:$split_axes, - Variadic:$dynamic_halo_sizes, - DefaultValuedAttr:$static_halo_sizes, - DefaultValuedAttr:$target_halo_sizes + Variadic:$source_halo_sizes, + DefaultValuedAttr:$static_source_halo_sizes, + Variadic:$destination_halo_sizes, + DefaultValuedAttr:$static_destination_halo_sizes + ); + let results = (outs + AnyTypeOf<[AnyNon0RankedMemRef, AnyNon0RankedTensor]>:$result ); let assemblyFormat = [{ - $input `on` $mesh + $source `into` $destination + `on` $mesh `split_axes` `=` $split_axes - (`halo_sizes` `=` custom($dynamic_halo_sizes, $static_halo_sizes)^)? - (`target_halo_sizes` `=` $target_halo_sizes^)? - attr-dict `:` type($input) + (`source_halo_sizes` `=` custom($source_halo_sizes, $static_source_halo_sizes)^)? + (`destination_halo_sizes` `=` custom($destination_halo_sizes, $static_destination_halo_sizes)^)? + attr-dict `:` type($source) `->` type($result) + }]; + let extraClassDeclaration = [{ + MutableOperandRange getDpsInitsMutable() { return getDestinationMutable(); } }]; } #endif // MLIR_DIALECT_MESH_IR_MESHOPS_TD diff --git a/mlir/include/mlir/IR/Dominance.h b/mlir/include/mlir/IR/Dominance.h index 2536ce585b3fdd..66b9456533ae04 100644 --- a/mlir/include/mlir/IR/Dominance.h +++ b/mlir/include/mlir/IR/Dominance.h @@ -141,8 +141,8 @@ class DominanceInfo : public detail::DominanceInfoBase { /// are in the same block and A properly dominates B within the block, or if /// the block that contains A properly dominates the block that contains B. In /// an SSACFG region, Operation A dominates Operation B in the same block if A - /// preceeds B. In a Graph region, all operations in a block dominate all - /// other operations in the same block. + /// preceeds B. In a Graph region, all operations in a block properly dominate + /// all operations in the same block. /// /// The `enclosingOpOk` flag says whether we should return true if the B op /// is enclosed by a region on A. @@ -176,9 +176,14 @@ class DominanceInfo : public detail::DominanceInfoBase { /// Return true if the specified block A properly dominates block B, i.e.: if /// block A contains block B, or if the region which contains block A also /// contains block B or some parent of block B and block A dominates that - /// block in that kind of region. In an SSACFG region, block A dominates - /// block B if all control flow paths from the entry block to block B flow - /// through block A. In a Graph region, all blocks dominate all other blocks. + /// block in that kind of region. + /// + /// In an SSACFG region, block A dominates block B if all control flow paths + /// from the entry block to block B flow through block A. + /// + /// Graph regions have only a single block. To be consistent with "proper + /// dominance" of ops, the single block is considered to properly dominate + /// itself in a graph region. bool properlyDominates(Block *a, Block *b) const { return super::properlyDominates(a, b); } @@ -197,20 +202,20 @@ class PostDominanceInfo : public detail::DominanceInfoBase { using super::super; /// Return true if operation A properly postdominates operation B. - bool properlyPostDominates(Operation *a, Operation *b); + bool properlyPostDominates(Operation *a, Operation *b) const; /// Return true if operation A postdominates operation B. - bool postDominates(Operation *a, Operation *b) { + bool postDominates(Operation *a, Operation *b) const { return a == b || properlyPostDominates(a, b); } /// Return true if the specified block A properly postdominates block B. - bool properlyPostDominates(Block *a, Block *b) { + bool properlyPostDominates(Block *a, Block *b) const { return super::properlyDominates(a, b); } /// Return true if the specified block A postdominates block B. - bool postDominates(Block *a, Block *b) { + bool postDominates(Block *a, Block *b) const { return a == b || properlyPostDominates(a, b); } }; diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h index 6c3a500f20e3a9..30164843f63675 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h @@ -192,6 +192,11 @@ class ModuleImport { /// implement the exact flag interface. void setExactFlag(llvm::Instruction *inst, Operation *op) const; + /// Sets the nneg flag attribute for the imported operation `op` given + /// the original instruction `inst`. Asserts if the operation does not + /// implement the nneg flag interface. + void setNonNegFlag(llvm::Instruction *inst, Operation *op) const; + /// Sets the fastmath flags attribute for the imported operation `op` given /// the original instruction `inst`. Asserts if the operation does not /// implement the fastmath interface. diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h index 5c977055e95dc8..41f208216374fe 100644 --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -33,7 +33,7 @@ class GreedyRewriteConfig; #define GEN_PASS_DECL_CANONICALIZER #define GEN_PASS_DECL_CONTROLFLOWSINK -#define GEN_PASS_DECL_CSEPASS +#define GEN_PASS_DECL_CSE #define GEN_PASS_DECL_INLINER #define GEN_PASS_DECL_LOOPINVARIANTCODEMOTION #define GEN_PASS_DECL_MEM2REG diff --git a/mlir/lib/Bindings/Python/MainModule.cpp b/mlir/lib/Bindings/Python/MainModule.cpp index 8da1ab16a4514b..7c27021902de31 100644 --- a/mlir/lib/Bindings/Python/MainModule.cpp +++ b/mlir/lib/Bindings/Python/MainModule.cpp @@ -58,7 +58,7 @@ PYBIND11_MODULE(_mlir, m) { // Registration decorators. m.def( "register_dialect", - [](py::object pyClass) { + [](py::type pyClass) { std::string dialectNamespace = pyClass.attr("DIALECT_NAMESPACE").cast(); PyGlobals::get().registerDialectImpl(dialectNamespace, pyClass); @@ -68,9 +68,9 @@ PYBIND11_MODULE(_mlir, m) { "Class decorator for registering a custom Dialect wrapper"); m.def( "register_operation", - [](const py::object &dialectClass, bool replace) -> py::cpp_function { + [](const py::type &dialectClass, bool replace) -> py::cpp_function { return py::cpp_function( - [dialectClass, replace](py::object opClass) -> py::object { + [dialectClass, replace](py::type opClass) -> py::type { std::string operationName = opClass.attr("OPERATION_NAME").cast(); PyGlobals::get().registerOperationImpl(operationName, opClass, diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp index 4e7758bf46d9cf..ce91424e7a577e 100644 --- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp +++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp @@ -524,8 +524,11 @@ LLVMTypeConverter::getMemRefAddressSpace(BaseMemRefType type) const { return failure(); if (!(*converted)) // Conversion to default is 0. return 0; - if (auto explicitSpace = llvm::dyn_cast_if_present(*converted)) - return explicitSpace.getInt(); + if (auto explicitSpace = dyn_cast_if_present(*converted)) { + if (explicitSpace.getType().isIndex() || + explicitSpace.getType().isSignlessInteger()) + return explicitSpace.getInt(); + } return failure(); } diff --git a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp index e48ca5180b706f..a6408391b1330c 100644 --- a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp +++ b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp @@ -74,6 +74,12 @@ std::tuple AllocationOpLLVMLowering::allocateBufferManuallyAlign( MemRefType memRefType = getMemRefResultType(op); // Allocate the underlying buffer. Type elementPtrType = this->getElementPtrType(memRefType); + if (!elementPtrType) { + emitError(loc, "conversion of memref memory space ") + << memRefType.getMemorySpace() + << " to integer address space " + "failed. Consider adding memory space conversions."; + } LLVM::LLVMFuncOp allocFuncOp = getNotalignedAllocFn( getTypeConverter(), op->getParentWithTrait(), getIndexType()); diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index ca55c44856d191..3d38de4bf1068e 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -4734,11 +4734,29 @@ struct DropLinearizeUnitComponentsIfDisjointOrZero final return success(); } }; + +/// Rewrite `affine.linearize_index [%%x] by (%b)`, into `%x`. +/// +/// By definition, that operation is `affine.apply affine_map<()[s0] -> (s0)>,` +/// which is the identity. +struct DropLinearizeOneBasisElement final + : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(affine::AffineLinearizeIndexOp op, + PatternRewriter &rewriter) const override { + if (op.getStaticBasis().size() != 1 || op.getMultiIndex().size() != 1) + return failure(); + rewriter.replaceOp(op, op.getMultiIndex().front()); + return success(); + } +}; } // namespace void affine::AffineLinearizeIndexOp::getCanonicalizationPatterns( RewritePatternSet &patterns, MLIRContext *context) { - patterns.add(context); + patterns.add(context); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp index 8682294c8a6972..f3413c1c30fadc 100644 --- a/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp +++ b/mlir/lib/Dialect/Arith/IR/InferIntRangeInterfaceImpls.cpp @@ -42,6 +42,12 @@ void arith::ConstantOp::inferResultRanges(ArrayRef argRanges, } if (auto arrayCstAttr = llvm::dyn_cast_or_null(getValue())) { + if (arrayCstAttr.isSplat()) { + setResultRange(getResult(), ConstantIntRanges::constant( + arrayCstAttr.getSplatValue())); + return; + } + std::optional result; for (const APInt &val : arrayCstAttr) { auto range = ConstantIntRanges::constant(val); diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt index d7c63cdd8198d7..3594b084138124 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt @@ -38,6 +38,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms TilingInterfaceImpl.cpp Transforms.cpp TransposeConv2D.cpp + DecomposeGenericByUnfoldingPermutation.cpp Vectorization.cpp WinogradConv2D.cpp diff --git a/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp new file mode 100644 index 00000000000000..83c4b5bdf10976 --- /dev/null +++ b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp @@ -0,0 +1,249 @@ +//===- DecomposeGenericByUnfoldingPermutation.cpp -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/Transforms/Transforms.h" +#include +#include +#include + +using namespace mlir; +using namespace mlir::linalg; + +namespace { + +/// This pattern decomposes the input operand(s) of a linalg.generic that has +/// a `transpose`, `broadcast`, or a mixture of two, into explicit transpose +/// and broadcast. Having them folded into the linalg.generic is a good +/// optimization but sometimes we may want to unwrap, i.e., `unfold` them as +/// explicit transpose and broadcast. This rewrite pattern helps do it for +/// each input operand. This is useful for instance when trying to recognize +/// named ops. +/// +/// The transpose, broadcast, or mixture of both, are expressed in the affine +/// map of the operand. Technically it is essentially `projected permutation`. +/// +/// Example +/// +/// ```mlir +/// +/// #projection = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)> +/// #identity = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> +/// ... +/// %res = linalg.generic +/// { indexing_maps = [#projection, #identity, #identity], +/// iterator_types = ["parallel", "parallel", "parallel", +/// "parallel", "parallel"]} +/// ins(%x, %y : tensor<7x8x9xf32>, tensor<5x9x7x8x10xf32>) +/// outs(%z : tensor<5x9x7x8x10xf32>) { +/// ^bb0(%in: f32, %in_1: f32, %out: f32): +/// %div = arith.divf %in, %in_1 : f32 +/// linalg.yield %div : f32 +/// } -> tensor<5x9x7x8x10xf32> +/// ``` +/// +/// In the above IR operand `%x` map is a projected-permutation. This can be +/// unfolded as: +/// +/// ```mlir +/// ... +/// %x_trans = linalg.transpose +/// ins(%x : tensor<7x8x9xf32>) +/// outs(%e1 : tensor<9x7x8xf32>) permutation = [2, 0, 1] +/// ... +/// %x_trans_bc = linalg.broadcast +/// ins(%x_trans : tensor<9x7x8xf32>) +/// outs(%e2 : tensor<5x9x7x8x10xf32>) dimensions = [0, 4] +/// %2 = linalg.div +/// ins(%x_trans_bc, %y : +/// tensor<5x9x7x8x10xf32>, tensor<5x9x7x8x10xf32>) +/// outs(%arg2 : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> +/// +/// Note that linalg.generic has been 'specialized' to linalg.div. +/// +/// To unfold it, it is more optimal to transpose first and then do the +/// broadcast. However, if transpose is done first, the permutation map needs +/// to be expressed in terms of reduced dimension as broadcast hasn't happened +/// yet. Also, the broadcast dimensions in a linalg.generic come from other +/// operands (those not broadcasted along that particular dimension). We work +/// this out by computing the convex-polyhedron shape of the linalg.generic +/// iteration space from shapes of all the operands, both inputs and outputs. +/// +struct DecomposeProjectedPermutation : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(GenericOp genericOp, + PatternRewriter &rewriter) const override; +}; + +/// For the given `map`, determine what dimensions are transposed and what +/// dimensions are broadcasted. +/// Returns : +/// transpose-permutation, broadcast-dimensions` (empty if not needed) +/// +std::pair, SmallVector> +computeTransposeBroadcast(AffineMap &map) { + assert(map.isProjectedPermutation(false) && "not a projection"); + + // As the map is a projection it likely operates on a smaller set of + // dimensions as far as the transpose is concerned (rest are broadcast). + int64_t minorSize = map.getNumResults(); + + SmallVector minorResult; + for (int64_t i = 0; i < minorSize; ++i) { + auto expr = cast(map.getResults()[i]); + minorResult.push_back(expr.getPosition()); + } + + // If dims are not monotonically increasing then transpose is present. + SmallVector sortedResMap(minorResult); + std::sort(sortedResMap.begin(), sortedResMap.end()); + bool hasTranspose = !std::equal(minorResult.begin(), minorResult.end(), + sortedResMap.begin(), sortedResMap.end()); + + // Walk the sorted map result to determine which dimensions are broadcasted. + SmallVector broadcast; + for (int64_t i = 0, j = 0; i < map.getNumInputs(); ++i) { + if (j < minorSize && sortedResMap[j] == i) { + j++; + continue; + } + broadcast.push_back(i); + } + + SmallVector permutation; + if (hasTranspose) { + // Consider an operand `x : tensor<7x8x9>` of a genericOp that has + // affine map `affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)>` + // `x`s access is both transposed and broadcast. But when specifying + // the `linalg.transpose(x : tensor<7x8x9>)` the dimensions need to be + // specified as `affine_map<(d0,d1,d2) -> (d1, d2, d0)` instead of + // refering to d3, d4. Therefore, re-base the transpose dimensions so + // that they start from d0. + permutation.resize(minorSize); + std::map minorMap; + for (int64_t i = 0; i < minorSize; ++i) + minorMap.insert({sortedResMap[i], i}); + + // Re-map the dimensions. + SmallVector remappedResult(minorSize); + for (int64_t i = 0; i < minorSize; ++i) + remappedResult[i] = minorMap[minorResult[i]]; + + /// Calculate the permutation for the transpose. + for (unsigned i = 0; i < minorSize; ++i) { + permutation[remappedResult[i]] = i; + } + } + return {permutation, broadcast}; +} + +LogicalResult DecomposeProjectedPermutation::matchAndRewrite( + GenericOp op, PatternRewriter &rewriter) const { + if (!op.hasPureTensorSemantics() || op.isSingleInputOutput() || + op.isSingleYieldOp() || !op.isAllParallelLoops()) + return failure(); + + // If the map of an operand is not a `projected permutation` then + // it cannot be decomposed to mere transpose and broadcast. + // The requirement that all maps be `projected permutation` may be + // over-restrictive but since we need to determine shape of the + // iteration space as well, reject if any map violates assumption. + for (auto &opOperand : op->getOpOperands()) { + auto map = op.getMatchingIndexingMap(&opOperand); + if (!map.isProjectedPermutation(false)) + return failure(); + } + + // Decomposing linalg.generic involves creating `tensor.empty` + // which can have dynamic shapes but then we would have to work + // out which operand can supply that runtime-value (tensor.dim). + // Leaving it as a future TODO. + if (llvm::any_of(op->getOpOperands(), [](OpOperand &oper) { + auto opType = cast(oper.get().getType()); + return ShapedType::isDynamicShape(opType.getShape()); + })) + return failure(); + + auto outputShape = op.getStaticLoopRanges(); + + auto loc = op.getLoc(); + bool isChanged = false; + SmallVector newInitValues = op.getDpsInputs(); + SmallVector newMap = op.getIndexingMapsArray(); + + // Walk over each input operand and unfold if it is transposed, broadcast + // or mix of two via operand's affine-map. + for (int64_t i = 0; i < op.getNumDpsInputs(); ++i) { + auto &map = newMap[i]; + auto inputRTType = cast(newInitValues[i].getType()); + auto elType = inputRTType.getElementType(); + + /// Nothing to do if map is already an identity. + if (map.isIdentity()) + continue; + + auto [permutation, broadcastedDims] = computeTransposeBroadcast(map); + + // Does it need transpose? + if (!permutation.empty()) { + /// linalg.transpose permutes the dimensions of input using + /// rule: dim(result, i) = dim(input, permutation[i]) + SmallVector transposedShape(map.getNumResults()); + for (int64_t i = 0; i < map.getNumResults(); ++i) + transposedShape[i] = inputRTType.getShape()[permutation[i]]; + + Value emptyTensor = + rewriter.create(loc, transposedShape, elType); + + auto transposeOp = rewriter.create(loc, newInitValues[i], + emptyTensor, permutation); + newInitValues[i] = transposeOp->getResult(0); + isChanged = true; + } + + // Does it require broadcast? + if (!broadcastedDims.empty()) { + assert(broadcastedDims.size() && "should have non size broadcast"); + Value emptyTensor = rewriter.create( + loc, outputShape, inputRTType.getElementType()); + + auto broadcastOp = rewriter.create( + loc, newInitValues[i], emptyTensor, broadcastedDims); + + newInitValues[i] = broadcastOp->getResult(0); + isChanged = true; + } + newMap[i] = rewriter.getMultiDimIdentityMap(map.getNumDims()); + } + + if (isChanged) { + SmallVector operands = op->getOperands(); + ValueRange operandsRef(operands); + + auto newOp = rewriter.create( + /*location=*/op.getLoc(), + /*resultTensorTypes=*/op->getResultTypes(), + /*inputs=*/newInitValues, + /*outputs=*/operandsRef.drop_front(op.getNumDpsInputs()), + /*indexingMaps=*/newMap, + /*iteratorTypes=*/op.getIteratorTypesArray()); + + newOp.getRegion().takeBody(op->getRegion(0)); + rewriter.replaceOp(op, newOp->getResults()); + } + return success(); +} + +} // namespace + +void mlir::linalg::populateDecomposeProjectedPermutationPatterns( + RewritePatternSet &patterns) { + patterns.insert(patterns.getContext()); +} diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp index dfafffce9d9b60..748e2a1377930d 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp @@ -347,6 +347,7 @@ struct LinalgSpecializeGenericOpsPass void LinalgSpecializeGenericOpsPass::runOnOperation() { RewritePatternSet patterns(&getContext()); populateLinalgGenericOpsSpecializationPatterns(patterns); + populateDecomposeProjectedPermutationPatterns(patterns); if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) signalPassFailure(); diff --git a/mlir/lib/Dialect/Mesh/IR/CMakeLists.txt b/mlir/lib/Dialect/Mesh/IR/CMakeLists.txt index 45ac9edb280bc9..3fea4d67430e08 100644 --- a/mlir/lib/Dialect/Mesh/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/Mesh/IR/CMakeLists.txt @@ -13,4 +13,5 @@ add_mlir_dialect_library(MLIRMeshDialect MLIRIR MLIRSupport MLIRViewLikeInterface + MLIRDestinationStyleOpInterface ) diff --git a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp index 19e9212157ae47..c5570d8ee8a443 100644 --- a/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp +++ b/mlir/lib/Dialect/Mesh/IR/MeshOps.cpp @@ -192,33 +192,34 @@ template static void shardShape(const InShape &inShape, const MeshShape &meshShape, const SplitAxes &splitAxes, OutShape &outShape, - ArrayRef shardedDimsSizes = {}, + ArrayRef shardedDimsOffsets = {}, ArrayRef haloSizes = {}) { std::copy(llvm::adl_begin(inShape), llvm::adl_end(inShape), llvm::adl_begin(outShape)); - if (!shardedDimsSizes.empty()) { + if (!shardedDimsOffsets.empty()) { + auto isDynShape = ShapedType::isDynamicShape(meshShape); + uint64_t pos = 1; for (auto [tensorAxis, innerSplitAxes] : llvm::enumerate(splitAxes)) { - if (innerSplitAxes.empty()) { -#ifndef NDEBUG - for (auto dimSz : shardedDimsSizes) { - auto inAxis = dimSz % inShape.size(); - assert(inShape[inAxis] == dimSz || dimSz == ShapedType::kDynamic || - inShape[inAxis] == ShapedType::kDynamic); - } -#endif // NDEBUG - } else { - // find sharded dims in sharded_dims_sizes with same static size on - // all devices. Use kDynamic for dimensions with dynamic or non-uniform - // sizes in sharded_dims_sizes. - auto sz = shardedDimsSizes[tensorAxis]; - bool same = true; - for (size_t i = tensorAxis + inShape.size(); - i < shardedDimsSizes.size(); i += inShape.size()) { - if (shardedDimsSizes[i] != sz) { - same = false; - break; + if (!innerSplitAxes.empty()) { + auto sz = shardedDimsOffsets[pos]; + bool same = !isDynShape; + if (same) { + // Find sharded dims in shardedDimsOffsets with same static size on + // all devices. Use kDynamic for dimensions with dynamic or + // non-uniform offs in shardedDimsOffsets. + uint64_t numShards = 0; + for (auto i : innerSplitAxes.asArrayRef()) { + numShards += meshShape[i]; + } + for (size_t i = 1; i < numShards; ++i) { + if (shardedDimsOffsets[pos + i] - shardedDimsOffsets[pos + i - 1] != + sz) { + same = false; + break; + } } + pos += numShards + 1; } outShape[tensorAxis] = same ? sz : ShapedType::kDynamic; } @@ -255,7 +256,7 @@ ShapedType mesh::shardShapedType(ShapedType shape, MeshOp mesh, using Dim = std::decay_t; SmallVector resShapeArr(shape.getShape().size()); shardShape(shape.getShape(), mesh.getShape(), sharding.getSplitAxes(), - resShapeArr, sharding.getStaticShardedDimsSizes(), + resShapeArr, sharding.getStaticShardedDimsOffsets(), sharding.getStaticHaloSizes()); return shape.clone(resShapeArr); } @@ -432,13 +433,14 @@ void ShardingOp::build(::mlir::OpBuilder &b, ::mlir::OperationState &odsState, ArrayRef partial_axes, mesh::ReductionKind partial_type, ArrayRef static_halo_sizes, - ArrayRef static_sharded_dims_sizes) { + ArrayRef static_sharded_dims_offsets) { return build( b, odsState, mesh, MeshAxesArrayAttr::get(b.getContext(), split_axes), ::mlir::DenseI16ArrayAttr::get(b.getContext(), partial_axes), ::mlir::mesh::ReductionKindAttr::get(b.getContext(), partial_type), ::mlir::DenseI64ArrayAttr::get(b.getContext(), static_halo_sizes), {}, - ::mlir::DenseI64ArrayAttr::get(b.getContext(), static_sharded_dims_sizes), + ::mlir::DenseI64ArrayAttr::get(b.getContext(), + static_sharded_dims_offsets), {}); } @@ -455,11 +457,11 @@ void ShardingOp::build( ::mlir::OpBuilder &b, ::mlir::OperationState &odsState, FlatSymbolRefAttr mesh, ArrayRef split_axes, ::mlir::ArrayRef<::mlir::OpFoldResult> halo_sizes, - ::mlir::ArrayRef<::mlir::OpFoldResult> sharded_dims_sizes) { + ::mlir::ArrayRef<::mlir::OpFoldResult> sharded_dims_offsets) { mlir::SmallVector staticHalos, staticDims; mlir::SmallVector dynamicHalos, dynamicDims; dispatchIndexOpFoldResults(halo_sizes, dynamicHalos, staticHalos); - dispatchIndexOpFoldResults(sharded_dims_sizes, dynamicDims, staticDims); + dispatchIndexOpFoldResults(sharded_dims_offsets, dynamicDims, staticDims); return build( b, odsState, mesh, MeshAxesArrayAttr::get(b.getContext(), split_axes), {}, ::mlir::mesh::ReductionKindAttr::get(b.getContext(), ReductionKind::Sum), @@ -477,10 +479,10 @@ void ShardingOp::build(::mlir::OpBuilder &b, ::mlir::OperationState &odsState, : b.getDenseI16ArrayAttr(from.getPartialAxes()), ::mlir::mesh::ReductionKindAttr::get(b.getContext(), from.getPartialType()), - from.getStaticShardedDimsSizes().empty() + from.getStaticShardedDimsOffsets().empty() ? DenseI64ArrayAttr() - : b.getDenseI64ArrayAttr(from.getStaticShardedDimsSizes()), - from.getDynamicShardedDimsSizes(), + : b.getDenseI64ArrayAttr(from.getStaticShardedDimsOffsets()), + from.getDynamicShardedDimsOffsets(), from.getStaticHaloSizes().empty() ? DenseI64ArrayAttr() : b.getDenseI64ArrayAttr(from.getStaticHaloSizes()), @@ -509,8 +511,8 @@ LogicalResult ShardingOp::verify() { failed(checkMeshAxis(getPartialAxes().value()))) return failure(); - if (!getStaticHaloSizes().empty() && !getStaticShardedDimsSizes().empty()) { - return emitOpError("halo sizes and shard shapes are mutually exclusive"); + if (!getStaticHaloSizes().empty() && !getStaticShardedDimsOffsets().empty()) { + return emitOpError("halo sizes and shard offsets are mutually exclusive"); } if (!getStaticHaloSizes().empty()) { @@ -539,13 +541,81 @@ LogicalResult ShardingOp::verifySymbolUses(SymbolTableCollection &symbolTable) { return failure(); } if (mlir::ShapedType::isDynamicShape(mesh->getShape()) && - getStaticShardedDimsSizes().size() > 0) { - return emitError() << "sharded dims sizes are not allowed for " + getStaticShardedDimsOffsets().size() > 0) { + return emitError() << "sharded dims offsets are not allowed for " "devices meshes with dynamic shape."; } + + auto shardedDimsOffsets = getStaticShardedDimsOffsets(); + if (!shardedDimsOffsets.empty()) { + auto meshShape = mesh.value().getShape(); + assert(!ShapedType::isDynamicShape(meshShape)); + uint64_t pos = 0; + for (auto [tensorAxis, innerSplitAxes] : llvm::enumerate(getSplitAxes())) { + if (!innerSplitAxes.empty()) { + int64_t numShards = 0, off = 0; + for (auto i : innerSplitAxes.asArrayRef()) { + numShards += meshShape[i]; + } + for (int64_t i = 0; i <= numShards; ++i) { + if (shardedDimsOffsets.size() <= pos + i) { + return emitError() << "sharded dims offsets has wrong size."; + } + if (!ShapedType::isDynamic(shardedDimsOffsets[pos + i])) { + if (shardedDimsOffsets[pos + i] < off) { + return emitError() + << "sharded dims offsets must be non-decreasing."; + } + off = shardedDimsOffsets[pos + i]; + } + } + pos += numShards + 1; + } + } + } return success(); } +namespace { +// Sharding annotations "halo sizes" and "sharded dims offsets" +// are a mix of attributes and dynamic values. This canonicalization moves +// constant values to the respective attribute lists and so minimizes the number +// of values. +class FoldDynamicLists final : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(ShardingOp op, + PatternRewriter &b) const override { + auto mixedHalos = + getMixedValues(op.getStaticHaloSizes(), op.getDynamicHaloSizes(), b); + auto mixedOffs = getMixedValues(op.getStaticShardedDimsOffsets(), + op.getDynamicShardedDimsOffsets(), b); + + // No constant operands were folded, just return; + if (failed(foldDynamicIndexList(mixedHalos, /*onlyNonNegative=*/true)) && + failed(foldDynamicIndexList(mixedOffs, /*onlyNonNegative=*/true))) { + return failure(); + } + + auto halos = decomposeMixedValues(mixedHalos); + auto offs = decomposeMixedValues(mixedOffs); + + op.setStaticHaloSizes(halos.first); + op.getDynamicHaloSizesMutable().assign(halos.second); + op.setStaticShardedDimsOffsets(offs.first); + op.getDynamicShardedDimsOffsetsMutable().assign(offs.second); + + return success(); + } +}; +} // namespace + +void ShardingOp::getCanonicalizationPatterns(mlir::RewritePatternSet &results, + mlir::MLIRContext *context) { + results.add(context); +} + //===----------------------------------------------------------------------===// // MeshSharding //===----------------------------------------------------------------------===// @@ -555,7 +625,12 @@ bool MeshSharding::equalSplitAndPartialAxes(const MeshSharding &rhs) const { return false; } - if (!getPartialAxes().empty() && getPartialType() != rhs.getPartialType()) { + if (getPartialAxes().size() != rhs.getPartialAxes().size() || + (!getPartialAxes().empty() && getPartialType() != rhs.getPartialType()) || + !llvm::equal( + llvm::make_range(getPartialAxes().begin(), getPartialAxes().end()), + llvm::make_range(rhs.getPartialAxes().begin(), + rhs.getPartialAxes().end()))) { return false; } @@ -576,6 +651,31 @@ bool MeshSharding::equalSplitAndPartialAxes(const MeshSharding &rhs) const { } bool MeshSharding::equalHaloAndShardSizes(const MeshSharding &rhs) const { + return equalShardSizes(rhs) && equalHaloSizes(rhs); +} + +bool MeshSharding::equalShardSizes(const MeshSharding &rhs) const { + if (rhs.getStaticShardedDimsOffsets().size() != + getStaticShardedDimsOffsets().size() || + !llvm::equal(llvm::make_range(getStaticShardedDimsOffsets().begin(), + getStaticShardedDimsOffsets().end()), + llvm::make_range(rhs.getStaticShardedDimsOffsets().begin(), + rhs.getStaticShardedDimsOffsets().end()))) { + return false; + } + if (rhs.getDynamicShardedDimsOffsets().size() != + getDynamicShardedDimsOffsets().size() || + !llvm::equal( + llvm::make_range(getDynamicShardedDimsOffsets().begin(), + getDynamicShardedDimsOffsets().end()), + llvm::make_range(rhs.getDynamicShardedDimsOffsets().begin(), + rhs.getDynamicShardedDimsOffsets().end()))) { + return false; + } + return true; +} + +bool MeshSharding::equalHaloSizes(const MeshSharding &rhs) const { if (rhs.getStaticHaloSizes().size() != getStaticHaloSizes().size() || !llvm::equal(llvm::make_range(getStaticHaloSizes().begin(), getStaticHaloSizes().end()), @@ -583,28 +683,13 @@ bool MeshSharding::equalHaloAndShardSizes(const MeshSharding &rhs) const { rhs.getStaticHaloSizes().end()))) { return false; } - if (rhs.getStaticShardedDimsSizes().size() != getDynamicHaloSizes().size() || - !llvm::equal(llvm::make_range(getStaticShardedDimsSizes().begin(), - getStaticShardedDimsSizes().end()), - llvm::make_range(rhs.getStaticShardedDimsSizes().begin(), - rhs.getStaticShardedDimsSizes().end()))) { - return false; - } - if (rhs.getDynamicHaloSizes().size() != getStaticShardedDimsSizes().size() || + if (rhs.getDynamicHaloSizes().size() != getDynamicHaloSizes().size() || !llvm::equal(llvm::make_range(getDynamicHaloSizes().begin(), getDynamicHaloSizes().end()), llvm::make_range(rhs.getDynamicHaloSizes().begin(), rhs.getDynamicHaloSizes().end()))) { return false; } - if (rhs.getDynamicShardedDimsSizes().size() != - getDynamicShardedDimsSizes().size() || - !llvm::equal(llvm::make_range(getDynamicShardedDimsSizes().begin(), - getDynamicShardedDimsSizes().end()), - llvm::make_range(rhs.getDynamicShardedDimsSizes().begin(), - rhs.getDynamicShardedDimsSizes().end()))) { - return false; - } return true; } @@ -629,9 +714,9 @@ MeshSharding::MeshSharding(Value rhs) { shardingOp.getPartialAxes().value_or(ArrayRef()), shardingOp.getPartialType().value_or(ReductionKind::Sum), shardingOp.getStaticHaloSizes(), - shardingOp.getStaticShardedDimsSizes(), + shardingOp.getStaticShardedDimsOffsets(), SmallVector(shardingOp.getDynamicHaloSizes()), - SmallVector(shardingOp.getDynamicShardedDimsSizes())); + SmallVector(shardingOp.getDynamicShardedDimsOffsets())); } MeshSharding MeshSharding::get(::mlir::FlatSymbolRefAttr mesh_, @@ -639,9 +724,9 @@ MeshSharding MeshSharding::get(::mlir::FlatSymbolRefAttr mesh_, ArrayRef partial_axes_, ReductionKind partial_type_, ArrayRef static_halo_sizes_, - ArrayRef static_sharded_dims_sizes_, + ArrayRef static_sharded_dims_offsets_, ArrayRef dynamic_halo_sizes_, - ArrayRef dynamic_sharded_dims_sizes_) { + ArrayRef dynamic_sharded_dims_offsets_) { MeshSharding res; res.mesh = mesh_; res.split_axes.resize(split_axes_.size()); @@ -658,9 +743,9 @@ MeshSharding MeshSharding::get(::mlir::FlatSymbolRefAttr mesh_, clone(partial_axes_, res.partial_axes); res.partial_type = partial_type_; clone(static_halo_sizes_, res.static_halo_sizes); - clone(static_sharded_dims_sizes_, res.static_sharded_dims_sizes); + clone(static_sharded_dims_offsets_, res.static_sharded_dims_offsets); clone(dynamic_halo_sizes_, res.dynamic_halo_sizes); - clone(dynamic_sharded_dims_sizes_, res.dynamic_sharded_dims_sizes); + clone(dynamic_sharded_dims_offsets_, res.dynamic_sharded_dims_offsets); return res; } diff --git a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp index fdfed39972fd52..b4d088cbd7088d 100644 --- a/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp +++ b/mlir/lib/Dialect/Mesh/Transforms/Spmdization.cpp @@ -126,7 +126,7 @@ static MeshSharding targetShardingInSplitLastAxis(MLIRContext *ctx, } // Split a replicated tensor along a mesh axis. -// e.g. [[0, 1]] -> [[0, 1, 2]]. +// E.g. [[0, 1]] -> [[0, 1, 2]]. // Returns the spmdized target value with its sharding. static std::tuple, MeshSharding> splitLastAxisInResharding(ImplicitLocOpBuilder &builder, @@ -429,6 +429,85 @@ tryMoveLastSplitAxisInResharding(ImplicitLocOpBuilder &builder, MeshOp mesh, return std::nullopt; } +// Detect a change in the halo size (only) and create necessary operations if +// needed. A changed halo sizes requires copying the "core" of the source tensor +// into the "core" of the destination tensor followed by an update halo +// operation. +static std::optional, MeshSharding>> +tryUpdateHaloInResharding(ImplicitLocOpBuilder &builder, MeshOp mesh, + MeshSharding sourceSharding, + MeshSharding targetSharding, + ShapedType sourceUnshardedShape, + TypedValue sourceShard) { + // Currently handles only cases where halo sizes differ but everything else + // stays the same (from source to destination sharding). + if (!sourceSharding.equalSplitAndPartialAxes(targetSharding) || + !sourceSharding.getPartialAxes().empty() || + !targetSharding.getPartialAxes().empty() || + !sourceSharding.getStaticShardedDimsOffsets().empty() || + !targetSharding.getStaticShardedDimsOffsets().empty() || + sourceSharding.equalHaloSizes(targetSharding)) { + return std::nullopt; + } + + auto srcHaloSizes = sourceSharding.getStaticHaloSizes(); + auto tgtHaloSizes = targetSharding.getStaticHaloSizes(); + assert(srcHaloSizes.empty() || srcHaloSizes.size() == tgtHaloSizes.size()); + assert(((srcHaloSizes.empty() || !ShapedType::isDynamicShape(srcHaloSizes)) && + !ShapedType::isDynamicShape(tgtHaloSizes) && + sourceShard.getType().hasStaticShape()) && + "dynamic shapes/halos are not supported yet for mesh-spmdization"); + auto rank = sourceShard.getType().getRank(); + auto splitAxes = sourceSharding.getSplitAxes(); + SmallVector srcCoreOffs(rank, 0), tgtCoreOffs(rank, 0), + strides(rank, 1), outShape(sourceShard.getType().getShape()), + coreShape(sourceShard.getType().getShape()); + + // Determine "core" of source and destination. + // The core is the local part of the shard excluding halo regions. + for (auto i = 0u; i < rank; ++i) { + if (i < splitAxes.size() && !splitAxes[i].empty()) { + if (!srcHaloSizes.empty()) { + coreShape[i] -= srcHaloSizes[i * 2] + srcHaloSizes[i * 2 + 1]; + srcCoreOffs[i] = srcHaloSizes[i * 2]; + } + tgtCoreOffs[i] = tgtHaloSizes[i * 2]; + outShape[i] = + coreShape[i] + tgtHaloSizes[i * 2] + tgtHaloSizes[i * 2 + 1]; + } + } + + // Extract core from source and copy into destination core. + auto noVals = ValueRange{}; + auto initVal = builder.create( + sourceShard.getLoc(), outShape, sourceShard.getType().getElementType()); + auto core = builder.create( + sourceShard.getLoc(), + RankedTensorType::get(coreShape, sourceShard.getType().getElementType()), + sourceShard, noVals, noVals, noVals, srcCoreOffs, coreShape, strides); + auto initOprnd = builder.create( + sourceShard.getLoc(), core, initVal, noVals, noVals, noVals, tgtCoreOffs, + coreShape, strides); + + // Finally update the halo. + auto updateHaloResult = + builder + .create( + sourceShard.getLoc(), + RankedTensorType::get(outShape, + sourceShard.getType().getElementType()), + sourceShard, initOprnd, mesh.getSymName(), + MeshAxesArrayAttr::get(builder.getContext(), + sourceSharding.getSplitAxes()), + sourceSharding.getDynamicHaloSizes(), + sourceSharding.getStaticHaloSizes(), + targetSharding.getDynamicHaloSizes(), + targetSharding.getStaticHaloSizes()) + .getResult(); + return std::make_tuple(cast>(updateHaloResult), + targetSharding); +} + // Handles only resharding on a 1D mesh. // Currently the sharded tensor axes must be exactly divisible by the single // mesh axis size. @@ -454,10 +533,10 @@ reshardOn1DMesh(ImplicitLocOpBuilder &builder, MeshOp mesh, TypedValue targetShard; MeshSharding actualTargetSharding; - if (reducedSourceSharding.getStaticHaloSizes().empty() && - targetSharding.getStaticHaloSizes().empty() && - reducedSourceSharding.getStaticShardedDimsSizes().empty() && - targetSharding.getStaticShardedDimsSizes().empty()) { + if (reducedSourceSharding.getStaticShardedDimsOffsets().empty() && + targetSharding.getStaticShardedDimsOffsets().empty() && + reducedSourceSharding.getStaticHaloSizes().empty() && + targetSharding.getStaticHaloSizes().empty()) { if (auto tryRes = tryMoveLastSplitAxisInResharding( builder, mesh, reducedSourceSharding, targetSharding, sourceUnshardedValue.getType(), reducedSourceShard)) { @@ -483,6 +562,19 @@ TypedValue reshard(ImplicitLocOpBuilder &builder, MeshOp mesh, MeshSharding targetSharding, TypedValue sourceUnshardedValue, TypedValue sourceShard) { + // If source and destination sharding are the same, no need to do anything. + if (sourceSharding == targetSharding) { + return sourceShard; + } + + // Tries to handle the case where the resharding is needed because the halo + // sizes are different. Supports arbitrary mesh dimensionality. + if (auto tryRes = tryUpdateHaloInResharding( + builder, mesh, sourceSharding, targetSharding, + sourceUnshardedValue.getType(), sourceShard)) { + return std::get<0>(tryRes.value()); // targetShard + } + // Resort to handling only 1D meshes since the general case is complicated if // it needs to be communication efficient in terms of minimizing the data // transfered between devices. @@ -636,8 +728,8 @@ spmdizeOperation(ShardOp shardOp, IRMapping &spmdizationMap, targetSpmdValue = spmdizationMap.lookup(shardOp.getSrc()); } else { // Insert resharding. - TypedValue srcSpmdValue = cast>( - spmdizationMap.lookup(srcShardOp.getSrc())); + TypedValue srcSpmdValue = + cast>(spmdizationMap.lookup(srcShardOp)); targetSpmdValue = reshard(builder, srcShardOp, shardOp, srcSpmdValue, symbolTableCollection); } diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 228c2d034ad4ad..a1de0831653e64 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -2221,13 +2221,13 @@ LogicalResult DeclareReductionOp::verifyRegions() { void TaskOp::build(OpBuilder &builder, OperationState &state, const TaskOperands &clauses) { MLIRContext *ctx = builder.getContext(); - // TODO Store clauses in op: privateVars, privateSyms. TaskOp::build(builder, state, clauses.allocateVars, clauses.allocatorVars, makeArrayAttr(ctx, clauses.dependKinds), clauses.dependVars, clauses.final, clauses.ifExpr, clauses.inReductionVars, makeDenseBoolArrayAttr(ctx, clauses.inReductionByref), makeArrayAttr(ctx, clauses.inReductionSyms), clauses.mergeable, - clauses.priority, /*private_vars=*/{}, /*private_syms=*/nullptr, + clauses.priority, /*private_vars=*/clauses.privateVars, + /*private_syms=*/makeArrayAttr(ctx, clauses.privateSyms), clauses.untied); } diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp index 58841f29698e0d..76ddaa2df5a9d9 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp @@ -1452,8 +1452,10 @@ void vector::populateVectorNarrowTypeRewritePatterns( RewriteAlignedSubByteIntExt, RewriteAlignedSubByteIntTrunc>(patterns.getContext(), benefit.getBenefit() + 1); - patterns.add>( - patterns.getContext(), benefit.getBenefit() + 1); + patterns + .add, + RewriteAlignedSubByteIntExt>( + patterns.getContext(), benefit.getBenefit() + 1); } void vector::populateVectorTransposeNarrowTypeRewritePatterns( diff --git a/mlir/lib/IR/Dominance.cpp b/mlir/lib/IR/Dominance.cpp index 2b138ae223546e..62477a823acaaf 100644 --- a/mlir/lib/IR/Dominance.cpp +++ b/mlir/lib/IR/Dominance.cpp @@ -34,7 +34,8 @@ DominanceInfoBase::~DominanceInfoBase() { delete entry.second.getPointer(); } -template void DominanceInfoBase::invalidate() { +template +void DominanceInfoBase::invalidate() { for (auto entry : dominanceInfos) delete entry.second.getPointer(); dominanceInfos.clear(); @@ -217,9 +218,10 @@ template bool DominanceInfoBase::properlyDominates(Block *a, Block *b) const { assert(a && b && "null blocks not allowed"); - // A block dominates itself but does not properly dominate itself. + // A block dominates, but does not properly dominate, itself unless this + // is a graph region. if (a == b) - return false; + return !hasSSADominance(a); // If both blocks are not in the same region, `a` properly dominates `b` if // `b` is defined in an operation region that (recursively) ends up being @@ -269,7 +271,7 @@ bool DominanceInfo::properlyDominatesImpl(Operation *a, Operation *b, Block *aBlock = a->getBlock(), *bBlock = b->getBlock(); assert(aBlock && bBlock && "operations must be in a block"); - // An instruction dominates, but does not properlyDominate, itself unless this + // An operation dominates, but does not properly dominate, itself unless this // is a graph region. if (a == b) return !hasSSADominance(aBlock); @@ -325,7 +327,8 @@ bool DominanceInfo::properlyDominates(Value a, Operation *b) const { //===----------------------------------------------------------------------===// /// Returns true if statement 'a' properly postdominates statement b. -bool PostDominanceInfo::properlyPostDominates(Operation *a, Operation *b) { +bool PostDominanceInfo::properlyPostDominates(Operation *a, + Operation *b) const { auto *aBlock = a->getBlock(), *bBlock = b->getBlock(); assert(aBlock && bBlock && "operations must be in a block"); diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index 12145f7a2217df..71d88d3a62f2b9 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -689,6 +689,12 @@ void ModuleImport::setExactFlag(llvm::Instruction *inst, Operation *op) const { iface.setIsExact(inst->isExact()); } +void ModuleImport::setNonNegFlag(llvm::Instruction *inst, Operation *op) const { + auto iface = cast(op); + + iface.setNonNeg(inst->hasNonNeg()); +} + void ModuleImport::setFastmathFlagsAttr(llvm::Instruction *inst, Operation *op) const { auto iface = cast(op); diff --git a/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi b/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi index 42694747e5f24f..03449b70b7fa38 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/__init__.pyi @@ -8,5 +8,5 @@ class _Globals: def append_dialect_search_prefix(self, module_name: str) -> None: ... def _check_dialect_module_loaded(self, dialect_namespace: str) -> bool: ... -def register_dialect(dialect_class: type) -> object: ... -def register_operation(dialect_class: type, *, replace: bool = ...) -> object: ... +def register_dialect(dialect_class: type) -> type: ... +def register_operation(dialect_class: type, *, replace: bool = ...) -> type: ... diff --git a/mlir/test/Analysis/test-dominance.mlir b/mlir/test/Analysis/test-dominance.mlir index 3c53193db7f72f..a926a8271200a3 100644 --- a/mlir/test/Analysis/test-dominance.mlir +++ b/mlir/test/Analysis/test-dominance.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(test-print-dominance))" -split-input-file 2>&1 | FileCheck %s +// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(test-print-dominance))" -split-input-file | FileCheck %s // CHECK-LABEL: Testing : func_condBranch func.func @func_condBranch(%cond : i1) { @@ -10,40 +10,117 @@ func.func @func_condBranch(%cond : i1) { ^exit: return } -// CHECK-LABEL: --- DominanceInfo --- -// CHECK-NEXT: Nearest(0, 0) = 0 -// CHECK-NEXT: Nearest(0, 1) = 0 -// CHECK-NEXT: Nearest(0, 2) = 0 -// CHECK-NEXT: Nearest(0, 3) = 0 + +// CHECK: --- DominanceInfo --- +// CHECK: Nearest(0, 0) = 0 +// CHECK: Nearest(0, 1) = 0 +// CHECK: Nearest(0, 2) = 0 +// CHECK: Nearest(0, 3) = 0 +// CHECK: Nearest(0, 4) = 4 // CHECK: Nearest(1, 0) = 0 -// CHECK-NEXT: Nearest(1, 1) = 1 -// CHECK-NEXT: Nearest(1, 2) = 0 -// CHECK-NEXT: Nearest(1, 3) = 0 +// CHECK: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 2) = 0 +// CHECK: Nearest(1, 3) = 0 +// CHECK: Nearest(1, 4) = 4 // CHECK: Nearest(2, 0) = 0 -// CHECK-NEXT: Nearest(2, 1) = 0 -// CHECK-NEXT: Nearest(2, 2) = 2 -// CHECK-NEXT: Nearest(2, 3) = 0 +// CHECK: Nearest(2, 1) = 0 +// CHECK: Nearest(2, 2) = 2 +// CHECK: Nearest(2, 3) = 0 +// CHECK: Nearest(2, 4) = 4 // CHECK: Nearest(3, 0) = 0 -// CHECK-NEXT: Nearest(3, 1) = 0 -// CHECK-NEXT: Nearest(3, 2) = 0 -// CHECK-NEXT: Nearest(3, 3) = 3 -// CHECK-LABEL: --- PostDominanceInfo --- -// CHECK-NEXT: Nearest(0, 0) = 0 -// CHECK-NEXT: Nearest(0, 1) = 3 -// CHECK-NEXT: Nearest(0, 2) = 3 -// CHECK-NEXT: Nearest(0, 3) = 3 +// CHECK: Nearest(3, 1) = 0 +// CHECK: Nearest(3, 2) = 0 +// CHECK: Nearest(3, 3) = 3 +// CHECK: Nearest(3, 4) = 4 +// CHECK: Nearest(4, 0) = 4 +// CHECK: Nearest(4, 1) = 4 +// CHECK: Nearest(4, 2) = 4 +// CHECK: Nearest(4, 3) = 4 +// CHECK: Nearest(4, 4) = 4 + +// CHECK: --- PostDominanceInfo --- +// CHECK: Nearest(0, 0) = 0 +// CHECK: Nearest(0, 1) = 3 +// CHECK: Nearest(0, 2) = 3 +// CHECK: Nearest(0, 3) = 3 +// CHECK: Nearest(0, 4) = 4 // CHECK: Nearest(1, 0) = 3 -// CHECK-NEXT: Nearest(1, 1) = 1 -// CHECK-NEXT: Nearest(1, 2) = 3 -// CHECK-NEXT: Nearest(1, 3) = 3 +// CHECK: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 2) = 3 +// CHECK: Nearest(1, 3) = 3 +// CHECK: Nearest(1, 4) = 4 // CHECK: Nearest(2, 0) = 3 -// CHECK-NEXT: Nearest(2, 1) = 3 -// CHECK-NEXT: Nearest(2, 2) = 2 -// CHECK-NEXT: Nearest(2, 3) = 3 +// CHECK: Nearest(2, 1) = 3 +// CHECK: Nearest(2, 2) = 2 +// CHECK: Nearest(2, 3) = 3 +// CHECK: Nearest(2, 4) = 4 // CHECK: Nearest(3, 0) = 3 -// CHECK-NEXT: Nearest(3, 1) = 3 -// CHECK-NEXT: Nearest(3, 2) = 3 -// CHECK-NEXT: Nearest(3, 3) = 3 +// CHECK: Nearest(3, 1) = 3 +// CHECK: Nearest(3, 2) = 3 +// CHECK: Nearest(3, 3) = 3 +// CHECK: Nearest(3, 4) = 4 +// CHECK: Nearest(4, 0) = 4 +// CHECK: Nearest(4, 1) = 4 +// CHECK: Nearest(4, 2) = 4 +// CHECK: Nearest(4, 3) = 4 +// CHECK: Nearest(4, 4) = 4 + +// CHECK: --- Block Dominance relationship --- +// CHECK: dominates(0, 0) = 1 (properly = 0) +// CHECK: dominates(0, 1) = 1 (properly = 1) +// CHECK: dominates(0, 2) = 1 (properly = 1) +// CHECK: dominates(0, 3) = 1 (properly = 1) +// CHECK: dominates(0, 4) = 0 (properly = 0) +// CHECK: dominates(1, 0) = 0 (properly = 0) +// CHECK: dominates(1, 1) = 1 (properly = 0) +// CHECK: dominates(1, 2) = 0 (properly = 0) +// CHECK: dominates(1, 3) = 0 (properly = 0) +// CHECK: dominates(1, 4) = 0 (properly = 0) +// CHECK: dominates(2, 0) = 0 (properly = 0) +// CHECK: dominates(2, 1) = 0 (properly = 0) +// CHECK: dominates(2, 2) = 1 (properly = 0) +// CHECK: dominates(2, 3) = 0 (properly = 0) +// CHECK: dominates(2, 4) = 0 (properly = 0) +// CHECK: dominates(3, 0) = 0 (properly = 0) +// CHECK: dominates(3, 1) = 0 (properly = 0) +// CHECK: dominates(3, 2) = 0 (properly = 0) +// CHECK: dominates(3, 3) = 1 (properly = 0) +// CHECK: dominates(3, 4) = 0 (properly = 0) +// CHECK: dominates(4, 0) = 1 (properly = 1) +// CHECK: dominates(4, 1) = 1 (properly = 1) +// CHECK: dominates(4, 2) = 1 (properly = 1) +// CHECK: dominates(4, 3) = 1 (properly = 1) +// CHECK: dominates(4, 4) = 1 (properly = 1) + +// CHECK: --- Block PostDominance relationship --- +// CHECK: postdominates(0, 0) = 1 (properly = 0) +// CHECK: postdominates(0, 1) = 0 (properly = 0) +// CHECK: postdominates(0, 2) = 0 (properly = 0) +// CHECK: postdominates(0, 3) = 0 (properly = 0) +// CHECK: postdominates(0, 4) = 0 (properly = 0) +// CHECK: postdominates(1, 0) = 0 (properly = 0) +// CHECK: postdominates(1, 1) = 1 (properly = 0) +// CHECK: postdominates(1, 2) = 0 (properly = 0) +// CHECK: postdominates(1, 3) = 0 (properly = 0) +// CHECK: postdominates(1, 4) = 0 (properly = 0) +// CHECK: postdominates(2, 0) = 0 (properly = 0) +// CHECK: postdominates(2, 1) = 0 (properly = 0) +// CHECK: postdominates(2, 2) = 1 (properly = 0) +// CHECK: postdominates(2, 3) = 0 (properly = 0) +// CHECK: postdominates(2, 4) = 0 (properly = 0) +// CHECK: postdominates(3, 0) = 1 (properly = 1) +// CHECK: postdominates(3, 1) = 1 (properly = 1) +// CHECK: postdominates(3, 2) = 1 (properly = 1) +// CHECK: postdominates(3, 3) = 1 (properly = 0) +// CHECK: postdominates(3, 4) = 0 (properly = 0) +// CHECK: postdominates(4, 0) = 1 (properly = 1) +// CHECK: postdominates(4, 1) = 1 (properly = 1) +// CHECK: postdominates(4, 2) = 1 (properly = 1) +// CHECK: postdominates(4, 3) = 1 (properly = 1) +// CHECK: postdominates(4, 4) = 1 (properly = 1) + +// CHECK: module attributes {test.block_ids = array} +// CHECK: func.func @func_condBranch({{.*}}) attributes {test.block_ids = array} // ----- @@ -60,32 +137,117 @@ func.func @func_loop(%arg0 : i32, %arg1 : i32) { ^exit: return } -// CHECK-LABEL: --- DominanceInfo --- + +// CHECK: --- DominanceInfo --- +// CHECK: Nearest(0, 0) = 0 +// CHECK: Nearest(0, 1) = 0 +// CHECK: Nearest(0, 2) = 0 +// CHECK: Nearest(0, 3) = 0 +// CHECK: Nearest(0, 4) = 4 // CHECK: Nearest(1, 0) = 0 -// CHECK-NEXT: Nearest(1, 1) = 1 -// CHECK-NEXT: Nearest(1, 2) = 1 -// CHECK-NEXT: Nearest(1, 3) = 1 +// CHECK: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 2) = 1 +// CHECK: Nearest(1, 3) = 1 +// CHECK: Nearest(1, 4) = 4 // CHECK: Nearest(2, 0) = 0 -// CHECK-NEXT: Nearest(2, 1) = 1 -// CHECK-NEXT: Nearest(2, 2) = 2 -// CHECK-NEXT: Nearest(2, 3) = 1 +// CHECK: Nearest(2, 1) = 1 +// CHECK: Nearest(2, 2) = 2 +// CHECK: Nearest(2, 3) = 1 +// CHECK: Nearest(2, 4) = 4 // CHECK: Nearest(3, 0) = 0 -// CHECK-NEXT: Nearest(3, 1) = 1 -// CHECK-NEXT: Nearest(3, 2) = 1 -// CHECK-NEXT: Nearest(3, 3) = 3 -// CHECK-LABEL: --- PostDominanceInfo --- +// CHECK: Nearest(3, 1) = 1 +// CHECK: Nearest(3, 2) = 1 +// CHECK: Nearest(3, 3) = 3 +// CHECK: Nearest(3, 4) = 4 +// CHECK: Nearest(4, 0) = 4 +// CHECK: Nearest(4, 1) = 4 +// CHECK: Nearest(4, 2) = 4 +// CHECK: Nearest(4, 3) = 4 +// CHECK: Nearest(4, 4) = 4 + +// CHECK: --- PostDominanceInfo --- +// CHECK: Nearest(0, 0) = 0 +// CHECK: Nearest(0, 1) = 1 +// CHECK: Nearest(0, 2) = 1 +// CHECK: Nearest(0, 3) = 3 +// CHECK: Nearest(0, 4) = 4 // CHECK: Nearest(1, 0) = 1 -// CHECK-NEXT: Nearest(1, 1) = 1 -// CHECK-NEXT: Nearest(1, 2) = 1 -// CHECK-NEXT: Nearest(1, 3) = 3 +// CHECK: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 2) = 1 +// CHECK: Nearest(1, 3) = 3 +// CHECK: Nearest(1, 4) = 4 // CHECK: Nearest(2, 0) = 1 -// CHECK-NEXT: Nearest(2, 1) = 1 -// CHECK-NEXT: Nearest(2, 2) = 2 -// CHECK-NEXT: Nearest(2, 3) = 3 +// CHECK: Nearest(2, 1) = 1 +// CHECK: Nearest(2, 2) = 2 +// CHECK: Nearest(2, 3) = 3 +// CHECK: Nearest(2, 4) = 4 // CHECK: Nearest(3, 0) = 3 -// CHECK-NEXT: Nearest(3, 1) = 3 -// CHECK-NEXT: Nearest(3, 2) = 3 -// CHECK-NEXT: Nearest(3, 3) = 3 +// CHECK: Nearest(3, 1) = 3 +// CHECK: Nearest(3, 2) = 3 +// CHECK: Nearest(3, 3) = 3 +// CHECK: Nearest(3, 4) = 4 +// CHECK: Nearest(4, 0) = 4 +// CHECK: Nearest(4, 1) = 4 +// CHECK: Nearest(4, 2) = 4 +// CHECK: Nearest(4, 3) = 4 +// CHECK: Nearest(4, 4) = 4 + +// CHECK: --- Block Dominance relationship --- +// CHECK: dominates(0, 0) = 1 (properly = 0) +// CHECK: dominates(0, 1) = 1 (properly = 1) +// CHECK: dominates(0, 2) = 1 (properly = 1) +// CHECK: dominates(0, 3) = 1 (properly = 1) +// CHECK: dominates(0, 4) = 0 (properly = 0) +// CHECK: dominates(1, 0) = 0 (properly = 0) +// CHECK: dominates(1, 1) = 1 (properly = 0) +// CHECK: dominates(1, 2) = 1 (properly = 1) +// CHECK: dominates(1, 3) = 1 (properly = 1) +// CHECK: dominates(1, 4) = 0 (properly = 0) +// CHECK: dominates(2, 0) = 0 (properly = 0) +// CHECK: dominates(2, 1) = 0 (properly = 0) +// CHECK: dominates(2, 2) = 1 (properly = 0) +// CHECK: dominates(2, 3) = 0 (properly = 0) +// CHECK: dominates(2, 4) = 0 (properly = 0) +// CHECK: dominates(3, 0) = 0 (properly = 0) +// CHECK: dominates(3, 1) = 0 (properly = 0) +// CHECK: dominates(3, 2) = 0 (properly = 0) +// CHECK: dominates(3, 3) = 1 (properly = 0) +// CHECK: dominates(3, 4) = 0 (properly = 0) +// CHECK: dominates(4, 0) = 1 (properly = 1) +// CHECK: dominates(4, 1) = 1 (properly = 1) +// CHECK: dominates(4, 2) = 1 (properly = 1) +// CHECK: dominates(4, 3) = 1 (properly = 1) +// CHECK: dominates(4, 4) = 1 (properly = 1) + +// CHECK: --- Block PostDominance relationship --- +// CHECK: postdominates(0, 0) = 1 (properly = 0) +// CHECK: postdominates(0, 1) = 0 (properly = 0) +// CHECK: postdominates(0, 2) = 0 (properly = 0) +// CHECK: postdominates(0, 3) = 0 (properly = 0) +// CHECK: postdominates(0, 4) = 0 (properly = 0) +// CHECK: postdominates(1, 0) = 1 (properly = 1) +// CHECK: postdominates(1, 1) = 1 (properly = 0) +// CHECK: postdominates(1, 2) = 1 (properly = 1) +// CHECK: postdominates(1, 3) = 0 (properly = 0) +// CHECK: postdominates(1, 4) = 0 (properly = 0) +// CHECK: postdominates(2, 0) = 0 (properly = 0) +// CHECK: postdominates(2, 1) = 0 (properly = 0) +// CHECK: postdominates(2, 2) = 1 (properly = 0) +// CHECK: postdominates(2, 3) = 0 (properly = 0) +// CHECK: postdominates(2, 4) = 0 (properly = 0) +// CHECK: postdominates(3, 0) = 1 (properly = 1) +// CHECK: postdominates(3, 1) = 1 (properly = 1) +// CHECK: postdominates(3, 2) = 1 (properly = 1) +// CHECK: postdominates(3, 3) = 1 (properly = 0) +// CHECK: postdominates(3, 4) = 0 (properly = 0) +// CHECK: postdominates(4, 0) = 1 (properly = 1) +// CHECK: postdominates(4, 1) = 1 (properly = 1) +// CHECK: postdominates(4, 2) = 1 (properly = 1) +// CHECK: postdominates(4, 3) = 1 (properly = 1) +// CHECK: postdominates(4, 4) = 1 (properly = 1) + +// CHECK: module attributes {test.block_ids = array} +// CHECK: func.func @func_loop({{.*}}) attributes {test.block_ids = array} // ----- @@ -95,16 +257,57 @@ func.func @nested_region(%arg0 : index, %arg1 : index, %arg2 : index) { return } -// CHECK-LABEL: --- DominanceInfo --- -// CHECK-NEXT: Nearest(0, 0) = 0 -// CHECK-NEXT: Nearest(0, 1) = 1 +// CHECK: --- DominanceInfo --- +// CHECK: Nearest(0, 0) = 0 +// CHECK: Nearest(0, 1) = 1 +// CHECK: Nearest(0, 2) = 2 // CHECK: Nearest(1, 0) = 1 -// CHECK-NEXT: Nearest(1, 1) = 1 -// CHECK-LABEL: --- PostDominanceInfo --- -// CHECK-NEXT: Nearest(0, 0) = 0 -// CHECK-NEXT: Nearest(0, 1) = 1 +// CHECK: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 2) = 2 +// CHECK: Nearest(2, 0) = 2 +// CHECK: Nearest(2, 1) = 2 +// CHECK: Nearest(2, 2) = 2 + +// CHECK: --- PostDominanceInfo --- +// CHECK: Nearest(0, 0) = 0 +// CHECK: Nearest(0, 1) = 1 +// CHECK: Nearest(0, 2) = 2 // CHECK: Nearest(1, 0) = 1 -// CHECK-NEXT: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 2) = 2 +// CHECK: Nearest(2, 0) = 2 +// CHECK: Nearest(2, 1) = 2 +// CHECK: Nearest(2, 2) = 2 + +// CHECK: --- Block Dominance relationship --- +// CHECK: dominates(0, 0) = 1 (properly = 0) +// CHECK: dominates(0, 1) = 0 (properly = 0) +// CHECK: dominates(0, 2) = 0 (properly = 0) +// CHECK: dominates(1, 0) = 1 (properly = 1) +// CHECK: dominates(1, 1) = 1 (properly = 0) +// CHECK: dominates(1, 2) = 0 (properly = 0) +// CHECK: dominates(2, 0) = 1 (properly = 1) +// CHECK: dominates(2, 1) = 1 (properly = 1) +// CHECK: dominates(2, 2) = 1 (properly = 1) + +// CHECK: --- Block PostDominance relationship --- +// CHECK: postdominates(0, 0) = 1 (properly = 0) +// CHECK: postdominates(0, 1) = 0 (properly = 0) +// CHECK: postdominates(0, 2) = 0 (properly = 0) +// CHECK: postdominates(1, 0) = 1 (properly = 1) +// CHECK: postdominates(1, 1) = 1 (properly = 0) +// CHECK: postdominates(1, 2) = 0 (properly = 0) +// CHECK: postdominates(2, 0) = 1 (properly = 1) +// CHECK: postdominates(2, 1) = 1 (properly = 1) +// CHECK: postdominates(2, 2) = 1 (properly = 1) + +// CHECK: module attributes {test.block_ids = array} { +// CHECK: func.func @nested_region({{.*}}) attributes {test.block_ids = array} { +// CHECK: scf.for {{.*}} { +// CHECK: } {test.block_ids = array} +// CHECK: return +// CHECK: } +// CHECK: } // ----- @@ -117,32 +320,126 @@ func.func @nested_region2(%arg0 : index, %arg1 : index, %arg2 : index) { } return } -// CHECK-LABEL: --- DominanceInfo --- + +// CHECK: --- DominanceInfo --- +// CHECK: Nearest(0, 0) = 0 +// CHECK: Nearest(0, 1) = 1 +// CHECK: Nearest(0, 2) = 2 +// CHECK: Nearest(0, 3) = 3 +// CHECK: Nearest(0, 4) = 4 // CHECK: Nearest(1, 0) = 1 -// CHECK-NEXT: Nearest(1, 1) = 1 -// CHECK-NEXT: Nearest(1, 2) = 2 -// CHECK-NEXT: Nearest(1, 3) = 3 +// CHECK: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 2) = 2 +// CHECK: Nearest(1, 3) = 3 +// CHECK: Nearest(1, 4) = 4 // CHECK: Nearest(2, 0) = 2 -// CHECK-NEXT: Nearest(2, 1) = 2 -// CHECK-NEXT: Nearest(2, 2) = 2 -// CHECK-NEXT: Nearest(2, 3) = 3 +// CHECK: Nearest(2, 1) = 2 +// CHECK: Nearest(2, 2) = 2 +// CHECK: Nearest(2, 3) = 3 +// CHECK: Nearest(2, 4) = 4 // CHECK: Nearest(3, 0) = 3 -// CHECK-NEXT: Nearest(3, 1) = 3 -// CHECK-NEXT: Nearest(3, 2) = 3 -// CHECK-NEXT: Nearest(3, 3) = 3 -// CHECK-LABEL: --- PostDominanceInfo --- -// CHECK-NEXT: Nearest(0, 0) = 0 -// CHECK-NEXT: Nearest(0, 1) = 1 -// CHECK-NEXT: Nearest(0, 2) = 2 -// CHECK-NEXT: Nearest(0, 3) = 3 +// CHECK: Nearest(3, 1) = 3 +// CHECK: Nearest(3, 2) = 3 +// CHECK: Nearest(3, 3) = 3 +// CHECK: Nearest(3, 4) = 4 +// CHECK: Nearest(4, 0) = 4 +// CHECK: Nearest(4, 1) = 4 +// CHECK: Nearest(4, 2) = 4 +// CHECK: Nearest(4, 3) = 4 +// CHECK: Nearest(4, 4) = 4 + +// CHECK: --- PostDominanceInfo --- +// CHECK: Nearest(0, 0) = 0 +// CHECK: Nearest(0, 1) = 1 +// CHECK: Nearest(0, 2) = 2 +// CHECK: Nearest(0, 3) = 3 +// CHECK: Nearest(0, 4) = 4 // CHECK: Nearest(1, 0) = 1 -// CHECK-NEXT: Nearest(1, 1) = 1 -// CHECK-NEXT: Nearest(1, 2) = 2 -// CHECK-NEXT: Nearest(1, 3) = 3 +// CHECK: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 2) = 2 +// CHECK: Nearest(1, 3) = 3 +// CHECK: Nearest(1, 4) = 4 // CHECK: Nearest(2, 0) = 2 -// CHECK-NEXT: Nearest(2, 1) = 2 -// CHECK-NEXT: Nearest(2, 2) = 2 -// CHECK-NEXT: Nearest(2, 3) = 3 +// CHECK: Nearest(2, 1) = 2 +// CHECK: Nearest(2, 2) = 2 +// CHECK: Nearest(2, 3) = 3 +// CHECK: Nearest(2, 4) = 4 +// CHECK: Nearest(3, 0) = 3 +// CHECK: Nearest(3, 1) = 3 +// CHECK: Nearest(3, 2) = 3 +// CHECK: Nearest(3, 3) = 3 +// CHECK: Nearest(3, 4) = 4 +// CHECK: Nearest(4, 0) = 4 +// CHECK: Nearest(4, 1) = 4 +// CHECK: Nearest(4, 2) = 4 +// CHECK: Nearest(4, 3) = 4 +// CHECK: Nearest(4, 4) = 4 + +// CHECK: --- Block Dominance relationship --- +// CHECK: dominates(0, 0) = 1 (properly = 0) +// CHECK: dominates(0, 1) = 0 (properly = 0) +// CHECK: dominates(0, 2) = 0 (properly = 0) +// CHECK: dominates(0, 3) = 0 (properly = 0) +// CHECK: dominates(0, 4) = 0 (properly = 0) +// CHECK: dominates(1, 0) = 1 (properly = 1) +// CHECK: dominates(1, 1) = 1 (properly = 0) +// CHECK: dominates(1, 2) = 0 (properly = 0) +// CHECK: dominates(1, 3) = 0 (properly = 0) +// CHECK: dominates(1, 4) = 0 (properly = 0) +// CHECK: dominates(2, 0) = 1 (properly = 1) +// CHECK: dominates(2, 1) = 1 (properly = 1) +// CHECK: dominates(2, 2) = 1 (properly = 0) +// CHECK: dominates(2, 3) = 0 (properly = 0) +// CHECK: dominates(2, 4) = 0 (properly = 0) +// CHECK: dominates(3, 0) = 1 (properly = 1) +// CHECK: dominates(3, 1) = 1 (properly = 1) +// CHECK: dominates(3, 2) = 1 (properly = 1) +// CHECK: dominates(3, 3) = 1 (properly = 0) +// CHECK: dominates(3, 4) = 0 (properly = 0) +// CHECK: dominates(4, 0) = 1 (properly = 1) +// CHECK: dominates(4, 1) = 1 (properly = 1) +// CHECK: dominates(4, 2) = 1 (properly = 1) +// CHECK: dominates(4, 3) = 1 (properly = 1) +// CHECK: dominates(4, 4) = 1 (properly = 1) + +// CHECK: --- Block PostDominance relationship --- +// CHECK: postdominates(0, 0) = 1 (properly = 0) +// CHECK: postdominates(0, 1) = 0 (properly = 0) +// CHECK: postdominates(0, 2) = 0 (properly = 0) +// CHECK: postdominates(0, 3) = 0 (properly = 0) +// CHECK: postdominates(0, 4) = 0 (properly = 0) +// CHECK: postdominates(1, 0) = 1 (properly = 1) +// CHECK: postdominates(1, 1) = 1 (properly = 0) +// CHECK: postdominates(1, 2) = 0 (properly = 0) +// CHECK: postdominates(1, 3) = 0 (properly = 0) +// CHECK: postdominates(1, 4) = 0 (properly = 0) +// CHECK: postdominates(2, 0) = 1 (properly = 1) +// CHECK: postdominates(2, 1) = 1 (properly = 1) +// CHECK: postdominates(2, 2) = 1 (properly = 0) +// CHECK: postdominates(2, 3) = 0 (properly = 0) +// CHECK: postdominates(2, 4) = 0 (properly = 0) +// CHECK: postdominates(3, 0) = 1 (properly = 1) +// CHECK: postdominates(3, 1) = 1 (properly = 1) +// CHECK: postdominates(3, 2) = 1 (properly = 1) +// CHECK: postdominates(3, 3) = 1 (properly = 0) +// CHECK: postdominates(3, 4) = 0 (properly = 0) +// CHECK: postdominates(4, 0) = 1 (properly = 1) +// CHECK: postdominates(4, 1) = 1 (properly = 1) +// CHECK: postdominates(4, 2) = 1 (properly = 1) +// CHECK: postdominates(4, 3) = 1 (properly = 1) +// CHECK: postdominates(4, 4) = 1 (properly = 1) + +// CHECK: module attributes {test.block_ids = array} { +// CHECK: func.func @nested_region2({{.*}}) attributes {test.block_ids = array} { +// CHECK: scf.for {{.*}} { +// CHECK: scf.for {{.*}} { +// CHECK: scf.for {{.*}} { +// CHECK: } {test.block_ids = array} +// CHECK: } {test.block_ids = array} +// CHECK: } {test.block_ids = array} +// CHECK: return +// CHECK: } +// CHECK: } // ----- @@ -167,141 +464,219 @@ func.func @func_loop_nested_region( ^exit: return } -// CHECK-LABEL: --- DominanceInfo --- + +// CHECK: --- DominanceInfo --- +// CHECK: Nearest(0, 0) = 0 +// CHECK: Nearest(0, 1) = 0 +// CHECK: Nearest(0, 2) = 0 +// CHECK: Nearest(0, 3) = 0 +// CHECK: Nearest(0, 4) = 0 +// CHECK: Nearest(0, 5) = 0 +// CHECK: Nearest(0, 6) = 6 +// CHECK: Nearest(1, 0) = 0 +// CHECK: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 2) = 1 +// CHECK: Nearest(1, 3) = 1 +// CHECK: Nearest(1, 4) = 1 +// CHECK: Nearest(1, 5) = 1 +// CHECK: Nearest(1, 6) = 6 // CHECK: Nearest(2, 0) = 0 -// CHECK-NEXT: Nearest(2, 1) = 1 -// CHECK-NEXT: Nearest(2, 2) = 2 -// CHECK-NEXT: Nearest(2, 3) = 2 -// CHECK-NEXT: Nearest(2, 4) = 2 -// CHECK-NEXT: Nearest(2, 5) = 1 +// CHECK: Nearest(2, 1) = 1 +// CHECK: Nearest(2, 2) = 2 +// CHECK: Nearest(2, 3) = 2 +// CHECK: Nearest(2, 4) = 2 +// CHECK: Nearest(2, 5) = 1 +// CHECK: Nearest(2, 6) = 6 // CHECK: Nearest(3, 0) = 0 -// CHECK-NEXT: Nearest(3, 1) = 1 -// CHECK-NEXT: Nearest(3, 2) = 2 -// CHECK-NEXT: Nearest(3, 3) = 3 -// CHECK-NEXT: Nearest(3, 4) = 4 -// CHECK-NEXT: Nearest(3, 5) = 1 +// CHECK: Nearest(3, 1) = 1 +// CHECK: Nearest(3, 2) = 2 +// CHECK: Nearest(3, 3) = 3 +// CHECK: Nearest(3, 4) = 4 +// CHECK: Nearest(3, 5) = 1 +// CHECK: Nearest(3, 6) = 6 // CHECK: Nearest(4, 0) = 0 -// CHECK-NEXT: Nearest(4, 1) = 1 -// CHECK-NEXT: Nearest(4, 2) = 2 -// CHECK-NEXT: Nearest(4, 3) = 4 -// CHECK-NEXT: Nearest(4, 4) = 4 -// CHECK-NEXT: Nearest(4, 5) = 1 -// CHECK-LABEL: --- PostDominanceInfo --- +// CHECK: Nearest(4, 1) = 1 +// CHECK: Nearest(4, 2) = 2 +// CHECK: Nearest(4, 3) = 4 +// CHECK: Nearest(4, 4) = 4 +// CHECK: Nearest(4, 5) = 1 +// CHECK: Nearest(4, 6) = 6 +// CHECK: Nearest(5, 0) = 0 +// CHECK: Nearest(5, 1) = 1 +// CHECK: Nearest(5, 2) = 1 +// CHECK: Nearest(5, 3) = 1 +// CHECK: Nearest(5, 4) = 1 +// CHECK: Nearest(5, 5) = 5 +// CHECK: Nearest(5, 6) = 6 +// CHECK: Nearest(6, 0) = 6 +// CHECK: Nearest(6, 1) = 6 +// CHECK: Nearest(6, 2) = 6 +// CHECK: Nearest(6, 3) = 6 +// CHECK: Nearest(6, 4) = 6 +// CHECK: Nearest(6, 5) = 6 +// CHECK: Nearest(6, 6) = 6 + +// CHECK: --- PostDominanceInfo --- +// CHECK: Nearest(0, 0) = 0 +// CHECK: Nearest(0, 1) = 1 +// CHECK: Nearest(0, 2) = 1 +// CHECK: Nearest(0, 3) = 1 +// CHECK: Nearest(0, 4) = 1 +// CHECK: Nearest(0, 5) = 5 +// CHECK: Nearest(0, 6) = 6 +// CHECK: Nearest(1, 0) = 1 +// CHECK: Nearest(1, 1) = 1 +// CHECK: Nearest(1, 2) = 1 +// CHECK: Nearest(1, 3) = 1 +// CHECK: Nearest(1, 4) = 1 +// CHECK: Nearest(1, 5) = 5 +// CHECK: Nearest(1, 6) = 6 // CHECK: Nearest(2, 0) = 1 -// CHECK-NEXT: Nearest(2, 1) = 1 -// CHECK-NEXT: Nearest(2, 2) = 2 -// CHECK-NEXT: Nearest(2, 3) = 2 -// CHECK-NEXT: Nearest(2, 4) = 2 -// CHECK-NEXT: Nearest(2, 5) = 5 +// CHECK: Nearest(2, 1) = 1 +// CHECK: Nearest(2, 2) = 2 +// CHECK: Nearest(2, 3) = 2 +// CHECK: Nearest(2, 4) = 2 +// CHECK: Nearest(2, 5) = 5 +// CHECK: Nearest(2, 6) = 6 // CHECK: Nearest(3, 0) = 1 -// CHECK-NEXT: Nearest(3, 1) = 1 -// CHECK-NEXT: Nearest(3, 2) = 2 -// CHECK-NEXT: Nearest(3, 3) = 3 -// CHECK-NEXT: Nearest(3, 4) = 4 -// CHECK-NEXT: Nearest(3, 5) = 5 +// CHECK: Nearest(3, 1) = 1 +// CHECK: Nearest(3, 2) = 2 +// CHECK: Nearest(3, 3) = 3 +// CHECK: Nearest(3, 4) = 4 +// CHECK: Nearest(3, 5) = 5 +// CHECK: Nearest(3, 6) = 6 // CHECK: Nearest(4, 0) = 1 -// CHECK-NEXT: Nearest(4, 1) = 1 -// CHECK-NEXT: Nearest(4, 2) = 2 -// CHECK-NEXT: Nearest(4, 3) = 4 -// CHECK-NEXT: Nearest(4, 4) = 4 -// CHECK-NEXT: Nearest(4, 5) = 5 -// CHECK-LABEL: --- Block Dominance relationship --- -// CHECK-NEXT: dominates(0, 0) = true -// CHECK-NEXT: dominates(0, 1) = true -// CHECK-NEXT: dominates(0, 2) = true -// CHECK-NEXT: dominates(0, 3) = true -// CHECK-NEXT: dominates(0, 4) = true -// CHECK-NEXT: dominates(0, 5) = true -// CHECK-NEXT: dominates(0, 6) = false -// CHECK-NEXT: dominates(1, 0) = false -// CHECK-NEXT: dominates(1, 1) = true -// CHECK-NEXT: dominates(1, 2) = true -// CHECK-NEXT: dominates(1, 3) = true -// CHECK-NEXT: dominates(1, 4) = true -// CHECK-NEXT: dominates(1, 5) = true -// CHECK-NEXT: dominates(1, 6) = false -// CHECK-NEXT: dominates(2, 0) = false -// CHECK-NEXT: dominates(2, 1) = false -// CHECK-NEXT: dominates(2, 2) = true -// CHECK-NEXT: dominates(2, 3) = true -// CHECK-NEXT: dominates(2, 4) = true -// CHECK-NEXT: dominates(2, 5) = false -// CHECK-NEXT: dominates(2, 6) = false -// CHECK-NEXT: dominates(3, 0) = false -// CHECK-NEXT: dominates(3, 1) = false -// CHECK-NEXT: dominates(3, 2) = false -// CHECK-NEXT: dominates(3, 3) = true -// CHECK-NEXT: dominates(3, 4) = false -// CHECK-NEXT: dominates(3, 5) = false -// CHECK-NEXT: dominates(3, 6) = false -// CHECK-NEXT: dominates(4, 0) = false -// CHECK-NEXT: dominates(4, 1) = false -// CHECK-NEXT: dominates(4, 2) = false -// CHECK-NEXT: dominates(4, 3) = true -// CHECK-NEXT: dominates(4, 4) = true -// CHECK-NEXT: dominates(4, 5) = false -// CHECK-NEXT: dominates(4, 6) = false -// CHECK-NEXT: dominates(5, 0) = false -// CHECK-NEXT: dominates(5, 1) = false -// CHECK-NEXT: dominates(5, 2) = false -// CHECK-NEXT: dominates(5, 3) = false -// CHECK-NEXT: dominates(5, 4) = false -// CHECK-NEXT: dominates(5, 5) = true -// CHECK-NEXT: dominates(5, 6) = false -// CHECK-NEXT: dominates(6, 0) = true -// CHECK-NEXT: dominates(6, 1) = true -// CHECK-NEXT: dominates(6, 2) = true -// CHECK-NEXT: dominates(6, 3) = true -// CHECK-NEXT: dominates(6, 4) = true -// CHECK-NEXT: dominates(6, 5) = true -// CHECK-NEXT: dominates(6, 6) = true -// CHECK-LABEL: --- Block PostDominance relationship --- -// CHECK-NEXT: postdominates(0, 0) = true -// CHECK-NEXT: postdominates(0, 1) = false -// CHECK-NEXT: postdominates(0, 2) = false -// CHECK-NEXT: postdominates(0, 3) = false -// CHECK-NEXT: postdominates(0, 4) = false -// CHECK-NEXT: postdominates(0, 5) = false -// CHECK-NEXT: postdominates(0, 6) = false -// CHECK-NEXT: postdominates(1, 0) = true -// CHECK-NEXT: postdominates(1, 1) = true -// CHECK-NEXT: postdominates(1, 2) = true -// CHECK-NEXT: postdominates(1, 3) = true -// CHECK-NEXT: postdominates(1, 4) = true -// CHECK-NEXT: postdominates(1, 5) = false -// CHECK-NEXT: postdominates(1, 6) = false -// CHECK-NEXT: postdominates(2, 0) = false -// CHECK-NEXT: postdominates(2, 1) = false -// CHECK-NEXT: postdominates(2, 2) = true -// CHECK-NEXT: postdominates(2, 3) = true -// CHECK-NEXT: postdominates(2, 4) = true -// CHECK-NEXT: postdominates(2, 5) = false -// CHECK-NEXT: postdominates(2, 6) = false -// CHECK-NEXT: postdominates(3, 0) = false -// CHECK-NEXT: postdominates(3, 1) = false -// CHECK-NEXT: postdominates(3, 2) = false -// CHECK-NEXT: postdominates(3, 3) = true -// CHECK-NEXT: postdominates(3, 4) = false -// CHECK-NEXT: postdominates(3, 5) = false -// CHECK-NEXT: postdominates(3, 6) = false -// CHECK-NEXT: postdominates(4, 0) = false -// CHECK-NEXT: postdominates(4, 1) = false -// CHECK-NEXT: postdominates(4, 2) = false -// CHECK-NEXT: postdominates(4, 3) = true -// CHECK-NEXT: postdominates(4, 4) = true -// CHECK-NEXT: postdominates(4, 5) = false -// CHECK-NEXT: postdominates(4, 6) = false -// CHECK-NEXT: postdominates(5, 0) = true -// CHECK-NEXT: postdominates(5, 1) = true -// CHECK-NEXT: postdominates(5, 2) = true -// CHECK-NEXT: postdominates(5, 3) = true -// CHECK-NEXT: postdominates(5, 4) = true -// CHECK-NEXT: postdominates(5, 5) = true -// CHECK-NEXT: postdominates(5, 6) = false -// CHECK-NEXT: postdominates(6, 0) = true -// CHECK-NEXT: postdominates(6, 1) = true -// CHECK-NEXT: postdominates(6, 2) = true -// CHECK-NEXT: postdominates(6, 3) = true -// CHECK-NEXT: postdominates(6, 4) = true -// CHECK-NEXT: postdominates(6, 5) = true -// CHECK-NEXT: postdominates(6, 6) = true +// CHECK: Nearest(4, 1) = 1 +// CHECK: Nearest(4, 2) = 2 +// CHECK: Nearest(4, 3) = 4 +// CHECK: Nearest(4, 4) = 4 +// CHECK: Nearest(4, 5) = 5 +// CHECK: Nearest(4, 6) = 6 +// CHECK: Nearest(5, 0) = 5 +// CHECK: Nearest(5, 1) = 5 +// CHECK: Nearest(5, 2) = 5 +// CHECK: Nearest(5, 3) = 5 +// CHECK: Nearest(5, 4) = 5 +// CHECK: Nearest(5, 5) = 5 +// CHECK: Nearest(5, 6) = 6 +// CHECK: Nearest(6, 0) = 6 +// CHECK: Nearest(6, 1) = 6 +// CHECK: Nearest(6, 2) = 6 +// CHECK: Nearest(6, 3) = 6 +// CHECK: Nearest(6, 4) = 6 +// CHECK: Nearest(6, 5) = 6 +// CHECK: Nearest(6, 6) = 6 + +// CHECK: --- Block Dominance relationship --- +// CHECK: dominates(0, 0) = 1 (properly = 0) +// CHECK: dominates(0, 1) = 1 (properly = 1) +// CHECK: dominates(0, 2) = 1 (properly = 1) +// CHECK: dominates(0, 3) = 1 (properly = 1) +// CHECK: dominates(0, 4) = 1 (properly = 1) +// CHECK: dominates(0, 5) = 1 (properly = 1) +// CHECK: dominates(0, 6) = 0 (properly = 0) +// CHECK: dominates(1, 0) = 0 (properly = 0) +// CHECK: dominates(1, 1) = 1 (properly = 0) +// CHECK: dominates(1, 2) = 1 (properly = 1) +// CHECK: dominates(1, 3) = 1 (properly = 1) +// CHECK: dominates(1, 4) = 1 (properly = 1) +// CHECK: dominates(1, 5) = 1 (properly = 1) +// CHECK: dominates(1, 6) = 0 (properly = 0) +// CHECK: dominates(2, 0) = 0 (properly = 0) +// CHECK: dominates(2, 1) = 0 (properly = 0) +// CHECK: dominates(2, 2) = 1 (properly = 0) +// CHECK: dominates(2, 3) = 1 (properly = 1) +// CHECK: dominates(2, 4) = 1 (properly = 1) +// CHECK: dominates(2, 5) = 0 (properly = 0) +// CHECK: dominates(2, 6) = 0 (properly = 0) +// CHECK: dominates(3, 0) = 0 (properly = 0) +// CHECK: dominates(3, 1) = 0 (properly = 0) +// CHECK: dominates(3, 2) = 0 (properly = 0) +// CHECK: dominates(3, 3) = 1 (properly = 0) +// CHECK: dominates(3, 4) = 0 (properly = 0) +// CHECK: dominates(3, 5) = 0 (properly = 0) +// CHECK: dominates(3, 6) = 0 (properly = 0) +// CHECK: dominates(4, 0) = 0 (properly = 0) +// CHECK: dominates(4, 1) = 0 (properly = 0) +// CHECK: dominates(4, 2) = 0 (properly = 0) +// CHECK: dominates(4, 3) = 1 (properly = 1) +// CHECK: dominates(4, 4) = 1 (properly = 0) +// CHECK: dominates(4, 5) = 0 (properly = 0) +// CHECK: dominates(4, 6) = 0 (properly = 0) +// CHECK: dominates(5, 0) = 0 (properly = 0) +// CHECK: dominates(5, 1) = 0 (properly = 0) +// CHECK: dominates(5, 2) = 0 (properly = 0) +// CHECK: dominates(5, 3) = 0 (properly = 0) +// CHECK: dominates(5, 4) = 0 (properly = 0) +// CHECK: dominates(5, 5) = 1 (properly = 0) +// CHECK: dominates(5, 6) = 0 (properly = 0) +// CHECK: dominates(6, 0) = 1 (properly = 1) +// CHECK: dominates(6, 1) = 1 (properly = 1) +// CHECK: dominates(6, 2) = 1 (properly = 1) +// CHECK: dominates(6, 3) = 1 (properly = 1) +// CHECK: dominates(6, 4) = 1 (properly = 1) +// CHECK: dominates(6, 5) = 1 (properly = 1) +// CHECK: dominates(6, 6) = 1 (properly = 1) + +// CHECK: --- Block PostDominance relationship --- +// CHECK: postdominates(0, 0) = 1 (properly = 0) +// CHECK: postdominates(0, 1) = 0 (properly = 0) +// CHECK: postdominates(0, 2) = 0 (properly = 0) +// CHECK: postdominates(0, 3) = 0 (properly = 0) +// CHECK: postdominates(0, 4) = 0 (properly = 0) +// CHECK: postdominates(0, 5) = 0 (properly = 0) +// CHECK: postdominates(0, 6) = 0 (properly = 0) +// CHECK: postdominates(1, 0) = 1 (properly = 1) +// CHECK: postdominates(1, 1) = 1 (properly = 0) +// CHECK: postdominates(1, 2) = 1 (properly = 1) +// CHECK: postdominates(1, 3) = 1 (properly = 1) +// CHECK: postdominates(1, 4) = 1 (properly = 1) +// CHECK: postdominates(1, 5) = 0 (properly = 0) +// CHECK: postdominates(1, 6) = 0 (properly = 0) +// CHECK: postdominates(2, 0) = 0 (properly = 0) +// CHECK: postdominates(2, 1) = 0 (properly = 0) +// CHECK: postdominates(2, 2) = 1 (properly = 0) +// CHECK: postdominates(2, 3) = 1 (properly = 1) +// CHECK: postdominates(2, 4) = 1 (properly = 1) +// CHECK: postdominates(2, 5) = 0 (properly = 0) +// CHECK: postdominates(2, 6) = 0 (properly = 0) +// CHECK: postdominates(3, 0) = 0 (properly = 0) +// CHECK: postdominates(3, 1) = 0 (properly = 0) +// CHECK: postdominates(3, 2) = 0 (properly = 0) +// CHECK: postdominates(3, 3) = 1 (properly = 0) +// CHECK: postdominates(3, 4) = 0 (properly = 0) +// CHECK: postdominates(3, 5) = 0 (properly = 0) +// CHECK: postdominates(3, 6) = 0 (properly = 0) +// CHECK: postdominates(4, 0) = 0 (properly = 0) +// CHECK: postdominates(4, 1) = 0 (properly = 0) +// CHECK: postdominates(4, 2) = 0 (properly = 0) +// CHECK: postdominates(4, 3) = 1 (properly = 1) +// CHECK: postdominates(4, 4) = 1 (properly = 0) +// CHECK: postdominates(4, 5) = 0 (properly = 0) +// CHECK: postdominates(4, 6) = 0 (properly = 0) +// CHECK: postdominates(5, 0) = 1 (properly = 1) +// CHECK: postdominates(5, 1) = 1 (properly = 1) +// CHECK: postdominates(5, 2) = 1 (properly = 1) +// CHECK: postdominates(5, 3) = 1 (properly = 1) +// CHECK: postdominates(5, 4) = 1 (properly = 1) +// CHECK: postdominates(5, 5) = 1 (properly = 0) +// CHECK: postdominates(5, 6) = 0 (properly = 0) +// CHECK: postdominates(6, 0) = 1 (properly = 1) +// CHECK: postdominates(6, 1) = 1 (properly = 1) +// CHECK: postdominates(6, 2) = 1 (properly = 1) +// CHECK: postdominates(6, 3) = 1 (properly = 1) +// CHECK: postdominates(6, 4) = 1 (properly = 1) +// CHECK: postdominates(6, 5) = 1 (properly = 1) +// CHECK: postdominates(6, 6) = 1 (properly = 1) + +// CHECK: module attributes {test.block_ids = array} { +// CHECK: func.func @func_loop_nested_region({{.*}}) attributes {test.block_ids = array} { +// CHECK: ^{{.*}} +// CHECK: ^{{.*}} +// CHECK: scf.for {{.*}} { +// CHECK: scf.for {{.*}} { +// CHECK: } {test.block_ids = array} +// CHECK: } {test.block_ids = array} +// CHECK: ^{{.*}} +// CHECK: } +// CHECK: } diff --git a/mlir/test/Conversion/MemRefToLLVM/invalid-uint.mlir b/mlir/test/Conversion/MemRefToLLVM/invalid-uint.mlir new file mode 100644 index 00000000000000..7e94677ebbdd7e --- /dev/null +++ b/mlir/test/Conversion/MemRefToLLVM/invalid-uint.mlir @@ -0,0 +1,8 @@ +// RUN: mlir-opt %s -finalize-memref-to-llvm -verify-diagnostics + +// CHECK-LABEL: @invalid_int_conversion +func.func @invalid_int_conversion() { + // expected-error@+1 {{conversion of memref memory space 1 : ui64 to integer address space failed. Consider adding memory space conversions.}} + %alloc = memref.alloc() {alignment = 64 : i64} : memref<10xf32, 1 : ui64> + return +} diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir index f6007aa16c1266..fa179744094c67 100644 --- a/mlir/test/Dialect/Affine/canonicalize.mlir +++ b/mlir/test/Dialect/Affine/canonicalize.mlir @@ -1566,3 +1566,14 @@ func.func @linearize_all_zero_unit_basis() -> index { %ret = affine.linearize_index [%c0, %c0] by (1, 1) : index return %ret : index } + +// ----- + +// CHECK-LABEL: @linearize_one_element_basis +// CHECK-SAME: (%[[arg0:.+]]: index, %[[arg1:.+]]: index) +// CHECK-NOT: affine.linearize_index +// CHECK: return %[[arg0]] +func.func @linearize_one_element_basis(%arg0: index, %arg1: index) -> index { + %ret = affine.linearize_index [%arg0] by (%arg1) : index + return %ret : index +} diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir index 682780c5f0a7df..aa558bad2299ce 100644 --- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir +++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir @@ -325,6 +325,36 @@ func.func @casts(%arg0: i32, %arg1: i64, %arg2: vector<4xi32>, llvm.return } +// CHECK-LABEL: @nneg_casts +// CHECK-SAME: (%[[I32:.*]]: i32, %[[I64:.*]]: i64, %[[V4I32:.*]]: vector<4xi32>, %[[V4I64:.*]]: vector<4xi64>, %[[PTR:.*]]: !llvm.ptr) +func.func @nneg_casts(%arg0: i32, %arg1: i64, %arg2: vector<4xi32>, + %arg3: vector<4xi64>, %arg4: !llvm.ptr) { +// CHECK: = llvm.zext nneg %[[I32]] : i32 to i64 + %0 = llvm.zext nneg %arg0 : i32 to i64 +// CHECK: = llvm.zext nneg %[[V4I32]] : vector<4xi32> to vector<4xi64> + %4 = llvm.zext nneg %arg2 : vector<4xi32> to vector<4xi64> +// CHECK: = llvm.uitofp nneg %[[I32]] : i32 to f32 + %7 = llvm.uitofp nneg %arg0 : i32 to f32 + llvm.return +} + +// CHECK-LABEL: @casts_overflow +// CHECK-SAME: (%[[I32:.*]]: i32, %[[I64:.*]]: i64, %[[V4I32:.*]]: vector<4xi32>, %[[V4I64:.*]]: vector<4xi64>, %[[PTR:.*]]: !llvm.ptr) +func.func @casts_overflow(%arg0: i32, %arg1: i64, %arg2: vector<4xi32>, + %arg3: vector<4xi64>, %arg4: !llvm.ptr) { +// CHECK: = llvm.trunc %[[I64]] overflow : i64 to i56 + %0 = llvm.trunc %arg1 overflow : i64 to i56 +// CHECK: = llvm.trunc %[[I64]] overflow : i64 to i56 + %1 = llvm.trunc %arg1 overflow : i64 to i56 +// CHECK: = llvm.trunc %[[I64]] overflow : i64 to i56 + %2 = llvm.trunc %arg1 overflow : i64 to i56 +// CHECK: = llvm.trunc %[[I64]] overflow : i64 to i56 + %3 = llvm.trunc %arg1 overflow : i64 to i56 +// CHECK: = llvm.trunc %[[V4I64]] overflow : vector<4xi64> to vector<4xi56> + %4 = llvm.trunc %arg3 overflow : vector<4xi64> to vector<4xi56> + llvm.return +} + // CHECK-LABEL: @vect func.func @vect(%arg0: vector<4xf32>, %arg1: i32, %arg2: f32, %arg3: !llvm.vec<2 x ptr>) { // CHECK: = llvm.extractelement {{.*}} : vector<4xf32> diff --git a/mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir b/mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir new file mode 100644 index 00000000000000..38e406a13ec087 --- /dev/null +++ b/mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir @@ -0,0 +1,71 @@ +// RUN: mlir-opt %s -split-input-file --linalg-specialize-generic-ops | FileCheck %s + +#projection = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)> +#identity = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> + +func.func @transpose_and_broadcast(%x : tensor<7x8x9xf32>, %y: tensor<5x9x7x8x10xf32>, %z : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> { + %res = linalg.generic + { indexing_maps = [#projection, #identity, #identity], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} + ins(%x, %y : tensor<7x8x9xf32>, tensor<5x9x7x8x10xf32>) outs(%z : tensor<5x9x7x8x10xf32>) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %div = arith.divf %in, %in_1 : f32 + linalg.yield %div : f32 + } -> tensor<5x9x7x8x10xf32> + return %res : tensor<5x9x7x8x10xf32> +} + +// CHECK-LABEL: transpose_and_broadcast +// CHECK-SAME: %[[X:.+]]: tensor<7x8x9xf32>, %[[Y:.+]]: tensor<5x9x7x8x10xf32>, %[[Z:.+]]: tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> { +// CHECK: %[[E0:.+]] = tensor.empty() : tensor<9x7x8xf32> +// CHECK: %[[X_trans:.+]] = linalg.transpose ins(%[[X]] : tensor<7x8x9xf32>) outs(%[[E0]] : tensor<9x7x8xf32>) permutation = [2, 0, 1] +// CHECK: %[[E1:.+]] = tensor.empty() : tensor<5x9x7x8x10xf32> +// CHECK: %[[X_trans_bc:.+]] = linalg.broadcast ins(%[[X_trans]] : tensor<9x7x8xf32>) outs(%[[E1]] : tensor<5x9x7x8x10xf32>) dimensions = [0, 4] +// CHECK: {{.*}} = linalg.div ins(%[[X_trans_bc]], %[[Y]] : tensor<5x9x7x8x10xf32>, tensor<5x9x7x8x10xf32>) outs(%[[Z]] : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> +// CHECK-NOT: linalg.generic + +// ----- + +#identity = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#transposed = affine_map<(d0, d1, d2) -> (d2, d0, d1)> + +func.func @transpose_only(%x : tensor<32x2x16xf32>, %y: tensor<2x16x32xf32>, %z : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { + %res = linalg.generic + { indexing_maps = [#transposed, #identity, #identity], iterator_types = ["parallel", "parallel", "parallel"]} + ins(%x, %y : tensor<32x2x16xf32>, tensor<2x16x32xf32>) + outs(%z : tensor<2x16x32xf32>) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %div = arith.divf %in, %in_1 : f32 + linalg.yield %div : f32 + } -> tensor<2x16x32xf32> + return %res : tensor<2x16x32xf32> +} + +// CHECK-LABEL: transpose_only +// CHECK-SAME: %[[X:.+]]: tensor<32x2x16xf32>, %[[Y:.+]]: tensor<2x16x32xf32>, %[[Z:.+]]: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { +// CHECK: %[[E0:.+]] = tensor.empty() : tensor<2x16x32xf32> +// CHECK: %[[X_trans:.+]] = linalg.transpose ins(%[[X]] : tensor<32x2x16xf32>) outs(%[[E0]] : tensor<2x16x32xf32>) permutation = [1, 2, 0] +// CHECK: {{.*}} = linalg.div ins(%[[X_trans]], %[[Y]] : tensor<2x16x32xf32>, tensor<2x16x32xf32>) outs(%[[Z]] : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> +// CHECK-NOT: linalg.generic + +// ----- + +#identity = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#broadcast = affine_map<(d0, d1, d2) -> (d0, d2)> +func.func @broadcast_only(%x : tensor<2x16x32xf32>, %y: tensor<2x32xf32>, %z : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { + %res = linalg.generic + { indexing_maps = [#identity, #broadcast, #identity], iterator_types = ["parallel", "parallel", "parallel"]} + ins(%x, %y : tensor<2x16x32xf32>, tensor<2x32xf32>) + outs(%z : tensor<2x16x32xf32>) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %div = arith.divf %in, %in_1 : f32 + linalg.yield %div : f32 + } -> tensor<2x16x32xf32> + return %res : tensor<2x16x32xf32> +} + +// CHECK-LABEL: broadcast_only +// CHECK-SAME: %[[X:.+]]: tensor<2x16x32xf32>, %[[Y:.+]]: tensor<2x32xf32>, %[[Z:.+]]: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { +// CHECK: %[[E0:.+]] = tensor.empty() : tensor<2x16x32xf32> +// CHECK: %[[X_bc:.+]] = linalg.broadcast ins(%[[Y]] : tensor<2x32xf32>) outs(%[[E0]] : tensor<2x16x32xf32>) dimensions = [1] +// CHECK: {{.*}} = linalg.div ins(%[[X]], %[[X_bc]] : tensor<2x16x32xf32>, tensor<2x16x32xf32>) outs(%arg2 : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> +// CHECK-NOT: linalg.generic diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir index f616f6795bf9dc..7038a6ff744e4e 100644 --- a/mlir/test/Dialect/MemRef/ops.mlir +++ b/mlir/test/Dialect/MemRef/ops.mlir @@ -1,6 +1,123 @@ // RUN: mlir-opt %s | mlir-opt | FileCheck %s // RUN: mlir-opt %s --mlir-print-op-generic | mlir-opt | FileCheck %s +// CHECK: #[[$MAP:.*]] = affine_map<(d0, d1)[s0] -> (d0 + s0, d1)> + +// CHECK-LABEL: func @alloc() { +func.func @alloc() { +^bb0: + // Test simple alloc. + // CHECK: %{{.*}} = memref.alloc() : memref<1024x64xf32, 1> + %0 = memref.alloc() : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> + + %c0 = "arith.constant"() {value = 0: index} : () -> index + %c1 = "arith.constant"() {value = 1: index} : () -> index + + // Test alloc with dynamic dimensions. + // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}) : memref + %1 = memref.alloc(%c0, %c1) : memref (d0, d1)>, 1> + + // Test alloc with no dynamic dimensions and one symbol. + // CHECK: %{{.*}} = memref.alloc()[%{{.*}}] : memref<2x4xf32, #[[$MAP]], 1> + %2 = memref.alloc()[%c0] : memref<2x4xf32, affine_map<(d0, d1)[s0] -> ((d0 + s0), d1)>, 1> + + // Test alloc with dynamic dimensions and one symbol. + // CHECK: %{{.*}} = memref.alloc(%{{.*}})[%{{.*}}] : memref<2x?xf32, #[[$MAP]], 1> + %3 = memref.alloc(%c1)[%c0] : memref<2x?xf32, affine_map<(d0, d1)[s0] -> (d0 + s0, d1)>, 1> + + // Alloc with no mappings. + // b/116054838 Parser crash while parsing ill-formed AllocOp + // CHECK: %{{.*}} = memref.alloc() : memref<2xi32> + %4 = memref.alloc() : memref<2 x i32> + + // CHECK: return + return +} + +// CHECK-LABEL: func @alloca() { +func.func @alloca() { +^bb0: + // Test simple alloc. + // CHECK: %{{.*}} = memref.alloca() : memref<1024x64xf32, 1> + %0 = memref.alloca() : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> + + %c0 = "arith.constant"() {value = 0: index} : () -> index + %c1 = "arith.constant"() {value = 1: index} : () -> index + + // Test alloca with dynamic dimensions. + // CHECK: %{{.*}} = memref.alloca(%{{.*}}, %{{.*}}) : memref + %1 = memref.alloca(%c0, %c1) : memref (d0, d1)>, 1> + + // Test alloca with no dynamic dimensions and one symbol. + // CHECK: %{{.*}} = memref.alloca()[%{{.*}}] : memref<2x4xf32, #[[$MAP]], 1> + %2 = memref.alloca()[%c0] : memref<2x4xf32, affine_map<(d0, d1)[s0] -> ((d0 + s0), d1)>, 1> + + // Test alloca with dynamic dimensions and one symbol. + // CHECK: %{{.*}} = memref.alloca(%{{.*}})[%{{.*}}] : memref<2x?xf32, #[[$MAP]], 1> + %3 = memref.alloca(%c1)[%c0] : memref<2x?xf32, affine_map<(d0, d1)[s0] -> (d0 + s0, d1)>, 1> + + // Alloca with no mappings, but with alignment. + // CHECK: %{{.*}} = memref.alloca() {alignment = 64 : i64} : memref<2xi32> + %4 = memref.alloca() {alignment = 64} : memref<2 x i32> + + return +} + +// CHECK-LABEL: func @dealloc() { +func.func @dealloc() { +^bb0: + // CHECK: %{{.*}} = memref.alloc() : memref<1024x64xf32> + %0 = memref.alloc() : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 0> + + // CHECK: memref.dealloc %{{.*}} : memref<1024x64xf32> + memref.dealloc %0 : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 0> + return +} + +// CHECK-LABEL: func @load_store +func.func @load_store() { +^bb0: + // CHECK: %{{.*}} = memref.alloc() : memref<1024x64xf32, 1> + %0 = memref.alloc() : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> + + %1 = arith.constant 0 : index + %2 = arith.constant 1 : index + + // CHECK: %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x64xf32, 1> + %3 = memref.load %0[%1, %2] : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> + + // CHECK: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x64xf32, 1> + memref.store %3, %0[%1, %2] : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> + + return +} + +// CHECK-LABEL: func @dma_ops() +func.func @dma_ops() { + %c0 = arith.constant 0 : index + %stride = arith.constant 32 : index + %elt_per_stride = arith.constant 16 : index + + %A = memref.alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 0> + %Ah = memref.alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 1> + %tag = memref.alloc() : memref<1 x f32> + + %num_elements = arith.constant 256 : index + + memref.dma_start %A[%c0], %Ah[%c0], %num_elements, %tag[%c0] : memref<256 x f32>, memref<256 x f32, 1>, memref<1 x f32> + memref.dma_wait %tag[%c0], %num_elements : memref<1 x f32> + // CHECK: dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}[%{{.*}}] : memref<256xf32>, memref<256xf32, 1>, memref<1xf32> + // CHECK-NEXT: dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xf32> + + // DMA with strides + memref.dma_start %A[%c0], %Ah[%c0], %num_elements, %tag[%c0], %stride, %elt_per_stride : memref<256 x f32>, memref<256 x f32, 1>, memref<1 x f32> + memref.dma_wait %tag[%c0], %num_elements : memref<1 x f32> + // CHECK-NEXT: dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}} : memref<256xf32>, memref<256xf32, 1>, memref<1xf32> + // CHECK-NEXT: dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xf32> + + return +} + // CHECK-LABEL: func @memref_reinterpret_cast func.func @memref_reinterpret_cast(%in: memref) -> memref<10x?xf32, strided<[?, 1], offset: ?>> { @@ -90,6 +207,87 @@ func.func @memref_alloca_scope() { return } +// CHECK-LABEL: func @memref_cast(%arg0 +func.func @memref_cast(%arg0: memref<4xf32>, %arg1 : memref, %arg2 : memref<64x16x4xf32, strided<[64, 4, 1], offset: 0>>) { + // CHECK: memref.cast %{{.*}} : memref<4xf32> to memref + %0 = memref.cast %arg0 : memref<4xf32> to memref + + // CHECK: memref.cast %{{.*}} : memref to memref<4xf32> + %1 = memref.cast %arg1 : memref to memref<4xf32> + + // CHECK: memref.cast %{{.*}} : memref<64x16x4xf32, strided<[64, 4, 1]>> to memref<64x16x4xf32, strided<[?, ?, ?], offset: ?>> + %2 = memref.cast %arg2 : memref<64x16x4xf32, strided<[64, 4, 1], offset: 0>> to memref<64x16x4xf32, strided<[?, ?, ?], offset: ?>> + + // CHECK: memref.cast {{%.*}} : memref<64x16x4xf32, strided<[?, ?, ?], offset: ?>> to memref<64x16x4xf32, strided<[64, 4, 1]>> + %3 = memref.cast %2 : memref<64x16x4xf32, strided<[?, ?, ?], offset: ?>> to memref<64x16x4xf32, strided<[64, 4, 1], offset: 0>> + + // CHECK: memref.cast %{{.*}} : memref<4xf32> to memref<*xf32> + %4 = memref.cast %1 : memref<4xf32> to memref<*xf32> + + // CHECK: memref.cast %{{.*}} : memref<*xf32> to memref<4xf32> + %5 = memref.cast %4 : memref<*xf32> to memref<4xf32> + return +} + +// Check that unranked memrefs with non-default memory space roundtrip +// properly. +// CHECK-LABEL: @unranked_memref_roundtrip(memref<*xf32, 4>) +func.func private @unranked_memref_roundtrip(memref<*xf32, 4>) + +// CHECK-LABEL: func @load_store_prefetch +func.func @load_store_prefetch(memref<4x4xi32>, index) { +^bb0(%0: memref<4x4xi32>, %1: index): + // CHECK: %0 = memref.load %arg0[%arg1, %arg1] : memref<4x4xi32> + %2 = "memref.load"(%0, %1, %1) : (memref<4x4xi32>, index, index)->i32 + + // CHECK: %{{.*}} = memref.load %arg0[%arg1, %arg1] : memref<4x4xi32> + %3 = memref.load %0[%1, %1] : memref<4x4xi32> + + // CHECK: memref.prefetch %arg0[%arg1, %arg1], write, locality<1>, data : memref<4x4xi32> + memref.prefetch %0[%1, %1], write, locality<1>, data : memref<4x4xi32> + + // CHECK: memref.prefetch %arg0[%arg1, %arg1], read, locality<3>, instr : memref<4x4xi32> + memref.prefetch %0[%1, %1], read, locality<3>, instr : memref<4x4xi32> + + return +} + +// Test with zero-dimensional operands using no index in load/store. +// CHECK-LABEL: func @zero_dim_no_idx +func.func @zero_dim_no_idx(%arg0 : memref, %arg1 : memref, %arg2 : memref) { + %0 = memref.load %arg0[] : memref + memref.store %0, %arg1[] : memref + return + // CHECK: %0 = memref.load %{{.*}}[] : memref + // CHECK: memref.store %{{.*}}, %{{.*}}[] : memref +} + +// CHECK-LABEL: func @memref_view(%arg0 +func.func @memref_view(%arg0 : index, %arg1 : index, %arg2 : index) { + %0 = memref.alloc() : memref<2048xi8> + // Test two dynamic sizes and dynamic offset. + // CHECK: memref.view {{.*}} : memref<2048xi8> to memref + %1 = memref.view %0[%arg2][%arg0, %arg1] : memref<2048xi8> to memref + + // Test one dynamic size and dynamic offset. + // CHECK: memref.view {{.*}} : memref<2048xi8> to memref<4x?xf32> + %3 = memref.view %0[%arg2][%arg1] : memref<2048xi8> to memref<4x?xf32> + + // Test static sizes and static offset. + // CHECK: memref.view {{.*}} : memref<2048xi8> to memref<64x4xf32> + %c0 = arith.constant 0: index + %5 = memref.view %0[%c0][] : memref<2048xi8> to memref<64x4xf32> + return +} + +// CHECK-LABEL: func @assume_alignment +// CHECK-SAME: %[[MEMREF:.*]]: memref<4x4xf16> +func.func @assume_alignment(%0: memref<4x4xf16>) { + // CHECK: memref.assume_alignment %[[MEMREF]], 16 : memref<4x4xf16> + memref.assume_alignment %0, 16 : memref<4x4xf16> + return +} + // CHECK-LABEL: func @expand_collapse_shape_static func.func @expand_collapse_shape_static( %arg0: memref<3x4x5xf32>, diff --git a/mlir/test/Dialect/Mesh/canonicalization.mlir b/mlir/test/Dialect/Mesh/canonicalization.mlir index ea2bd29056ec78..f0112d689805d3 100644 --- a/mlir/test/Dialect/Mesh/canonicalization.mlir +++ b/mlir/test/Dialect/Mesh/canonicalization.mlir @@ -191,3 +191,20 @@ func.func @send_empty_mesh_axes( // CHECK: return %[[ARG]] return %0 : tensor<4xf32> } + +mesh.mesh @mesh4x4(shape = 4x4) +// CHECK-LABEL: func @test_halo_sizes +func.func @test_halo_sizes() -> !mesh.sharding { + %c2_i64 = arith.constant 2 : i64 + // CHECK mesh.sharding @mesh4x4 split_axes = [[0], [1]] halo_sizes = [1, 2, 2, 22] : !mesh.sharding + %sharding = mesh.sharding @mesh4x4 split_axes = [[0], [1]] halo_sizes = [1, %c2_i64, %c2_i64, 22] : !mesh.sharding + return %sharding : !mesh.sharding +} + +// CHECK-LABEL: func @test_shard_offs +func.func @test_shard_offs() -> !mesh.sharding { + %c2_i64 = arith.constant 2 : i64 + // CHECK mesh.sharding @mesh4x4 split_axes = [[0], [1]] sharded_dims_offsets = [0, 1, 2, 3, 4, 0, 2, 3, 4, 22] : !mesh.sharding + %sharding = mesh.sharding @mesh4x4 split_axes = [[0], [1]] sharded_dims_offsets = [0, 1, %c2_i64, 3, 4, 0, %c2_i64, 3, 4, 22] : !mesh.sharding + return %sharding : !mesh.sharding +} \ No newline at end of file diff --git a/mlir/test/Dialect/Mesh/invalid.mlir b/mlir/test/Dialect/Mesh/invalid.mlir index 3827df90e6962f..29b900a8da4a60 100644 --- a/mlir/test/Dialect/Mesh/invalid.mlir +++ b/mlir/test/Dialect/Mesh/invalid.mlir @@ -89,8 +89,8 @@ func.func @sharding_attribute_invalid_halo(%arg0 : tensor<4x8xf32>) { // ----- func.func @sharding_attribute_invalid_sizes(%arg0 : tensor<4x8xf32>) { - // expected-error@+1 {{halo sizes and shard shapes are mutually exclusive}} - %s = mesh.sharding @mesh0 split_axes = [[0]] halo_sizes = [1, 2] sharded_dims_sizes = [2, 2] : !mesh.sharding + // expected-error@+1 {{halo sizes and shard offsets are mutually exclusive}} + %s = mesh.sharding @mesh0 split_axes = [[0]] halo_sizes = [1, 2] sharded_dims_offsets = [0, 2, 2] : !mesh.sharding %0 = mesh.shard %arg0 to %s : tensor<4x8xf32> return } @@ -99,8 +99,28 @@ func.func @sharding_attribute_invalid_sizes(%arg0 : tensor<4x8xf32>) { mesh.mesh @mesh_dyn(shape = ?x?) func.func @sharding_dyn_mesh_and_sizes(%arg0 : tensor<4x8xf32>) { - // expected-error@+1 {{sharded dims sizes are not allowed for devices meshes with dynamic shape}} - %s = mesh.sharding @mesh_dyn split_axes = [[0]] sharded_dims_sizes = [2, 2] : !mesh.sharding + // expected-error@+1 {{sharded dims offsets are not allowed for devices meshes with dynamic shape}} + %s = mesh.sharding @mesh_dyn split_axes = [[0]] sharded_dims_offsets = [0, 2, 2] : !mesh.sharding + %0 = mesh.shard %arg0 to %s : tensor<4x8xf32> + return +} + +// ----- + +mesh.mesh @mesh0(shape = 2x4) +func.func @sharding_sizes_count(%arg0 : tensor<4x8xf32>) { + // expected-error@+1 {{sharded dims offsets has wrong size}} + %s = mesh.sharding @mesh0 split_axes = [[0], [1]] sharded_dims_offsets = [0, 2, 4, 0, 2, 4, 6] : !mesh.sharding + %0 = mesh.shard %arg0 to %s : tensor<4x8xf32> + return +} + +// ----- + +mesh.mesh @mesh0(shape = 4) +func.func @sharding_sizes_decreasing(%arg0 : tensor<4x8xf32>) { + // expected-error@+1 {{sharded dims offsets must be non-decreasing}} + %s = mesh.sharding @mesh0 split_axes = [[0]] sharded_dims_offsets = [0, 2, 3, 2] : !mesh.sharding %0 = mesh.shard %arg0 to %s : tensor<4x8xf32> return } diff --git a/mlir/test/Dialect/Mesh/ops.mlir b/mlir/test/Dialect/Mesh/ops.mlir index 5ead7babe2c084..d8df01c3d6520d 100644 --- a/mlir/test/Dialect/Mesh/ops.mlir +++ b/mlir/test/Dialect/Mesh/ops.mlir @@ -144,10 +144,10 @@ func.func @mesh_shard_halo_sizes() -> () { func.func @mesh_shard_dims_sizes() -> () { // CHECK: %[[C3:.*]] = arith.constant 3 : i64 %c3 = arith.constant 3 : i64 - // CHECK: mesh.sharding @mesh4 split_axes = {{\[\[}}0]] sharded_dims_sizes = [1, 4, 2] : !mesh.sharding - %sharding1 = mesh.sharding @mesh4 split_axes = [[0]] sharded_dims_sizes = [1, 4, 2] : !mesh.sharding - // CHECK: mesh.sharding @mesh4 split_axes = {{\[\[}}0]] sharded_dims_sizes = [4, %[[C3]], 1] : !mesh.sharding - %sharding2 = mesh.sharding @mesh4 split_axes = [[0]] sharded_dims_sizes = [4, %c3, 1] : !mesh.sharding + // CHECK: mesh.sharding @mesh4 split_axes = {{\[\[}}0]] sharded_dims_offsets = [0, 1, 4, 6] : !mesh.sharding + %sharding1 = mesh.sharding @mesh4 split_axes = [[0]] sharded_dims_offsets = [0, 1, 4, 6] : !mesh.sharding + // CHECK: mesh.sharding @mesh4 split_axes = {{\[\[}}0]] sharded_dims_offsets = [0, 2, %[[C3]], 5] : !mesh.sharding + %sharding2 = mesh.sharding @mesh4 split_axes = [[0]] sharded_dims_offsets = [0, 2, %c3, 5] : !mesh.sharding return } @@ -615,18 +615,16 @@ func.func @update_halo( // CHECK-SAME: %[[ARG:.*]]: memref<12x12xi8> %arg0 : memref<12x12xi8>) { // CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : i64 - // CHECK-NEXT: mesh.update_halo %[[ARG]] on @mesh0 + // CHECK-NEXT: %[[UH1:.*]] = mesh.update_halo %[[ARG]] into %[[ARG]] on @mesh0 // CHECK-SAME: split_axes = {{\[\[}}0]] - // CHECK-SAME: halo_sizes = [2, %c2_i64] : memref<12x12xi8> + // CHECK-SAME: halo_sizes = [2, %c2_i64] : memref<12x12xi8> -> memref<12x12xi8> %c2 = arith.constant 2 : i64 - mesh.update_halo %arg0 on @mesh0 split_axes = [[0]] - halo_sizes = [2, %c2] : memref<12x12xi8> - // CHECK-NEXT: mesh.update_halo %[[ARG]] on @mesh0 + %uh1 = mesh.update_halo %arg0 into %arg0 on @mesh0 split_axes = [[0]] + source_halo_sizes = [2, %c2] : memref<12x12xi8> -> memref<12x12xi8> + // CHECK-NEXT: %[[UH2:.*]] = mesh.update_halo %[[ARG]] into %[[UH1]] on @mesh0 // CHECK-SAME: split_axes = {{\[\[}}0], [1]] - // CHECK-SAME: halo_sizes = [2, 2, %[[C2]], 2] - // CHECK-SAME: target_halo_sizes = [3, 3, 2, 2] : memref<12x12xi8> - mesh.update_halo %arg0 on @mesh0 split_axes = [[0], [1]] - halo_sizes = [2, 2, %c2, 2] target_halo_sizes = [3, 3, 2, 2] - : memref<12x12xi8> + // CHECK-SAME: halo_sizes = [2, 2, %[[C2]], 2] : memref<12x12xi8> -> memref<12x12xi8> + %uh2 = mesh.update_halo %arg0 into %uh1 on @mesh0 split_axes = [[0], [1]] + source_halo_sizes = [2, 2, %c2, 2] : memref<12x12xi8> -> memref<12x12xi8> return } diff --git a/mlir/test/Dialect/Mesh/spmdization.mlir b/mlir/test/Dialect/Mesh/spmdization.mlir index 8b0c4053b0dc7e..22ddb72569835d 100644 --- a/mlir/test/Dialect/Mesh/spmdization.mlir +++ b/mlir/test/Dialect/Mesh/spmdization.mlir @@ -219,3 +219,34 @@ func.func @ew_chain_with_halo( // CHECK-NEXT: return %[[TMP3]] : tensor<5x16xf32> return %sharding_annotated_6 : tensor<8x16xf32> } + +// CHECK-LABEL: func @test_shard_update_halo +// CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<300x1200xi64> +func.func @test_shard_update_halo(%arg0: tensor<1200x1200xi64>) -> tensor<1200x1200xi64> { + %sharding = mesh.sharding @mesh_1d_4 split_axes = [[0]] : !mesh.sharding + // CHECK: %[[T:.*]] = tensor.empty() : tensor<304x1200xi64> + // CHECK: %[[inserted_slice:.*]] = tensor.insert_slice %[[IN1]] into %[[T]][2, 0] [300, 1200] [1, 1] : tensor<300x1200xi64> into tensor<304x1200xi64> + // CHECK: %[[UH:.*]] = mesh.update_halo %[[IN1]] into %[[inserted_slice]] on @mesh_1d_4 split_axes = {{\[\[0]]}} destination_halo_sizes = [2, 2] : tensor<300x1200xi64> -> tensor<304x1200xi64> + %sharding_annotated = mesh.shard %arg0 to %sharding : tensor<1200x1200xi64> + %sharding_0 = mesh.sharding @mesh_1d_4 split_axes = [[0]] halo_sizes = [2, 2] : !mesh.sharding + %sharding_annotated_1 = mesh.shard %sharding_annotated to %sharding_0 : tensor<1200x1200xi64> + %sharding_annotated_3 = mesh.shard %sharding_annotated_1 to %sharding_0 annotate_for_users : tensor<1200x1200xi64> + // CHECK: return %[[UH]] : tensor<304x1200xi64> + return %sharding_annotated_3 : tensor<1200x1200xi64> +} + +mesh.mesh @mesh4x4(shape = 4x4) +// CHECK-LABEL: func @test_shard_update_halo2d +// CHECK-SAME: %[[IN1:[A-Za-z0-9_]+]]: tensor<300x300xi64> +func.func @test_shard_update_halo2d(%arg0: tensor<1200x1200xi64>) -> tensor<1200x1200xi64> { + %sharding = mesh.sharding @mesh4x4 split_axes = [[0], [1]] : !mesh.sharding + // CHECK: %[[T:.*]] = tensor.empty() : tensor<303x307xi64> + // CHECK: %[[inserted_slice:.*]] = tensor.insert_slice %[[IN1]] into %[[T]][1, 3] [300, 300] [1, 1] : tensor<300x300xi64> into tensor<303x307xi64> + // CHECK: %[[UH:.*]] = mesh.update_halo %[[IN1]] into %[[inserted_slice]] on @mesh4x4 split_axes = {{\[\[}}0], [1]] destination_halo_sizes = [1, 2, 3, 4] : tensor<300x300xi64> -> tensor<303x307xi64> + %sharding_annotated = mesh.shard %arg0 to %sharding : tensor<1200x1200xi64> + %sharding_0 = mesh.sharding @mesh4x4 split_axes = [[0], [1]] halo_sizes = [1, 2, 3, 4] : !mesh.sharding + %sharding_annotated_1 = mesh.shard %sharding_annotated to %sharding_0 : tensor<1200x1200xi64> + %sharding_annotated_3 = mesh.shard %sharding_annotated_1 to %sharding_0 annotate_for_users : tensor<1200x1200xi64> + // CHECK: return %[[UH]] : tensor<303x307xi64> + return %sharding_annotated_3 : tensor<1200x1200xi64> +} \ No newline at end of file diff --git a/mlir/test/Dialect/Tensor/mesh-spmdization.mlir b/mlir/test/Dialect/Tensor/mesh-spmdization.mlir index 611acb5b41445b..5443eea83aa2d8 100644 --- a/mlir/test/Dialect/Tensor/mesh-spmdization.mlir +++ b/mlir/test/Dialect/Tensor/mesh-spmdization.mlir @@ -4,12 +4,12 @@ mesh.mesh @mesh_1d_4(shape = 4) -// CHECK-LABEL: func @tensor_empty_static_sharded_dims_sizes -func.func @tensor_empty_static_sharded_dims_sizes() -> () { +// CHECK-LABEL: func @tensor_empty_static_sharded_dims_offsets +func.func @tensor_empty_static_sharded_dims_offsets() -> () { %b = tensor.empty() : tensor<8x16xf32> - %sharding = mesh.sharding @mesh_1d_4 split_axes = [[0]] sharded_dims_sizes = [1, 3, 3, 1] : !mesh.sharding + %sharding = mesh.sharding @mesh_1d_4 split_axes = [[0]] sharded_dims_offsets = [0, 1, 4, 7, 8] : !mesh.sharding %sharded= mesh.shard %b to %sharding : tensor<8x16xf32> - // CHECK: %[[sharding:.*]] = mesh.sharding @mesh_1d_4 split_axes = {{\[\[}}0]] sharded_dims_sizes = [1, 3, 3, 1] : !mesh.sharding + // CHECK: %[[sharding:.*]] = mesh.sharding @mesh_1d_4 split_axes = {{\[\[}}0]] sharded_dims_offsets = [0, 1, 4, 7, 8] : !mesh.sharding // CHECK: %[[proc_linear_idx:.*]] = mesh.process_linear_index on @mesh_1d_4 : index // CHECK: %[[V0:.*]]:2 = mesh.shard_shape 8x16 %[[sharding]] %[[proc_linear_idx]] : index, index // CHECK: tensor.empty(%[[V0]]#0) : tensor @@ -17,13 +17,13 @@ func.func @tensor_empty_static_sharded_dims_sizes() -> () { return } -// CHECK-LABEL: func @tensor_empty_dynamic_sharded_dims_sizes +// CHECK-LABEL: func @tensor_empty_dynamic_sharded_dims_offsets // CHECK-SAME: %[[A0:.*]]: index -func.func @tensor_empty_dynamic_sharded_dims_sizes(%arg0 : index) -> () { +func.func @tensor_empty_dynamic_sharded_dims_offsets(%arg0 : index) -> () { %b = tensor.empty(%arg0) : tensor<8x?xf32> - %sharding = mesh.sharding @mesh_1d_4 split_axes = [[0]] sharded_dims_sizes = [1, 3, 3, 1] : !mesh.sharding + %sharding = mesh.sharding @mesh_1d_4 split_axes = [[0]] sharded_dims_offsets = [0, 1, 4, 7, 8] : !mesh.sharding %sharded= mesh.shard %b to %sharding : tensor<8x?xf32> - // CHECK: %[[sharding:.*]] = mesh.sharding @mesh_1d_4 split_axes = {{\[\[}}0]] sharded_dims_sizes = [1, 3, 3, 1] : !mesh.sharding + // CHECK: %[[sharding:.*]] = mesh.sharding @mesh_1d_4 split_axes = {{\[\[}}0]] sharded_dims_offsets = [0, 1, 4, 7, 8] : !mesh.sharding // CHECK: %[[proc_linear_idx:.*]] = mesh.process_linear_index on @mesh_1d_4 : index // CHECK: %[[V0:.*]]:2 = mesh.shard_shape 8x? %[[sharding]] %[[proc_linear_idx]] : index, index // CHECK: tensor.empty(%[[V0]]#0, %[[A0]]) : tensor @@ -33,9 +33,9 @@ func.func @tensor_empty_dynamic_sharded_dims_sizes(%arg0 : index) -> () { // CHECK-LABEL: func @tensor_empty_same_static_dims_sizes func.func @tensor_empty_same_static_dims_sizes() -> () { - %b = tensor.empty() : tensor<8x16xf32> - %sharding = mesh.sharding @mesh_1d_4 split_axes = [[0]] sharded_dims_sizes = [4, 4, 4, 4] : !mesh.sharding - %sharded= mesh.shard %b to %sharding : tensor<8x16xf32> + %b = tensor.empty() : tensor<16x16xf32> + %sharding = mesh.sharding @mesh_1d_4 split_axes = [[0]] sharded_dims_offsets = [0, 4, 8, 12, 16] : !mesh.sharding + %sharded= mesh.shard %b to %sharding : tensor<16x16xf32> // CHECK-NEXT: tensor.empty() : tensor<4x16xf32> return diff --git a/mlir/test/Dialect/Vector/int-range-interface.mlir b/mlir/test/Dialect/Vector/int-range-interface.mlir index 09dfe932a52323..0263193b204015 100644 --- a/mlir/test/Dialect/Vector/int-range-interface.mlir +++ b/mlir/test/Dialect/Vector/int-range-interface.mlir @@ -17,6 +17,13 @@ func.func @constant_splat() -> vector<8xi32> { func.return %1 : vector<8xi32> } +// CHECK-LABEL: func @float_constant_splat +// Don't crash on splat floats. +func.func @float_constant_splat() -> vector<8xf32> { + %0 = arith.constant dense<3.0> : vector<8xf32> + func.return %0: vector<8xf32> +} + // CHECK-LABEL: func @vector_splat // CHECK: test.reflect_bounds {smax = 5 : index, smin = 4 : index, umax = 5 : index, umin = 4 : index} func.func @vector_splat() -> vector<4xindex> { diff --git a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir index 84aaa9c61200b9..210025e30d7db5 100644 --- a/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir +++ b/mlir/test/Dialect/Vector/vector-rewrite-narrow-types.mlir @@ -193,36 +193,8 @@ func.func @f3ext(%a: vector<5xi8>) -> vector<8xi17> { return %1 : vector<8xi17> } -// CHECK-LABEL: func.func @aligned_extsi( -func.func @aligned_extsi(%a: vector<8xi4>) -> vector<8xi32> { -// CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi32> { -// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> -// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8> -// CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> -// CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<4xi8> -// CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> -// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8> -// CHECK: %[[I32:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8xi8> to vector<8xi32> - %0 = arith.extsi %a : vector<8xi4> to vector<8xi32> - return %0 : vector<8xi32> -} - -// CHECK-LABEL: func.func @aligned_extsi_2d( -func.func @aligned_extsi_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { -// CHECK-SAME: %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xi32> { -// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8> -// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi4> to vector<8x16xi8> -// CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> -// CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<8x16xi8> -// CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> -// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<8x16xi8> -// CHECK: %[[I32:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xi32> - %0 = arith.extsi %a : vector<8x32xi4> to vector<8x32xi32> - return %0 : vector<8x32xi32> -} - -// CHECK-LABEL: func.func @aligned_extsi_base_case( -func.func @aligned_extsi_base_case(%a: vector<8xi4>) -> vector<8xi8> { +// CHECK-LABEL: func.func @aligned_extsi_i4_to_i8( +func.func @aligned_extsi_i4_to_i8(%a: vector<8xi4>) -> vector<8xi8> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi8> { // CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> // CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8> @@ -234,60 +206,61 @@ func.func @aligned_extsi_base_case(%a: vector<8xi4>) -> vector<8xi8> { return %0 : vector<8xi8> } -// CHECK-LABEL: func.func @aligned_sitofp( -func.func @aligned_sitofp(%a: vector<8xi4>) -> vector<8xf32> { -// CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xf32> { +// CHECK-LABEL: func.func @aligned_extsi_i4_to_i32( +func.func @aligned_extsi_i4_to_i32(%a: vector<8xi4>) -> vector<8xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi32> { // CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> // CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8> // CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> // CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<4xi8> // CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> // CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8> -// CHECK: %[[F32:.*]] = arith.sitofp %[[INTERLEAVE]] : vector<8xi8> to vector<8xf32> - %0 = arith.sitofp %a : vector<8xi4> to vector<8xf32> - return %0 : vector<8xf32> +// CHECK: %[[I32:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8xi8> to vector<8xi32> + %0 = arith.extsi %a : vector<8xi4> to vector<8xi32> + return %0 : vector<8xi32> } -// CHECK-LABEL: func.func @aligned_sitofp_2d( -func.func @aligned_sitofp_2d(%a: vector<8x32xi4>) -> vector<8x32xf32> { -// CHECK-SAME: %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xf32> { +// CHECK-LABEL: func.func @aligned_extsi_2d( +func.func @aligned_extsi_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { +// CHECK-SAME: %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xi32> { // CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8> // CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi4> to vector<8x16xi8> // CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> // CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<8x16xi8> // CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> // CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<8x16xi8> -// CHECK: %[[F32:.*]] = arith.sitofp %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xf32> - %0 = arith.sitofp %a : vector<8x32xi4> to vector<8x32xf32> - return %0 : vector<8x32xf32> +// CHECK: %[[I32:.*]] = arith.extsi %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xi32> + %0 = arith.extsi %a : vector<8x32xi4> to vector<8x32xi32> + return %0 : vector<8x32xi32> } -// CHECK-LABEL: func.func @aligned_trunci( -func.func @aligned_trunci(%a: vector<8xi32>) -> vector<8xi4> { -// CHECK-SAME: %[[IN:.*]]: vector<8xi32>) -> vector<8xi4> { + +// CHECK-LABEL: func.func @aligned_trunci_i8_to_i4( +func.func @aligned_trunci_i8_to_i4(%a: vector<8xi8>) -> vector<8xi4> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi8>) -> vector<8xi4> { // CHECK-DAG: %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8> // CHECK-DAG: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> -// CHECK: %[[I8:.*]] = arith.trunci %[[IN]] : vector<8xi32> to vector<8xi8> -// CHECK: %[[LOW:.*]], %[[HIGH:.*]] = vector.deinterleave %[[I8]] : vector<8xi8> -> vector<4xi8> +// CHECK: %[[LOW:.*]], %[[HIGH:.*]] = vector.deinterleave %[[IN]] : vector<8xi8> -> vector<4xi8> // CHECK: %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8> // CHECK: %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8> // CHECK: %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8> // CHECK: %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<4xi8> to vector<8xi4> - %0 = arith.trunci %a : vector<8xi32> to vector<8xi4> + %0 = arith.trunci %a : vector<8xi8> to vector<8xi4> return %0 : vector<8xi4> } -// CHECK-LABEL: func.func @aligned_trunci_base_case( -func.func @aligned_trunci_base_case(%a: vector<8xi8>) -> vector<8xi4> { -// CHECK-SAME: %[[IN:.*]]: vector<8xi8>) -> vector<8xi4> { +// CHECK-LABEL: func.func @aligned_trunci_i32_to_i4( +func.func @aligned_trunci_i32_to_i4(%a: vector<8xi32>) -> vector<8xi4> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi32>) -> vector<8xi4> { // CHECK-DAG: %[[LOW_MASK:.*]] = arith.constant dense<15> : vector<4xi8> // CHECK-DAG: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> -// CHECK: %[[LOW:.*]], %[[HIGH:.*]] = vector.deinterleave %[[IN]] : vector<8xi8> -> vector<4xi8> +// CHECK: %[[I8:.*]] = arith.trunci %[[IN]] : vector<8xi32> to vector<8xi8> +// CHECK: %[[LOW:.*]], %[[HIGH:.*]] = vector.deinterleave %[[I8]] : vector<8xi8> -> vector<4xi8> // CHECK: %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[LOW_MASK]] : vector<4xi8> // CHECK: %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[I4_BITS]] : vector<4xi8> // CHECK: %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<4xi8> // CHECK: %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<4xi8> to vector<8xi4> - %0 = arith.trunci %a : vector<8xi8> to vector<8xi4> + %0 = arith.trunci %a : vector<8xi32> to vector<8xi4> return %0 : vector<8xi4> } @@ -314,33 +287,26 @@ func.func @aligned_trunci_nd(%a: vector<3x8x32xi32>) -> vector<3x8x32xi4> { // CHECK: %[[ZEROED_LOW:.*]] = arith.andi %[[LOW]], %[[I4_MASK]] : vector<3x8x16xi8> // CHECK: %[[SHL_HIGH:.*]] = arith.shli %[[HIGH]], %[[LEFT_SHIFT_BITS]] : vector<3x8x16xi8> // CHECK: %[[MERGED:.*]] = arith.ori %[[ZEROED_LOW]], %[[SHL_HIGH]] : vector<3x8x16xi8> - // CHECK: %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<3x8x16xi8> to vector<3x8x32xi4> + // CHECK: %[[I4:.*]] = vector.bitcast %[[MERGED]] : vector<3x8x16xi8> to vector<3x8x32xi4> %0 = arith.trunci %a : vector<3x8x32xi32> to vector<3x8x32xi4> return %0 : vector<3x8x32xi4> } -// CHECK-LABEL: func.func @i4_transpose( -func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> { -// CHECK-SAME: %[[IN:.*]]: vector<8x16xi4>) -> vector<16x8xi4> { -// CHECK: %[[EXT:.*]] = vector.interleave -// CHECK: %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8> -// CHECK: vector.deinterleave %[[TRANS]] : vector<16x8xi8> -> vector<16x4xi8> - %0 = vector.transpose %a, [1, 0] : vector<8x16xi4> to vector<16x8xi4> - return %0 : vector<16x8xi4> -} - -// CHECK-LABEL: func.func @i7_transpose( -func.func @i7_transpose(%a: vector<8x16xi7>) -> vector<16x8xi7> { -// CHECK-SAME: %[[IN:.*]]: vector<8x16xi7>) -> vector<16x8xi7> { -// CHECK: %[[EXT:.*]] = arith.extsi %[[IN]] : vector<8x16xi7> to vector<8x16xi8> -// CHECK: %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8> -// CHECK: %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi7> - %0 = vector.transpose %a, [1, 0] : vector<8x16xi7> to vector<16x8xi7> - return %0 : vector<16x8xi7> +// CHECK-LABEL: func.func @aligned_extui_i4_to_i8( +func.func @aligned_extui_i4_to_i8(%a: vector<8xi4>) -> vector<8xi8> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi8> { +// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> +// CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<15> : vector<4xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8> +// CHECK: %[[LOW:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<4xi8> +// CHECK: %[[HIGH:.*]] = arith.shrui %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8> + %0 = arith.extui %a : vector<8xi4> to vector<8xi8> + return %0 : vector<8xi8> } -// CHECK-LABEL: func.func @aligned_extui( -func.func @aligned_extui(%a: vector<8xi4>) -> vector<8xi32> { +// CHECK-LABEL: func.func @aligned_extui_i4_to_i32( +func.func @aligned_extui_i4_to_i32(%a: vector<8xi4>) -> vector<8xi32> { // CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi32> { // CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> // CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<15> : vector<4xi8> @@ -367,19 +333,83 @@ func.func @aligned_extui_2d(%a: vector<8x32xi4>) -> vector<8x32xi32> { return %0 : vector<8x32xi32> } -// CHECK-LABEL: func.func @aligned_extui_base_case( -func.func @aligned_extui_base_case(%a: vector<8xi4>) -> vector<8xi8> { -// CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xi8> { +// CHECK-LABEL: func.func @aligned_sitofp( +func.func @aligned_sitofp(%a: vector<8xi4>) -> vector<8xf32> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xf32> { +// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8> +// CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8> +// CHECK: %[[F32:.*]] = arith.sitofp %[[INTERLEAVE]] : vector<8xi8> to vector<8xf32> + %0 = arith.sitofp %a : vector<8xi4> to vector<8xf32> + return %0 : vector<8xf32> +} + +// CHECK-LABEL: func.func @aligned_sitofp_2d( +func.func @aligned_sitofp_2d(%a: vector<8x32xi4>) -> vector<8x32xf32> { +// CHECK-SAME: %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xf32> { +// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi4> to vector<8x16xi8> +// CHECK: %[[SHL_LOW:.*]] = arith.shli %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> +// CHECK: %[[LOW:.*]] = arith.shrsi %[[SHL_LOW]], %[[I4_BITS]] : vector<8x16xi8> +// CHECK: %[[HIGH:.*]] = arith.shrsi %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<8x16xi8> +// CHECK: %[[F32:.*]] = arith.sitofp %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xf32> + %0 = arith.sitofp %a : vector<8x32xi4> to vector<8x32xf32> + return %0 : vector<8x32xf32> +} + +// CHECK-LABEL: func.func @aligned_uitofp( +func.func @aligned_uitofp(%a: vector<8xi4>) -> vector<8xf32> { +// CHECK-SAME: %[[IN:.*]]: vector<8xi4>) -> vector<8xf32> { // CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<4xi8> // CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<15> : vector<4xi8> // CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8xi4> to vector<4xi8> // CHECK: %[[LOW:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<4xi8> // CHECK: %[[HIGH:.*]] = arith.shrui %[[BITCAST]], %[[I4_BITS]] : vector<4xi8> // CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<4xi8> - %0 = arith.extui %a : vector<8xi4> to vector<8xi8> - return %0 : vector<8xi8> +// CHECK: %[[F32:.*]] = arith.uitofp %[[INTERLEAVE]] : vector<8xi8> to vector<8xf32> + %0 = arith.uitofp %a : vector<8xi4> to vector<8xf32> + return %0 : vector<8xf32> +} + +// CHECK-LABEL: func.func @aligned_uitofp_2d( +func.func @aligned_uitofp_2d(%a: vector<8x32xi4>) -> vector<8x32xf32> { +// CHECK-SAME: %[[IN:.*]]: vector<8x32xi4>) -> vector<8x32xf32> { +// CHECK: %[[I4_BITS:.*]] = arith.constant dense<4> : vector<8x16xi8> +// CHECK: %[[LOWBITS_MASK:.*]] = arith.constant dense<15> : vector<8x16xi8> +// CHECK: %[[BITCAST:.*]] = vector.bitcast %[[IN]] : vector<8x32xi4> to vector<8x16xi8> +// CHECK: %[[LOW:.*]] = arith.andi %[[BITCAST]], %[[LOWBITS_MASK]] : vector<8x16xi8> +// CHECK: %[[HIGH:.*]] = arith.shrui %[[BITCAST]], %[[I4_BITS]] : vector<8x16xi8> +// CHECK: %[[INTERLEAVE:.*]] = vector.interleave %[[LOW]], %[[HIGH]] : vector<8x16xi8> +// CHECK: %[[F32:.*]] = arith.uitofp %[[INTERLEAVE]] : vector<8x32xi8> to vector<8x32xf32> + %0 = arith.uitofp %a : vector<8x32xi4> to vector<8x32xf32> + return %0 : vector<8x32xf32> +} + +// CHECK-LABEL: func.func @i4_transpose( +func.func @i4_transpose(%a: vector<8x16xi4>) -> vector<16x8xi4> { +// CHECK-SAME: %[[IN:.*]]: vector<8x16xi4>) -> vector<16x8xi4> { +// CHECK: %[[EXT:.*]] = vector.interleave +// CHECK: %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8> +// CHECK: vector.deinterleave %[[TRANS]] : vector<16x8xi8> -> vector<16x4xi8> + %0 = vector.transpose %a, [1, 0] : vector<8x16xi4> to vector<16x8xi4> + return %0 : vector<16x8xi4> } +// CHECK-LABEL: func.func @i7_transpose( +func.func @i7_transpose(%a: vector<8x16xi7>) -> vector<16x8xi7> { +// CHECK-SAME: %[[IN:.*]]: vector<8x16xi7>) -> vector<16x8xi7> { +// CHECK: %[[EXT:.*]] = arith.extsi %[[IN]] : vector<8x16xi7> to vector<8x16xi8> +// CHECK: %[[TRANS:.*]] = vector.transpose %[[EXT]], [1, 0] : vector<8x16xi8> to vector<16x8xi8> +// CHECK: %[[TRUNC:.*]] = arith.trunci %[[TRANS]] : vector<16x8xi8> to vector<16x8xi7> + %0 = vector.transpose %a, [1, 0] : vector<8x16xi7> to vector<16x8xi7> + return %0 : vector<16x8xi7> +} + + module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) { %f = transform.structured.match ops{["func.func"]} in %module_op diff --git a/mlir/test/IR/core-ops.mlir b/mlir/test/IR/core-ops.mlir index 9858bcd1c05e7b..5d1583111541ca 100644 --- a/mlir/test/IR/core-ops.mlir +++ b/mlir/test/IR/core-ops.mlir @@ -171,34 +171,6 @@ func.func @affine_apply() { return } -// CHECK-LABEL: func @load_store_prefetch -func.func @load_store_prefetch(memref<4x4xi32>, index) { -^bb0(%0: memref<4x4xi32>, %1: index): - // CHECK: %0 = memref.load %arg0[%arg1, %arg1] : memref<4x4xi32> - %2 = "memref.load"(%0, %1, %1) : (memref<4x4xi32>, index, index)->i32 - - // CHECK: %{{.*}} = memref.load %arg0[%arg1, %arg1] : memref<4x4xi32> - %3 = memref.load %0[%1, %1] : memref<4x4xi32> - - // CHECK: memref.prefetch %arg0[%arg1, %arg1], write, locality<1>, data : memref<4x4xi32> - memref.prefetch %0[%1, %1], write, locality<1>, data : memref<4x4xi32> - - // CHECK: memref.prefetch %arg0[%arg1, %arg1], read, locality<3>, instr : memref<4x4xi32> - memref.prefetch %0[%1, %1], read, locality<3>, instr : memref<4x4xi32> - - return -} - -// Test with zero-dimensional operands using no index in load/store. -// CHECK-LABEL: func @zero_dim_no_idx -func.func @zero_dim_no_idx(%arg0 : memref, %arg1 : memref, %arg2 : memref) { - %0 = memref.load %arg0[] : memref - memref.store %0, %arg1[] : memref - return - // CHECK: %0 = memref.load %{{.*}}[] : memref - // CHECK: memref.store %{{.*}}, %{{.*}}[] : memref -} - // CHECK-LABEL: func @return_op(%arg0: i32) -> i32 { func.func @return_op(%a : i32) -> i32 { // CHECK: return %arg0 : i32 @@ -232,51 +204,6 @@ func.func @calls(%arg0: i32) { return } -// CHECK-LABEL: func @memref_cast(%arg0 -func.func @memref_cast(%arg0: memref<4xf32>, %arg1 : memref, %arg2 : memref<64x16x4xf32, strided<[64, 4, 1], offset: 0>>) { - // CHECK: memref.cast %{{.*}} : memref<4xf32> to memref - %0 = memref.cast %arg0 : memref<4xf32> to memref - - // CHECK: memref.cast %{{.*}} : memref to memref<4xf32> - %1 = memref.cast %arg1 : memref to memref<4xf32> - - // CHECK: memref.cast %{{.*}} : memref<64x16x4xf32, strided<[64, 4, 1]>> to memref<64x16x4xf32, strided<[?, ?, ?], offset: ?>> - %2 = memref.cast %arg2 : memref<64x16x4xf32, strided<[64, 4, 1], offset: 0>> to memref<64x16x4xf32, strided<[?, ?, ?], offset: ?>> - - // CHECK: memref.cast {{%.*}} : memref<64x16x4xf32, strided<[?, ?, ?], offset: ?>> to memref<64x16x4xf32, strided<[64, 4, 1]>> - %3 = memref.cast %2 : memref<64x16x4xf32, strided<[?, ?, ?], offset: ?>> to memref<64x16x4xf32, strided<[64, 4, 1], offset: 0>> - - // CHECK: memref.cast %{{.*}} : memref<4xf32> to memref<*xf32> - %4 = memref.cast %1 : memref<4xf32> to memref<*xf32> - - // CHECK: memref.cast %{{.*}} : memref<*xf32> to memref<4xf32> - %5 = memref.cast %4 : memref<*xf32> to memref<4xf32> - return -} - -// Check that unranked memrefs with non-default memory space roundtrip -// properly. -// CHECK-LABEL: @unranked_memref_roundtrip(memref<*xf32, 4>) -func.func private @unranked_memref_roundtrip(memref<*xf32, 4>) - -// CHECK-LABEL: func @memref_view(%arg0 -func.func @memref_view(%arg0 : index, %arg1 : index, %arg2 : index) { - %0 = memref.alloc() : memref<2048xi8> - // Test two dynamic sizes and dynamic offset. - // CHECK: memref.view {{.*}} : memref<2048xi8> to memref - %1 = memref.view %0[%arg2][%arg0, %arg1] : memref<2048xi8> to memref - - // Test one dynamic size and dynamic offset. - // CHECK: memref.view {{.*}} : memref<2048xi8> to memref<4x?xf32> - %3 = memref.view %0[%arg2][%arg1] : memref<2048xi8> to memref<4x?xf32> - - // Test static sizes and static offset. - // CHECK: memref.view {{.*}} : memref<2048xi8> to memref<64x4xf32> - %c0 = arith.constant 0: index - %5 = memref.view %0[%c0][] : memref<2048xi8> to memref<64x4xf32> - return -} - // CHECK-LABEL: func @test_dimop // CHECK-SAME: %[[ARG:.*]]: tensor<4x4x?xf32> func.func @test_dimop(%arg0: tensor<4x4x?xf32>) { @@ -288,11 +215,3 @@ func.func @test_dimop(%arg0: tensor<4x4x?xf32>) { %1 = affine.apply affine_map<(d0) -> (d0)>(%0) return } - -// CHECK-LABEL: func @assume_alignment -// CHECK-SAME: %[[MEMREF:.*]]: memref<4x4xf16> -func.func @assume_alignment(%0: memref<4x4xf16>) { - // CHECK: memref.assume_alignment %[[MEMREF]], 16 : memref<4x4xf16> - memref.assume_alignment %0, 16 : memref<4x4xf16> - return -} diff --git a/mlir/test/IR/memory-ops.mlir b/mlir/test/IR/memory-ops.mlir deleted file mode 100644 index c1cfc3bfa0dbf7..00000000000000 --- a/mlir/test/IR/memory-ops.mlir +++ /dev/null @@ -1,118 +0,0 @@ -// RUN: mlir-opt %s | FileCheck %s - -// CHECK: #[[$MAP:.*]] = affine_map<(d0, d1)[s0] -> (d0 + s0, d1)> - -// CHECK-LABEL: func @alloc() { -func.func @alloc() { -^bb0: - // Test simple alloc. - // CHECK: %{{.*}} = memref.alloc() : memref<1024x64xf32, 1> - %0 = memref.alloc() : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> - - %c0 = "arith.constant"() {value = 0: index} : () -> index - %c1 = "arith.constant"() {value = 1: index} : () -> index - - // Test alloc with dynamic dimensions. - // CHECK: %{{.*}} = memref.alloc(%{{.*}}, %{{.*}}) : memref - %1 = memref.alloc(%c0, %c1) : memref (d0, d1)>, 1> - - // Test alloc with no dynamic dimensions and one symbol. - // CHECK: %{{.*}} = memref.alloc()[%{{.*}}] : memref<2x4xf32, #[[$MAP]], 1> - %2 = memref.alloc()[%c0] : memref<2x4xf32, affine_map<(d0, d1)[s0] -> ((d0 + s0), d1)>, 1> - - // Test alloc with dynamic dimensions and one symbol. - // CHECK: %{{.*}} = memref.alloc(%{{.*}})[%{{.*}}] : memref<2x?xf32, #[[$MAP]], 1> - %3 = memref.alloc(%c1)[%c0] : memref<2x?xf32, affine_map<(d0, d1)[s0] -> (d0 + s0, d1)>, 1> - - // Alloc with no mappings. - // b/116054838 Parser crash while parsing ill-formed AllocOp - // CHECK: %{{.*}} = memref.alloc() : memref<2xi32> - %4 = memref.alloc() : memref<2 x i32> - - // CHECK: return - return -} - -// CHECK-LABEL: func @alloca() { -func.func @alloca() { -^bb0: - // Test simple alloc. - // CHECK: %{{.*}} = memref.alloca() : memref<1024x64xf32, 1> - %0 = memref.alloca() : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> - - %c0 = "arith.constant"() {value = 0: index} : () -> index - %c1 = "arith.constant"() {value = 1: index} : () -> index - - // Test alloca with dynamic dimensions. - // CHECK: %{{.*}} = memref.alloca(%{{.*}}, %{{.*}}) : memref - %1 = memref.alloca(%c0, %c1) : memref (d0, d1)>, 1> - - // Test alloca with no dynamic dimensions and one symbol. - // CHECK: %{{.*}} = memref.alloca()[%{{.*}}] : memref<2x4xf32, #[[$MAP]], 1> - %2 = memref.alloca()[%c0] : memref<2x4xf32, affine_map<(d0, d1)[s0] -> ((d0 + s0), d1)>, 1> - - // Test alloca with dynamic dimensions and one symbol. - // CHECK: %{{.*}} = memref.alloca(%{{.*}})[%{{.*}}] : memref<2x?xf32, #[[$MAP]], 1> - %3 = memref.alloca(%c1)[%c0] : memref<2x?xf32, affine_map<(d0, d1)[s0] -> (d0 + s0, d1)>, 1> - - // Alloca with no mappings, but with alignment. - // CHECK: %{{.*}} = memref.alloca() {alignment = 64 : i64} : memref<2xi32> - %4 = memref.alloca() {alignment = 64} : memref<2 x i32> - - return -} - -// CHECK-LABEL: func @dealloc() { -func.func @dealloc() { -^bb0: - // CHECK: %{{.*}} = memref.alloc() : memref<1024x64xf32> - %0 = memref.alloc() : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 0> - - // CHECK: memref.dealloc %{{.*}} : memref<1024x64xf32> - memref.dealloc %0 : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 0> - return -} - -// CHECK-LABEL: func @load_store -func.func @load_store() { -^bb0: - // CHECK: %{{.*}} = memref.alloc() : memref<1024x64xf32, 1> - %0 = memref.alloc() : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> - - %1 = arith.constant 0 : index - %2 = arith.constant 1 : index - - // CHECK: %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x64xf32, 1> - %3 = memref.load %0[%1, %2] : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> - - // CHECK: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<1024x64xf32, 1> - memref.store %3, %0[%1, %2] : memref<1024x64xf32, affine_map<(d0, d1) -> (d0, d1)>, 1> - - return -} - -// CHECK-LABEL: func @dma_ops() -func.func @dma_ops() { - %c0 = arith.constant 0 : index - %stride = arith.constant 32 : index - %elt_per_stride = arith.constant 16 : index - - %A = memref.alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 0> - %Ah = memref.alloc() : memref<256 x f32, affine_map<(d0) -> (d0)>, 1> - %tag = memref.alloc() : memref<1 x f32> - - %num_elements = arith.constant 256 : index - - memref.dma_start %A[%c0], %Ah[%c0], %num_elements, %tag[%c0] : memref<256 x f32>, memref<256 x f32, 1>, memref<1 x f32> - memref.dma_wait %tag[%c0], %num_elements : memref<1 x f32> - // CHECK: dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}[%{{.*}}] : memref<256xf32>, memref<256xf32, 1>, memref<1xf32> - // CHECK-NEXT: dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xf32> - - // DMA with strides - memref.dma_start %A[%c0], %Ah[%c0], %num_elements, %tag[%c0], %stride, %elt_per_stride : memref<256 x f32>, memref<256 x f32, 1>, memref<1 x f32> - memref.dma_wait %tag[%c0], %num_elements : memref<1 x f32> - // CHECK-NEXT: dma_start %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}}[%{{.*}}], %{{.*}}, %{{.*}} : memref<256xf32>, memref<256xf32, 1>, memref<1xf32> - // CHECK-NEXT: dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xf32> - - return -} diff --git a/mlir/test/Target/LLVMIR/Import/nneg.ll b/mlir/test/Target/LLVMIR/Import/nneg.ll new file mode 100644 index 00000000000000..07756b9f706bdb --- /dev/null +++ b/mlir/test/Target/LLVMIR/Import/nneg.ll @@ -0,0 +1,10 @@ +; RUN: mlir-translate -import-llvm -split-input-file %s | FileCheck %s + +; CHECK-LABEL: @nnegflag_inst +define void @nnegflag_inst(i32 %arg1) { + ; CHECK: llvm.zext nneg %{{.*}} : i32 to i64 + %1 = zext nneg i32 %arg1 to i64 + ; CHECK: llvm.uitofp nneg %{{.*}} : i32 to f32 + %2 = uitofp nneg i32 %arg1 to float + ret void +} diff --git a/mlir/test/Target/LLVMIR/Import/nsw_nuw.ll b/mlir/test/Target/LLVMIR/Import/nsw_nuw.ll index d08098a5e5dfe0..4af799da36dc08 100644 --- a/mlir/test/Target/LLVMIR/Import/nsw_nuw.ll +++ b/mlir/test/Target/LLVMIR/Import/nsw_nuw.ll @@ -10,5 +10,7 @@ define void @intflag_inst(i64 %arg1, i64 %arg2) { %3 = mul nsw nuw i64 %arg1, %arg2 ; CHECK: llvm.shl %{{.*}}, %{{.*}} overflow : i64 %4 = shl nuw nsw i64 %arg1, %arg2 + ; CHECK: llvm.trunc %{{.*}} overflow : i64 to i32 + %5 = trunc nsw i64 %arg1 to i32 ret void } diff --git a/mlir/test/Target/LLVMIR/nneg.mlir b/mlir/test/Target/LLVMIR/nneg.mlir new file mode 100644 index 00000000000000..8afa765a510e24 --- /dev/null +++ b/mlir/test/Target/LLVMIR/nneg.mlir @@ -0,0 +1,10 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +// CHECK-LABEL: define void @nnegflag_func +llvm.func @nnegflag_func(%arg0: i32) { + // CHECK: %{{.*}} = zext nneg i32 %{{.*}} to i64 + %0 = llvm.zext nneg %arg0 : i32 to i64 + // CHECK: %{{.*}} = uitofp nneg i32 %{{.*}} to float + %1 = llvm.uitofp nneg %arg0 : i32 to f32 + llvm.return +} diff --git a/mlir/test/Target/LLVMIR/nsw_nuw.mlir b/mlir/test/Target/LLVMIR/nsw_nuw.mlir index 6843c2ef0299c7..584aa05a04f7cf 100644 --- a/mlir/test/Target/LLVMIR/nsw_nuw.mlir +++ b/mlir/test/Target/LLVMIR/nsw_nuw.mlir @@ -10,5 +10,7 @@ llvm.func @intflags_func(%arg0: i64, %arg1: i64) { %2 = llvm.mul %arg0, %arg1 overflow : i64 // CHECK: %{{.*}} = shl nuw nsw i64 %{{.*}}, %{{.*}} %3 = llvm.shl %arg0, %arg1 overflow : i64 + // CHECK: %{{.*}} = trunc nuw i64 %{{.*}} to i32 + %4 = llvm.trunc %arg1 overflow : i64 to i32 llvm.return } diff --git a/mlir/test/Transforms/cse.mlir b/mlir/test/Transforms/cse.mlir index 11a33102684733..b447094874d017 100644 --- a/mlir/test/Transforms/cse.mlir +++ b/mlir/test/Transforms/cse.mlir @@ -1,7 +1,4 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(cse))' | FileCheck %s - -// CHECK-DAG: #[[$MAP:.*]] = affine_map<(d0) -> (d0 mod 2)> -#map0 = affine_map<(d0) -> (d0 mod 2)> +// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline='builtin.module(func.func(cse))' -split-input-file | FileCheck %s // CHECK-LABEL: @simple_constant func.func @simple_constant() -> (i32, i32) { @@ -13,6 +10,11 @@ func.func @simple_constant() -> (i32, i32) { return %0, %1 : i32, i32 } +// ----- + +// CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 mod 2)> +#map0 = affine_map<(d0) -> (d0 mod 2)> + // CHECK-LABEL: @basic func.func @basic() -> (index, index) { // CHECK: %[[VAR_c0:[0-9a-zA-Z_]+]] = arith.constant 0 : index @@ -27,6 +29,8 @@ func.func @basic() -> (index, index) { return %0, %1 : index, index } +// ----- + // CHECK-LABEL: @many func.func @many(f32, f32) -> (f32) { ^bb0(%a : f32, %b : f32): @@ -52,6 +56,8 @@ func.func @many(f32, f32) -> (f32) { return %l : f32 } +// ----- + /// Check that operations are not eliminated if they have different operands. // CHECK-LABEL: @different_ops func.func @different_ops() -> (i32, i32) { @@ -64,6 +70,8 @@ func.func @different_ops() -> (i32, i32) { return %0, %1 : i32, i32 } +// ----- + /// Check that operations are not eliminated if they have different result /// types. // CHECK-LABEL: @different_results @@ -77,6 +85,8 @@ func.func @different_results(%arg0: tensor<*xf32>) -> (tensor, tensor<4 return %0, %1 : tensor, tensor<4x?xf32> } +// ----- + /// Check that operations are not eliminated if they have different attributes. // CHECK-LABEL: @different_attributes func.func @different_attributes(index, index) -> (i1, i1, i1) { @@ -93,6 +103,8 @@ func.func @different_attributes(index, index) -> (i1, i1, i1) { return %0, %1, %2 : i1, i1, i1 } +// ----- + /// Check that operations with side effects are not eliminated. // CHECK-LABEL: @side_effect func.func @side_effect() -> (memref<2x1xf32>, memref<2x1xf32>) { @@ -106,6 +118,8 @@ func.func @side_effect() -> (memref<2x1xf32>, memref<2x1xf32>) { return %0, %1 : memref<2x1xf32>, memref<2x1xf32> } +// ----- + /// Check that operation definitions are properly propagated down the dominance /// tree. // CHECK-LABEL: @down_propagate_for @@ -122,6 +136,8 @@ func.func @down_propagate_for() { return } +// ----- + // CHECK-LABEL: @down_propagate func.func @down_propagate() -> i32 { // CHECK-NEXT: %[[VAR_c1_i32:[0-9a-zA-Z_]+]] = arith.constant 1 : i32 @@ -142,6 +158,8 @@ func.func @down_propagate() -> i32 { return %arg : i32 } +// ----- + /// Check that operation definitions are NOT propagated up the dominance tree. // CHECK-LABEL: @up_propagate_for func.func @up_propagate_for() -> i32 { @@ -159,6 +177,8 @@ func.func @up_propagate_for() -> i32 { return %1 : i32 } +// ----- + // CHECK-LABEL: func @up_propagate func.func @up_propagate() -> i32 { // CHECK-NEXT: %[[VAR_c0_i32:[0-9a-zA-Z_]+]] = arith.constant 0 : i32 @@ -188,6 +208,8 @@ func.func @up_propagate() -> i32 { return %add : i32 } +// ----- + /// The same test as above except that we are testing on a cfg embedded within /// an operation region. // CHECK-LABEL: func @up_propagate_region @@ -221,6 +243,8 @@ func.func @up_propagate_region() -> i32 { return %0 : i32 } +// ----- + /// This test checks that nested regions that are isolated from above are /// properly handled. // CHECK-LABEL: @nested_isolated @@ -248,6 +272,8 @@ func.func @nested_isolated() -> i32 { return %0 : i32 } +// ----- + /// This test is checking that CSE gracefully handles values in graph regions /// where the use occurs before the def, and one of the defs could be CSE'd with /// the other. @@ -269,6 +295,8 @@ func.func @use_before_def() { return } +// ----- + /// This test is checking that CSE is removing duplicated read op that follow /// other. // CHECK-LABEL: @remove_direct_duplicated_read_op @@ -281,6 +309,8 @@ func.func @remove_direct_duplicated_read_op() -> i32 { return %2 : i32 } +// ----- + /// This test is checking that CSE is removing duplicated read op that follow /// other. // CHECK-LABEL: @remove_multiple_duplicated_read_op @@ -300,6 +330,8 @@ func.func @remove_multiple_duplicated_read_op() -> i64 { return %6 : i64 } +// ----- + /// This test is checking that CSE is not removing duplicated read op that /// have write op in between. // CHECK-LABEL: @dont_remove_duplicated_read_op_with_sideeffecting @@ -314,6 +346,8 @@ func.func @dont_remove_duplicated_read_op_with_sideeffecting() -> i32 { return %2 : i32 } +// ----- + // Check that an operation with a single region can CSE. func.func @cse_single_block_ops(%a : tensor, %b : tensor) -> (tensor, tensor) { @@ -332,6 +366,8 @@ func.func @cse_single_block_ops(%a : tensor, %b : tensor) // CHECK-NOT: test.cse_of_single_block_op // CHECK: return %[[OP]], %[[OP]] +// ----- + // Operations with different number of bbArgs dont CSE. func.func @no_cse_varied_bbargs(%a : tensor, %b : tensor) -> (tensor, tensor) { @@ -350,6 +386,8 @@ func.func @no_cse_varied_bbargs(%a : tensor, %b : tensor) // CHECK: %[[OP1:.+]] = test.cse_of_single_block_op // CHECK: return %[[OP0]], %[[OP1]] +// ----- + // Operations with different regions dont CSE func.func @no_cse_region_difference_simple(%a : tensor, %b : tensor) -> (tensor, tensor) { @@ -368,6 +406,8 @@ func.func @no_cse_region_difference_simple(%a : tensor, %b : tensor, %b : tensor, %c : f32, %d : i1) -> (tensor, tensor) { @@ -392,6 +432,8 @@ func.func @cse_single_block_ops_identical_bodies(%a : tensor, %b : tens // CHECK-NOT: test.cse_of_single_block_op // CHECK: return %[[OP]], %[[OP]] +// ----- + // Operation with non-identical regions dont CSE. func.func @no_cse_single_block_ops_different_bodies(%a : tensor, %b : tensor, %c : f32, %d : i1) -> (tensor, tensor) { @@ -416,6 +458,8 @@ func.func @no_cse_single_block_ops_different_bodies(%a : tensor, %b : t // CHECK: %[[OP1:.+]] = test.cse_of_single_block_op // CHECK: return %[[OP0]], %[[OP1]] +// ----- + func.func @failing_issue_59135(%arg0: tensor<2x2xi1>, %arg1: f32, %arg2 : tensor<2xi1>) -> (tensor<2xi1>, tensor<2xi1>) { %false_2 = arith.constant false %true_5 = arith.constant true @@ -438,6 +482,8 @@ func.func @failing_issue_59135(%arg0: tensor<2x2xi1>, %arg1: f32, %arg2 : tensor // CHECK: test.region_yield %[[TRUE]] // CHECK: return %[[OP]], %[[OP]] +// ----- + func.func @cse_multiple_regions(%c: i1, %t: tensor<5xf32>) -> (tensor<5xf32>, tensor<5xf32>) { %r1 = scf.if %c -> (tensor<5xf32>) { %0 = tensor.empty() : tensor<5xf32> @@ -463,6 +509,8 @@ func.func @cse_multiple_regions(%c: i1, %t: tensor<5xf32>) -> (tensor<5xf32>, te // CHECK-NOT: scf.if // CHECK: return %[[if]], %[[if]] +// ----- + // CHECK-LABEL: @cse_recursive_effects_success func.func @cse_recursive_effects_success() -> (i32, i32, i32) { // CHECK-NEXT: %[[READ_VALUE:.*]] = "test.op_with_memread"() : () -> i32 @@ -492,6 +540,8 @@ func.func @cse_recursive_effects_success() -> (i32, i32, i32) { return %0, %2, %1 : i32, i32, i32 } +// ----- + // CHECK-LABEL: @cse_recursive_effects_failure func.func @cse_recursive_effects_failure() -> (i32, i32, i32) { // CHECK-NEXT: %[[READ_VALUE:.*]] = "test.op_with_memread"() : () -> i32 diff --git a/mlir/test/lib/IR/TestDominance.cpp b/mlir/test/lib/IR/TestDominance.cpp index fab80bdacb032d..b34149b3e2cbdf 100644 --- a/mlir/test/lib/IR/TestDominance.cpp +++ b/mlir/test/lib/IR/TestDominance.cpp @@ -12,6 +12,7 @@ // //===----------------------------------------------------------------------===// +#include "mlir/IR/Builders.h" #include "mlir/IR/Dominance.h" #include "mlir/IR/SymbolTable.h" #include "mlir/Pass/Pass.h" @@ -24,24 +25,46 @@ static bool dominatesOrPostDominates(DominanceInfo &dominanceInfo, Block *a, Block *b) { return dominanceInfo.dominates(a, b); } - static bool dominatesOrPostDominates(PostDominanceInfo &dominanceInfo, Block *a, Block *b) { return dominanceInfo.postDominates(a, b); } +static bool properlyDominatesOrPostDominates(DominanceInfo &dominanceInfo, + Block *a, Block *b) { + return dominanceInfo.properlyDominates(a, b); +} +static bool properlyDominatesOrPostDominates(PostDominanceInfo &dominanceInfo, + Block *a, Block *b) { + return dominanceInfo.properlyPostDominates(a, b); +} namespace { /// Helper class to print dominance information. class DominanceTest { public: + static constexpr StringRef kBlockIdsAttrName = "test.block_ids"; + /// Constructs a new test instance using the given operation. DominanceTest(Operation *operation) : operation(operation) { - // Create unique ids for each block. + Builder b(operation->getContext()); + + // Helper function that annotates the IR with block IDs. + auto annotateBlockId = [&](Operation *op, int64_t blockId) { + auto idAttr = op->getAttrOfType(kBlockIdsAttrName); + SmallVector ids; + if (idAttr) + ids = llvm::to_vector(idAttr.asArrayRef()); + ids.push_back(blockId); + op->setAttr(kBlockIdsAttrName, b.getDenseI64ArrayAttr(ids)); + }; + + // Create unique IDs for each block. operation->walk([&](Operation *nested) { if (blockIds.count(nested->getBlock()) > 0) return; blockIds.insert({nested->getBlock(), blockIds.size()}); + annotateBlockId(nested->getBlock()->getParentOp(), blockIds.size() - 1); }); } @@ -61,26 +84,28 @@ class DominanceTest { if (!visited.insert(nestedBlock).second) return; if (printCommonDominatorInfo) { - llvm::errs() << "Nearest(" << blockIds[block] << ", " + llvm::outs() << "Nearest(" << blockIds[block] << ", " << blockIds[nestedBlock] << ") = "; Block *dom = dominanceInfo.findNearestCommonDominator(block, nestedBlock); if (dom) - llvm::errs() << blockIds[dom]; + llvm::outs() << blockIds[dom]; else - llvm::errs() << ""; - llvm::errs() << "\n"; + llvm::outs() << ""; + llvm::outs() << "\n"; } else { if (std::is_same::value) - llvm::errs() << "dominates("; - else - llvm::errs() << "postdominates("; - llvm::errs() << blockIds[block] << ", " << blockIds[nestedBlock] - << ") = "; - if (dominatesOrPostDominates(dominanceInfo, block, nestedBlock)) - llvm::errs() << "true\n"; + llvm::outs() << "dominates("; else - llvm::errs() << "false\n"; + llvm::outs() << "postdominates("; + llvm::outs() << blockIds[block] << ", " << blockIds[nestedBlock] + << ") = " + << std::to_string(dominatesOrPostDominates( + dominanceInfo, block, nestedBlock)) + << " (properly = " + << std::to_string(properlyDominatesOrPostDominates( + dominanceInfo, block, nestedBlock)) + << ")\n"; } }); }); @@ -101,24 +126,24 @@ struct TestDominancePass } void runOnOperation() override { - llvm::errs() << "Testing : " << getOperation().getName() << "\n"; + llvm::outs() << "Testing : " << getOperation().getName() << "\n"; DominanceTest dominanceTest(getOperation()); // Print dominance information. - llvm::errs() << "--- DominanceInfo ---\n"; + llvm::outs() << "--- DominanceInfo ---\n"; dominanceTest.printDominance(getAnalysis(), /*printCommonDominatorInfo=*/true); - llvm::errs() << "--- PostDominanceInfo ---\n"; + llvm::outs() << "--- PostDominanceInfo ---\n"; dominanceTest.printDominance(getAnalysis(), /*printCommonDominatorInfo=*/true); // Print dominance relationship between blocks. - llvm::errs() << "--- Block Dominance relationship ---\n"; + llvm::outs() << "--- Block Dominance relationship ---\n"; dominanceTest.printDominance(getAnalysis(), /*printCommonDominatorInfo=*/false); - llvm::errs() << "--- Block PostDominance relationship ---\n"; + llvm::outs() << "--- Block PostDominance relationship ---\n"; dominanceTest.printDominance(getAnalysis(), /*printCommonDominatorInfo=*/false); } diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 37669d9033fe30..7a6d1cf2e9c6de 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -2203,6 +2203,13 @@ libc_math_function( ], ) +libc_math_function( + name = "exp10m1f", + additional_deps = [ + ":explogxf", + ], +) + libc_math_function( name = "exp2", additional_deps = [ diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index 20447d59c1f92a..0628947540ca73 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -32,6 +32,8 @@ exports_files([ "include/llvm/IR/Intrinsics.td", "include/llvm/Option/OptParser.td", "utils/lit/lit.py", + # This one is needed for building and vendoring out lldb from off tree. + "utils/lldbDataFormatters.py", ]) # It may be tempting to add compiler flags here, but that should be avoided. diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index e2b24fd253a676..b119fbe18de5fb 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3236,6 +3236,7 @@ td_library( includes = ["include"], deps = [ ":BuiltinDialectTdFiles", + ":DestinationStyleOpInterfaceTdFiles", ":InferTypeOpInterfaceTdFiles", ":SideEffectInterfacesTdFiles", ], @@ -3370,6 +3371,7 @@ cc_library( deps = [ ":ArithDialect", ":BytecodeOpInterface", + ":DestinationStyleOpInterface", ":DialectUtils", ":IR", ":InferTypeOpInterface", @@ -10095,34 +10097,6 @@ cc_binary( ], ) -cc_binary( - name = "mlir-spirv-cpu-runner", - srcs = ["tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp"], - deps = [ - ":ArithDialect", - ":BuiltinToLLVMIRTranslation", - ":ExecutionEngineUtils", - ":FuncDialect", - ":FuncToLLVM", - ":GPUDialect", - ":GPUToSPIRV", - ":GPUTransforms", - ":LLVMDialect", - ":LLVMToLLVMIRTranslation", - ":MemRefDialect", - ":MlirJitRunner", - ":Pass", - ":SPIRVConversion", - ":SPIRVDialect", - ":SPIRVToLLVM", - ":SPIRVTransforms", - ":ToLLVMIRTranslation", - "//llvm:Core", - "//llvm:Linker", - "//llvm:Support", - ], -) - cc_library( name = "TableGen", srcs = glob(["lib/TableGen/*.cpp"]),