diff --git a/rvv-intrinsic-generator/rvv_intrinsic_gen/constants.py b/rvv-intrinsic-generator/rvv_intrinsic_gen/constants.py index 15915a6f6..46217d902 100644 --- a/rvv-intrinsic-generator/rvv_intrinsic_gen/constants.py +++ b/rvv-intrinsic-generator/rvv_intrinsic_gen/constants.py @@ -27,8 +27,10 @@ WFSEWS = [16, 32] NSEWS = [16, 32, 64] TYPES = ["float", "int", "uint"] +TYPES = ["float", "int", "uint", "bfloat"] ITYPES = ["int", "uint"] FTYPES = ["float"] +BFTYPES = ["bfloat"] MTYPES = ["bool"] MLENS = [1, 2, 4, 8, 16, 32, 64] REF_DOC_URL = "../rvv-intrinsic-api.md" diff --git a/rvv-intrinsic-generator/rvv_intrinsic_gen/inst.py b/rvv-intrinsic-generator/rvv_intrinsic_gen/inst.py index bb9e3dba0..1cd09d542 100644 --- a/rvv-intrinsic-generator/rvv_intrinsic_gen/inst.py +++ b/rvv-intrinsic-generator/rvv_intrinsic_gen/inst.py @@ -43,7 +43,7 @@ from templates import mask_load_store_template from templates import permute_template from constants import LMULS,WLMULS,NCVTLMULS,SEWS,WSEWS,FSEWS,WFSEWS,NSEWS,\ - TYPES,ITYPES,FTYPES,MTYPES,MLENS,REF_DOC_URL + TYPES,BTYPES,ITYPES,FTYPES,BFTYPES,MTYPES,MLENS,REF_DOC_URL from generator import CompatibleHeaderGenerator @@ -68,11 +68,11 @@ def gen(g): g.function_group(load_template, "Vector Unit-Stride Load Functions", REF_DOC_URL + "#74-vector-unit-stride-operations", ["vle"], - TYPES, SEWS, LMULS, decorators.has_masking_maskedoff_policy) + BTYPES, SEWS, LMULS, decorators.has_masking_maskedoff_policy) g.function_group(store_template, "Vector Unit-Stride Store Functions", REF_DOC_URL + "#74-vector-unit-stride-operations", ["vse"], - TYPES, SEWS, LMULS, decorators.has_masking_no_maskedoff) + BTYPES, SEWS, LMULS, decorators.has_masking_no_maskedoff) g.function_group(load_template, "Vector Strided Load Functions", REF_DOC_URL + "#75-vector-strided-loadstore-operations", @@ -408,6 +408,27 @@ def gen(g): "Narrowing Floating-Point/Integer Type-Convert Functions", REF_DOC_URL + "#1419-narrowing-floating-pointinteger-type-convert-operations", ["ncvt"], "", NSEWS, NCVTLMULS, decorators.has_masking_maskedoff_policy) + + #################################################################### + g.start_group("Vector BFloat16 Functions (still on the draft status)") + + g.function_group( + mac_template, "Vector BFloat16 Widening Multiply-Add Functions", + REF_DOC_URL + "#1420-vector-bf16-widening-multiply-add-operations", + ["wmacc"], BFTYPES, WFSEWS, WLMULS, + decorators.has_masking_no_maskedoff_policy) + + g.function_group( + cvt_op_template, "Widening BFloat16/FP32 Type-Convert Functions", + REF_DOC_URL + + "#1421-widening-bf16-fp32-type-convert-operations", ["wcvtbf16"], + "", WSEWS, WLMULS, decorators.has_masking_maskedoff_policy) + + g.function_group( + cvt_op_template, + "Narrowing FP32/BFloat16 Type-Convert Functions", REF_DOC_URL + + "#1422-narrowing-fp32-bf16-type-convert-operations", ["ncvtbf16"], + "", NSEWS, NCVTLMULS, decorators.has_masking_maskedoff_policy) #################################################################### g.start_group("Vector Reduction Functions") diff --git a/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/cvt_op_template.py b/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/cvt_op_template.py index 1d6580682..a912ca39b 100644 --- a/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/cvt_op_template.py +++ b/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/cvt_op_template.py @@ -40,7 +40,8 @@ def render(G, op_list, type_list, sew_list, lmul_list, decorator_list): convert_set = [["int", "x", "float", "f"], ["int", "x", "int", "x"], ["uint", "x", "uint", "x"], ["uint", "xu", "float", "f"], ["float", "f", "int", "x"], ["float", "f", "uint", "xu"], - ["float", "f", "float", "f"]] + ["float", "f", "float", "f"], ["bfloat", "bf", "float", "f"], + ["float", "f", "bfloat", "bf"]] for args in prod( OP=op_list, SEW=sew_list, TYPES=convert_set, LMUL=lmul_list): op = args["OP"] @@ -54,11 +55,19 @@ def render(G, op_list, type_list, sew_list, lmul_list, decorator_list): if (op == "cvt" and args["TYPES1"] == args["TYPES3"]): continue + if ((args["TYPES1"] == "bf" or args["TYPES3"] == "bf") and + op != "wcvtbf16" and op != "ncvtbf16"): + continue + + if ((op == "wcvtbf16" and args["TYPES3"] != "bf" ) or + (op == "ncvtbf16" and args["TYPES1"] != "bf" )): + continue + args["MIDDLE"] = "v" factor = "" - if op == "wcvt": + if op == "wcvt" or op == "wcvtbf16": factor = "W" - if op == "ncvt": + if op == "ncvt" or op == "ncvtbf16": factor = "N" args["MIDDLE"] = "w" @@ -101,7 +110,7 @@ def render(G, op_list, type_list, sew_list, lmul_list, decorator_list): **decorator.tu_dest_args(rt), src=src_type, vl=type_helper.size_t) - if args["TYPES1"] != args["TYPES3"] and args["TYPES3"] == "f": + if args["TYPES1"] != args["TYPES3"] and args["TYPES3"] == "f" and args["TYPES1"] != "bf": args["OP"] = args["OP"] + "_rtz" inst_info = InstInfo.get( args, decorator, InstType.VV, extra_attr=extra_attr) diff --git a/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/mac_template.py b/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/mac_template.py index 0a44115f2..9b316a8a5 100644 --- a/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/mac_template.py +++ b/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/mac_template.py @@ -41,6 +41,10 @@ def render(G, op_list, type_list, sew_list, lmul_list, decorator_list): args["S_TYPE"] = "f" args["OP"] = "f" + op inst_type = InstType.VVF + elif data_type == "bfloat": + args["S_TYPE"] = "f" + args["OP"] = "f" + op + "bf16" + inst_type = InstType.VVF else: args["S_TYPE"] = "x" inst_type = InstType.VVX @@ -146,6 +150,30 @@ def render(G, op_list, type_list, sew_list, lmul_list, decorator_list): vs1=type_helper.s, vs2=type_helper.v, vl=type_helper.size_t) + elif data_type == "bfloat": + if "wmacc" in op and args["SEW"] == 16: + G.func( + inst_info_vv, + name="{OP}_vv_f{WSEW}m{WLMUL}".format_map(args) + + decorator.func_suffix, + return_type="vfloat{WSEW}m{WLMUL}_t".format_map(args), + **decorator.mask_args(type_helper.m, type_helper.v), + vd="vfloat{WSEW}m{WLMUL}_t".format_map(args), + vs1=type_helper.v, + vs2=type_helper.v, + vl=type_helper.size_t) + G.func( + inst_info_vs, + name="{OP}_v{S_TYPE}_f{WSEW}m{WLMUL}".format_map(args) + + decorator.func_suffix, + return_type="vfloat{WSEW}m{WLMUL}_t".format_map(args), + **decorator.mask_args(type_helper.m, type_helper.v), + vd="vfloat{WSEW}m{WLMUL}_t".format_map(args), + vs1=type_helper.s, + vs2=type_helper.v, + vl=type_helper.size_t) + else: + continue else: G.func( inst_info_vv, diff --git a/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/reint_op_template.py b/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/reint_op_template.py index d7cdaf516..189596fa3 100644 --- a/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/reint_op_template.py +++ b/rvv-intrinsic-generator/rvv_intrinsic_gen/templates/reint_op_template.py @@ -40,8 +40,10 @@ def render(G, op_list, type_list, sew_list, lmul_list, decorator_list): # Variable in list means # [dst type, dst short type, src type, src short type] convert_set = [["float", "f", "int", "i"], ["float", "f", "uint", "u"], + ["bfloat", "bf", "int", "i"], ["bfloat", "bf", "uint", "u"], ["uint", "u", "int", "i"], ["int", "i", "uint", "u"], - ["int", "i", "float", "f"], ["uint", "u", "float", "f"]] + ["int", "i", "float", "f"], ["uint", "u", "float", "f"], + ["int", "i", "bfloat", "bf"], ["uint", "u", "bfloat", "bf"]] for args in prod( OP=op_list, SEW=sew_list, TYPES=convert_set, LMUL=lmul_list): diff --git a/rvv-intrinsic-rfc.md b/rvv-intrinsic-rfc.md index b193e018b..74cce92d3 100644 --- a/rvv-intrinsic-rfc.md +++ b/rvv-intrinsic-rfc.md @@ -51,32 +51,34 @@ Further, individual intrinsic functions depend on the availability of the corres Encode `SEW` and `LMUL` into data types. We enforce the constraint `LMUL ≥ SEW/ELEN` in the implementation. There are the following data types for `ELEN` = 64. -| Types | LMUL = 1 | LMUL = 2 | LMUL = 4 | LMUL = 8 | LMUL = 1/2 | LMUL = 1/4 | LMUL = 1/8 -| ------------ | ------------ | ------------ | ------------ | ----------- | ------------- | ------------- | -------------- -| **int64_t** | vint64m1_t | vint64m2_t | vint64m4_t | vint64m8_t | N/A | N/A | N/A -| **uint64_t** | vuint64m1_t | vuint64m2_t | vuint64m4_t | vuint64m8_t | N/A | N/A | N/A -| **int32_t** | vint32m1_t | vint32m2_t | vint32m4_t | vint32m8_t | vint32mf2_t | N/A | N/A -| **uint32_t** | vuint32m1_t | vuint32m2_t | vuint32m4_t | vuint32m8_t | vuint32mf2_t | N/A | N/A -| **int16_t** | vint16m1_t | vint16m2_t | vint16m4_t | vint16m8_t | vint16mf2_t | vint16mf4_t | N/A -| **uint16_t** | vuint16m1_t | vuint16m2_t | vuint16m4_t | vuint16m8_t | vuint16mf2_t | vuint16mf4_t | N/A -| **int8_t** | vint8m1_t | vint8m2_t | vint8m4_t | vint8m8_t | vint8mf2_t | vint8mf4_t | vint8mf8_t -| **uint8_t** | vuint8m1_t | vuint8m2_t | vuint8m4_t | vuint8m8_t | vuint8mf2_t | vuint8mf4_t | vuint8mf8_t -| **vfloat64** | vfloat64m1_t | vfloat64m2_t | vfloat64m4_t | vfloat64m8_t | N/A | N/A | N/A -| **vfloat32** | vfloat32m1_t | vfloat32m2_t | vfloat32m4_t | vfloat32m8_t | vfloat32mf2_t | N/A | N/A -| **vfloat16** | vfloat16m1_t | vfloat16m2_t | vfloat16m4_t | vfloat16m8_t | vfloat16mf2_t | vfloat16mf4_t | N/A +| Types | LMUL = 1 | LMUL = 2 | LMUL = 4 | LMUL = 8 | LMUL = 1/2 | LMUL = 1/4 | LMUL = 1/8 +| ------------ | ------------ | ------------ | ------------ | ----------- | ------------- | ------------- | -------------- +| **int64_t** | vint64m1_t | vint64m2_t | vint64m4_t | vint64m8_t | N/A | N/A | N/A +| **uint64_t** | vuint64m1_t | vuint64m2_t | vuint64m4_t | vuint64m8_t | N/A | N/A | N/A +| **int32_t** | vint32m1_t | vint32m2_t | vint32m4_t | vint32m8_t | vint32mf2_t | N/A | N/A +| **uint32_t** | vuint32m1_t | vuint32m2_t | vuint32m4_t | vuint32m8_t | vuint32mf2_t | N/A | N/A +| **int16_t** | vint16m1_t | vint16m2_t | vint16m4_t | vint16m8_t | vint16mf2_t | vint16mf4_t | N/A +| **uint16_t** | vuint16m1_t | vuint16m2_t | vuint16m4_t | vuint16m8_t | vuint16mf2_t | vuint16mf4_t | N/A +| **int8_t** | vint8m1_t | vint8m2_t | vint8m4_t | vint8m8_t | vint8mf2_t | vint8mf4_t | vint8mf8_t +| **uint8_t** | vuint8m1_t | vuint8m2_t | vuint8m4_t | vuint8m8_t | vuint8mf2_t | vuint8mf4_t | vuint8mf8_t +| **vfloat64** | vfloat64m1_t | vfloat64m2_t | vfloat64m4_t | vfloat64m8_t | N/A | N/A | N/A +| **vfloat32** | vfloat32m1_t | vfloat32m2_t | vfloat32m4_t | vfloat32m8_t | vfloat32mf2_t | N/A | N/A +| **vfloat16** | vfloat16m1_t | vfloat16m2_t | vfloat16m4_t | vfloat16m8_t | vfloat16mf2_t | vfloat16mf4_t | N/A +| **vbfloat16** | vbfloat16m1_t | vbfloat16m2_t | vbfloat16m4_t | vbfloat16m8_t | vbfloat16mf2_t | vbfloat16mf4_t | N/A There are the following data types for `ELEN` = 32. -| Types | LMUL = 1 | LMUL = 2 | LMUL = 4 | LMUL = 8 | LMUL = 1/2 | LMUL = 1/4 | LMUL = 1/8 -| ------------ | ------------ | ------------ | ------------ | ----------- | ------------- | ------------- | -------------- -| **int32_t** | vint32m1_t | vint32m2_t | vint32m4_t | vint32m8_t | N/A | N/A | N/A -| **uint32_t** | vuint32m1_t | vuint32m2_t | vuint32m4_t | vuint32m8_t | N/A | N/A | N/A -| **int16_t** | vint16m1_t | vint16m2_t | vint16m4_t | vint16m8_t | vint16mf2_t | N/A | N/A -| **uint16_t** | vuint16m1_t | vuint16m2_t | vuint16m4_t | vuint16m8_t | vuint16mf2_t | N/A | N/A -| **int8_t** | vint8m1_t | vint8m2_t | vint8m4_t | vint8m8_t | vint8mf2_t | vint8mf4_t | N/A -| **uint8_t** | vuint8m1_t | vuint8m2_t | vuint8m4_t | vuint8m8_t | vuint8mf2_t | vuint8mf4_t | N/A -| **vfloat32** | vfloat32m1_t | vfloat32m2_t | vfloat32m4_t | vfloat32m8_t | N/A | N/A | N/A -| **vfloat16** | vfloat16m1_t | vfloat16m2_t | vfloat16m4_t | vfloat16m8_t | vfloat16mf2_t | N/A | N/A +| Types | LMUL = 1 | LMUL = 2 | LMUL = 4 | LMUL = 8 | LMUL = 1/2 | LMUL = 1/4 | LMUL = 1/8 +| ------------ | ------------ | ------------ | ------------ | ----------- | ------------- | ------------- | -------------- +| **int32_t** | vint32m1_t | vint32m2_t | vint32m4_t | vint32m8_t | N/A | N/A | N/A +| **uint32_t** | vuint32m1_t | vuint32m2_t | vuint32m4_t | vuint32m8_t | N/A | N/A | N/A +| **int16_t** | vint16m1_t | vint16m2_t | vint16m4_t | vint16m8_t | vint16mf2_t | N/A | N/A +| **uint16_t** | vuint16m1_t | vuint16m2_t | vuint16m4_t | vuint16m8_t | vuint16mf2_t | N/A | N/A +| **int8_t** | vint8m1_t | vint8m2_t | vint8m4_t | vint8m8_t | vint8mf2_t | vint8mf4_t | N/A +| **uint8_t** | vuint8m1_t | vuint8m2_t | vuint8m4_t | vuint8m8_t | vuint8mf2_t | vuint8mf4_t | N/A +| **vfloat32** | vfloat32m1_t | vfloat32m2_t | vfloat32m4_t | vfloat32m8_t | N/A | N/A | N/A +| **vfloat16** | vfloat16m1_t | vfloat16m2_t | vfloat16m4_t | vfloat16m8_t | vfloat16mf2_t | N/A | N/A +| **vbfloat16** | vbfloat16m1_t | vbfloat16m2_t | vbfloat16m4_t | vbfloat16m8_t | vbfloat16mf2_t | N/A | N/A ### Mask Types