optimize ImageLoaderSVG::create_image_from_utf8_buffer

Redot-Engine · Oct 19, 2024 · 10fb526 · 10fb526
1 parent a480a05
commit 10fb526
Showing 1 changed file with 89 additions and 10 deletions.
diff --git a/modules/svg/image_loader_svg.cpp b/modules/svg/image_loader_svg.cpp
@@ -37,6 +37,16 @@
 
 #include <thorvg.h>
 
+#if defined(__SSSE3__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512BW__)
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+
 HashMap<Color, Color> ImageLoaderSVG::forced_color_map = HashMap<Color, Color>();
 
 void ImageLoaderSVG::set_forced_color_map(const HashMap<Color, Color> &p_color_map) {
@@ -79,6 +89,84 @@ Ref<Image> ImageLoaderSVG::load_mem_svg(const uint8_t *p_svg, int p_size, float
 	return img;
 }
 
+static void RGBA_to_BGRA(uint32_t width, uint32_t height, Vector<uint8_t> &image, uint32_t *buffer) {
+	const uint32_t wh = width * height;
+	uint32_t i;
+#if defined(__AVX512BW__)
+	// Some of recent Intel processors and recent AMD processors (from Zen 4) support AVX-512.
+	{
+		uint8_t *dst = image.ptrw();
+		const __m512i mask = _mm512_setr_epi8(
+				2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
+				2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
+				2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
+				2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
+		for (i = 0; i < wh / 16; ++i) {
+			__m512i s16 = _mm512_loadu_epi8(&buffer[i * 16]);
+			__m512i d16 = _mm512_shuffle_epi8(s16, mask);
+			_mm512_storeu_epi8(dst, d16);
+			dst += 64;
+		}
+		i = wh / 16 * 16;
+	}
+#elif defined(__AVX2__)
+	// AVX2 is widely aviable on recent x86-64 processors.
+	{
+		uint8_t *dst = image.ptrw();
+		const __m256i mask = _mm256_setr_epi8(
+				2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
+				2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
+		for (i = 0; i < wh / 8; ++i) {
+			__m256i s8 = _mm256_loadu_epi8(&buffer[i * 8]);
+			__m256i d8 = _mm256_shuffle_epi8(s8, mask);
+			_mm256_storeu_epi8(dst, d8);
+			dst += 32;
+		}
+		i = wh / 8 * 8;
+	}
+#elif defined(__AVX__) || defined(__SSSE3__)
+	// _mm_shuffle_epi8 is available from SSSE3
+	// Recent x86-64 processors support AVX/AVX2 and legacy SSE instructions are covered within AVX instructions.
+	// VC++ doesn't define __SSSE3__ when "/arch:AVX" is specified so __AVX__ is used for VC++.
+	// GCC supports fine-grained CFLAGS options such as -mssse3.
+	{
+		uint8_t *dst = image.ptrw();
+		const __m128i mask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
+		for (i = 0; i < wh / 4; ++i) {
+			__m128i s4 = _mm_loadu_epi8(&buffer[i * 4]);
+			__m128i d4 = _mm_shuffle_epi8(s4, mask);
+			_mm_storeu_epi8(dst, d4);
+			dst += 16;
+		}
+		i = wh / 4 * 4;
+	}
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+	// vqtbl1q_u8 is available from ARMv8.
+	// ARMv7 NEON doesn't support vqtbl1q_u8.
+	{
+		uint8_t *dst = image.ptrw();
+		const uint8x16_t mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 };
+		for (i = 0; i < wh / 4; ++i) {
+			uint8x16_t s4 = vreinterpretq_u8_u32(vld1q_u32(&buffer[i * 4]));
+			uint8x16_t d4 = vqtbl1q_u8(s4, mask);
+			vst1q_u8(dst, d4);
+			dst += 16;
+		}
+		i = wh / 4 * 4;
+	}
+#else
+	i = 0;
+#endif
+	for (; i < wh; ++i) {
+		uint32_t n = buffer[i];
+		const size_t offset = sizeof(uint32_t) * i;
+		image.write[offset + 0] = (n >> 16) & 0xff;
+		image.write[offset + 1] = (n >> 8) & 0xff;
+		image.write[offset + 2] = n & 0xff;
+		image.write[offset + 3] = (n >> 24) & 0xff;
+	}
+}
+
 Error ImageLoaderSVG::create_image_from_utf8_buffer(Ref<Image> p_image, const uint8_t *p_buffer, int p_buffer_size, float p_scale, bool p_upsample) {
 	ERR_FAIL_COND_V_MSG(Math::is_zero_approx(p_scale), ERR_INVALID_PARAMETER, "ImageLoaderSVG: Can't load SVG with a scale of 0.");
 
@@ -136,16 +224,7 @@ Error ImageLoaderSVG::create_image_from_utf8_buffer(Ref<Image> p_image, const ui
 	Vector<uint8_t> image;
 	image.resize(width * height * sizeof(uint32_t));
 
-	for (uint32_t y = 0; y < height; y++) {
-		for (uint32_t x = 0; x < width; x++) {
-			uint32_t n = buffer[y * width + x];
-			const size_t offset = sizeof(uint32_t) * width * y + sizeof(uint32_t) * x;
-			image.write[offset + 0] = (n >> 16) & 0xff;
-			image.write[offset + 1] = (n >> 8) & 0xff;
-			image.write[offset + 2] = n & 0xff;
-			image.write[offset + 3] = (n >> 24) & 0xff;
-		}
-	}
+	RGBA_to_BGRA(width, height, image, buffer);
 
 	res = sw_canvas->clear(true);
 	memfree(buffer);