Skip to content

Commit

Permalink
optimize ImageLoaderSVG::create_image_from_utf8_buffer
Browse files Browse the repository at this point in the history
  • Loading branch information
beru committed Oct 19, 2024
1 parent a480a05 commit 10fb526
Showing 1 changed file with 89 additions and 10 deletions.
99 changes: 89 additions & 10 deletions modules/svg/image_loader_svg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@

#include <thorvg.h>

#if defined(__SSSE3__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512BW__)
#ifdef _MSC_VER
#include <intrin.h>
#else
#include <x86intrin.h>
#endif
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <arm_neon.h>
#endif

HashMap<Color, Color> ImageLoaderSVG::forced_color_map = HashMap<Color, Color>();

void ImageLoaderSVG::set_forced_color_map(const HashMap<Color, Color> &p_color_map) {
Expand Down Expand Up @@ -79,6 +89,84 @@ Ref<Image> ImageLoaderSVG::load_mem_svg(const uint8_t *p_svg, int p_size, float
return img;
}

static void RGBA_to_BGRA(uint32_t width, uint32_t height, Vector<uint8_t> &image, uint32_t *buffer) {
const uint32_t wh = width * height;
uint32_t i;
#if defined(__AVX512BW__)
// Some of recent Intel processors and recent AMD processors (from Zen 4) support AVX-512.
{
uint8_t *dst = image.ptrw();
const __m512i mask = _mm512_setr_epi8(
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
for (i = 0; i < wh / 16; ++i) {
__m512i s16 = _mm512_loadu_epi8(&buffer[i * 16]);
__m512i d16 = _mm512_shuffle_epi8(s16, mask);
_mm512_storeu_epi8(dst, d16);
dst += 64;
}
i = wh / 16 * 16;
}
#elif defined(__AVX2__)
// AVX2 is widely aviable on recent x86-64 processors.
{
uint8_t *dst = image.ptrw();
const __m256i mask = _mm256_setr_epi8(
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15,
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
for (i = 0; i < wh / 8; ++i) {
__m256i s8 = _mm256_loadu_epi8(&buffer[i * 8]);
__m256i d8 = _mm256_shuffle_epi8(s8, mask);
_mm256_storeu_epi8(dst, d8);
dst += 32;
}
i = wh / 8 * 8;
}
#elif defined(__AVX__) || defined(__SSSE3__)
// _mm_shuffle_epi8 is available from SSSE3
// Recent x86-64 processors support AVX/AVX2 and legacy SSE instructions are covered within AVX instructions.
// VC++ doesn't define __SSSE3__ when "/arch:AVX" is specified so __AVX__ is used for VC++.
// GCC supports fine-grained CFLAGS options such as -mssse3.
{
uint8_t *dst = image.ptrw();
const __m128i mask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
for (i = 0; i < wh / 4; ++i) {
__m128i s4 = _mm_loadu_epi8(&buffer[i * 4]);
__m128i d4 = _mm_shuffle_epi8(s4, mask);
_mm_storeu_epi8(dst, d4);
dst += 16;
}
i = wh / 4 * 4;
}
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
// vqtbl1q_u8 is available from ARMv8.
// ARMv7 NEON doesn't support vqtbl1q_u8.
{
uint8_t *dst = image.ptrw();
const uint8x16_t mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 };
for (i = 0; i < wh / 4; ++i) {
uint8x16_t s4 = vreinterpretq_u8_u32(vld1q_u32(&buffer[i * 4]));
uint8x16_t d4 = vqtbl1q_u8(s4, mask);
vst1q_u8(dst, d4);
dst += 16;
}
i = wh / 4 * 4;
}
#else
i = 0;
#endif
for (; i < wh; ++i) {
uint32_t n = buffer[i];
const size_t offset = sizeof(uint32_t) * i;
image.write[offset + 0] = (n >> 16) & 0xff;
image.write[offset + 1] = (n >> 8) & 0xff;
image.write[offset + 2] = n & 0xff;
image.write[offset + 3] = (n >> 24) & 0xff;
}
}

Error ImageLoaderSVG::create_image_from_utf8_buffer(Ref<Image> p_image, const uint8_t *p_buffer, int p_buffer_size, float p_scale, bool p_upsample) {
ERR_FAIL_COND_V_MSG(Math::is_zero_approx(p_scale), ERR_INVALID_PARAMETER, "ImageLoaderSVG: Can't load SVG with a scale of 0.");

Expand Down Expand Up @@ -136,16 +224,7 @@ Error ImageLoaderSVG::create_image_from_utf8_buffer(Ref<Image> p_image, const ui
Vector<uint8_t> image;
image.resize(width * height * sizeof(uint32_t));

for (uint32_t y = 0; y < height; y++) {
for (uint32_t x = 0; x < width; x++) {
uint32_t n = buffer[y * width + x];
const size_t offset = sizeof(uint32_t) * width * y + sizeof(uint32_t) * x;
image.write[offset + 0] = (n >> 16) & 0xff;
image.write[offset + 1] = (n >> 8) & 0xff;
image.write[offset + 2] = n & 0xff;
image.write[offset + 3] = (n >> 24) & 0xff;
}
}
RGBA_to_BGRA(width, height, image, buffer);

res = sw_canvas->clear(true);
memfree(buffer);
Expand Down

0 comments on commit 10fb526

Please sign in to comment.