99static inline void
1010_Py_hexlify_scalar (const unsigned char * src , Py_UCS1 * dst , Py_ssize_t len )
1111{
12+ /* Various optimizations like using math instead of a table lookup,
13+ manually unrolling the loop, storing the global table pointer locally,
14+ and doing wider dst writes have been tried and benchmarked; all produced
15+ nearly identical performance on gcc 15. Using a 256 entry uint16_t
16+ table was a bit slower. So we keep our old simple and obvious code. */
1217 for (Py_ssize_t i = 0 ; i < len ; i ++ ) {
1318 unsigned char c = src [i ];
1419 * dst ++ = Py_hexdigits [c >> 4 ];
@@ -18,12 +23,24 @@ _Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
1823
1924/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
2025 Uses __builtin_shufflevector for portable interleave that compiles to
21- native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64,
22- NEON zip1/zip2 on ARM64, vzip on ARM32).
26+ native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
27+ NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
28+ for the target microarch allow it [try -march=native if running 32-bit
29+ on an rpi3 or later]).
2330
2431 Requirements:
2532 - GCC 12+ or Clang 3.0+ (for __builtin_shufflevector)
26- - x86-64, ARM64, or ARM32 with NEON */
33+ - x86-64, ARM64, or ARM32 with NEON
34+
35+ Performance:
36+ - Up to 11x faster on larger data than the scalar code.
37+ - For more common small data it varies between 1.1-3x faster.
38+
39+ Even faster is possible for big data using AVX2 or AVX512 but
40+ that adds complication. Honestly, who really hexes _huge_ data?!
41+
42+ Speeding up the 16-64 byte cases fits nicely with md5 through sha512.
43+ */
2744#if (defined(__x86_64__ ) || defined(__aarch64__ ) || \
2845 (defined(__arm__ ) && defined(__ARM_NEON ))) && \
2946 (defined(__clang__ ) || (defined(__GNUC__ ) && __GNUC__ >= 12 ))
@@ -38,7 +55,8 @@ _Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
3855typedef unsigned char v16u8 __attribute__((vector_size (16 )));
3956/* 128-bit vector of 16 signed bytes - for efficient comparison.
4057 Using signed comparison generates pcmpgtb on x86-64 instead of
41- the slower psubusb+pcmpeqb sequence from unsigned comparison. */
58+ the slower psubusb+pcmpeqb sequence from unsigned comparison.
59+ ARM NEON performs the same either way. */
4260typedef signed char v16s8 __attribute__((vector_size (16 )));
4361
4462/* Splat a byte value across all 16 lanes */
@@ -55,7 +73,7 @@ v16s8_splat(signed char x)
5573}
5674
5775/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
58- Compiles to native SSE2 on x86-64, NEON on ARM64. */
76+ Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32) . */
5977static void
6078_Py_hexlify_simd (const unsigned char * src , Py_UCS1 * dst , Py_ssize_t len )
6179{
@@ -88,7 +106,8 @@ _Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
88106 lo = lo + ascii_0 + (lo_gt9 & offset );
89107
90108 /* Interleave hi/lo nibbles using portable shufflevector.
91- This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64. */
109+ This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
110+ or vzip on ARM32. */
92111 v16u8 result0 = __builtin_shufflevector (hi , lo ,
93112 0 , 16 , 1 , 17 , 2 , 18 , 3 , 19 , 4 , 20 , 5 , 21 , 6 , 22 , 7 , 23 );
94113 v16u8 result1 = __builtin_shufflevector (hi , lo ,
@@ -184,8 +203,8 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
184203
185204 if (bytes_per_sep_group == 0 ) {
186205#if PY_HEXLIFY_CAN_COMPILE_SIMD
187- /* Use portable SIMD for inputs >= 16 bytes */
188206 if (arglen >= 16 ) {
207+ // little vector units go brrrr...
189208 _Py_hexlify_simd ((const unsigned char * )argbuf , retbuf , arglen );
190209 }
191210 else
0 commit comments