explain more in the comments

gpshead · gpshead · commit 0f94c061d498 · 2026-01-18T08:36:42.000Z
diff --git a/Python/pystrhex.c b/Python/pystrhex.c
@@ -9,6 +9,11 @@
 static inline void
 _Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
 {
+    /* Various optimizations like using math instead of a table lookup,
+       manually unrolling the loop, storing the global table pointer locally,
+       and doing wider dst writes have been tried and benchmarked; all produced
+       nearly identical performance on gcc 15.  Using a 256 entry uint16_t
+       table was a bit slower.  So we keep our old simple and obvious code. */
     for (Py_ssize_t i = 0; i < len; i++) {
         unsigned char c = src[i];
         *dst++ = Py_hexdigits[c >> 4];
@@ -18,12 +23,24 @@ _Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
 
 /* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
    Uses __builtin_shufflevector for portable interleave that compiles to
-   native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64,
-   NEON zip1/zip2 on ARM64, vzip on ARM32).
+   native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
+   NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
+   for the target microarch allow it [try -march=native if running 32-bit
+   on an rpi3 or later]).
 
    Requirements:
    - GCC 12+ or Clang 3.0+ (for __builtin_shufflevector)
-   - x86-64, ARM64, or ARM32 with NEON */
+   - x86-64, ARM64, or ARM32 with NEON
+
+   Performance:
+   - Up to 11x faster on larger data than the scalar code.
+   - For more common small data it varies between 1.1-3x faster.
+
+   Even faster is possible for big data using AVX2 or AVX512 but
+   that adds complication. Honestly, who really hexes _huge_ data?!
+
+   Speeding up the 16-64 byte cases fits nicely with md5 through sha512.
+*/
 #if (defined(__x86_64__) || defined(__aarch64__) || \
      (defined(__arm__) && defined(__ARM_NEON))) && \
     (defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 12))
@@ -38,7 +55,8 @@ _Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
 typedef unsigned char v16u8 __attribute__((vector_size(16)));
 /* 128-bit vector of 16 signed bytes - for efficient comparison.
    Using signed comparison generates pcmpgtb on x86-64 instead of
-   the slower psubusb+pcmpeqb sequence from unsigned comparison. */
+   the slower psubusb+pcmpeqb sequence from unsigned comparison.
+   ARM NEON performs the same either way. */
 typedef signed char v16s8 __attribute__((vector_size(16)));
 
 /* Splat a byte value across all 16 lanes */
@@ -55,7 +73,7 @@ v16s8_splat(signed char x)
 }
 
 /* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
-   Compiles to native SSE2 on x86-64, NEON on ARM64. */
+   Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
 static void
 _Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
 {
@@ -88,7 +106,8 @@ _Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
         lo = lo + ascii_0 + (lo_gt9 & offset);
 
         /* Interleave hi/lo nibbles using portable shufflevector.
-           This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64. */
+           This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
+           or vzip on ARM32. */
         v16u8 result0 = __builtin_shufflevector(hi, lo,
             0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
         v16u8 result1 = __builtin_shufflevector(hi, lo,
@@ -184,8 +203,8 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
 
     if (bytes_per_sep_group == 0) {
 #if PY_HEXLIFY_CAN_COMPILE_SIMD
-        /* Use portable SIMD for inputs >= 16 bytes */
         if (arglen >= 16) {
+            // little vector units go brrrr...
             _Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
         }
         else