Skip to content

Commit 0f94c06

Browse files
committed
explain more in the comments
1 parent b6feaba commit 0f94c06

1 file changed

Lines changed: 26 additions & 7 deletions

File tree

Python/pystrhex.c

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99
static inline void
1010
_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
1111
{
12+
/* Various optimizations like using math instead of a table lookup,
13+
manually unrolling the loop, storing the global table pointer locally,
14+
and doing wider dst writes have been tried and benchmarked; all produced
15+
nearly identical performance on gcc 15. Using a 256 entry uint16_t
16+
table was a bit slower. So we keep our old simple and obvious code. */
1217
for (Py_ssize_t i = 0; i < len; i++) {
1318
unsigned char c = src[i];
1419
*dst++ = Py_hexdigits[c >> 4];
@@ -18,12 +23,24 @@ _Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
1823

1924
/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
2025
Uses __builtin_shufflevector for portable interleave that compiles to
21-
native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64,
22-
NEON zip1/zip2 on ARM64, vzip on ARM32).
26+
native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
27+
NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
28+
for the target microarch allow it [try -march=native if running 32-bit
29+
on an rpi3 or later]).
2330
2431
Requirements:
2532
- GCC 12+ or Clang 3.0+ (for __builtin_shufflevector)
26-
- x86-64, ARM64, or ARM32 with NEON */
33+
- x86-64, ARM64, or ARM32 with NEON
34+
35+
Performance:
36+
- Up to 11x faster on larger data than the scalar code.
37+
- For more common small data it varies between 1.1-3x faster.
38+
39+
Even faster is possible for big data using AVX2 or AVX512 but
40+
that adds complication. Honestly, who really hexes _huge_ data?!
41+
42+
Speeding up the 16-64 byte cases fits nicely with md5 through sha512.
43+
*/
2744
#if (defined(__x86_64__) || defined(__aarch64__) || \
2845
(defined(__arm__) && defined(__ARM_NEON))) && \
2946
(defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 12))
@@ -38,7 +55,8 @@ _Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
3855
typedef unsigned char v16u8 __attribute__((vector_size(16)));
3956
/* 128-bit vector of 16 signed bytes - for efficient comparison.
4057
Using signed comparison generates pcmpgtb on x86-64 instead of
41-
the slower psubusb+pcmpeqb sequence from unsigned comparison. */
58+
the slower psubusb+pcmpeqb sequence from unsigned comparison.
59+
ARM NEON performs the same either way. */
4260
typedef signed char v16s8 __attribute__((vector_size(16)));
4361

4462
/* Splat a byte value across all 16 lanes */
@@ -55,7 +73,7 @@ v16s8_splat(signed char x)
5573
}
5674

5775
/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
58-
Compiles to native SSE2 on x86-64, NEON on ARM64. */
76+
Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
5977
static void
6078
_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
6179
{
@@ -88,7 +106,8 @@ _Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
88106
lo = lo + ascii_0 + (lo_gt9 & offset);
89107

90108
/* Interleave hi/lo nibbles using portable shufflevector.
91-
This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64. */
109+
This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
110+
or vzip on ARM32. */
92111
v16u8 result0 = __builtin_shufflevector(hi, lo,
93112
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
94113
v16u8 result1 = __builtin_shufflevector(hi, lo,
@@ -184,8 +203,8 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
184203

185204
if (bytes_per_sep_group == 0) {
186205
#if PY_HEXLIFY_CAN_COMPILE_SIMD
187-
/* Use portable SIMD for inputs >= 16 bytes */
188206
if (arglen >= 16) {
207+
// little vector units go brrrr...
189208
_Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
190209
}
191210
else

0 commit comments

Comments
 (0)