Skip to content

Commit a22e5ce

Browse files
gpsheadclaude
andcommitted
pystrhex: Add SIMD optimization for large separator groups
Extend the portable SIMD hexlify to handle separator cases where bytes_per_sep >= 16. Uses in-place shuffle: SIMD hexlify to output buffer, then work backwards to insert separators via memmove. For 4096 bytes with sep=32: ~3.3µs (vs ~7.3µs for sep=1 scalar). Useful for hex dump style output like bytes.hex('\n', 32). Also adds benchmark for newline separator every 32 bytes. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent ae3d7be commit a22e5ce

2 files changed

Lines changed: 64 additions & 0 deletions

File tree

Python/pystrhex.c

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,50 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
184184
Py_ssize_t chunk;
185185
unsigned int k;
186186

187+
#if PY_HEXLIFY_CAN_COMPILE_SIMD
188+
/* SIMD path for large separator groups (>= 16 bytes per group).
189+
SIMD hexlify to output buffer, then shuffle in-place to insert
190+
separators. Working backwards avoids overlap issues since we're
191+
expanding (destination index >= source index). */
192+
if (abs_bytes_per_sep >= 16 && arglen >= 16) {
193+
/* SIMD hexlify all bytes to start of output buffer */
194+
_Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
195+
196+
/* Shuffle in-place, working backwards */
197+
Py_ssize_t hex_chunk_size = 2 * (Py_ssize_t)abs_bytes_per_sep;
198+
Py_ssize_t remainder_bytes = arglen - chunks * (Py_ssize_t)abs_bytes_per_sep;
199+
Py_ssize_t remainder_hex_len = 2 * remainder_bytes;
200+
Py_ssize_t hex_pos = 2 * arglen; /* End of hex data */
201+
Py_ssize_t out_pos = resultlen; /* End of output */
202+
203+
if (bytes_per_sep_group < 0) {
204+
/* Forward: remainder at end, separators after each chunk */
205+
if (remainder_hex_len > 0) {
206+
hex_pos -= remainder_hex_len;
207+
out_pos -= remainder_hex_len;
208+
memmove(retbuf + out_pos, retbuf + hex_pos, remainder_hex_len);
209+
}
210+
for (Py_ssize_t c = chunks - 1; c >= 0; c--) {
211+
retbuf[--out_pos] = sep_char;
212+
hex_pos -= hex_chunk_size;
213+
out_pos -= hex_chunk_size;
214+
memmove(retbuf + out_pos, retbuf + hex_pos, hex_chunk_size);
215+
}
216+
}
217+
else {
218+
/* Backward: remainder at start, separators before each chunk */
219+
for (Py_ssize_t c = chunks - 1; c >= 0; c--) {
220+
hex_pos -= hex_chunk_size;
221+
out_pos -= hex_chunk_size;
222+
memmove(retbuf + out_pos, retbuf + hex_pos, hex_chunk_size);
223+
retbuf[--out_pos] = sep_char;
224+
}
225+
/* Remainder at start stays in place (hex_pos == out_pos == remainder_hex_len) */
226+
}
227+
goto done_hexlify;
228+
}
229+
#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */
230+
187231
if (bytes_per_sep_group < 0) {
188232
i = j = 0;
189233
for (chunk = 0; chunk < chunks; chunk++) {
@@ -221,6 +265,10 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
221265
}
222266
}
223267

268+
#if PY_HEXLIFY_CAN_COMPILE_SIMD
269+
done_hexlify:
270+
#endif
271+
224272
#ifdef Py_DEBUG
225273
if (!return_bytes) {
226274
assert(_PyUnicode_CheckConsistency(retval, 1));

Tools/scripts/pystrhex_benchmark.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,21 @@ def bench_bytes_hex_sep_group():
7979
print(f" {ns:7.1f} ns {format_size(size)}")
8080

8181

82+
def bench_bytes_hex_newline_32():
83+
"""Benchmark bytes.hex() with newline separator every 32 bytes.
84+
85+
This simulates hex dump output with 32 bytes (64 hex chars) per line.
86+
"""
87+
print("\nbytes.hex('\\n', 32) with newline (every 32 bytes):")
88+
# Only test sizes >= 32 where this grouping is meaningful
89+
for size in SIZES:
90+
if size < 32:
91+
continue
92+
data = DATA[size]
93+
ns = run_benchmark(lambda d=data: d.hex('\n', 32))
94+
print(f" {ns:7.1f} ns {format_size(size)}")
95+
96+
8297
def bench_bytearray_hex():
8398
"""Benchmark bytearray.hex() for comparison."""
8499
print("\nbytearray.hex() by size:")
@@ -179,6 +194,7 @@ def bench_hash_hexdigest_only():
179194
bench_bytes_hex()
180195
bench_bytes_hex_sep()
181196
bench_bytes_hex_sep_group()
197+
bench_bytes_hex_newline_32()
182198
bench_bytearray_hex()
183199
bench_memoryview_hex()
184200
bench_binascii_hexlify()

0 commit comments

Comments
 (0)