Skip to content

Commit 92c281d

Browse files
gpsheadclaude
andcommitted
pystrhex: Use signed comparison for efficient SIMD codegen
GCC's vector extensions generate inefficient code for unsigned byte comparison (hi > nine): psubusb + pcmpeqb + pcmpeqb (3 instructions). By casting to signed bytes before comparison, GCC generates the efficient pcmpgtb instruction instead. This is safe because nibble values (0-15) are within signed byte range. This reduces the SIMD loop from 29 to 25 instructions, matching the performance of explicit SSE2 intrinsics while keeping the portable vector extensions approach. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent e643fb8 commit 92c281d

1 file changed

Lines changed: 17 additions & 4 deletions

File tree

Python/pystrhex.c

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323

2424
/* 128-bit vector of 16 unsigned bytes */
2525
typedef unsigned char v16u8 __attribute__((vector_size(16)));
26+
/* 128-bit vector of 16 signed bytes - for efficient comparison.
27+
Using signed comparison generates pcmpgtb on x86-64 instead of
28+
the slower psubusb+pcmpeqb sequence from unsigned comparison. */
29+
typedef signed char v16s8 __attribute__((vector_size(16)));
2630

2731
/* Splat a byte value across all 16 lanes */
2832
static inline v16u8
@@ -31,6 +35,12 @@ v16u8_splat(unsigned char x)
3135
return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
3236
}
3337

38+
static inline v16s8
39+
v16s8_splat(signed char x)
40+
{
41+
return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
42+
}
43+
3444
/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
3545
Compiles to native SSE2 on x86-64, NEON on ARM64. */
3646
static void
@@ -39,7 +49,7 @@ _Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
3949
const v16u8 mask_0f = v16u8_splat(0x0f);
4050
const v16u8 ascii_0 = v16u8_splat('0');
4151
const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */
42-
const v16u8 nine = v16u8_splat(9);
52+
const v16s8 nine = v16s8_splat(9);
4353

4454
Py_ssize_t i = 0;
4555

@@ -53,9 +63,12 @@ _Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
5363
v16u8 hi = (data >> 4) & mask_0f;
5464
v16u8 lo = data & mask_0f;
5565

56-
/* Compare > 9 produces all-ones mask where true */
57-
v16u8 hi_gt9 = hi > nine;
58-
v16u8 lo_gt9 = lo > nine;
66+
/* Compare > 9 using signed comparison for efficient codegen.
67+
Nibble values 0-15 are safely in signed byte range.
68+
This generates pcmpgtb on x86-64, avoiding the slower
69+
psubusb+pcmpeqb sequence from unsigned comparison. */
70+
v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
71+
v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);
5972

6073
/* Convert nibbles to hex ASCII */
6174
hi = hi + ascii_0 + (hi_gt9 & offset);

0 commit comments

Comments
 (0)