Skip to content

Commit c6d938e

Browse files
authored
[mypyc] Fix b64decode to match new cpython behavior (#21200)
Fixes #21120 Behavior of `base64.b64decode` was recently [fixed in cpython](python/cpython#145264) to no longer stop processing the input data after the first padded quad. This resulted in a failure in one of our tests as the result of the librt implementation was no longer the same. Update the librt implementation to match the new correct behavior. Inputs where there are valid base64 characters after padding are rejected by `libbase64` so we fall back to the slow path where we preprocess the input. Currently this preprocessing matches the buggy behavior where we stop processing after the first padded quad. Change this to instead ignore padding in the middle of input, and only copy padding at the end. Behavior of librt will now be correct regardless of python version at runtime as it doesn't seem worthwhile to preserve the buggy behavior for compatibility. So modify the test to only perform checks on input where there's data after padding with the python versions that have the bugfix.
1 parent a399e1c commit c6d938e

2 files changed

Lines changed: 29 additions & 31 deletions

File tree

mypyc/lib-rt/base64/librt_base64.c

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -240,39 +240,21 @@ b64decode_handle_invalid_input(
240240
return PyErr_NoMemory();
241241
}
242242

243-
// Copy base64 characters and some padding to the new buffer
243+
int pad_chars = 0;
244+
// Copy base64 characters to the new buffer. Ignore padding to conform to RFC 4648 section 3.3.
244245
for (size_t i = 0; i < srclen; i++) {
245246
char c = src[i];
246247
if (is_valid_base64_char(c, false)) {
247248
newbuf[newbuf_len++] = c;
249+
pad_chars = 0;
248250
} else if (c == '=') {
249-
// Copy a necessary amount of padding
250-
int remainder = newbuf_len % 4;
251-
if (remainder == 0) {
252-
// No padding needed
253-
break;
254-
}
255-
int numpad = 4 - remainder;
256-
// Check that there is at least the required amount padding (CPython ignores
257-
// extra padding)
258-
while (numpad > 0) {
259-
if (i == srclen || src[i] != '=') {
260-
break;
261-
}
262-
newbuf[newbuf_len++] = '=';
263-
i++;
264-
numpad--;
265-
// Skip non-base64 alphabet characters within padding
266-
while (i < srclen && !is_valid_base64_char(src[i], true)) {
267-
i++;
268-
}
269-
}
270-
break;
251+
pad_chars++;
271252
}
272253
}
273254

255+
int quad_pos = newbuf_len % 4;
274256
// Stdlib always performs a non-strict padding check
275-
if (newbuf_len % 4 != 0) {
257+
if (quad_pos != 0 && quad_pos + pad_chars < 4) {
276258
if (freesrc) {
277259
PyMem_Free((void *)src);
278260
}
@@ -282,6 +264,15 @@ b64decode_handle_invalid_input(
282264
return NULL;
283265
}
284266

267+
if (quad_pos != 0) {
268+
// Add padding at the end to make the input length a multiple of 4. We know that this padding
269+
// is present in src because otherwise we would report the "Incorrect padding" error above.
270+
while (quad_pos < 4) {
271+
newbuf[newbuf_len++] = '=';
272+
quad_pos++;
273+
}
274+
}
275+
285276
size_t outlen = max_out;
286277
int ret = base64_decode(newbuf, newbuf_len, outbuf, &outlen, 0);
287278
PyMem_Free(newbuf);

mypyc/test-data/run-base64.test

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ from typing import Any, cast
33
import base64
44
import binascii
55
import random
6+
import sys
67

78
from librt.base64 import b64encode, b64decode, urlsafe_b64encode, urlsafe_b64decode
89

@@ -121,6 +122,14 @@ def test_decode_with_non_base64_chars() -> None:
121122
check_decode(b"e" + b + b"A==", encoded=True)
122123
check_decode(b"eA=" + b + b"=", encoded=True)
123124

125+
def has_stdlib_b64decode_bugfix() -> bool:
126+
# stdlib b64decode has a bug in older python versions where it skips processing the input data
127+
# after the first padded quad. It was changed to conform to RFC 4648 section 3.3 in cpython 3.13.13+,
128+
# 3.14.4+ and 3.15+. The librt implementation was changed to match the correct behavior regardless
129+
# of python version so some inputs result in different results than stdlib on older python.
130+
_, minor, micro, _, _ = sys.version_info
131+
return minor > 14 or (minor == 14 and micro >= 4) or (minor == 13 and micro >= 13)
132+
124133
def check_decode_error(b: bytes, ignore_stdlib: bool = False) -> None:
125134
if not ignore_stdlib:
126135
with assertRaises(binascii.Error):
@@ -135,9 +144,7 @@ def test_decode_with_invalid_padding() -> None:
135144
check_decode_error(b"eA=")
136145
check_decode_error(b"eHk")
137146
check_decode_error(b"eA = ")
138-
139-
# Here stdlib behavior seems nonsensical, so we don't try to duplicate it
140-
check_decode_error(b"eA=a=", ignore_stdlib=True)
147+
check_decode_error(b"eA==x", ignore_stdlib=not has_stdlib_b64decode_bugfix())
141148

142149
def test_decode_with_extra_data_after_padding() -> None:
143150
check_decode(b"=", encoded=True)
@@ -146,10 +153,10 @@ def test_decode_with_extra_data_after_padding() -> None:
146153
check_decode(b"====", encoded=True)
147154
check_decode(b"eA===", encoded=True)
148155
check_decode(b"eHk==", encoded=True)
149-
# TODO: behavior in these cases changed in Python 3.14.4, we should match that.
150-
# check_decode(b"eA==x", encoded=True)
151-
# check_decode(b"eHk=x", encoded=True)
152-
# check_decode(b"eA==abc=======efg", encoded=True)
156+
if has_stdlib_b64decode_bugfix():
157+
check_decode(b"eA=a=", encoded=True)
158+
check_decode(b"eHk=x", encoded=True)
159+
check_decode(b"eA==abc=======efg", encoded=True)
153160

154161
def test_decode_wrappers() -> None:
155162
funcs: list[Any] = [b64decode, urlsafe_b64decode]

0 commit comments

Comments
 (0)