[mypyc] Fix b64decode to match new cpython behavior (#21200)

p-sawicki · web-flow · commit c6d938efd83e · 2026-04-10T16:44:26.000+02:00
Fixes #21120 Behavior of `base64.b64decode` was recently [fixed in cpython](python/cpython#145264) to no longer stop processing the input data after the first padded quad. This resulted in a failure in one of our tests as the result of the librt implementation was no longer the same. Update the librt implementation to match the new correct behavior. Inputs where there are valid base64 characters after padding are rejected by `libbase64` so we fall back to the slow path where we preprocess the input. Currently this preprocessing matches the buggy behavior where we stop processing after the first padded quad. Change this to instead ignore padding in the middle of input, and only copy padding at the end. Behavior of librt will now be correct regardless of python version at runtime as it doesn't seem worthwhile to preserve the buggy behavior for compatibility. So modify the test to only perform checks on input where there's data after padding with the python versions that have the bugfix.
diff --git a/mypyc/lib-rt/base64/librt_base64.c b/mypyc/lib-rt/base64/librt_base64.c
@@ -240,39 +240,21 @@ b64decode_handle_invalid_input(
         return PyErr_NoMemory();
     }
 
-    // Copy base64 characters and some padding to the new buffer
+    int pad_chars = 0;
+    // Copy base64 characters to the new buffer. Ignore padding to conform to RFC 4648 section 3.3.
     for (size_t i = 0; i < srclen; i++) {
         char c = src[i];
         if (is_valid_base64_char(c, false)) {
             newbuf[newbuf_len++] = c;
+            pad_chars = 0;
         } else if (c == '=') {
-            // Copy a necessary amount of padding
-            int remainder = newbuf_len % 4;
-            if (remainder == 0) {
-                // No padding needed
-                break;
-            }
-            int numpad = 4 - remainder;
-            // Check that there is at least the required amount padding (CPython ignores
-            // extra padding)
-            while (numpad > 0) {
-                if (i == srclen || src[i] != '=') {
-                    break;
-                }
-                newbuf[newbuf_len++] = '=';
-                i++;
-                numpad--;
-                // Skip non-base64 alphabet characters within padding
-                while (i < srclen && !is_valid_base64_char(src[i], true)) {
-                    i++;
-                }
-            }
-            break;
+            pad_chars++;
         }
     }
 
+    int quad_pos = newbuf_len % 4;
     // Stdlib always performs a non-strict padding check
-    if (newbuf_len % 4 != 0) {
+    if (quad_pos != 0 && quad_pos + pad_chars < 4) {
         if (freesrc) {
             PyMem_Free((void *)src);
         }
@@ -282,6 +264,15 @@ b64decode_handle_invalid_input(
         return NULL;
     }
 
+    if (quad_pos != 0) {
+        // Add padding at the end to make the input length a multiple of 4. We know that this padding
+        // is present in src because otherwise we would report the "Incorrect padding" error above.
+        while (quad_pos < 4) {
+            newbuf[newbuf_len++] = '=';
+            quad_pos++;
+        }
+    }
+
     size_t outlen = max_out;
     int ret = base64_decode(newbuf, newbuf_len, outbuf, &outlen, 0);
     PyMem_Free(newbuf);
diff --git a/mypyc/test-data/run-base64.test b/mypyc/test-data/run-base64.test
@@ -3,6 +3,7 @@ from typing import Any, cast
 import base64
 import binascii
 import random
+import sys
 
 from librt.base64 import b64encode, b64decode, urlsafe_b64encode, urlsafe_b64decode
 
@@ -121,6 +122,14 @@ def test_decode_with_non_base64_chars() -> None:
             check_decode(b"e" + b + b"A==", encoded=True)
             check_decode(b"eA=" + b + b"=", encoded=True)
 
+def has_stdlib_b64decode_bugfix() -> bool:
+    # stdlib b64decode has a bug in older python versions where it skips processing the input data
+    # after the first padded quad. It was changed to conform to RFC 4648 section 3.3 in cpython 3.13.13+,
+    # 3.14.4+ and 3.15+. The librt implementation was changed to match the correct behavior regardless
+    # of python version so some inputs result in different results than stdlib on older python.
+    _, minor, micro, _, _ = sys.version_info
+    return minor > 14 or (minor == 14 and micro >= 4) or (minor == 13 and micro >= 13)
+
 def check_decode_error(b: bytes, ignore_stdlib: bool = False) -> None:
     if not ignore_stdlib:
         with assertRaises(binascii.Error):
@@ -135,9 +144,7 @@ def test_decode_with_invalid_padding() -> None:
     check_decode_error(b"eA=")
     check_decode_error(b"eHk")
     check_decode_error(b"eA = ")
-
-    # Here stdlib behavior seems nonsensical, so we don't try to duplicate it
-    check_decode_error(b"eA=a=", ignore_stdlib=True)
+    check_decode_error(b"eA==x", ignore_stdlib=not has_stdlib_b64decode_bugfix())
 
 def test_decode_with_extra_data_after_padding() -> None:
     check_decode(b"=", encoded=True)
@@ -146,10 +153,10 @@ def test_decode_with_extra_data_after_padding() -> None:
     check_decode(b"====", encoded=True)
     check_decode(b"eA===", encoded=True)
     check_decode(b"eHk==", encoded=True)
-    # TODO: behavior in these cases changed in Python 3.14.4, we should match that.
-    # check_decode(b"eA==x", encoded=True)
-    # check_decode(b"eHk=x", encoded=True)
-    # check_decode(b"eA==abc=======efg", encoded=True)
+    if has_stdlib_b64decode_bugfix():
+        check_decode(b"eA=a=", encoded=True)
+        check_decode(b"eHk=x", encoded=True)
+        check_decode(b"eA==abc=======efg", encoded=True)
 
 def test_decode_wrappers() -> None:
     funcs: list[Any] = [b64decode, urlsafe_b64decode]