@@ -71,6 +71,8 @@ typedef struct _PyCfgBasicblock {
7171 unsigned b_cold : 1 ;
7272 /* b_warm is used by the cold-detection algorithm to mark blocks which are definitely not cold */
7373 unsigned b_warm : 1 ;
74+ /* b_dfs_visited_gen is used by the borrow-safe analysis to mark whether a block has been visited */
75+ int b_dfs_visited_gen ;
7476} basicblock ;
7577
7678
@@ -95,6 +97,8 @@ typedef struct _PyCfgBuilder cfg_builder;
9597#define LOCATION (LNO , END_LNO , COL , END_COL ) \
9698 ((const _Py_SourceLocation){(LNO), (END_LNO), (COL), (END_COL)})
9799
100+ static int cfg_dfs_generation_counter = 0 ;
101+
98102static inline int
99103is_block_push (cfg_instr * i )
100104{
@@ -2730,6 +2734,239 @@ load_fast_push_block(basicblock ***sp, basicblock *target,
27302734 }
27312735}
27322736
2737+ /*
2738+ * Recursively determine if a borrowed reference remains safe along all CFG paths.
2739+ *
2740+ * This function performs a DFS starting from 'current_block'.
2741+ * It tracks a specific value that was notionally loaded by a LOAD_FAST_BORROW
2742+ * instruction and is currently at 'depth_of_value_on_entry' on the operand stack
2743+ * (0 being TOS, -1 if already known to be consumed). The 'original_local_idx'
2744+ * is the index of the local variable from which this value was borrowed.
2745+ *
2746+ * A borrow is considered UNSAFE if, along any execution path before the
2747+ * borrowed value is consumed from the stack:
2748+ * 1. The 'original_local_idx' (the backing store for the borrow) is overwritten
2749+ * (e.g., by STORE_FAST, DELETE_FAST on that local).
2750+ * 2. The borrowed value itself, when at the top of the stack (depth 0), is
2751+ * consumed by an instruction that stores it into any local variable
2752+ * (e.g., STORE_FAST).
2753+ * 3. The value is not consumed and the CFG path ends (e.g., end of function
2754+ * without consumption) or enters a cycle where it's not consumed.
2755+ *
2756+ * The function uses 'cfg_dfs_generation_counter' in conjunction with
2757+ * 'current_block->b_dfs_visited_gen' to detect cycles within the current
2758+ * specific DFS traversal, preventing infinite loops and deeming such cyclic
2759+ * paths unsafe if the value isn't consumed within the cycle.
2760+ *
2761+ * Stack depths are tracked by:
2762+ * - 'depth_of_value_on_entry': The depth of the tracked item upon entering 'current_block'.
2763+ * - 'current_target_depth': The depth of the item as instructions in 'current_block' are processed.
2764+ * - 'depth_before_jump_op_effect': When a jump instruction is encountered, this
2765+ * is calculated by simulating all prior instructions in 'current_block' to find
2766+ * the item's depth *just before* the jump instruction itself has any stack effect.
2767+ * This precise depth is then used to calculate the 'target_entry_depth' for the
2768+ * recursive call to the jump's target block.
2769+ *
2770+ */
2771+ static bool
2772+ is_borrow_safe (
2773+ basicblock * current_block ,
2774+ Py_ssize_t depth_of_value_on_entry ,
2775+ int original_local_idx ,
2776+ cfg_builder * g )
2777+ {
2778+ if (depth_of_value_on_entry == -1 ) {
2779+ return true;
2780+ }
2781+
2782+ if (current_block -> b_dfs_visited_gen == cfg_dfs_generation_counter ) {
2783+ return false;
2784+ }
2785+ current_block -> b_dfs_visited_gen = cfg_dfs_generation_counter ;
2786+
2787+ Py_ssize_t current_target_depth = depth_of_value_on_entry ;
2788+
2789+ for (int i = 0 ; i < current_block -> b_iused ; i ++ ) {
2790+ cfg_instr * instr = & current_block -> b_instr [i ];
2791+ int opcode = instr -> i_opcode ;
2792+ int oparg = instr -> i_oparg ;
2793+
2794+ if ((opcode == STORE_FAST || opcode == DELETE_FAST || opcode == LOAD_FAST_AND_CLEAR ) &&
2795+ oparg == original_local_idx ) {
2796+ return false;
2797+ }
2798+
2799+ stack_effects effects_noj ;
2800+ if (get_stack_effects (opcode , oparg , 0 , & effects_noj ) < 0 ) {
2801+ return false;
2802+ }
2803+ int num_popped = _PyOpcode_num_popped (opcode , oparg );
2804+ int num_pushed = _PyOpcode_num_pushed (opcode , oparg );
2805+
2806+ if (current_target_depth < num_popped ) {
2807+ if (opcode == STORE_FAST && current_target_depth == 0 ) {
2808+ return false;
2809+ }
2810+ return true;
2811+ }
2812+ current_target_depth = current_target_depth - num_popped + num_pushed ;
2813+
2814+ if (HAS_TARGET (opcode )) {
2815+ if (i != current_block -> b_iused - 1 ) {
2816+ continue ; // Skip jumps that are not the last instruction in the block.
2817+ }
2818+ bool safe_on_all_branches = true;
2819+
2820+ Py_ssize_t depth_before_jump_op_effect = depth_of_value_on_entry ;
2821+ for (int k = 0 ; k < i ; ++ k ) {
2822+ cfg_instr * prev_instr_in_block = & current_block -> b_instr [k ];
2823+ // Kill check for intermediate instructions
2824+ if ((prev_instr_in_block -> i_opcode == STORE_FAST ||
2825+ prev_instr_in_block -> i_opcode == DELETE_FAST ||
2826+ prev_instr_in_block -> i_opcode == LOAD_FAST_AND_CLEAR ) &&
2827+ prev_instr_in_block -> i_oparg == original_local_idx ) {
2828+ return false;
2829+ }
2830+ stack_effects prev_effects ;
2831+ if (get_stack_effects (prev_instr_in_block -> i_opcode , prev_instr_in_block -> i_oparg , 0 , & prev_effects ) < 0 ) {
2832+ return false;
2833+ }
2834+ int prev_popped_val = _PyOpcode_num_popped (prev_instr_in_block -> i_opcode , prev_instr_in_block -> i_oparg );
2835+ if (depth_before_jump_op_effect < prev_popped_val ) {
2836+ if (prev_instr_in_block -> i_opcode == STORE_FAST && depth_before_jump_op_effect == 0 ) {
2837+ return false;
2838+ }
2839+ return true;
2840+ }
2841+ depth_before_jump_op_effect = depth_before_jump_op_effect - prev_popped_val +
2842+ _PyOpcode_num_pushed (prev_instr_in_block -> i_opcode , prev_instr_in_block -> i_oparg );
2843+ }
2844+
2845+ // Analyze jump target path
2846+ stack_effects effects_jump ;
2847+ if (get_stack_effects (opcode , oparg , 1 , & effects_jump ) < 0 ) return false;
2848+ int jump_popped_val = _PyOpcode_num_popped (opcode , oparg );
2849+ Py_ssize_t target_entry_depth = depth_before_jump_op_effect ;
2850+
2851+ if (target_entry_depth < jump_popped_val ) {
2852+ if (opcode == STORE_FAST && target_entry_depth == 0 ) {
2853+ safe_on_all_branches = false;
2854+ }
2855+ } else { // Not consumed by jump op, recurse on target.
2856+ target_entry_depth = target_entry_depth - jump_popped_val + _PyOpcode_num_pushed (opcode , oparg );
2857+ if (!is_borrow_safe (instr -> i_target , target_entry_depth , original_local_idx , g )) {
2858+ safe_on_all_branches = false;
2859+ }
2860+ }
2861+
2862+ // Analyze fallthrough path for conditional jumps, if the jump path was safe.
2863+ if (safe_on_all_branches && IS_CONDITIONAL_JUMP_OPCODE (opcode ) && current_block -> b_next ) {
2864+ if (!is_borrow_safe (current_block -> b_next , current_target_depth , original_local_idx , g )) {
2865+ safe_on_all_branches = false;
2866+ }
2867+ }
2868+ return safe_on_all_branches ;
2869+ }
2870+ }
2871+
2872+ // When the instructions finish and the value isn't consumed, check fallthrough.
2873+ if (current_block -> b_next && BB_HAS_FALLTHROUGH (current_block )) {
2874+ return is_borrow_safe (current_block -> b_next , current_target_depth , original_local_idx , g );
2875+ }
2876+
2877+ /*
2878+ No further path (or no fallthrough) and value is still on stack (current_target_depth is valid).
2879+ This means it's unconsumed at the end of a CFG path, so it's unsafe.
2880+ */
2881+ return false;
2882+ }
2883+
2884+
2885+ /*
2886+ * Initiates a CFG wide safety check for a borrowed reference.
2887+ *
2888+ * This function is called when a LOAD_FAST instruction is a candidate for
2889+ * LOAD_FAST_BORROW, but its produced value ('REF_UNCONSUMED' flag is set)
2890+ * might live beyond the current basic block ('producer_block').
2891+ *
2892+ * It determines the immediate successor(s) of the 'producer_block' and the
2893+ * stack depth at which the borrowed value (identified by 'original_local_idx'
2894+ * and initially at 'depth_of_value_at_producer_end' from TOS) would enter
2895+ * these successors.
2896+ *
2897+ * It then calls 'is_borrow_safe()' for each successor path. The borrow is
2898+ * considered globally safe only if 'is_borrow_safe()' returns true for ALL
2899+ * possible successor paths.
2900+ *
2901+ * A new DFS traversal is started by incrementing 'cfg_dfs_generation_counter'
2902+ * to ensure that 'is_borrow_safe()' uses a fresh set of visited markers
2903+ * ('b_dfs_visited_gen') for its analysis.
2904+ *
2905+ */
2906+ static bool
2907+ check_borrow_safety_globally (
2908+ basicblock * producer_block ,
2909+ Py_ssize_t depth_of_value_at_producer_end ,
2910+ int original_local_idx ,
2911+ cfg_builder * g )
2912+ {
2913+ cfg_dfs_generation_counter ++ ;
2914+ bool overall_safety = true;
2915+
2916+ cfg_instr * last_instr = basicblock_last_instr (producer_block );
2917+
2918+ // If depth is -1, implies value was already consumed or is invalid for tracking.
2919+ if (depth_of_value_at_producer_end == -1 ) {
2920+ return false;
2921+ }
2922+
2923+ if (last_instr && HAS_TARGET (last_instr -> i_opcode )) {
2924+ stack_effects effects_jump ;
2925+ if (get_stack_effects (last_instr -> i_opcode , last_instr -> i_oparg , 1 , & effects_jump ) < 0 ) return false;
2926+ int jump_popped = _PyOpcode_num_popped (last_instr -> i_opcode , last_instr -> i_oparg );
2927+ Py_ssize_t entry_depth_for_target = depth_of_value_at_producer_end ;
2928+
2929+ if (entry_depth_for_target < jump_popped ) {
2930+ if (last_instr -> i_opcode == STORE_FAST && entry_depth_for_target == 0 ) {
2931+ overall_safety = false;
2932+ }
2933+ } else {
2934+ entry_depth_for_target = entry_depth_for_target - jump_popped +
2935+ _PyOpcode_num_pushed (last_instr -> i_opcode , last_instr -> i_oparg );
2936+ if (!is_borrow_safe (last_instr -> i_target , entry_depth_for_target , original_local_idx , g )) {
2937+ overall_safety = false;
2938+ }
2939+ }
2940+
2941+ // Analyze fallthrough path for conditional jumps, if the jump path was safe.
2942+ if (overall_safety && IS_CONDITIONAL_JUMP_OPCODE (last_instr -> i_opcode ) && producer_block -> b_next ) {
2943+ stack_effects effects_fall ;
2944+ if (get_stack_effects (last_instr -> i_opcode , last_instr -> i_oparg , 0 , & effects_fall ) < 0 ) return false;
2945+ int fall_popped = _PyOpcode_num_popped (last_instr -> i_opcode , last_instr -> i_oparg );
2946+ Py_ssize_t entry_depth_for_fallthrough = depth_of_value_at_producer_end ;
2947+
2948+ if (entry_depth_for_fallthrough < fall_popped ) {
2949+ if (last_instr -> i_opcode == STORE_FAST && entry_depth_for_fallthrough == 0 ) {
2950+ overall_safety = false;
2951+ }
2952+ } else { // Recurse on fallthrough.
2953+ entry_depth_for_fallthrough = entry_depth_for_fallthrough - fall_popped +
2954+ _PyOpcode_num_pushed (last_instr -> i_opcode , last_instr -> i_oparg );
2955+ if (!is_borrow_safe (producer_block -> b_next , entry_depth_for_fallthrough , original_local_idx , g )) {
2956+ overall_safety = false;
2957+ }
2958+ }
2959+ }
2960+ } else if (producer_block -> b_next && BB_HAS_FALLTHROUGH (producer_block )) { // Standard fallthrough, no jump.
2961+ if (!is_borrow_safe (producer_block -> b_next , depth_of_value_at_producer_end , original_local_idx , g )) {
2962+ overall_safety = false;
2963+ }
2964+ } else {
2965+ overall_safety = false;
2966+ }
2967+ return overall_safety ;
2968+ }
2969+
27332970/*
27342971 * Strength reduce LOAD_FAST{_LOAD_FAST} instructions into faster variants that
27352972 * load borrowed references onto the operand stack.
@@ -2776,6 +3013,7 @@ optimize_load_fast(cfg_builder *g)
27763013 basicblock * entryblock = g -> g_entryblock ;
27773014 for (basicblock * b = entryblock ; b != NULL ; b = b -> b_next ) {
27783015 max_instrs = Py_MAX (max_instrs , b -> b_iused );
3016+ b -> b_dfs_visited_gen = 0 ;
27793017 }
27803018 size_t instr_flags_size = max_instrs * sizeof (uint8_t );
27813019 uint8_t * instr_flags = PyMem_Malloc (instr_flags_size );
@@ -3031,17 +3269,82 @@ optimize_load_fast(cfg_builder *g)
30313269
30323270 // Optimize instructions
30333271 for (int i = 0 ; i < block -> b_iused ; i ++ ) {
3034- if (!instr_flags [i ]) {
3035- cfg_instr * instr = & block -> b_instr [i ];
3036- switch (instr -> i_opcode ) {
3037- case LOAD_FAST :
3272+ cfg_instr * instr = & block -> b_instr [i ];
3273+ bool is_load_fast_type = (instr -> i_opcode == LOAD_FAST || instr -> i_opcode == LOAD_FAST_LOAD_FAST );
3274+
3275+ if (is_load_fast_type ) {
3276+ bool killed_or_stored_locally = (instr_flags [i ] & (SUPPORT_KILLED | STORED_AS_LOCAL ));
3277+ if (killed_or_stored_locally ) {
3278+ continue ; // Definitely cannot borrow due to local block issues
3279+ }
3280+
3281+ bool unconsumed_in_block = (instr_flags [i ] & REF_UNCONSUMED );
3282+ bool can_borrow = false;
3283+
3284+ if (!unconsumed_in_block ) {
3285+ can_borrow = true; // Safe by local analysis, consumed in current block
3286+ } else {
3287+ if (instr -> i_opcode == LOAD_FAST ) {
3288+ int local_idx = instr -> i_oparg ;
3289+ Py_ssize_t depth_from_tos_at_block_end = -1 ;
3290+ // Find the specific item on the `refs` stack (at end of current block simulation)
3291+ for (Py_ssize_t k = 0 ; k < refs .size ; k ++ ) {
3292+ ref r = ref_stack_at (& refs , k );
3293+ if (r .instr == i && r .local == local_idx ) { // Match instruction and local
3294+ depth_from_tos_at_block_end = refs .size - 1 - k ;
3295+ break ;
3296+ }
3297+ }
3298+
3299+ if (depth_from_tos_at_block_end != -1 ) {
3300+ can_borrow = check_borrow_safety_globally (block , depth_from_tos_at_block_end , local_idx , g );
3301+ } else {
3302+ // If REF_UNCONSUMED is set but we couldn't find its depth, assume unsafe.
3303+ // This can happen if refs.size is 0, yet flag is set.
3304+ can_borrow = false;
3305+ }
3306+
3307+ } else { // LOAD_FAST_LOAD_FAST
3308+ can_borrow = true; // Assume true, prove false if any part is unsafe
3309+ int local_idx1 = instr -> i_oparg >> 4 ;
3310+ int local_idx2 = instr -> i_oparg & 15 ;
3311+ Py_ssize_t depth1 = -1 , depth2 = -1 ;
3312+
3313+ // Find depths on `refs` stack for both products of this LFLF instruction `i`
3314+ for (Py_ssize_t k = 0 ; k < refs .size ; k ++ ) {
3315+ ref r = ref_stack_at (& refs , k );
3316+ if (r .instr == i ) {
3317+ Py_ssize_t current_depth = refs .size - 1 - k ;
3318+ if (r .local == local_idx1 && depth1 == -1 ) depth1 = current_depth ;
3319+ else if (r .local == local_idx2 && depth2 == -1 ) depth2 = current_depth ;
3320+ }
3321+ }
3322+
3323+ bool found_any_on_stack = (depth1 != -1 || depth2 != -1 );
3324+
3325+ if (depth1 != -1 ) {
3326+ if (!check_borrow_safety_globally (block , depth1 , local_idx1 , g )) {
3327+ can_borrow = false;
3328+ }
3329+ }
3330+ if (can_borrow && depth2 != -1 ) {
3331+ if (!check_borrow_safety_globally (block , depth2 , local_idx2 , g )) {
3332+ can_borrow = false;
3333+ }
3334+ }
3335+
3336+ if (unconsumed_in_block && !found_any_on_stack ) {
3337+ can_borrow = false;
3338+ }
3339+ }
3340+ }
3341+
3342+ if (can_borrow ) {
3343+ if (instr -> i_opcode == LOAD_FAST ) {
30383344 instr -> i_opcode = LOAD_FAST_BORROW ;
3039- break ;
3040- case LOAD_FAST_LOAD_FAST :
3345+ } else if (instr -> i_opcode == LOAD_FAST_LOAD_FAST ) {
30413346 instr -> i_opcode = LOAD_FAST_BORROW_LOAD_FAST_BORROW ;
3042- break ;
3043- default :
3044- break ;
3347+ }
30453348 }
30463349 }
30473350 }
0 commit comments