Skip to content

Commit 7beb3fa

Browse files
ljfpCopilot
andcommitted
pythongh-133672: Allow LOAD_FAST to be optimized to LOAD_FAST_BORROW
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 6d7bbee commit 7beb3fa

3 files changed

Lines changed: 327 additions & 11 deletions

File tree

Lib/test/test_peepholer.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2598,7 +2598,19 @@ def test_for_iter(self):
25982598
("LOAD_CONST", 0, 7),
25992599
("RETURN_VALUE", None, 8),
26002600
]
2601-
self.cfg_optimization_test(insts, insts, consts=[None])
2601+
expected = [
2602+
("LOAD_FAST_BORROW", 0, 1),
2603+
top := self.Label(),
2604+
("FOR_ITER", end := self.Label(), 2),
2605+
("STORE_FAST", 2, 3),
2606+
("JUMP", top, 4),
2607+
end,
2608+
("END_FOR", None, 5),
2609+
("POP_TOP", None, 6),
2610+
("LOAD_CONST", 0, 7),
2611+
("RETURN_VALUE", None, 8),
2612+
]
2613+
self.cfg_optimization_test(insts, expected, consts=[None])
26022614

26032615
def test_load_attr(self):
26042616
insts = [
@@ -2664,7 +2676,7 @@ def test_send(self):
26642676
("RETURN_VALUE", None, 7)
26652677
]
26662678
expected = [
2667-
("LOAD_FAST", 0, 1),
2679+
("LOAD_FAST_BORROW", 0, 1),
26682680
("LOAD_FAST_BORROW", 1, 2),
26692681
("SEND", end := self.Label(), 3),
26702682
("LOAD_CONST", 0, 4),
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
The peephole optimizer now converts LOAD_FAST to LOAD_FAST_BORROW more aggressively. This optimization is applied even if the loaded variable remains live on the stack at the end of a basic block, as long as the borrow is otherwise determined to be safe. This can reduce reference counting overhead in certain code patterns, such as with iterables in loops.

Python/flowgraph.c

Lines changed: 312 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ typedef struct _PyCfgBasicblock {
7171
unsigned b_cold : 1;
7272
/* b_warm is used by the cold-detection algorithm to mark blocks which are definitely not cold */
7373
unsigned b_warm : 1;
74+
/* b_dfs_visited_gen is used by the borrow-safe analysis to mark whether a block has been visited */
75+
int b_dfs_visited_gen;
7476
} basicblock;
7577

7678

@@ -95,6 +97,8 @@ typedef struct _PyCfgBuilder cfg_builder;
9597
#define LOCATION(LNO, END_LNO, COL, END_COL) \
9698
((const _Py_SourceLocation){(LNO), (END_LNO), (COL), (END_COL)})
9799

100+
static int cfg_dfs_generation_counter = 0;
101+
98102
static inline int
99103
is_block_push(cfg_instr *i)
100104
{
@@ -2730,6 +2734,239 @@ load_fast_push_block(basicblock ***sp, basicblock *target,
27302734
}
27312735
}
27322736

2737+
/*
2738+
* Recursively determine if a borrowed reference remains safe along all CFG paths.
2739+
*
2740+
* This function performs a DFS starting from 'current_block'.
2741+
* It tracks a specific value that was notionally loaded by a LOAD_FAST_BORROW
2742+
* instruction and is currently at 'depth_of_value_on_entry' on the operand stack
2743+
* (0 being TOS, -1 if already known to be consumed). The 'original_local_idx'
2744+
* is the index of the local variable from which this value was borrowed.
2745+
*
2746+
* A borrow is considered UNSAFE if, along any execution path before the
2747+
* borrowed value is consumed from the stack:
2748+
* 1. The 'original_local_idx' (the backing store for the borrow) is overwritten
2749+
* (e.g., by STORE_FAST, DELETE_FAST on that local).
2750+
* 2. The borrowed value itself, when at the top of the stack (depth 0), is
2751+
* consumed by an instruction that stores it into any local variable
2752+
* (e.g., STORE_FAST).
2753+
* 3. The value is not consumed and the CFG path ends (e.g., end of function
2754+
* without consumption) or enters a cycle where it's not consumed.
2755+
*
2756+
* The function uses 'cfg_dfs_generation_counter' in conjunction with
2757+
* 'current_block->b_dfs_visited_gen' to detect cycles within the current
2758+
* specific DFS traversal, preventing infinite loops and deeming such cyclic
2759+
* paths unsafe if the value isn't consumed within the cycle.
2760+
*
2761+
* Stack depths are tracked by:
2762+
* - 'depth_of_value_on_entry': The depth of the tracked item upon entering 'current_block'.
2763+
* - 'current_target_depth': The depth of the item as instructions in 'current_block' are processed.
2764+
* - 'depth_before_jump_op_effect': When a jump instruction is encountered, this
2765+
* is calculated by simulating all prior instructions in 'current_block' to find
2766+
* the item's depth *just before* the jump instruction itself has any stack effect.
2767+
* This precise depth is then used to calculate the 'target_entry_depth' for the
2768+
* recursive call to the jump's target block.
2769+
*
2770+
*/
2771+
static bool
2772+
is_borrow_safe(
2773+
basicblock *current_block,
2774+
Py_ssize_t depth_of_value_on_entry,
2775+
int original_local_idx,
2776+
cfg_builder *g)
2777+
{
2778+
if (depth_of_value_on_entry == -1) {
2779+
return true;
2780+
}
2781+
2782+
if (current_block->b_dfs_visited_gen == cfg_dfs_generation_counter) {
2783+
return false;
2784+
}
2785+
current_block->b_dfs_visited_gen = cfg_dfs_generation_counter;
2786+
2787+
Py_ssize_t current_target_depth = depth_of_value_on_entry;
2788+
2789+
for (int i = 0; i < current_block->b_iused; i++) {
2790+
cfg_instr *instr = &current_block->b_instr[i];
2791+
int opcode = instr->i_opcode;
2792+
int oparg = instr->i_oparg;
2793+
2794+
if ((opcode == STORE_FAST || opcode == DELETE_FAST || opcode == LOAD_FAST_AND_CLEAR) &&
2795+
oparg == original_local_idx) {
2796+
return false;
2797+
}
2798+
2799+
stack_effects effects_noj;
2800+
if (get_stack_effects(opcode, oparg, 0, &effects_noj) < 0) {
2801+
return false;
2802+
}
2803+
int num_popped = _PyOpcode_num_popped(opcode, oparg);
2804+
int num_pushed = _PyOpcode_num_pushed(opcode, oparg);
2805+
2806+
if (current_target_depth < num_popped) {
2807+
if (opcode == STORE_FAST && current_target_depth == 0) {
2808+
return false;
2809+
}
2810+
return true;
2811+
}
2812+
current_target_depth = current_target_depth - num_popped + num_pushed;
2813+
2814+
if (HAS_TARGET(opcode)) {
2815+
if (i != current_block->b_iused - 1) {
2816+
continue; // Skip jumps that are not the last instruction in the block.
2817+
}
2818+
bool safe_on_all_branches = true;
2819+
2820+
Py_ssize_t depth_before_jump_op_effect = depth_of_value_on_entry;
2821+
for(int k=0; k < i; ++k) {
2822+
cfg_instr *prev_instr_in_block = &current_block->b_instr[k];
2823+
// Kill check for intermediate instructions
2824+
if ((prev_instr_in_block->i_opcode == STORE_FAST ||
2825+
prev_instr_in_block->i_opcode == DELETE_FAST ||
2826+
prev_instr_in_block->i_opcode == LOAD_FAST_AND_CLEAR) &&
2827+
prev_instr_in_block->i_oparg == original_local_idx) {
2828+
return false;
2829+
}
2830+
stack_effects prev_effects;
2831+
if (get_stack_effects(prev_instr_in_block->i_opcode, prev_instr_in_block->i_oparg, 0, &prev_effects) < 0) {
2832+
return false;
2833+
}
2834+
int prev_popped_val = _PyOpcode_num_popped(prev_instr_in_block->i_opcode, prev_instr_in_block->i_oparg);
2835+
if (depth_before_jump_op_effect < prev_popped_val) {
2836+
if (prev_instr_in_block->i_opcode == STORE_FAST && depth_before_jump_op_effect == 0) {
2837+
return false;
2838+
}
2839+
return true;
2840+
}
2841+
depth_before_jump_op_effect = depth_before_jump_op_effect - prev_popped_val +
2842+
_PyOpcode_num_pushed(prev_instr_in_block->i_opcode, prev_instr_in_block->i_oparg);
2843+
}
2844+
2845+
// Analyze jump target path
2846+
stack_effects effects_jump;
2847+
if (get_stack_effects(opcode, oparg, 1, &effects_jump) < 0) return false;
2848+
int jump_popped_val = _PyOpcode_num_popped(opcode, oparg);
2849+
Py_ssize_t target_entry_depth = depth_before_jump_op_effect;
2850+
2851+
if (target_entry_depth < jump_popped_val) {
2852+
if (opcode == STORE_FAST && target_entry_depth == 0) {
2853+
safe_on_all_branches = false;
2854+
}
2855+
} else { // Not consumed by jump op, recurse on target.
2856+
target_entry_depth = target_entry_depth - jump_popped_val + _PyOpcode_num_pushed(opcode, oparg);
2857+
if (!is_borrow_safe(instr->i_target, target_entry_depth, original_local_idx, g)) {
2858+
safe_on_all_branches = false;
2859+
}
2860+
}
2861+
2862+
// Analyze fallthrough path for conditional jumps, if the jump path was safe.
2863+
if (safe_on_all_branches && IS_CONDITIONAL_JUMP_OPCODE(opcode) && current_block->b_next) {
2864+
if (!is_borrow_safe(current_block->b_next, current_target_depth, original_local_idx, g)) {
2865+
safe_on_all_branches = false;
2866+
}
2867+
}
2868+
return safe_on_all_branches;
2869+
}
2870+
}
2871+
2872+
// When the instructions finish and the value isn't consumed, check fallthrough.
2873+
if (current_block->b_next && BB_HAS_FALLTHROUGH(current_block)) {
2874+
return is_borrow_safe(current_block->b_next, current_target_depth, original_local_idx, g);
2875+
}
2876+
2877+
/*
2878+
No further path (or no fallthrough) and value is still on stack (current_target_depth is valid).
2879+
This means it's unconsumed at the end of a CFG path, so it's unsafe.
2880+
*/
2881+
return false;
2882+
}
2883+
2884+
2885+
/*
2886+
* Initiates a CFG wide safety check for a borrowed reference.
2887+
*
2888+
* This function is called when a LOAD_FAST instruction is a candidate for
2889+
* LOAD_FAST_BORROW, but its produced value ('REF_UNCONSUMED' flag is set)
2890+
* might live beyond the current basic block ('producer_block').
2891+
*
2892+
* It determines the immediate successor(s) of the 'producer_block' and the
2893+
* stack depth at which the borrowed value (identified by 'original_local_idx'
2894+
* and initially at 'depth_of_value_at_producer_end' from TOS) would enter
2895+
* these successors.
2896+
*
2897+
* It then calls 'is_borrow_safe()' for each successor path. The borrow is
2898+
* considered globally safe only if 'is_borrow_safe()' returns true for ALL
2899+
* possible successor paths.
2900+
*
2901+
* A new DFS traversal is started by incrementing 'cfg_dfs_generation_counter'
2902+
* to ensure that 'is_borrow_safe()' uses a fresh set of visited markers
2903+
* ('b_dfs_visited_gen') for its analysis.
2904+
*
2905+
*/
2906+
static bool
2907+
check_borrow_safety_globally(
2908+
basicblock *producer_block,
2909+
Py_ssize_t depth_of_value_at_producer_end,
2910+
int original_local_idx,
2911+
cfg_builder *g)
2912+
{
2913+
cfg_dfs_generation_counter++;
2914+
bool overall_safety = true;
2915+
2916+
cfg_instr *last_instr = basicblock_last_instr(producer_block);
2917+
2918+
// If depth is -1, implies value was already consumed or is invalid for tracking.
2919+
if (depth_of_value_at_producer_end == -1) {
2920+
return false;
2921+
}
2922+
2923+
if (last_instr && HAS_TARGET(last_instr->i_opcode)) {
2924+
stack_effects effects_jump;
2925+
if (get_stack_effects(last_instr->i_opcode, last_instr->i_oparg, 1, &effects_jump) < 0) return false;
2926+
int jump_popped = _PyOpcode_num_popped(last_instr->i_opcode, last_instr->i_oparg);
2927+
Py_ssize_t entry_depth_for_target = depth_of_value_at_producer_end;
2928+
2929+
if (entry_depth_for_target < jump_popped) {
2930+
if (last_instr->i_opcode == STORE_FAST && entry_depth_for_target == 0) {
2931+
overall_safety = false;
2932+
}
2933+
} else {
2934+
entry_depth_for_target = entry_depth_for_target - jump_popped +
2935+
_PyOpcode_num_pushed(last_instr->i_opcode, last_instr->i_oparg);
2936+
if (!is_borrow_safe(last_instr->i_target, entry_depth_for_target, original_local_idx, g)) {
2937+
overall_safety = false;
2938+
}
2939+
}
2940+
2941+
// Analyze fallthrough path for conditional jumps, if the jump path was safe.
2942+
if (overall_safety && IS_CONDITIONAL_JUMP_OPCODE(last_instr->i_opcode) && producer_block->b_next) {
2943+
stack_effects effects_fall;
2944+
if (get_stack_effects(last_instr->i_opcode, last_instr->i_oparg, 0, &effects_fall) < 0) return false;
2945+
int fall_popped = _PyOpcode_num_popped(last_instr->i_opcode, last_instr->i_oparg);
2946+
Py_ssize_t entry_depth_for_fallthrough = depth_of_value_at_producer_end;
2947+
2948+
if (entry_depth_for_fallthrough < fall_popped) {
2949+
if (last_instr->i_opcode == STORE_FAST && entry_depth_for_fallthrough == 0) {
2950+
overall_safety = false;
2951+
}
2952+
} else { // Recurse on fallthrough.
2953+
entry_depth_for_fallthrough = entry_depth_for_fallthrough - fall_popped +
2954+
_PyOpcode_num_pushed(last_instr->i_opcode, last_instr->i_oparg);
2955+
if (!is_borrow_safe(producer_block->b_next, entry_depth_for_fallthrough, original_local_idx, g)) {
2956+
overall_safety = false;
2957+
}
2958+
}
2959+
}
2960+
} else if (producer_block->b_next && BB_HAS_FALLTHROUGH(producer_block)) { // Standard fallthrough, no jump.
2961+
if (!is_borrow_safe(producer_block->b_next, depth_of_value_at_producer_end, original_local_idx, g)) {
2962+
overall_safety = false;
2963+
}
2964+
} else {
2965+
overall_safety = false;
2966+
}
2967+
return overall_safety;
2968+
}
2969+
27332970
/*
27342971
* Strength reduce LOAD_FAST{_LOAD_FAST} instructions into faster variants that
27352972
* load borrowed references onto the operand stack.
@@ -2776,6 +3013,7 @@ optimize_load_fast(cfg_builder *g)
27763013
basicblock *entryblock = g->g_entryblock;
27773014
for (basicblock *b = entryblock; b != NULL; b = b->b_next) {
27783015
max_instrs = Py_MAX(max_instrs, b->b_iused);
3016+
b->b_dfs_visited_gen = 0;
27793017
}
27803018
size_t instr_flags_size = max_instrs * sizeof(uint8_t);
27813019
uint8_t *instr_flags = PyMem_Malloc(instr_flags_size);
@@ -3031,17 +3269,82 @@ optimize_load_fast(cfg_builder *g)
30313269

30323270
// Optimize instructions
30333271
for (int i = 0; i < block->b_iused; i++) {
3034-
if (!instr_flags[i]) {
3035-
cfg_instr *instr = &block->b_instr[i];
3036-
switch (instr->i_opcode) {
3037-
case LOAD_FAST:
3272+
cfg_instr *instr = &block->b_instr[i];
3273+
bool is_load_fast_type = (instr->i_opcode == LOAD_FAST || instr->i_opcode == LOAD_FAST_LOAD_FAST);
3274+
3275+
if (is_load_fast_type) {
3276+
bool killed_or_stored_locally = (instr_flags[i] & (SUPPORT_KILLED | STORED_AS_LOCAL));
3277+
if (killed_or_stored_locally) {
3278+
continue; // Definitely cannot borrow due to local block issues
3279+
}
3280+
3281+
bool unconsumed_in_block = (instr_flags[i] & REF_UNCONSUMED);
3282+
bool can_borrow = false;
3283+
3284+
if (!unconsumed_in_block) {
3285+
can_borrow = true; // Safe by local analysis, consumed in current block
3286+
} else {
3287+
if (instr->i_opcode == LOAD_FAST) {
3288+
int local_idx = instr->i_oparg;
3289+
Py_ssize_t depth_from_tos_at_block_end = -1;
3290+
// Find the specific item on the `refs` stack (at end of current block simulation)
3291+
for (Py_ssize_t k = 0; k < refs.size; k++) {
3292+
ref r = ref_stack_at(&refs, k);
3293+
if (r.instr == i && r.local == local_idx) { // Match instruction and local
3294+
depth_from_tos_at_block_end = refs.size - 1 - k;
3295+
break;
3296+
}
3297+
}
3298+
3299+
if (depth_from_tos_at_block_end != -1) {
3300+
can_borrow = check_borrow_safety_globally(block, depth_from_tos_at_block_end, local_idx, g);
3301+
} else {
3302+
// If REF_UNCONSUMED is set but we couldn't find its depth, assume unsafe.
3303+
// This can happen if refs.size is 0, yet flag is set.
3304+
can_borrow = false;
3305+
}
3306+
3307+
} else { // LOAD_FAST_LOAD_FAST
3308+
can_borrow = true; // Assume true, prove false if any part is unsafe
3309+
int local_idx1 = instr->i_oparg >> 4;
3310+
int local_idx2 = instr->i_oparg & 15;
3311+
Py_ssize_t depth1 = -1, depth2 = -1;
3312+
3313+
// Find depths on `refs` stack for both products of this LFLF instruction `i`
3314+
for (Py_ssize_t k = 0; k < refs.size; k++) {
3315+
ref r = ref_stack_at(&refs, k);
3316+
if (r.instr == i) {
3317+
Py_ssize_t current_depth = refs.size - 1 - k;
3318+
if (r.local == local_idx1 && depth1 == -1) depth1 = current_depth;
3319+
else if (r.local == local_idx2 && depth2 == -1) depth2 = current_depth;
3320+
}
3321+
}
3322+
3323+
bool found_any_on_stack = (depth1 != -1 || depth2 != -1);
3324+
3325+
if (depth1 != -1) {
3326+
if (!check_borrow_safety_globally(block, depth1, local_idx1, g)) {
3327+
can_borrow = false;
3328+
}
3329+
}
3330+
if (can_borrow && depth2 != -1) {
3331+
if (!check_borrow_safety_globally(block, depth2, local_idx2, g)) {
3332+
can_borrow = false;
3333+
}
3334+
}
3335+
3336+
if (unconsumed_in_block && !found_any_on_stack) {
3337+
can_borrow = false;
3338+
}
3339+
}
3340+
}
3341+
3342+
if (can_borrow) {
3343+
if (instr->i_opcode == LOAD_FAST) {
30383344
instr->i_opcode = LOAD_FAST_BORROW;
3039-
break;
3040-
case LOAD_FAST_LOAD_FAST:
3345+
} else if (instr->i_opcode == LOAD_FAST_LOAD_FAST) {
30413346
instr->i_opcode = LOAD_FAST_BORROW_LOAD_FAST_BORROW;
3042-
break;
3043-
default:
3044-
break;
3347+
}
30453348
}
30463349
}
30473350
}

0 commit comments

Comments
 (0)