Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Include/cpython/pystate.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ struct _ts {
struct _PyInterpreterFrame *base_frame;

struct _PyInterpreterFrame *last_profiled_frame;
uintptr_t last_profiled_frame_seq;

Py_tracefunc c_profilefunc;
Py_tracefunc c_tracefunc;
Expand Down
2 changes: 2 additions & 0 deletions Include/internal/pycore_debug_offsets.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ typedef struct _Py_DebugOffsets {
uint64_t current_frame;
uint64_t base_frame;
uint64_t last_profiled_frame;
uint64_t last_profiled_frame_seq;
uint64_t thread_id;
uint64_t native_thread_id;
uint64_t datastack_chunk;
Expand Down Expand Up @@ -294,6 +295,7 @@ typedef struct _Py_DebugOffsets {
.current_frame = offsetof(PyThreadState, current_frame), \
.base_frame = offsetof(PyThreadState, base_frame), \
.last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
.last_profiled_frame_seq = offsetof(PyThreadState, last_profiled_frame_seq), \
.thread_id = offsetof(PyThreadState, thread_id), \
.native_thread_id = offsetof(PyThreadState, native_thread_id), \
.datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
Expand Down
3 changes: 3 additions & 0 deletions Include/internal/pycore_interpframe.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,12 +292,15 @@ _PyThreadState_GetFrame(PyThreadState *tstate)
// This avoids corrupting the cache when transient frames (called and returned
// between profiler samples) update last_profiled_frame to addresses the
// profiler never saw.
// The sequence distinguishes this anchor from a later frame that reuses the
// same _PyInterpreterFrame address.
#define _PyThreadState_UpdateLastProfiledFrame(tstate, frame, previous) \
do { \
PyThreadState *tstate_ = (tstate); \
_PyInterpreterFrame *frame_ = (frame); \
if (tstate_->last_profiled_frame == frame_) { \
tstate_->last_profiled_frame = (previous); \
tstate_->last_profiled_frame_seq++; \
Comment thread
pablogsal marked this conversation as resolved.
} \
} while (0)

Expand Down
30 changes: 17 additions & 13 deletions InternalDocs/frames.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,22 +142,26 @@ since the frame chain may have been in an inconsistent state due to concurrent u

### Remote Profiling Frame Cache

The `last_profiled_frame` field in `PyThreadState` supports an optimization for
remote profilers that sample call stacks from external processes. When a remote
profiler reads the call stack, it writes the current frame address to this field.
The eval loop then keeps this pointer valid by updating it to the parent frame
whenever a frame returns (in `_PyEval_FrameClearAndPop`).
The `last_profiled_frame` and `last_profiled_frame_seq` fields in
`PyThreadState` support an optimization for remote profilers that sample call
stacks from external processes. When a remote profiler reads the call stack, it
writes the current frame address to `last_profiled_frame`. The eval loop then
keeps this pointer valid by updating it to the parent frame whenever a frame
returns (in `_PyEval_FrameClearAndPop`) and increments the sequence.

This creates a "high-water mark" that always points to a frame still on the stack.
On subsequent samples, the profiler can walk from `current_frame` until it reaches
`last_profiled_frame`, knowing that frames from that point downward are unchanged
and can be retrieved from a cache. This significantly reduces the amount of remote
memory reads needed when call stacks are deep and stable at their base.

The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
`last_profiled_frame` is non-NULL AND matches the frame being popped. This
prevents transient frames (called and returned between profiler samples) from
corrupting the cache pointer, while avoiding any overhead when profiling is inactive.
`last_profiled_frame`, then validate the pointer and sequence before using cached
callers. This prevents a later frame that reuses the same `_PyInterpreterFrame`
address from being mistaken for the sampled frame. The cache significantly
reduces the amount of remote memory reads needed when call stacks are deep and
stable at their base.

The update in `_PyEval_FrameClearAndPop` is guarded: it only advances the
pointer and sequence when `last_profiled_frame` is non-NULL AND matches the
frame being popped. This prevents transient frames (called and returned between
profiler samples) from corrupting the cache anchor, while avoiding any overhead
when profiling is inactive.


### The Instruction Pointer
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Fix skewed stack trackes in the Tachyon profiler when caching is enabled and
Fix skewed stack traces in the Tachyon profiler when caching is enabled and
when generators and coroutines are profiled, by updating
``tstate->last_profiled_frame`` at every frame-removal site. The issue resulted
in total erasure of some callers. Patch by Maurycy Pawłowski-Wieroński.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix another way the Tachyon profiler frame cache could produce impossible
mixed stack traces when ``_PyInterpreterFrame`` addresses are reused, by
validating cached frame anchors with a sequence counter.
17 changes: 15 additions & 2 deletions Modules/_remote_debugging/_remote_debugging.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,15 @@ typedef struct {
#define FRAME_CACHE_MAX_THREADS 32
#define FRAME_CACHE_MAX_FRAMES 1024

typedef struct {
uintptr_t frame;
uintptr_t seq;
} FrameCacheAnchor;

typedef struct {
uint64_t thread_id; // 0 = empty slot
uintptr_t thread_state_addr;
uintptr_t last_profiled_frame_seq; // sequence paired with addrs[0]
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs;
PyObject *thread_id_obj; // owned reference, NULL if empty
Expand Down Expand Up @@ -434,7 +440,7 @@ typedef struct {
uintptr_t thread_state_addr; // Owning thread state address
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
uintptr_t gc_frame; // GC frame address (0 if not tracking)
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
FrameCacheAnchor last_profiled; // Last cached frame anchor
StackChunkList *chunks; // Pre-copied stack chunks
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers
Expand Down Expand Up @@ -622,15 +628,21 @@ extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
extern int set_last_profiled_frame(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr, uintptr_t frame_addr);
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
extern int frame_cache_lookup_and_extend(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
uintptr_t last_profiled_frame,
uintptr_t thread_state_addr,
FrameCacheAnchor anchor,
PyObject *frame_info,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs);
extern int frame_cache_anchor_matches(
RemoteUnwinderObject *unwinder,
uintptr_t thread_state_addr,
FrameCacheAnchor anchor);
// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
// Only stores complete stacks that reach base_frame_addr
extern int frame_cache_store(
Expand All @@ -640,6 +652,7 @@ extern int frame_cache_store(
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t last_profiled_frame_seq,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited);

Expand Down
5 changes: 3 additions & 2 deletions Modules/_remote_debugging/debug_offsets_validation.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
#define FIELD_SIZE(type, member) sizeof(((type *)0)->member)

enum {
PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 880,
PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 888,
PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104,
};

Expand Down Expand Up @@ -261,7 +261,8 @@ validate_fixed_field(
APPLY(thread_state, next, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
APPLY(thread_state, current_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
APPLY(thread_state, base_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size)
APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
APPLY(thread_state, last_profiled_frame_seq, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size)

#define PY_REMOTE_DEBUG_INTERPRETER_STATE_FIELDS(APPLY, buffer_size) \
APPLY(interpreter_state, id, sizeof(int64_t), _Alignof(int64_t), buffer_size); \
Expand Down
96 changes: 85 additions & 11 deletions Modules/_remote_debugging/frame_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -147,50 +147,119 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
Py_CLEAR(unwinder->frame_cache[i].frame_list);
unwinder->frame_cache[i].thread_id = 0;
unwinder->frame_cache[i].thread_state_addr = 0;
unwinder->frame_cache[i].last_profiled_frame_seq = 0;
unwinder->frame_cache[i].num_addrs = 0;
STATS_INC(unwinder, stale_cache_invalidations);
}
}
}

static int
read_last_profiled_anchor(RemoteUnwinderObject *unwinder,
uintptr_t thread_state_addr,
FrameCacheAnchor *anchor)
{
uintptr_t frame_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame;
uintptr_t seq_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame_seq;

// These fields are adjacent in PyThreadState. Read them together when the
// layout allows it so validation uses a pointer and sequence from the same
// remote-memory read.
if (seq_offset == frame_offset + sizeof(uintptr_t)) {
uintptr_t live_anchor[2];
if (_Py_RemoteDebug_ReadRemoteMemory(&unwinder->handle,
thread_state_addr + frame_offset,
sizeof(live_anchor),
live_anchor) < 0) {
return -1;
}
anchor->frame = live_anchor[0];
anchor->seq = live_anchor[1];
return 0;
}

if (read_ptr(unwinder, thread_state_addr + frame_offset, &anchor->frame) < 0) {
return -1;
}
return read_ptr(unwinder, thread_state_addr + seq_offset, &anchor->seq);
}

static Py_ssize_t
find_cached_frame_addr(const FrameCacheEntry *entry, uintptr_t frame_addr,
uintptr_t *real_pops)
{
*real_pops = 0;
for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
if (entry->addrs[i] == frame_addr) {
return i;
}
if (entry->addrs[i] != 0) {
(*real_pops)++;
}
}
return -1;
}

int
frame_cache_anchor_matches(
RemoteUnwinderObject *unwinder,
uintptr_t thread_state_addr,
FrameCacheAnchor anchor)
{
FrameCacheAnchor live_anchor = {0, 0};
if (read_last_profiled_anchor(unwinder, thread_state_addr, &live_anchor) < 0) {
PyErr_Clear();
return 0;
}
return live_anchor.frame == anchor.frame && live_anchor.seq == anchor.seq;
}

// Find last_profiled_frame in cache and extend frame_info with cached continuation
// If frame_addrs is provided (not NULL), also extends it with cached addresses
int
frame_cache_lookup_and_extend(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
uintptr_t last_profiled_frame,
uintptr_t thread_state_addr,
FrameCacheAnchor anchor,
PyObject *frame_info,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs)
{
if (!unwinder->frame_cache || last_profiled_frame == 0) {
if (!unwinder->frame_cache || anchor.frame == 0) {
return 0;
}

FrameCacheEntry *entry = frame_cache_find(unwinder, thread_id);
if (!entry || !entry->frame_list) {
return 0;
}
if (entry->thread_state_addr != thread_state_addr) {
return 0;
}

assert(entry->num_addrs >= 0 && entry->num_addrs <= FRAME_CACHE_MAX_FRAMES);

// Find the index where last_profiled_frame matches
Py_ssize_t start_idx = -1;
for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
if (entry->addrs[i] == last_profiled_frame) {
start_idx = i;
break;
}
}

uintptr_t real_pops = 0;
Py_ssize_t start_idx = find_cached_frame_addr(entry, anchor.frame, &real_pops);
if (start_idx < 0) {
return 0; // Not found
}
assert(start_idx < entry->num_addrs);

// Synthetic marker frames (<native>/<GC>) are stored as addr-0 entries but
// never increment last_profiled_frame_seq in the target (only real frame
// pops do). Count the real frames before start_idx so the sequence check is
// not thrown off by markers sitting between the leaf and the anchor.
if (entry->last_profiled_frame_seq + real_pops != anchor.seq) {
return 0;
}

Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
if (start_idx >= num_frames) {
return 0;
}

// Extend frame_info with frames ABOVE start_idx (not including it).
// The frame at start_idx (last_profiled_frame) was the executing frame
Expand All @@ -200,6 +269,9 @@ frame_cache_lookup_and_extend(
if (cache_start >= num_frames) {
return 0; // Nothing above last_profiled_frame to extend with
}
if (!frame_cache_anchor_matches(unwinder, thread_state_addr, anchor)) {
return 0;
}

PyObject *slice = PyList_GetSlice(entry->frame_list, cache_start, num_frames);
if (!slice) {
Expand Down Expand Up @@ -235,6 +307,7 @@ frame_cache_store(
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
uintptr_t last_profiled_frame_seq,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited)
{
Expand Down Expand Up @@ -277,6 +350,7 @@ frame_cache_store(
}
entry->thread_id = thread_id;
entry->thread_state_addr = thread_state_addr;
entry->last_profiled_frame_seq = last_profiled_frame_seq;
if (entry->thread_id_obj == NULL) {
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
if (entry->thread_id_obj == NULL) {
Expand Down
Loading
Loading