diff --git a/acceptance/bundle/deployment/bind/dashboard/recreation/out.state_after_bind.direct.json b/acceptance/bundle/deployment/bind/dashboard/recreation/out.state_after_bind.direct.json index 771dd3f908b..586e33472fb 100644 --- a/acceptance/bundle/deployment/bind/dashboard/recreation/out.state_after_bind.direct.json +++ b/acceptance/bundle/deployment/bind/dashboard/recreation/out.state_after_bind.direct.json @@ -12,7 +12,7 @@ "etag": [ETAG], "parent_path": "/Workspace/Users/[USERNAME]/.bundle/test-bundle-[UNIQUE_NAME]/default/resources", "published": true, - "serialized_dashboard": "{\"pages\":[{\"displayName\":\"Page One\",\"name\":\"02724bf2\"}]}", + "serialized_dashboard": "sha256:[HASH]", "warehouse_id": "[TEST_DEFAULT_WAREHOUSE_ID]" } } diff --git a/acceptance/bundle/migrate/dashboards/out.new_state.json b/acceptance/bundle/migrate/dashboards/out.new_state.json index 695d14602a7..b2e70881a8e 100644 --- a/acceptance/bundle/migrate/dashboards/out.new_state.json +++ b/acceptance/bundle/migrate/dashboards/out.new_state.json @@ -12,7 +12,7 @@ "etag": "[NUMID]", "parent_path": "/Workspace/Users/[USERNAME]", "published": true, - "serialized_dashboard": "{\"pages\":[{\"name\":\"02724bf2\",\"displayName\":\"Dashboard test bundle-deploy-dashboard\"}]}\n", + "serialized_dashboard": "sha256:[HASH]", "warehouse_id": "123456" } } diff --git a/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json b/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json index 6b55f64bd8d..d28383d0bfb 100644 --- a/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json +++ b/acceptance/bundle/migrate/dashboards/out.plan_after_migrate.json @@ -30,9 +30,9 @@ "serialized_dashboard": { "action": "skip", "reason": "etag_based", - "old": "{\"pages\":[{\"name\":\"02724bf2\",\"displayName\":\"Dashboard test bundle-deploy-dashboard\"}]}\n", - "new": "{\"pages\":[{\"name\":\"02724bf2\",\"displayName\":\"Dashboard test bundle-deploy-dashboard\"}]}\n", - "remote": "{\"pages\":[{\"displayName\":\"Dashboard test bundle-deploy-dashboard\",\"name\":\"02724bf2\",\"pageType\":\"PAGE_TYPE_CANVAS\"}]}\n" + "old": "sha256:[HASH]", + "new": "sha256:[HASH]", + "remote": "sha256:[HASH]" } } } diff --git a/acceptance/bundle/resources/dashboard-state-sha/dashboard.lvdash.json b/acceptance/bundle/resources/dashboard-state-sha/dashboard.lvdash.json new file mode 100644 index 00000000000..e92b987863c --- /dev/null +++ b/acceptance/bundle/resources/dashboard-state-sha/dashboard.lvdash.json @@ -0,0 +1 @@ +{"pages":[{"name":"main","displayName":"Sales Overview"}]} \ No newline at end of file diff --git a/acceptance/bundle/resources/dashboard-state-sha/databricks.yml.tmpl b/acceptance/bundle/resources/dashboard-state-sha/databricks.yml.tmpl new file mode 100644 index 00000000000..a41576273c0 --- /dev/null +++ b/acceptance/bundle/resources/dashboard-state-sha/databricks.yml.tmpl @@ -0,0 +1,11 @@ +bundle: + name: dashboard-state-sha-$UNIQUE_NAME + +resources: + dashboards: + dashboard1: + display_name: $DASHBOARD_DISPLAY_NAME + warehouse_id: $TEST_DEFAULT_WAREHOUSE_ID + embed_credentials: true + file_path: "dashboard.lvdash.json" + parent_path: /Users/$CURRENT_USER_NAME diff --git a/acceptance/bundle/resources/dashboard-state-sha/out.create.serialized.json b/acceptance/bundle/resources/dashboard-state-sha/out.create.serialized.json new file mode 100644 index 00000000000..e3ad638d7d3 --- /dev/null +++ b/acceptance/bundle/resources/dashboard-state-sha/out.create.serialized.json @@ -0,0 +1,3 @@ +[ + "{\"pages\":[{\"name\":\"main\",\"displayName\":\"Sales Overview\"}]}" +] diff --git a/acceptance/bundle/resources/dashboard-state-sha/out.plan_create.direct.json b/acceptance/bundle/resources/dashboard-state-sha/out.plan_create.direct.json new file mode 100644 index 00000000000..cd992ae4c25 --- /dev/null +++ b/acceptance/bundle/resources/dashboard-state-sha/out.plan_create.direct.json @@ -0,0 +1,19 @@ +{ + "plan_version": 2, + "cli_version": "[DEV_VERSION]", + "plan": { + "resources.dashboards.dashboard1": { + "action": "create", + "new_state": { + "value": { + "display_name": "dashboard-state-sha [UUID]", + "embed_credentials": true, + "parent_path": "/Workspace/Users/[USERNAME]", + "published": true, + "serialized_dashboard": "{\"pages\":[{\"name\":\"main\",\"displayName\":\"Sales Overview\"}]}", + "warehouse_id": "[TEST_DEFAULT_WAREHOUSE_ID]" + } + } + } + } +} diff --git a/acceptance/bundle/resources/dashboard-state-sha/out.plan_skip.direct.json b/acceptance/bundle/resources/dashboard-state-sha/out.plan_skip.direct.json new file mode 100644 index 00000000000..87b3dd06841 --- /dev/null +++ b/acceptance/bundle/resources/dashboard-state-sha/out.plan_skip.direct.json @@ -0,0 +1,40 @@ +{ + "plan_version": 2, + "cli_version": "[DEV_VERSION]", + "lineage": "[UUID]", + "serial": 1, + "plan": { + "resources.dashboards.dashboard1": { + "action": "skip", + "remote_state": { + "create_time": "[TIMESTAMP]", + "dashboard_id": "[DASHBOARD_ID]", + "display_name": "dashboard-state-sha [UUID]", + "embed_credentials": true, + "etag": [ETAG], + "lifecycle_state": "ACTIVE", + "parent_path": "/Workspace/Users/[USERNAME]", + "path": "/Users/[USERNAME]/dashboard-state-sha [UUID].lvdash.json", + "published": true, + "serialized_dashboard": "{\"pages\":[{\"displayName\":\"Sales Overview\",\"name\":\"main\",\"pageType\":\"PAGE_TYPE_CANVAS\"}]}\n", + "update_time": "[TIMESTAMP]", + "warehouse_id": "[TEST_DEFAULT_WAREHOUSE_ID]" + }, + "changes": { + "etag": { + "action": "skip", + "reason": "custom", + "old": [ETAG], + "remote": [ETAG] + }, + "serialized_dashboard": { + "action": "skip", + "reason": "etag_based", + "old": "sha256:[HASH]", + "new": "sha256:[HASH]", + "remote": "sha256:[HASH]" + } + } + } + } +} diff --git a/acceptance/bundle/resources/dashboard-state-sha/out.state.direct.txt b/acceptance/bundle/resources/dashboard-state-sha/out.state.direct.txt new file mode 100644 index 00000000000..c04e637a0b6 --- /dev/null +++ b/acceptance/bundle/resources/dashboard-state-sha/out.state.direct.txt @@ -0,0 +1 @@ +json.state.resources.dashboards.dashboard1.state.serialized_dashboard = "sha256:[HASH]"; diff --git a/acceptance/bundle/resources/dashboard-state-sha/out.test.toml b/acceptance/bundle/resources/dashboard-state-sha/out.test.toml new file mode 100644 index 00000000000..2a4f200653a --- /dev/null +++ b/acceptance/bundle/resources/dashboard-state-sha/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = true +RequiresWarehouse = true +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.READPLAN = ["", "1"] diff --git a/acceptance/bundle/resources/dashboard-state-sha/output.txt b/acceptance/bundle/resources/dashboard-state-sha/output.txt new file mode 100644 index 00000000000..925d48c5e99 --- /dev/null +++ b/acceptance/bundle/resources/dashboard-state-sha/output.txt @@ -0,0 +1,17 @@ + +>>> [CLI] bundle plan -o json +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/dashboard-state-sha-[UNIQUE_NAME]/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> [CLI] bundle plan -o json + +>>> [CLI] bundle destroy --auto-approve +The following resources will be deleted: + delete resources.dashboards.dashboard1 + +All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/dashboard-state-sha-[UNIQUE_NAME]/default + +Deleting files... +Destroy complete! diff --git a/acceptance/bundle/resources/dashboard-state-sha/script b/acceptance/bundle/resources/dashboard-state-sha/script new file mode 100644 index 00000000000..1a43a4f592b --- /dev/null +++ b/acceptance/bundle/resources/dashboard-state-sha/script @@ -0,0 +1,38 @@ +DASHBOARD_DISPLAY_NAME="dashboard-state-sha $(uuid)" +if [ -z "$CLOUD_ENV" ]; then + export TEST_DEFAULT_WAREHOUSE_ID="warehouse-1234" + echo "warehouse-1234:TEST_DEFAULT_WAREHOUSE_ID" >> ACC_REPLS +fi +export DASHBOARD_DISPLAY_NAME +envsubst < databricks.yml.tmpl > databricks.yml + +cleanup() { + trace $CLI bundle destroy --auto-approve + rm -f out.requests.txt +} +trap cleanup EXIT +rm -f out.requests.txt + +# The direct-engine plan keeps the FULL serialized_dashboard in new_state (so it can be +# deployed), and reports the diff against state as a content hash. +trace $CLI bundle plan -o json > out.plan_create.$DATABRICKS_BUNDLE_ENGINE.json + +# Deploy. With READPLAN=1 this applies the SAVED plan file instead of re-planning. The plan +# and the persisted state keep only the hash, but the create API call must still send the +# FULL serialized_dashboard. out.create.serialized.json is identical for READPLAN="" and +# READPLAN=1, which proves the saved plan applies the real content rather than the hash. +# Not traced: the deploy command line differs by READPLAN (--plan vs none). +$CLI bundle deploy $(readplanarg out.plan_create.direct.json) + +DASHBOARD_ID=$($CLI bundle summary --output json | jq -r '.resources.dashboards.dashboard1.id') +echo "$DASHBOARD_ID:DASHBOARD_ID" >> ACC_REPLS + +jq -s '[.[] | select(.method == "POST" and (.path | endswith("/api/2.0/lakeview/dashboards")) and .body.serialized_dashboard != null) | .body.serialized_dashboard]' out.requests.txt > out.create.serialized.json +rm -f out.requests.txt + +# Persisted state holds ONLY the content hash, never the dashboard JSON. +print_state.py | gron.py | grep serialized_dashboard > out.state.$DATABRICKS_BUNDLE_ENGINE.txt + +# Re-plan with no local change: the server normalizes serialized_dashboard, but it is +# ignore_remote_changes (etag_based), so the resource is unchanged (no spurious update). +trace $CLI bundle plan -o json > out.plan_skip.$DATABRICKS_BUNDLE_ENGINE.json diff --git a/acceptance/bundle/resources/dashboard-state-sha/test.toml b/acceptance/bundle/resources/dashboard-state-sha/test.toml new file mode 100644 index 00000000000..3f7a4f7a0a6 --- /dev/null +++ b/acceptance/bundle/resources/dashboard-state-sha/test.toml @@ -0,0 +1,22 @@ +# Direct-engine hashed-state behaviour for dashboards. Kept outside dashboards/ so it can run +# direct-only: terraform does not hash state, and the saved-plan path (deploy --plan, i.e. +# READPLAN=1) is direct-only. RecordRequests is inherited from resources/test.toml. +Local = true +Cloud = true +RequiresWarehouse = true + +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] +EnvMatrix.READPLAN = ["", "1"] + +[Env] +# Prevent MSYS2 from rewriting /Users/... paths on Windows. +MSYS_NO_PATHCONV = "1" + +# Etag can be both negative and positive. +[[Repls]] +Old = "\"[-0-9]{8,}\"" +New = "[ETAG]" + +[[Repls]] +Old = "\"[0-9]{8,}\"" +New = "[ETAG]" diff --git a/acceptance/bundle/resources/dashboards/detect-change/out.plan.direct.json b/acceptance/bundle/resources/dashboards/detect-change/out.plan.direct.json index 7af15357c25..525f981d100 100644 --- a/acceptance/bundle/resources/dashboards/detect-change/out.plan.direct.json +++ b/acceptance/bundle/resources/dashboards/detect-change/out.plan.direct.json @@ -39,9 +39,9 @@ "serialized_dashboard": { "action": "skip", "reason": "etag_based", - "old": "{\n \"pages\": [\n {\n \"displayName\": \"New Page\",\n \"layout\": [\n {\n \"position\": {\n \"height\": 2,\n \"width\": 6,\n \"x\": 0,\n \"y\": 0\n },\n \"widget\": {\n \"name\": \"82eb9107\",\n \"textbox_spec\": \"# I'm a title\"\n }\n },\n {\n \"position\": {\n \"height\": 2,\n \"width\": 6,\n \"x\": 0,\n \"y\": 2\n },\n \"widget\": {\n \"name\": \"ffa6de4f\",\n \"textbox_spec\": \"Text\"\n }\n }\n ],\n \"name\": \"fdd21a3c\"\n }\n ]\n}\n", - "new": "{\n \"pages\": [\n {\n \"displayName\": \"New Page\",\n \"layout\": [\n {\n \"position\": {\n \"height\": 2,\n \"width\": 6,\n \"x\": 0,\n \"y\": 0\n },\n \"widget\": {\n \"name\": \"82eb9107\",\n \"textbox_spec\": \"# I'm a title\"\n }\n },\n {\n \"position\": {\n \"height\": 2,\n \"width\": 6,\n \"x\": 0,\n \"y\": 2\n },\n \"widget\": {\n \"name\": \"ffa6de4f\",\n \"textbox_spec\": \"Text\"\n }\n }\n ],\n \"name\": \"fdd21a3c\"\n }\n ]\n}\n", - "remote": "{}\n" + "old": "sha256:[HASH]", + "new": "sha256:[HASH]", + "remote": "sha256:[HASH]" } } } diff --git a/acceptance/bundle/resources/dashboards/simple/out.plan.direct.json b/acceptance/bundle/resources/dashboards/simple/out.plan.direct.json index 59ab070e00e..5770db0e58f 100644 --- a/acceptance/bundle/resources/dashboards/simple/out.plan.direct.json +++ b/acceptance/bundle/resources/dashboards/simple/out.plan.direct.json @@ -30,9 +30,9 @@ "serialized_dashboard": { "action": "skip", "reason": "etag_based", - "old": "{ }\n", - "new": "{ }\n", - "remote": "{}\n" + "old": "sha256:[HASH]", + "new": "sha256:[HASH]", + "remote": "sha256:[HASH]" } } } diff --git a/acceptance/bundle/resources/dashboards/unpublish-out-of-band/out.plan.direct.json b/acceptance/bundle/resources/dashboards/unpublish-out-of-band/out.plan.direct.json index 558a4ddcfd8..3aeff915c1c 100644 --- a/acceptance/bundle/resources/dashboards/unpublish-out-of-band/out.plan.direct.json +++ b/acceptance/bundle/resources/dashboards/unpublish-out-of-band/out.plan.direct.json @@ -46,9 +46,9 @@ "serialized_dashboard": { "action": "skip", "reason": "etag_based", - "old": "{\"pages\":[{\"displayName\":\"Test Dashboard\",\"name\":\"test-page\"}]}", - "new": "{\"pages\":[{\"displayName\":\"Test Dashboard\",\"name\":\"test-page\"}]}", - "remote": "{\"pages\":[{\"displayName\":\"Test Dashboard\",\"name\":\"test-page\",\"pageType\":\"PAGE_TYPE_CANVAS\"}]}" + "old": "sha256:[HASH]", + "new": "sha256:[HASH]", + "remote": "sha256:[HASH]" } } } diff --git a/acceptance/bundle/test.toml b/acceptance/bundle/test.toml index fca8259a3b5..fb81209776e 100644 --- a/acceptance/bundle/test.toml +++ b/acceptance/bundle/test.toml @@ -22,3 +22,12 @@ New = 'os/[OS]' [[Repls]] Old = ' cicd/github' New = '' + +# serialized_dashboard is persisted in direct-engine state as a content hash. Mask the +# 64-char digest with a hash-specific token (not the generic [ALPHANUMID]) so the output +# reads clearly as a sha256 hash, at a low Order so it runs before the generic [NUMID] / +# [DASHBOARD_ID] rules that would otherwise split the hex into pieces. +[[Repls]] +Old = 'sha256:[0-9a-f]{64}' +New = 'sha256:[HASH]' +Order = -1 diff --git a/bundle/direct/apply.go b/bundle/direct/apply.go index a4a61f727f2..036c70d0a41 100644 --- a/bundle/direct/apply.go +++ b/bundle/direct/apply.go @@ -50,6 +50,16 @@ func (d *DeploymentUnit) Deploy(ctx context.Context, db *dstate.DeploymentState, } } +// saveState compacts the state (replacing fields declared hashed_in_state with content +// hashes, see dresources.CompactState) before persisting it. +func (d *DeploymentUnit) saveState(db *dstate.DeploymentState, id string, newState any) error { + compacted, err := dresources.CompactState(d.Adapter.ResourceConfig(), newState) + if err != nil { + return fmt.Errorf("compacting state: %w", err) + } + return db.SaveState(d.ResourceKey, id, compacted, d.DependsOn) +} + func (d *DeploymentUnit) Create(ctx context.Context, db *dstate.DeploymentState, newState any) error { var newID string var remoteState any @@ -75,7 +85,7 @@ func (d *DeploymentUnit) Create(ctx context.Context, db *dstate.DeploymentState, return err } - err = db.SaveState(d.ResourceKey, newID, newState, d.DependsOn) + err = d.saveState(db, newID, newState) if err != nil { return fmt.Errorf("saving state after creating id=%s: %w", newID, err) } @@ -146,7 +156,7 @@ func (d *DeploymentUnit) Update(ctx context.Context, db *dstate.DeploymentState, return err } - err = db.SaveState(d.ResourceKey, id, newState, d.DependsOn) + err = d.saveState(db, id, newState) if err != nil { return fmt.Errorf("saving state id=%s: %w", id, err) } @@ -190,7 +200,7 @@ func (d *DeploymentUnit) UpdateWithID(ctx context.Context, db *dstate.Deployment return err } - err = db.SaveState(d.ResourceKey, newID, newState, d.DependsOn) + err = d.saveState(db, newID, newState) if err != nil { return fmt.Errorf("saving state id=%s: %w", oldID, err) } @@ -252,7 +262,7 @@ func (d *DeploymentUnit) Resize(ctx context.Context, db *dstate.DeploymentState, return fmt.Errorf("resizing id=%s: %w", id, err) } - err = db.SaveState(d.ResourceKey, id, newState, d.DependsOn) + err = d.saveState(db, id, newState) if err != nil { return fmt.Errorf("saving state id=%s: %w", id, err) } diff --git a/bundle/direct/bind.go b/bundle/direct/bind.go index 9760ce95666..3d074faf64b 100644 --- a/bundle/direct/bind.go +++ b/bundle/direct/bind.go @@ -8,6 +8,7 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/bundle/deployplan" + "github.com/databricks/cli/bundle/direct/dresources" "github.com/databricks/cli/bundle/direct/dstate" "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/structs/structaccess" @@ -145,13 +146,24 @@ func (b *DeploymentBundle) Bind(ctx context.Context, client *databricks.Workspac } } + adapter, err := b.getAdapterForKey(resourceKey) + if err != nil { + os.Remove(tmpStatePath) + return nil, err + } + compacted, err := dresources.CompactState(adapter.ResourceConfig(), sv.Value) + if err != nil { + os.Remove(tmpStatePath) + return nil, fmt.Errorf("compacting state: %w", err) + } + err = b.StateDB.Open(ctx, tmpStatePath, dstate.WithRecovery(true), dstate.WithWrite(true)) if err != nil { os.Remove(tmpStatePath) return nil, err } - err = b.StateDB.SaveState(resourceKey, resourceID, sv.Value, dependsOn) + err = b.StateDB.SaveState(resourceKey, resourceID, compacted, dependsOn) if err != nil { os.Remove(tmpStatePath) return nil, err diff --git a/bundle/direct/bundle_apply.go b/bundle/direct/bundle_apply.go index a9981ee63d8..8cf4565f170 100644 --- a/bundle/direct/bundle_apply.go +++ b/bundle/direct/bundle_apply.go @@ -130,7 +130,7 @@ func (b *DeploymentBundle) Apply(ctx context.Context, client *databricks.Workspa logdiag.LogError(ctx, fmt.Errorf("state entry not found for %q", resourceKey)) return false } - err = b.StateDB.SaveState(resourceKey, id, sv.Value, entry.DependsOn) + err = d.saveState(&b.StateDB, id, sv.Value) } else { // TODO: redo calcDiff to downgrade planned action if possible (?) err = d.Deploy(ctx, &b.StateDB, sv.Value, action, entry) diff --git a/bundle/direct/bundle_plan.go b/bundle/direct/bundle_plan.go index 552d2067cc3..a22429d90db 100644 --- a/bundle/direct/bundle_plan.go +++ b/bundle/direct/bundle_plan.go @@ -197,6 +197,28 @@ func (b *DeploymentBundle) CalculatePlan(ctx context.Context, client *databricks return false } + // Replace the hashed_in_state fields with their "sha256:..." hash in the state we + // just read off disk, so the diff below compares like-for-like. + // + // We only ever keep a content hash for big fields like serialized_dashboard, never + // the full contents. The new config we diff against (localState, below) is hashed + // too, so the saved side has to be hashed as well or the two could never match. The + // state we read might still hold the full, un-hashed contents though: either it was + // written by an older CLI from before this change, or the field was only just added + // to hashed_in_state. Hashing it here, on read, lines the two sides up so an + // unchanged resource correctly shows "no change". + // + // This only changes the in-memory copy used for the diff. The on-disk entry keeps + // its full contents until the resource is next saved (which rewrites it as a hash), + // so no state_version bump or explicit migration is needed. + // + // See https://github.com/databricks/cli/pull/5609 + savedState, err = dresources.CompactState(adapter.ResourceConfig(), savedState) + if err != nil { + logdiag.LogError(ctx, fmt.Errorf("%s: compacting saved state: %w", errorPrefix, err)) + return false + } + // Note, currently we're diffing static structs, not dynamic value. // This means for fields that contain references like ${resources.group.foo.id} we do one of the following: // for strings: comparing unresolved string like "${resoures.group.foo.id}" with actual object id. As long as IDs do not have ${...} format we're good. @@ -208,7 +230,15 @@ func (b *DeploymentBundle) CalculatePlan(ctx context.Context, client *databricks logdiag.LogError(ctx, fmt.Errorf("%s: internal error: no state cache entry found for %q", errorPrefix, resourceKey)) return false } - localDiff, err := structdiff.GetStructDiff(savedState, sv.Value, adapter.KeyedSlices()) + + // Compact a copy for comparison only; sv.Value keeps the full contents, which + // the deploy sends to the API. + localState, err := dresources.CompactState(adapter.ResourceConfig(), sv.Value) + if err != nil { + logdiag.LogError(ctx, fmt.Errorf("%s: compacting local state: %w", errorPrefix, err)) + return false + } + localDiff, err := structdiff.GetStructDiff(savedState, localState, adapter.KeyedSlices()) if err != nil { logdiag.LogError(ctx, fmt.Errorf("%s: diffing local state: %w", errorPrefix, err)) return false @@ -241,7 +271,21 @@ func (b *DeploymentBundle) CalculatePlan(ctx context.Context, client *databricks return false } - remoteDiff, err = structdiff.GetStructDiff(remoteStateComparable, sv.Value, adapter.KeyedSlices()) + // Compact the remapped remote on the same fields, so a hashed_in_state field + // is a hash on all three sides of the diff (saved, config, remote). Once the + // saved value is a hash, every comparison must be hash-vs-hash to be meaningful + // — including remote drift (Remote vs New). This is what keeps hashed_in_state + // orthogonal to ignore_remote_changes: remote drift is detected as hash != hash, + // so a field can be hashed without being ignore_remote_changes. serialized_dashboard + // is also ignore_remote_changes, but for an independent reason: the server + // normalizes it, so its remote hash never matches the config hash (see resources.yml). + remoteStateComparable, err = dresources.CompactState(adapter.ResourceConfig(), remoteStateComparable) + if err != nil { + logdiag.LogError(ctx, fmt.Errorf("%s: compacting remote state id=%q: %w", errorPrefix, dbentry.ID, err)) + return false + } + + remoteDiff, err = structdiff.GetStructDiff(remoteStateComparable, localState, adapter.KeyedSlices()) if err != nil { logdiag.LogError(ctx, fmt.Errorf("%s: diffing remote state: %w", errorPrefix, err)) return false diff --git a/bundle/direct/dresources/README.md b/bundle/direct/dresources/README.md index a20b68c4ebd..f99652673de 100644 --- a/bundle/direct/dresources/README.md +++ b/bundle/direct/dresources/README.md @@ -47,6 +47,16 @@ If the API may return a slice's elements in a different order between calls (e.g The state struct is serialized to JSON and persisted between deploys. Backward incompatible changes will result in a drift, which depending on field behaviour might result in recreate. See dstate/migrate.go on how to handle state migration. +## hashed_in_state: storing large fields as content hashes + +Declare a field under `hashed_in_state` in `resources.yml` when it holds large content that is only ever compared for equality and never read back from state. The engine then persists only a `sha256:` content hash for that field (via `CompactState`), and — crucially — hashes it on **every** value entering the diff: saved state, the local config, and the remapped remote. Once the saved value is a hash, all three sides must be hashes or the comparisons (`Old==New` for a local change, `Remote==New` for remote drift) would be hash-vs-content nonsense. The full contents stay only in the plan's `new_state` and are sent to the API on every deploy, so the deploy is unaffected. + +A field qualifies if it is large (a small field gains nothing — the hash placeholder is ~70 bytes) and is never read back from state by any code path (e.g. not consumed raw by `OverrideChangeDesc` or by state export). + +**`hashed_in_state` is orthogonal to `ignore_remote_changes`.** Because the remote side is hashed too, remote drift on a hashed field is detected as `hash != hash` — so a field can be `hashed_in_state` *without* being `ignore_remote_changes` (as long as the server echoes the value back unchanged). The two are declared independently. + +`dashboards.serialized_dashboard` happens to need both, for unrelated reasons: it is `hashed_in_state` because the inlined dashboard JSON is large, and it is *separately* `ignore_remote_changes` because the **server normalizes** it (adds `pageType`, reorders keys) so its remote hash never equals the config hash — drift is detected via `etag` instead. The plan therefore reports the field as a hash on all three sides. No state version bump is needed: legacy full-content state is hashed on read for comparison and rewritten compactly on the next save. + ## OverrideChangeDesc Use `OverrideChangeDesc` only as a last resort when `resources.yml` settings cannot express the needed logic. Skipping an action with `change.Action = deployplan.Skip` in `OverrideChangeDesc` creates a silent no-op: the plan shows no change even if the user's config differs from remote. Document the skip reason clearly in both the comment and `change.Reason`. diff --git a/bundle/direct/dresources/config.go b/bundle/direct/dresources/config.go index a425d7d7208..6976c0a1696 100644 --- a/bundle/direct/dresources/config.go +++ b/bundle/direct/dresources/config.go @@ -70,6 +70,12 @@ type ResourceLifecycleConfig struct { // BackendDefaults: fields where the backend may set defaults. // When old and new are nil but remote is set, and the remote value matches allowed values (if specified), the change is skipped. BackendDefaults []BackendDefaultRule `yaml:"backend_defaults,omitempty"` + + // HashedInState: fields persisted to state as a content hash ("sha256:") + // instead of their full contents. Only valid for large, equality-only fields + // that are never read back from state (e.g. dashboards' serialized_dashboard, + // which is ignore_remote_changes and re-sent from config on every deploy). + HashedInState []FieldRule `yaml:"hashed_in_state,omitempty"` } // Config is the root configuration structure for resource lifecycle behavior. @@ -91,6 +97,7 @@ var empty = ResourceLifecycleConfig{ NormalizeCase: nil, NormalizeSlash: nil, BackendDefaults: nil, + HashedInState: nil, } func mustParseConfig(data []byte) func() *Config { diff --git a/bundle/direct/dresources/dashboard_test.go b/bundle/direct/dresources/dashboard_test.go index 177e1236221..17f93475d28 100644 --- a/bundle/direct/dresources/dashboard_test.go +++ b/bundle/direct/dresources/dashboard_test.go @@ -2,9 +2,11 @@ package dresources import ( "encoding/json" + "strings" "testing" "github.com/databricks/cli/bundle/config/resources" + "github.com/databricks/cli/libs/structs/structpath" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -23,3 +25,53 @@ func TestDashboardState_JSONSerialization_PublishedField(t *testing.T) { assert.Contains(t, string(data), `"published":true`) } + +func TestDashboardCompactState(t *testing.T) { + state := &DashboardState{ + DashboardConfig: resources.DashboardConfig{ + DisplayName: "test-dashboard", + Etag: "etag-123", + SerializedDashboard: `{"pages":[{"name":"p1"}]}`, + }, + } + + out, err := CompactState(GetResourceConfig("dashboards"), state) + require.NoError(t, err) + compacted := out.(*DashboardState) + + // serialized_dashboard is replaced by a content hash; other fields are preserved. + require.IsType(t, "", compacted.SerializedDashboard) + assert.True(t, strings.HasPrefix(compacted.SerializedDashboard.(string), stateHashPrefix)) + assert.Equal(t, "test-dashboard", compacted.DisplayName) + assert.Equal(t, "etag-123", compacted.Etag) + + // The original state is not mutated. + assert.Equal(t, `{"pages":[{"name":"p1"}]}`, state.SerializedDashboard) + + // Compacting is idempotent. + out2, err := CompactState(GetResourceConfig("dashboards"), compacted) + require.NoError(t, err) + assert.Equal(t, compacted.SerializedDashboard, out2.(*DashboardState).SerializedDashboard) +} + +// TestDashboardSerializedDashboardStateRules documents that serialized_dashboard carries +// two independent declarations: hashed_in_state (persist only its hash, since the blob is +// large) and ignore_remote_changes (the server normalizes the content, so its remote hash +// never matches the config hash — drift is detected via etag instead). The two are +// orthogonal in general; serialized_dashboard just happens to need both. +func TestDashboardSerializedDashboardStateRules(t *testing.T) { + cfg := GetResourceConfig("dashboards") + path := structpath.NewStringKey(nil, "serialized_dashboard") + + hasRule := func(rules []FieldRule) bool { + for _, rule := range rules { + if path.HasPatternPrefix(rule.Field) { + return true + } + } + return false + } + + assert.True(t, hasRule(cfg.HashedInState), "serialized_dashboard must be declared hashed_in_state") + assert.True(t, hasRule(cfg.IgnoreRemoteChanges), "serialized_dashboard must be ignore_remote_changes (server normalizes the content)") +} diff --git a/bundle/direct/dresources/resources.yml b/bundle/direct/dresources/resources.yml index 9c12da9f39b..65965472662 100644 --- a/bundle/direct/dresources/resources.yml +++ b/bundle/direct/dresources/resources.yml @@ -382,6 +382,13 @@ resources: - field: dataset_schema reason: input_only + hashed_in_state: + # serialized_dashboard holds the inlined dashboard JSON (often megabytes). It is + # ignore_remote_changes (etag_based) and re-sent from config on every deploy, so it + # is never read back from state; persist only its content hash to keep state small. + - field: serialized_dashboard + reason: large_content + genie_spaces: ignore_remote_changes: # serialized_space locally (structured YAML) and remotely (JSON string) will differ diff --git a/bundle/direct/dresources/state_compaction.go b/bundle/direct/dresources/state_compaction.go new file mode 100644 index 00000000000..419e6ee1f47 --- /dev/null +++ b/bundle/direct/dresources/state_compaction.go @@ -0,0 +1,88 @@ +package dresources + +import ( + "crypto/sha256" + "encoding/hex" + "encoding/json" + "errors" + "fmt" + "reflect" + "strings" + + "github.com/databricks/cli/libs/structs/structaccess" +) + +// stateHashPrefix marks a state value that holds a content hash instead of the +// full content. It is part of the on-disk state format, so changing it is a +// backward-incompatible change. +const stateHashPrefix = "sha256:" + +// hashStateValue returns a content-hash placeholder ("sha256:") over the JSON +// encoding of v. It is used to store large, equality-only fields (e.g. a dashboard's +// serialized_dashboard) compactly in state instead of their full contents. +// +// It is idempotent and stable: nil, an empty string, and a value that is already a +// placeholder are returned unchanged, so re-compacting an already-compact state and +// comparing a fresh config against stored state both behave predictably. +func hashStateValue(v any) (any, error) { + if s, ok := v.(string); ok { + if s == "" || strings.HasPrefix(s, stateHashPrefix) { + return v, nil + } + } + if v == nil { + return v, nil + } + + // json.Marshal is deterministic (map keys are sorted), so equal content always + // produces an equal hash across runs and platforms. + data, err := json.Marshal(v) + if err != nil { + return nil, err + } + sum := sha256.Sum256(data) + return stateHashPrefix + hex.EncodeToString(sum[:]), nil +} + +// CompactState returns a copy of state with every field declared in cfg.HashedInState +// replaced by a content hash, so the state persists only the hash and not the full +// contents. It is applied both before persisting state and to every value entering the +// state diff, so stored and compared values share one form. The caller's value is never +// mutated (it is reused for the deploy API call, which needs the full contents). +// +// Returns state unchanged when no fields are declared or state is not a non-nil pointer. +func CompactState(cfg *ResourceLifecycleConfig, state any) (any, error) { + if cfg == nil || len(cfg.HashedInState) == 0 { + return state, nil + } + + rv := reflect.ValueOf(state) + if rv.Kind() != reflect.Pointer || rv.IsNil() { + return state, nil + } + + // Shallow copy so the caller's value (reused for the deploy) is untouched. + out := reflect.New(rv.Type().Elem()) + out.Elem().Set(rv.Elem()) + compacted := out.Interface() + + for _, rule := range cfg.HashedInState { + field := rule.Field.String() + current, err := structaccess.GetByString(compacted, field) + if err != nil { + if _, ok := errors.AsType[*structaccess.NotFoundError](err); ok { + continue + } + return nil, fmt.Errorf("compacting state field %q: %w", field, err) + } + hashed, err := hashStateValue(current) + if err != nil { + return nil, fmt.Errorf("compacting state field %q: %w", field, err) + } + if err := structaccess.SetByString(compacted, field, hashed); err != nil { + return nil, fmt.Errorf("compacting state field %q: %w", field, err) + } + } + + return compacted, nil +} diff --git a/bundle/direct/dresources/state_compaction_test.go b/bundle/direct/dresources/state_compaction_test.go new file mode 100644 index 00000000000..9fd6611cc8b --- /dev/null +++ b/bundle/direct/dresources/state_compaction_test.go @@ -0,0 +1,68 @@ +package dresources + +import ( + "strings" + "testing" + + "github.com/databricks/cli/bundle/config/resources" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCompactStateNoDeclaredFields(t *testing.T) { + state := &DashboardState{DashboardConfig: resources.DashboardConfig{SerializedDashboard: `{"a":1}`}} + + // A resource type with no hashed_in_state declaration returns the value untouched. + out, err := CompactState(GetResourceConfig("jobs"), state) + require.NoError(t, err) + assert.Same(t, state, out.(*DashboardState)) + + // A nil config is also a no-op. + out, err = CompactState(nil, state) + require.NoError(t, err) + assert.Same(t, state, out.(*DashboardState)) +} + +func TestHashStateValue(t *testing.T) { + stringHash, err := hashStateValue("hello") + require.NoError(t, err) + require.IsType(t, "", stringHash) + assert.True(t, strings.HasPrefix(stringHash.(string), stateHashPrefix)) + + // Same content always hashes to the same value. + again, err := hashStateValue("hello") + require.NoError(t, err) + assert.Equal(t, stringHash, again) + + // Different content hashes differently. + other, err := hashStateValue("world") + require.NoError(t, err) + assert.NotEqual(t, stringHash, other) + + // A map hashes deterministically regardless of literal key order. + mapHash, err := hashStateValue(map[string]any{"a": 1, "b": 2}) + require.NoError(t, err) + mapHash2, err := hashStateValue(map[string]any{"b": 2, "a": 1}) + require.NoError(t, err) + assert.Equal(t, mapHash, mapHash2) +} + +func TestHashStateValueIdempotent(t *testing.T) { + hashed, err := hashStateValue("some content") + require.NoError(t, err) + + // Re-hashing a placeholder returns it unchanged. + again, err := hashStateValue(hashed) + require.NoError(t, err) + assert.Equal(t, hashed, again) +} + +func TestHashStateValueEmptyAndNil(t *testing.T) { + empty, err := hashStateValue("") + require.NoError(t, err) + assert.Empty(t, empty) + + null, err := hashStateValue(nil) + require.NoError(t, err) + assert.Nil(t, null) +}