From 94866e813b7a6ccc746f9a34c1b3a1a82c8cd44b Mon Sep 17 00:00:00 2001 From: Marc LeBlanc <7050295+marcleblanc2@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:01:07 -0600 Subject: [PATCH 1/3] Scope --sync-saml-orgs to the set run's selected users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Org sync now mirrors the permission-sync mode of the same run: - Org names gain a synced- ownership prefix (synced--); the sync only ever modifies orgs carrying it, so manually created orgs are never touched. - Additive set modes (--users, --users-without-explicit-perms, --created-after) with --sync-saml-orgs run a SCOPED org sync: per-user additions AND removals computed from each selected user's own SAML assertion and org list. Other users are never touched, and no full user stream or org member pages are loaded — API traffic stays proportional to the selection. User org memberships ride along inline in the existing user queries (zero extra requests). - Full org sync (standalone sync-saml-orgs, set --full / --repos*) discovers every synced org in one search request (replacing per-name lookups) and now also empties — but never deletes — synced orgs whose SAML group disappeared. - Org snapshots record their scope (schema_version 2); scoped applies validate by re-reading just the scoped users' org lists via aliased batch lookups. - The fixture fake gains organizations/currentUser support; new local cases cover scoped add+remove with an out-of-scope canary member, scoped idempotence, and full-mode orphan cleanup. Amp-Thread-ID: https://ampcode.com/threads/T-019ebdc3-c01c-72cb-aa46-52a0183c2ab1 Co-authored-by: Amp --- README.md | 17 + src/src_auth_perms_sync/orgs/queries.py | 46 ++ src/src_auth_perms_sync/orgs/sync.py | 533 ++++++++++++++---- src/src_auth_perms_sync/orgs/types.py | 6 + .../permissions/command.py | 89 ++- .../permissions/queries.py | 44 +- .../permissions/sourcegraph.py | 6 + src/src_auth_perms_sync/shared/queries.py | 11 +- src/src_auth_perms_sync/shared/run_context.py | 8 + src/src_auth_perms_sync/shared/saml_groups.py | 96 +++- src/src_auth_perms_sync/shared/sourcegraph.py | 4 + src/src_auth_perms_sync/shared/types.py | 24 + tests/e2e/case_runner.py | 179 +++++- .../set-users-sync-saml-orgs-noop/before.json | 108 ++++ .../set-users-sync-saml-orgs-noop/maps.yaml | 8 + .../after.json | 109 ++++ .../before.json | 104 ++++ .../set-users-sync-saml-orgs-scoped/maps.yaml | 8 + .../sync-saml-orgs-full-cleanup/after.json | 111 ++++ .../sync-saml-orgs-full-cleanup/before.json | 106 ++++ tests/tests.yaml | 46 ++ tests/unit/test_saml_groups.py | 86 +++ 22 files changed, 1622 insertions(+), 127 deletions(-) create mode 100644 tests/e2e/fixtures/set-users-sync-saml-orgs-noop/before.json create mode 100644 tests/e2e/fixtures/set-users-sync-saml-orgs-noop/maps.yaml create mode 100644 tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/after.json create mode 100644 tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/before.json create mode 100644 tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/maps.yaml create mode 100644 tests/e2e/fixtures/sync-saml-orgs-full-cleanup/after.json create mode 100644 tests/e2e/fixtures/sync-saml-orgs-full-cleanup/before.json diff --git a/README.md b/README.md index c1eeb88..26a296e 100644 --- a/README.md +++ b/README.md @@ -284,6 +284,23 @@ snapshots that make `--apply` reversible. - Creates the orgs if they don't exist, and sync the members from the SAML groups to the orgs - `--sync-saml-orgs` can also be added to a `set` run, to run both at the same time +### Org sync behavior + +- Org names are `synced--` (non-alphanumeric characters + become `-`). The `synced-` prefix marks tool ownership: the sync only ever + modifies orgs whose name carries it, so manually created orgs are never touched. +- The org sync mode follows the permission sync mode of the same run — no surprises: + - **Full** (standalone `sync-saml-orgs`, or `set --full` / `--repos*` + `--sync-saml-orgs`): converges every synced org against all users. A synced + org whose SAML group disappeared has all members removed, but the org itself + is kept (its settings survive in case the group comes back). + - **Scoped** (`set --users` / `--users-without-explicit-perms` / + `--created-after` with `--sync-saml-orgs`): syncs org membership for exactly + the users the set run selected — per-user additions AND removals, computed + from each user's own SAML assertion and org list. Other users' memberships + never change, and no full user scan or org member listing is needed, so API + traffic stays proportional to the selection. + ## Options Run `src-auth-perms-sync --help` for options diff --git a/src/src_auth_perms_sync/orgs/queries.py b/src/src_auth_perms_sync/orgs/queries.py index bb6888c..ad76b3c 100644 --- a/src/src_auth_perms_sync/orgs/queries.py +++ b/src/src_auth_perms_sync/orgs/queries.py @@ -2,6 +2,52 @@ from __future__ import annotations +QUERY_CURRENT_USER = """ +query SamlOrganizationSyncCurrentUser { + currentUser { id username } +} +""" + +# One search request finds every tool-managed (`synced-` prefixed) org. +# totalCount > len(nodes) signals truncation; callers must then fall back +# to per-name lookups. +QUERY_SYNCED_ORGANIZATIONS = """ +query SyncedOrganizations($first: Int!, $query: String!) { + currentUser { id username } + organizations(first: $first, query: $query) { + totalCount + nodes { + id + name + } + } +} +""" + + +def users_organizations_batch_query(batch_size: int) -> str: + """Fetch many users' org memberships in one aliased `node()` request. + + Used to validate a scoped org sync: re-reading each scoped user's own + org list is far cheaper than paging every touched org's member list. + """ + variables = ", ".join(f"$user{index}: ID!" for index in range(batch_size)) + aliases = "".join( + f""" + user{index}: node(id: $user{index}) {{ + ... on User {{ + id + username + organizations {{ + nodes {{ id name }} + }} + }} + }}""" + for index in range(batch_size) + ) + return f"query UsersOrganizationsBatch({variables}) {{{aliases}\n}}" + + QUERY_ORGANIZATION_MEMBERS_PAGE = """ query OrganizationMembersPage($id: ID!, $first: Int!, $after: String) { node(id: $id) { diff --git a/src/src_auth_perms_sync/orgs/sync.py b/src/src_auth_perms_sync/orgs/sync.py index ce435b7..76148af 100644 --- a/src/src_auth_perms_sync/orgs/sync.py +++ b/src/src_auth_perms_sync/orgs/sync.py @@ -5,7 +5,6 @@ import datetime import json import logging -import re import time from collections.abc import Iterable from concurrent.futures import CancelledError, ThreadPoolExecutor @@ -27,15 +26,21 @@ ORGANIZATION_LOOKUP_BATCH_SIZE: int = 50 ORGANIZATION_SEARCH_RESULT_LIMIT: int = 100 ORGANIZATION_MEMBER_PAGE_SIZE: int = 1000 -ORGANIZATION_NAME_MAX_LENGTH: int = 255 -ORGANIZATION_SNAPSHOT_SCHEMA_VERSION: int = 1 +ORGANIZATION_SNAPSHOT_SCHEMA_VERSION: int = 2 ORGANIZATION_SNAPSHOT_DIFF_SCHEMA_VERSION: int = 1 +# One `organizations(query: "synced-")` search discovers every tool-managed +# org in a single request. Sourcegraph caps `first:` at 5000 on most +# connections; when an instance exceeds the returned page we fall back to +# per-name lookups (and skip orphaned-org cleanup) rather than miss orgs. +SYNCED_ORGANIZATION_SEARCH_LIMIT: int = 5000 -_ORGANIZATION_NAME_PART_RE = re.compile(r"[^A-Za-z0-9]+") -_ORGANIZATION_NAME_DASH_RUN_RE = re.compile(r"-+") _ALREADY_MEMBER_TEXT = "user is already a member of the organization" _ORGANIZATION_EXISTS_TEXT = "organization name is already taken" +# Re-exported here for callers that think in org-sync terms; the naming +# rule lives in shared.saml_groups so user compaction can share it. +organization_name_for_saml_group = saml_groups.organization_name_for_saml_group + @dataclass(frozen=True) class _OrganizationSyncState: @@ -92,24 +97,158 @@ def _load_organization_sync_state( client, command_data.saml_group_users, ) + + discovered = _discover_synced_organization_states(client) + if discovered is None: + # Truncated discovery: resolve target names individually and skip + # orphaned-org cleanup this run. + if not targets: + log.warning("No SAML group memberships found in user accountData — nothing to sync.") + return None + current_user, current_states = _load_current_organization_states( + client, + sorted(targets), + parallelism, + worker_pool, + ) + else: + current_user, synced_states = discovered + for organization_name in sorted(set(synced_states) - set(targets)): + # The SAML group behind this synced org is gone (or lost its + # last member): remove every member but keep the org so admin + # settings survive if the group comes back. + targets[organization_name] = organization_types.TargetOrganization( + name=organization_name, + provider_config_id="", + saml_group="", + ) + log.info( + "Synced org %s no longer matches any SAML group: " + "removing all members, keeping the org.", + organization_name, + ) + if not targets: + log.warning( + "No SAML group memberships found in user accountData " + "and no synced orgs exist — nothing to sync." + ) + return None + current_states = { + organization_name: synced_states.get(organization_name) + or organization_types.OrganizationState( + id=None, + name=organization_name, + members_by_id={}, + ) + for organization_name in targets + } + with src.span( + "load_current_organization_states", + organization_count=len(targets), + member_page_size=ORGANIZATION_MEMBER_PAGE_SIZE, + ) as load_event: + _fetch_members_into_states(client, current_states, parallelism, worker_pool, load_event) + + command_event["target_organizations"] = len(targets) + command_event["desired_memberships"] = sum( + len(target.desired_members_by_id) for target in targets.values() + ) + before_snapshot = _snapshot_from_states(client.endpoint, targets, current_states) + plan = _plan_organization_sync(targets, current_states, current_user) + expected_states = _expected_states_from_targets(targets, current_states) + expected_snapshot = _snapshot_from_states(client.endpoint, targets, expected_states) + return _OrganizationSyncState( + targets=targets, + current_user=current_user, + current_states=current_states, + before_snapshot=before_snapshot, + plan=plan, + expected_snapshot=expected_snapshot, + ) + + +def _load_scoped_organization_sync_state( + client: src.SourcegraphClient, + scoped_users: list[shared_types.ScopedSamlGroupUser], + parallelism: int, + command_event: dict[str, Any], + worker_pool: ThreadPoolExecutor | None = None, +) -> _OrganizationSyncState | None: + """Plan a per-user org sync covering only the given users. + + Each user's `accountData` is the complete truth for that user's SAML + groups, and their own org list is the complete truth for their current + synced-org memberships — so additions AND removals are both safe per + user. Users outside the scope are never touched, and neither full user + streams nor org member pages are loaded. + """ + log.info( + "Scoped SAML org sync: planning membership for %d selected user(s) only; " + "no other users' org memberships will change.", + len(scoped_users), + ) + command_event["scoped_user_count"] = len(scoped_users) + targets: dict[str, organization_types.TargetOrganization] = {} + collisions: set[str] = set() + for scoped_user in scoped_users: + _record_saml_group_user_memberships(targets, collisions, scoped_user) + _raise_for_target_collisions(collisions) + + current_members_by_organization: dict[str, dict[str, organization_types.OrgMember]] = {} + organization_ids_by_name: dict[str, str] = {} + for scoped_user in scoped_users: + for organization in scoped_user.synced_organizations: + organization_ids_by_name[organization["name"]] = organization["id"] + current_members_by_organization.setdefault(organization["name"], {})[ + scoped_user.user_id + ] = {"id": scoped_user.user_id, "username": scoped_user.username} + for organization_name in sorted(set(current_members_by_organization) - set(targets)): + # In-scope user(s) are members of a synced org that matches none of + # their SAML groups any more: plan their removal (org kept). + targets[organization_name] = organization_types.TargetOrganization( + name=organization_name, + provider_config_id="", + saml_group="", + ) + command_event["target_organizations"] = len(targets) command_event["desired_memberships"] = sum( len(target.desired_members_by_id) for target in targets.values() ) if not targets: - log.warning("No SAML group memberships found in user accountData — nothing to sync.") + log.info("Selected user(s) hold no SAML group or synced org memberships — nothing to sync.") return None - current_user, current_states = _load_current_organization_states( + # Org IDs for orgs the scoped users belong to come from their own org + # lists; only the remaining target names need a lookup (also yielding + # currentUser for the create path). No member pages are fetched. + names_needing_lookup = sorted(set(targets) - set(organization_ids_by_name)) + current_user, looked_up_states = _lookup_organization_states( client, - sorted(targets), + names_needing_lookup, parallelism, worker_pool, ) - before_snapshot = _snapshot_from_states(client.endpoint, targets, current_states) + current_states: dict[str, organization_types.OrganizationState] = {} + for organization_name in targets: + known_id = organization_ids_by_name.get(organization_name) + if known_id is not None: + state = organization_types.OrganizationState( + id=known_id, + name=organization_name, + members_by_id={}, + ) + else: + state = looked_up_states[organization_name] + state.members_by_id = dict(current_members_by_organization.get(organization_name, {})) + current_states[organization_name] = state + + before_snapshot = _snapshot_from_states(client.endpoint, targets, current_states, scope="users") plan = _plan_organization_sync(targets, current_states, current_user) expected_states = _expected_states_from_targets(targets, current_states) - expected_snapshot = _snapshot_from_states(client.endpoint, targets, expected_states) + expected_snapshot = _snapshot_from_states( + client.endpoint, targets, expected_states, scope="users" + ) return _OrganizationSyncState( targets=targets, current_user=current_user, @@ -288,6 +427,113 @@ def _finish_organization_apply_with_backup( log.info("To inspect the pre-sync org membership state, read:\n %s", before_path) +def _finish_scoped_organization_apply_with_backup( + client: src.SourcegraphClient, + run_paths: backups.RunPaths, + sync_state: _OrganizationSyncState, + scoped_users: list[shared_types.ScopedSamlGroupUser], + before_path: Path | None, + parallelism: int, + worker_pool: ThreadPoolExecutor | None = None, +) -> None: + """Validate a scoped apply by re-reading only the scoped users' org lists.""" + after_states = _load_scoped_after_states( + client, + sync_state, + scoped_users, + parallelism, + worker_pool, + ) + after_snapshot = _snapshot_from_states( + client.endpoint, sync_state.targets, after_states, scope="users" + ) + if run_paths.write_files: + after_path, diff_path = _write_organization_after_snapshot( + run_paths, + sync_state.before_snapshot, + after_snapshot, + ) + log.info("Wrote after org snapshot: %s diff=%s.", after_path, diff_path) + else: + log.info("Skipping after and diff org snapshots because --no-files is set.") + _validate_organization_sync(after_snapshot, sync_state.expected_snapshot) + if before_path is not None: + log.info("To inspect the pre-sync org membership state, read:\n %s", before_path) + + +def _load_scoped_after_states( + client: src.SourcegraphClient, + sync_state: _OrganizationSyncState, + scoped_users: list[shared_types.ScopedSamlGroupUser], + parallelism: int, + worker_pool: ThreadPoolExecutor | None = None, +) -> dict[str, organization_types.OrganizationState]: + """Rebuild target org states from the scoped users' refreshed org lists.""" + organizations_by_user_id = _fetch_users_organizations( + client, + [scoped_user.user_id for scoped_user in scoped_users], + parallelism, + worker_pool, + ) + after_states = { + organization_name: organization_types.OrganizationState( + id=sync_state.current_states[organization_name].id, + name=organization_name, + members_by_id={}, + ) + for organization_name in sync_state.targets + } + for scoped_user in scoped_users: + for organization in organizations_by_user_id.get(scoped_user.user_id, []): + state = after_states.get(organization["name"]) + if state is not None: + state.members_by_id[scoped_user.user_id] = { + "id": scoped_user.user_id, + "username": scoped_user.username, + } + return after_states + + +def _fetch_users_organizations( + client: src.SourcegraphClient, + user_ids: list[str], + parallelism: int, + worker_pool: ThreadPoolExecutor | None = None, +) -> dict[str, list[shared_types.OrganizationReference]]: + """Fetch many users' org memberships via aliased batch lookups.""" + + def fetch_batch( + batch: list[str], + ) -> list[tuple[str, list[shared_types.OrganizationReference]]]: + data = client.graphql( + queries.users_organizations_batch_query(len(batch)), + cast(src.JSONDict, {f"user{index}": user_id for index, user_id in enumerate(batch)}), + follow_pages=False, + ) + batch_organizations: list[tuple[str, list[shared_types.OrganizationReference]]] = [] + for index, user_id in enumerate(batch): + node = cast(dict[str, Any] | None, data.get(f"user{index}")) + organizations: list[shared_types.OrganizationReference] = [] + if node: + organizations = cast( + list[shared_types.OrganizationReference], + node["organizations"]["nodes"], + ) + batch_organizations.append((user_id, organizations)) + return batch_organizations + + organizations_by_user_id: dict[str, list[shared_types.OrganizationReference]] = {} + for batch_results in run_context.parallel_map( + fetch_batch, + list(_chunks(user_ids, ORGANIZATION_LOOKUP_BATCH_SIZE)), + parallelism=parallelism, + worker_pool=worker_pool, + ): + for user_id, organizations in batch_results: + organizations_by_user_id[user_id] = organizations + return organizations_by_user_id + + def _write_organization_after_snapshot( run_paths: backups.RunPaths, before_snapshot: organization_types.OrganizationSnapshot, @@ -325,27 +571,52 @@ def cmd_sync_saml_organizations( command_data: run_context.CommandData | None = None, worker_pool: ThreadPoolExecutor | None = None, ) -> None: - """Create/update Sourcegraph orgs from every discovered SAML group. + """Create/update Sourcegraph orgs from discovered SAML groups. Org names are deterministic and config-free: the Sourcegraph-safe form - of `-`. Invalid org-name + of `synced--`. Invalid org-name characters are converted to `-`; any resulting name collision fails before mutation so we never merge unrelated SAML groups accidentally. + The `synced-` prefix marks tool ownership: only orgs carrying it are + ever modified. + + Two modes, matching the permission-sync mode of the same run: + + - Full (standalone `sync-saml-orgs`, or after a full-overwrite set): + converges every synced org to the complete user population, + including emptying (but never deleting) synced orgs whose SAML + group disappeared. + - Scoped (`command_data.scoped_saml_group_users` is set, after an + additive set): per-user additions and removals for exactly the + selected users; no other users or orgs are touched and no full + user stream or member pages are loaded. """ + resolved_command_data = command_data or run_context.CommandData() + scoped_users = resolved_command_data.scoped_saml_group_users with src.span( "cmd_sync_saml_organizations", dry_run=dry_run, parallelism=parallelism, do_backup=do_backup, + scoped=scoped_users is not None, ) as command_event: - sync_state = _load_organization_sync_state( - client, - saml_groups_attribute_name_by_config_id, - parallelism, - command_event, - command_data or run_context.CommandData(), - worker_pool, - ) + if scoped_users is not None: + sync_state = _load_scoped_organization_sync_state( + client, + scoped_users, + parallelism, + command_event, + worker_pool, + ) + else: + sync_state = _load_organization_sync_state( + client, + saml_groups_attribute_name_by_config_id, + parallelism, + command_event, + resolved_command_data, + worker_pool, + ) if sync_state is None: return @@ -372,14 +643,25 @@ def cmd_sync_saml_organizations( _record_organization_apply_event(command_event, apply_result) if do_backup: - _finish_organization_apply_with_backup( - client, - run_paths, - sync_state, - before_path, - parallelism, - worker_pool, - ) + if scoped_users is not None: + _finish_scoped_organization_apply_with_backup( + client, + run_paths, + sync_state, + scoped_users, + before_path, + parallelism, + worker_pool, + ) + else: + _finish_organization_apply_with_backup( + client, + run_paths, + sync_state, + before_path, + parallelism, + worker_pool, + ) _raise_for_failed_organization_sync(apply_result) @@ -427,7 +709,7 @@ def _collect_target_organizations( def _record_saml_group_user_memberships( targets: dict[str, organization_types.TargetOrganization], collisions: set[str], - user: shared_types.SamlGroupUser, + user: shared_types.SamlGroupUser | shared_types.ScopedSamlGroupUser, ) -> None: for membership in user.saml_group_memberships: _record_target_organization_membership( @@ -444,7 +726,7 @@ def _record_target_organization_membership( collisions: set[str], provider_config_id: str, group_name: str, - user: shared_types.SamlGroupUser, + user: shared_types.SamlGroupUser | shared_types.ScopedSamlGroupUser, ) -> None: organization_name = organization_name_for_saml_group(provider_config_id, group_name) existing_target = targets.get(organization_name) @@ -509,89 +791,145 @@ def _log_target_collection_summary( ) -def organization_name_for_saml_group(provider_config_id: str, group_name: str) -> str: - provider_part = _organization_name_part(provider_config_id, "auth provider configID") - group_part = _organization_name_part(group_name, "SAML group name") - organization_name = f"{provider_part}-{group_part}" - if len(organization_name) > ORGANIZATION_NAME_MAX_LENGTH: - raise SystemExit( - f"FATAL: generated org name for configID={provider_config_id!r} " - f"group={group_name!r} is {len(organization_name)} characters; " - f"Sourcegraph org names must be <= {ORGANIZATION_NAME_MAX_LENGTH}." - ) - return organization_name - - -def _organization_name_part(value: str, label: str) -> str: - normalized = _ORGANIZATION_NAME_PART_RE.sub("-", value.strip()) - normalized = _ORGANIZATION_NAME_DASH_RUN_RE.sub("-", normalized).strip("-") - if not normalized: - raise SystemExit( - f"FATAL: {label} {value!r} cannot be converted to a Sourcegraph org-name part." - ) - return normalized - - def _load_current_organization_states( client: src.SourcegraphClient, organization_names: list[str], parallelism: int, worker_pool: ThreadPoolExecutor | None = None, ) -> tuple[organization_types.OrgMember, dict[str, organization_types.OrganizationState]]: - states: dict[str, organization_types.OrganizationState] = {} - current_user: organization_types.OrgMember | None = None - name_batches = list(_chunks(organization_names, ORGANIZATION_LOOKUP_BATCH_SIZE)) + """Resolve target org IDs and load their full member lists.""" with src.span( "load_current_organization_states", organization_count=len(organization_names), - lookup_batch_count=len(name_batches), member_page_size=ORGANIZATION_MEMBER_PAGE_SIZE, ) as load_event: + current_user, states = _lookup_organization_states( + client, + organization_names, + parallelism, + worker_pool, + ) + _fetch_members_into_states(client, states, parallelism, worker_pool, load_event) + return current_user, states - def fetch_organization_batch( - batch: list[str], - ) -> organization_types.OrganizationBatchLookup: - return _fetch_organization_batch(client, batch) - for result in run_context.parallel_map( - fetch_organization_batch, - name_batches, - parallelism=parallelism, - worker_pool=worker_pool, - ): - batch_current_user = result["current_user"] - if current_user is None: - current_user = batch_current_user - elif current_user["id"] != batch_current_user["id"]: - raise RuntimeError( - "currentUser changed between organization lookup batches " - f"({current_user['username']} vs {batch_current_user['username']})" - ) - states.update(result["states"]) +def _lookup_organization_states( + client: src.SourcegraphClient, + organization_names: list[str], + parallelism: int, + worker_pool: ThreadPoolExecutor | None = None, +) -> tuple[organization_types.OrgMember, dict[str, organization_types.OrganizationState]]: + """Resolve org names to IDs (no member pages) via aliased batch lookups.""" + states: dict[str, organization_types.OrganizationState] = {} + current_user: organization_types.OrgMember | None = None + name_batches = list(_chunks(organization_names, ORGANIZATION_LOOKUP_BATCH_SIZE)) + + def fetch_organization_batch( + batch: list[str], + ) -> organization_types.OrganizationBatchLookup: + return _fetch_organization_batch(client, batch) - existing_states = [state for state in states.values() if state.id is not None] - load_event["existing_organizations_needing_member_pages"] = len(existing_states) + for result in run_context.parallel_map( + fetch_organization_batch, + name_batches, + parallelism=parallelism, + worker_pool=worker_pool, + ): + batch_current_user = result["current_user"] + if current_user is None: + current_user = batch_current_user + elif current_user["id"] != batch_current_user["id"]: + raise RuntimeError( + "currentUser changed between organization lookup batches " + f"({current_user['username']} vs {batch_current_user['username']})" + ) + states.update(result["states"]) - def fetch_members( - state: organization_types.OrganizationState, - ) -> tuple[organization_types.OrganizationState, list[organization_types.OrgMember]]: - return state, _fetch_all_members(client, state) + if current_user is None: + current_user = _fetch_current_user(client) + return current_user, states - for state, members in run_context.parallel_map( - fetch_members, - existing_states, - parallelism=parallelism, - worker_pool=worker_pool, - ): - for member in members: - state.members_by_id[member["id"]] = member - load_event["existing_organizations"] = sum(1 for state in states.values() if state.id) - load_event["total_current_members"] = sum( - len(state.members_by_id) for state in states.values() - ) +def _fetch_members_into_states( + client: src.SourcegraphClient, + states: dict[str, organization_types.OrganizationState], + parallelism: int, + worker_pool: ThreadPoolExecutor | None, + load_event: dict[str, Any], +) -> None: + """Page every existing org's full member list into its state.""" + existing_states = [state for state in states.values() if state.id is not None] + load_event["existing_organizations_needing_member_pages"] = len(existing_states) + + def fetch_members( + state: organization_types.OrganizationState, + ) -> tuple[organization_types.OrganizationState, list[organization_types.OrgMember]]: + return state, _fetch_all_members(client, state) + + for state, members in run_context.parallel_map( + fetch_members, + existing_states, + parallelism=parallelism, + worker_pool=worker_pool, + ): + for member in members: + state.members_by_id[member["id"]] = member + load_event["existing_organizations"] = len(existing_states) + load_event["total_current_members"] = sum(len(state.members_by_id) for state in states.values()) + + +def _fetch_current_user(client: src.SourcegraphClient) -> organization_types.OrgMember: + data = client.graphql(queries.QUERY_CURRENT_USER) + current_user = cast(organization_types.OrgMember | None, data.get("currentUser")) if current_user is None: - raise RuntimeError("currentUser was not returned while loading organizations") + raise RuntimeError("currentUser is null; SAML org sync requires an authenticated token") + return current_user + + +def _discover_synced_organization_states( + client: src.SourcegraphClient, +) -> tuple[organization_types.OrgMember, dict[str, organization_types.OrganizationState]] | None: + """Find every existing `synced-` org (ID + name) in one search request. + + Returns None when the search result was truncated (more matches than + `SYNCED_ORGANIZATION_SEARCH_LIMIT`); callers must then fall back to + per-name lookups and skip orphaned-org cleanup. + """ + with src.span("discover_synced_organizations") as discovery_event: + data = client.graphql( + queries.QUERY_SYNCED_ORGANIZATIONS, + { + "first": SYNCED_ORGANIZATION_SEARCH_LIMIT, + "query": saml_groups.SYNCED_ORGANIZATION_NAME_PREFIX, + }, + ) + current_user = cast(organization_types.OrgMember | None, data.get("currentUser")) + if current_user is None: + raise RuntimeError("currentUser is null; SAML org sync requires an authenticated token") + connection = cast(dict[str, Any], data["organizations"]) + raw_organizations = cast(list[dict[str, Any]], connection["nodes"]) + total_count = cast(int, connection["totalCount"]) + discovery_event["total_count"] = total_count + discovery_event["returned_count"] = len(raw_organizations) + if total_count > len(raw_organizations): + log.warning( + "Synced-org discovery returned %d of %d matches; falling back to " + "per-name lookups and skipping orphaned synced-org cleanup this run.", + len(raw_organizations), + total_count, + ) + return None + # The search also matches display names; keep only true synced- names. + states = { + raw_organization["name"]: organization_types.OrganizationState( + id=cast(str, raw_organization["id"]), + name=cast(str, raw_organization["name"]), + members_by_id={}, + ) + for raw_organization in raw_organizations + if saml_groups.is_synced_organization_name(cast(str, raw_organization["name"])) + } + discovery_event["synced_organizations"] = len(states) return current_user, states @@ -990,6 +1328,8 @@ def _snapshot_from_states( endpoint: str, targets: dict[str, organization_types.TargetOrganization], states: dict[str, organization_types.OrganizationState], + *, + scope: organization_types.OrganizationSyncScope = "full", ) -> organization_types.OrganizationSnapshot: organizations: dict[str, organization_types.OrganizationSnapshotEntry] = {} for organization_name, target in sorted(targets.items()): @@ -1005,6 +1345,9 @@ def _snapshot_from_states( "schema_version": ORGANIZATION_SNAPSHOT_SCHEMA_VERSION, "captured_at": datetime.datetime.now(datetime.UTC).isoformat(timespec="seconds"), "endpoint": endpoint, + # "users" snapshots cover only the scoped users' memberships per + # org, not each org's full member list. + "scope": scope, "stats": { "target_organizations": len(organizations), "existing_organizations": sum( diff --git a/src/src_auth_perms_sync/orgs/types.py b/src/src_auth_perms_sync/orgs/types.py index 9924a91..fc44fb2 100644 --- a/src/src_auth_perms_sync/orgs/types.py +++ b/src/src_auth_perms_sync/orgs/types.py @@ -8,6 +8,9 @@ from ..shared import types as shared_types OrganizationChangeKind: TypeAlias = Literal["add", "remove"] +# "full": converge whole orgs against the complete user population. +# "users": per-user sync covering only the run's selected users. +OrganizationSyncScope: TypeAlias = Literal["full", "users"] class OrgMember(TypedDict): @@ -39,6 +42,9 @@ class OrganizationSnapshot(TypedDict): schema_version: int captured_at: str endpoint: str + # "users"-scoped snapshots record only the scoped users' memberships + # per org, not each org's full member list. + scope: OrganizationSyncScope stats: OrganizationSnapshotStats organizations: dict[str, OrganizationSnapshotEntry] diff --git a/src/src_auth_perms_sync/permissions/command.py b/src/src_auth_perms_sync/permissions/command.py index b968a18..7b029e1 100644 --- a/src/src_auth_perms_sync/permissions/command.py +++ b/src/src_auth_perms_sync/permissions/command.py @@ -523,6 +523,7 @@ def _hydrate_site_user_candidates( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, parallelism: int, worker_pool: ThreadPoolExecutor | None, ) -> list[shared_types.User]: @@ -542,6 +543,7 @@ def _hydrate_site_user_candidates( [candidate["id"] for candidate in candidates], include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, parallelism=parallelism, worker_pool=worker_pool, progress_label="Hydrated selected Sourcegraph user metadata", @@ -581,6 +583,32 @@ def _log_user_planning_progress( ) +def _additive_command_data( + context: permission_types.MappingContext | None, + selected_users: list[shared_types.User], + retain_saml_group_users: bool, +) -> run_context.CommandData: + """Build an additive set command's result data. + + When `retain_saml_group_users` is set, the selected users are compacted + into `scoped_saml_group_users` so a subsequent `--sync-saml-orgs` phase + syncs org membership for exactly these users — per-user additions and + removals — without streaming all users again. An empty selection yields + an empty scope (org sync no-ops), never a full-instance sync. + """ + providers = context.providers if context is not None else None + if not retain_saml_group_users: + return run_context.CommandData(auth_providers=providers) + return run_context.CommandData( + auth_providers=providers, + scoped_saml_group_users=saml_groups.compact_scoped_saml_group_users( + selected_users, + providers or [], + context.saml_groups_attribute_names if context is not None else {}, + ), + ) + + def cmd_set( client: src.SourcegraphClient, run_paths: backups.RunPaths, @@ -688,7 +716,8 @@ def cmd_set( bind_id_mode, saml_groups_attribute_name_by_config_id, do_backup, - worker_pool, + retain_saml_group_users=retain_saml_group_users, + worker_pool=worker_pool, mapping_rules=mapping_rules, ) if options.mode == "users_without_explicit_perms": @@ -702,7 +731,8 @@ def cmd_set( bind_id_mode, saml_groups_attribute_name_by_config_id, do_backup, - worker_pool, + retain_saml_group_users=retain_saml_group_users, + worker_pool=worker_pool, mapping_rules=mapping_rules, ) if options.mode == "created_after": @@ -716,7 +746,8 @@ def cmd_set( bind_id_mode, saml_groups_attribute_name_by_config_id, do_backup, - worker_pool, + retain_saml_group_users=retain_saml_group_users, + worker_pool=worker_pool, mapping_rules=mapping_rules, ) return run_context.CommandData() @@ -732,6 +763,7 @@ def cmd_set_additive_users( bind_id_mode: str, saml_groups_attribute_name_by_config_id: dict[str, str], do_backup: bool, + retain_saml_group_users: bool = False, worker_pool: ThreadPoolExecutor | None = None, mapping_rules: list[permission_types.MappingRule] | None = None, ) -> run_context.CommandData: @@ -748,16 +780,18 @@ def cmd_set_additive_users( mapping_rules = resolve_mapping_rules(mapping_rules, run_paths.maps_path) if not mapping_rules: log.warning("No maps defined in %s — nothing to do.", run_paths.maps_path) - return run_context.CommandData() + return _additive_command_data(None, [], retain_saml_group_users) include_user_emails = permissions_mapping.mapping_rules_need_user_emails(mapping_rules) - include_user_account_data = permissions_mapping.mapping_rules_need_saml_account_data( - mapping_rules + include_user_account_data = ( + permissions_mapping.mapping_rules_need_saml_account_data(mapping_rules) + or retain_saml_group_users ) users = _resolve_user_identifiers( client, user_identifiers, include_emails=include_user_emails, include_account_data=include_user_account_data, + include_organizations=retain_saml_group_users, ) context = load_mapping_context_discovery( client, @@ -778,7 +812,7 @@ def cmd_set_additive_users( ) users = selected_users if not users: - return run_context.CommandData(auth_providers=context.providers) + return _additive_command_data(context, users, retain_saml_group_users) matching_rules = _mapping_rules_matching_selected_users(context, users) log.info( @@ -798,7 +832,7 @@ def cmd_set_additive_users( do_backup=do_backup, worker_pool=worker_pool, ) - return run_context.CommandData(auth_providers=context.providers) + return _additive_command_data(context, users, retain_saml_group_users) service_ids = _service_ids_required_by_mapping_rules(context, matching_rules) log.info( @@ -843,7 +877,7 @@ def cmd_set_additive_users( existing_repos_by_user_id=existing_repos_by_user_id, worker_pool=worker_pool, ) - return run_context.CommandData(auth_providers=context.providers) + return _additive_command_data(context, users, retain_saml_group_users) def cmd_set_additive_users_without_explicit_perms( @@ -856,6 +890,7 @@ def cmd_set_additive_users_without_explicit_perms( bind_id_mode: str, saml_groups_attribute_name_by_config_id: dict[str, str], do_backup: bool, + retain_saml_group_users: bool = False, worker_pool: ThreadPoolExecutor | None = None, mapping_rules: list[permission_types.MappingRule] | None = None, ) -> run_context.CommandData: @@ -876,15 +911,16 @@ def cmd_set_additive_users_without_explicit_perms( mapping_rules = resolve_mapping_rules(mapping_rules, run_paths.maps_path) if not mapping_rules: log.warning("No maps defined in %s — nothing to do.", run_paths.maps_path) - return run_context.CommandData() + return _additive_command_data(None, [], retain_saml_group_users) context = load_mapping_context_discovery( client, mapping_rules, saml_groups_attribute_name_by_config_id, ) include_user_emails = permissions_mapping.mapping_rules_need_user_emails(mapping_rules) - include_user_account_data = permissions_mapping.mapping_rules_need_saml_account_data( - mapping_rules + include_user_account_data = ( + permissions_mapping.mapping_rules_need_saml_account_data(mapping_rules) + or retain_saml_group_users ) # Match mapping rules BEFORE the explicit-permission check: the # per-user `permissionsInfo.repositories(source: API, first: 1)` @@ -904,6 +940,7 @@ def cmd_set_additive_users_without_explicit_perms( candidates, include_emails=include_user_emails, include_account_data=include_user_account_data, + include_organizations=retain_saml_group_users, parallelism=parallelism, worker_pool=worker_pool, ) @@ -947,7 +984,7 @@ def cmd_set_additive_users_without_explicit_perms( do_backup=do_backup, worker_pool=worker_pool, ) - return run_context.CommandData(auth_providers=context.providers) + return _additive_command_data(context, users, retain_saml_group_users) matching_rules = _mapping_rules_matching_selected_users(context, users) log.info( @@ -967,7 +1004,7 @@ def cmd_set_additive_users_without_explicit_perms( do_backup=do_backup, worker_pool=worker_pool, ) - return run_context.CommandData(auth_providers=context.providers) + return _additive_command_data(context, users, retain_saml_group_users) service_ids = _service_ids_required_by_mapping_rules(context, matching_rules) log.info( @@ -1020,7 +1057,7 @@ def cmd_set_additive_users_without_explicit_perms( do_backup=do_backup, worker_pool=worker_pool, ) - return run_context.CommandData(auth_providers=context.providers) + return _additive_command_data(context, users, retain_saml_group_users) def cmd_set_additive_created_after( @@ -1032,6 +1069,7 @@ def cmd_set_additive_created_after( bind_id_mode: str, saml_groups_attribute_name_by_config_id: dict[str, str], do_backup: bool, + retain_saml_group_users: bool = False, worker_pool: ThreadPoolExecutor | None = None, mapping_rules: list[permission_types.MappingRule] | None = None, ) -> run_context.CommandData: @@ -1050,15 +1088,16 @@ def cmd_set_additive_created_after( mapping_rules = resolve_mapping_rules(mapping_rules, run_paths.maps_path) if not mapping_rules: log.warning("No maps defined in %s — nothing to do.", run_paths.maps_path) - return run_context.CommandData() + return _additive_command_data(None, [], retain_saml_group_users) context = load_mapping_context_discovery( client, mapping_rules, saml_groups_attribute_name_by_config_id, ) include_user_emails = permissions_mapping.mapping_rules_need_user_emails(mapping_rules) - include_user_account_data = permissions_mapping.mapping_rules_need_saml_account_data( - mapping_rules + include_user_account_data = ( + permissions_mapping.mapping_rules_need_saml_account_data(mapping_rules) + or retain_saml_group_users ) candidates = permissions_sourcegraph.list_site_user_candidates( client, @@ -1076,6 +1115,7 @@ def cmd_set_additive_created_after( candidates, include_emails=include_user_emails, include_account_data=include_user_account_data, + include_organizations=retain_saml_group_users, parallelism=parallelism, worker_pool=worker_pool, ) @@ -1091,7 +1131,7 @@ def cmd_set_additive_created_after( do_backup=do_backup, worker_pool=worker_pool, ) - return run_context.CommandData(auth_providers=context.providers) + return _additive_command_data(context, users, retain_saml_group_users) matching_rules = _mapping_rules_matching_selected_users(context, users) log.info( @@ -1111,7 +1151,7 @@ def cmd_set_additive_created_after( do_backup=do_backup, worker_pool=worker_pool, ) - return run_context.CommandData(auth_providers=context.providers) + return _additive_command_data(context, users, retain_saml_group_users) service_ids = _service_ids_required_by_mapping_rules(context, matching_rules) log.info( @@ -1156,7 +1196,7 @@ def cmd_set_additive_created_after( existing_repos_by_user_id=existing_repos_by_user_id, worker_pool=worker_pool, ) - return run_context.CommandData(auth_providers=context.providers) + return _additive_command_data(context, users, retain_saml_group_users) def _resolve_user_identifiers( @@ -1165,6 +1205,7 @@ def _resolve_user_identifiers( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> list[shared_types.User]: """Resolve username/email inputs to distinct Sourcegraph users in caller order.""" users: list[shared_types.User] = [] @@ -1175,6 +1216,7 @@ def _resolve_user_identifiers( user_identifier, include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, ) if user["id"] in seen_user_ids: continue @@ -1189,6 +1231,7 @@ def _resolve_user_identifier( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> shared_types.User: """Resolve username/email input to one Sourcegraph user.""" user: shared_types.User | None @@ -1198,11 +1241,13 @@ def _resolve_user_identifier( user_identifier, include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, ) or permissions_sourcegraph.get_user_by_username( client, user_identifier, include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, ) else: user = permissions_sourcegraph.get_user_by_username( @@ -1210,11 +1255,13 @@ def _resolve_user_identifier( user_identifier, include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, ) or permissions_sourcegraph.get_user_by_email( client, user_identifier, include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, ) if user is None: raise SystemExit(f"No Sourcegraph user found for {user_identifier!r}.") diff --git a/src/src_auth_perms_sync/permissions/queries.py b/src/src_auth_perms_sync/permissions/queries.py index 5d5fcb3..ae5b64e 100644 --- a/src/src_auth_perms_sync/permissions/queries.py +++ b/src/src_auth_perms_sync/permissions/queries.py @@ -159,11 +159,20 @@ } """ +# Inlining org memberships into user hydration saves a separate per-user +# lookup when scoped org sync needs them. +USER_ORGANIZATIONS_FIELDS = """ +organizations { + nodes { id name } +} +""" + def user_fields( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> str: """Return user fields, adding heavier fields only when downstream needs them.""" fields = USER_BASE_FIELDS.replace( @@ -171,7 +180,9 @@ def user_fields( USER_ACCOUNT_DATA_FIELD if include_account_data else "", ) if include_emails: - return f"{fields}\n{USER_EMAIL_FIELDS}" + fields = f"{fields}\n{USER_EMAIL_FIELDS}" + if include_organizations: + fields = f"{fields}\n{USER_ORGANIZATIONS_FIELDS}" return fields @@ -179,11 +190,17 @@ def query_user_by_username( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> str: + fields = user_fields( + include_emails=include_emails, + include_account_data=include_account_data, + include_organizations=include_organizations, + ) return f""" query UserByUsername($username: String!) {{ user(username: $username) {{ - {user_fields(include_emails=include_emails, include_account_data=include_account_data)} + {fields} }} }} """ @@ -193,11 +210,17 @@ def query_user_by_email( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> str: + fields = user_fields( + include_emails=include_emails, + include_account_data=include_account_data, + include_organizations=include_organizations, + ) return f""" query UserByEmail($email: String!) {{ user(email: $email) {{ - {user_fields(include_emails=include_emails, include_account_data=include_account_data)} + {fields} }} }} """ @@ -207,12 +230,18 @@ def query_user_by_id( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> str: + fields = user_fields( + include_emails=include_emails, + include_account_data=include_account_data, + include_organizations=include_organizations, + ) return f""" query UserByID($id: ID!) {{ node(id: $id) {{ ... on User {{ - {user_fields(include_emails=include_emails, include_account_data=include_account_data)} + {fields} }} }} }} @@ -224,6 +253,7 @@ def users_by_ids_batch_query( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> str: """Hydrate many users in one request via aliased `node()` lookups. @@ -231,7 +261,11 @@ def users_by_ids_batch_query( dominates user hydration, so batching cuts request count by the batch size with the same per-user fields. """ - fields = user_fields(include_emails=include_emails, include_account_data=include_account_data) + fields = user_fields( + include_emails=include_emails, + include_account_data=include_account_data, + include_organizations=include_organizations, + ) variables = ", ".join(f"$user{index}: ID!" for index in range(batch_size)) aliases = "".join( f""" diff --git a/src/src_auth_perms_sync/permissions/sourcegraph.py b/src/src_auth_perms_sync/permissions/sourcegraph.py index c96bb51..fa7a9d4 100644 --- a/src/src_auth_perms_sync/permissions/sourcegraph.py +++ b/src/src_auth_perms_sync/permissions/sourcegraph.py @@ -153,6 +153,7 @@ def get_user_by_username( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> shared_types.User | None: """Return the exact Sourcegraph user for `username`, if it exists.""" data = cast( @@ -161,6 +162,7 @@ def get_user_by_username( queries.query_user_by_username( include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, ), cast(src.JSONDict, {"username": username}), ), @@ -174,6 +176,7 @@ def get_user_by_email( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> shared_types.User | None: """Return the user owning the verified email address, if it exists.""" data = cast( @@ -182,6 +185,7 @@ def get_user_by_email( queries.query_user_by_email( include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, ), cast(src.JSONDict, {"email": email}), ), @@ -216,6 +220,7 @@ def get_users_by_ids( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, parallelism: int = 1, worker_pool: ThreadPoolExecutor | None = None, progress_label: str | None = None, @@ -232,6 +237,7 @@ def fetch_batch(batch: Sequence[str]) -> list[shared_types.User | None]: len(batch), include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, ), cast(src.JSONDict, {f"user{index}": user_id for index, user_id in enumerate(batch)}), follow_pages=False, diff --git a/src/src_auth_perms_sync/shared/queries.py b/src/src_auth_perms_sync/shared/queries.py index 0a8126d..821aa4f 100644 --- a/src/src_auth_perms_sync/shared/queries.py +++ b/src/src_auth_perms_sync/shared/queries.py @@ -53,15 +53,24 @@ accountData """ +# Inlining the user's org memberships into the same user query saves a +# separate per-user lookup when scoped org sync needs them. +USER_ORGANIZATIONS_FIELD = """ organizations { + nodes { id name } + } +""" + def query_users( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> str: """Return the users page query, adding heavier fields only when requested.""" email_fields = USER_EMAIL_FIELDS if include_emails else "" account_data_field = USER_ACCOUNT_DATA_FIELD if include_account_data else "" + organizations_field = USER_ORGANIZATIONS_FIELD if include_organizations else "" return f""" query ListUsers($first: Int!, $after: String) {{ users(first: $first, after: $after) {{ @@ -69,7 +78,7 @@ def query_users( id username builtinAuth -{email_fields} externalAccounts(first: 50) {{ +{email_fields}{organizations_field} externalAccounts(first: 50) {{ nodes {{ serviceType serviceID diff --git a/src/src_auth_perms_sync/shared/run_context.py b/src/src_auth_perms_sync/shared/run_context.py index 1364358..127d16c 100644 --- a/src/src_auth_perms_sync/shared/run_context.py +++ b/src/src_auth_perms_sync/shared/run_context.py @@ -26,10 +26,18 @@ class CommandData: `auth_provider_views` and `code_host_views` carry the same dicts the get command writes to `auth-providers.yaml` and `code-hosts.yaml`, so module callers receive discovery data without re-parsing files. + + `saml_group_users` carries the complete user population (full set + modes); a subsequent org sync reuses it for a full sync. Additive set + modes instead carry their selected user subset in + `scoped_saml_group_users`; a subsequent org sync then runs scoped to + exactly those users (per-user additions and removals, nobody else + touched) without streaming all users again. """ auth_providers: list[shared_types.AuthProvider] | None = None saml_group_users: list[shared_types.SamlGroupUser] | None = None + scoped_saml_group_users: list[shared_types.ScopedSamlGroupUser] | None = None auth_provider_views: list[dict[str, Any]] | None = None code_host_views: list[dict[str, Any]] | None = None maps_created: bool = False diff --git a/src/src_auth_perms_sync/shared/saml_groups.py b/src/src_auth_perms_sync/shared/saml_groups.py index 1dcc1dd..f0d9275 100644 --- a/src/src_auth_perms_sync/shared/saml_groups.py +++ b/src/src_auth_perms_sync/shared/saml_groups.py @@ -22,6 +22,7 @@ from __future__ import annotations +import re from collections.abc import Iterable from typing import Any, cast @@ -30,6 +31,48 @@ DEFAULT_GROUPS_ATTRIBUTE_NAME: str = "groups" SAML_SERVICE_TYPE: str = "saml" +# Every organization this tool creates gets this name prefix. No human +# would name an org `synced-...`, so the prefix doubles as the ownership +# marker: an org is tool-managed if and only if its name starts with it. +SYNCED_ORGANIZATION_NAME_PREFIX: str = "synced-" +ORGANIZATION_NAME_MAX_LENGTH: int = 255 + +_ORGANIZATION_NAME_PART_RE = re.compile(r"[^A-Za-z0-9]+") +_ORGANIZATION_NAME_DASH_RUN_RE = re.compile(r"-+") + + +def organization_name_for_saml_group(provider_config_id: str, group_name: str) -> str: + """Return the deterministic synced org name for one SAML group. + + Shape: `synced--`. + """ + provider_part = _organization_name_part(provider_config_id, "auth provider configID") + group_part = _organization_name_part(group_name, "SAML group name") + organization_name = f"{SYNCED_ORGANIZATION_NAME_PREFIX}{provider_part}-{group_part}" + if len(organization_name) > ORGANIZATION_NAME_MAX_LENGTH: + raise SystemExit( + f"FATAL: generated org name for configID={provider_config_id!r} " + f"group={group_name!r} is {len(organization_name)} characters; " + f"Sourcegraph org names must be <= {ORGANIZATION_NAME_MAX_LENGTH}." + ) + return organization_name + + +def _organization_name_part(value: str, label: str) -> str: + normalized = _ORGANIZATION_NAME_PART_RE.sub("-", value.strip()) + normalized = _ORGANIZATION_NAME_DASH_RUN_RE.sub("-", normalized).strip("-") + if not normalized: + raise SystemExit( + f"FATAL: {label} {value!r} cannot be converted to a Sourcegraph org-name part." + ) + return normalized + + +def is_synced_organization_name(organization_name: str) -> bool: + """Return whether an org name marks the org as managed by this tool.""" + return organization_name.startswith(SYNCED_ORGANIZATION_NAME_PREFIX) + + # Per-(serviceID, clientID) override of the SAML groups attribute name. # `None` or a missing key means "use DEFAULT_GROUPS_ATTRIBUTE_NAME for # this provider". Built by `attribute_names_by_provider_key()` from the @@ -150,12 +193,12 @@ def _dict_items(value: Any) -> list[dict[str, Any]]: return [cast(dict[str, Any], item) for item in items if isinstance(item, dict)] -def compact_saml_group_user( +def saml_group_memberships_for_user( user: shared_types.User, providers_by_account_key: dict[tuple[str, str], shared_types.AuthProvider], attribute_names_by_provider: SamlGroupsAttributeNameByProvider, -) -> shared_types.SamlGroupUser | None: - """Return only the user fields org sync needs from one full user row.""" +) -> tuple[shared_types.SamlGroupMembership, ...]: + """Extract one user's distinct (configID, group) SAML memberships.""" memberships: list[shared_types.SamlGroupMembership] = [] seen: set[tuple[str, str]] = set() for account in user["externalAccounts"]["nodes"]: @@ -180,12 +223,24 @@ def compact_saml_group_user( ) ) seen.add(membership_key) + return tuple(memberships) + + +def compact_saml_group_user( + user: shared_types.User, + providers_by_account_key: dict[tuple[str, str], shared_types.AuthProvider], + attribute_names_by_provider: SamlGroupsAttributeNameByProvider, +) -> shared_types.SamlGroupUser | None: + """Return only the user fields org sync needs from one full user row.""" + memberships = saml_group_memberships_for_user( + user, providers_by_account_key, attribute_names_by_provider + ) if not memberships: return None return shared_types.SamlGroupUser( user_id=user["id"], username=user["username"], - saml_group_memberships=tuple(memberships), + saml_group_memberships=memberships, ) @@ -206,6 +261,39 @@ def compact_saml_group_users( return compact_users +def compact_scoped_saml_group_users( + users: Iterable[shared_types.User], + providers: Iterable[shared_types.AuthProvider], + attribute_names_by_provider: SamlGroupsAttributeNameByProvider, +) -> list[shared_types.ScopedSamlGroupUser]: + """Compact in-scope users for a scoped (per-user) organization sync. + + Every user is kept — even with zero SAML group memberships — because + scoped org sync also removes users from synced orgs they left. Each + user's row must have been fetched with `include_organizations=True`; + only `synced-` prefixed org memberships are retained. + """ + providers_by_account_key = saml_providers_by_account_key(providers) + scoped_users: list[shared_types.ScopedSamlGroupUser] = [] + for user in users: + synced_organizations = tuple( + organization + for organization in user.get("organizations", {"nodes": []})["nodes"] + if is_synced_organization_name(organization["name"]) + ) + scoped_users.append( + shared_types.ScopedSamlGroupUser( + user_id=user["id"], + username=user["username"], + saml_group_memberships=saml_group_memberships_for_user( + user, providers_by_account_key, attribute_names_by_provider + ), + synced_organizations=synced_organizations, + ) + ) + return scoped_users + + def count_users_per_saml_group( users: Iterable[shared_types.User], attribute_names_by_provider: SamlGroupsAttributeNameByProvider | None = None, diff --git a/src/src_auth_perms_sync/shared/sourcegraph.py b/src/src_auth_perms_sync/shared/sourcegraph.py index e0ba123..969a66f 100644 --- a/src/src_auth_perms_sync/shared/sourcegraph.py +++ b/src/src_auth_perms_sync/shared/sourcegraph.py @@ -37,6 +37,7 @@ def list_users_with_accounts( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> list[shared_types.User]: return [ cast(shared_types.User, node) @@ -44,6 +45,7 @@ def list_users_with_accounts( queries.query_users( include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, ), connection_path=("users",), page_size=DEFAULT_PAGE_SIZE, @@ -57,6 +59,7 @@ def list_users_streaming( *, include_emails: bool = False, include_account_data: bool = True, + include_organizations: bool = False, ) -> Iterator[shared_types.User]: """Stream ListUsers pages one at a time, yielding each User as it arrives. @@ -73,6 +76,7 @@ def list_users_streaming( queries.query_users( include_emails=include_emails, include_account_data=include_account_data, + include_organizations=include_organizations, ), connection_path=("users",), page_size=DEFAULT_PAGE_SIZE, diff --git a/src/src_auth_perms_sync/shared/types.py b/src/src_auth_perms_sync/shared/types.py index 46f2032..20c1df0 100644 --- a/src/src_auth_perms_sync/shared/types.py +++ b/src/src_auth_perms_sync/shared/types.py @@ -35,12 +35,22 @@ class UserEmail(TypedDict): verified: bool +class OrganizationReference(TypedDict): + id: str + name: str + + +class OrganizationReferenceConnection(TypedDict): + nodes: list[OrganizationReference] + + class User(TypedDict): id: str username: str builtinAuth: bool externalAccounts: ExternalAccountConnection emails: NotRequired[list[UserEmail]] + organizations: NotRequired[OrganizationReferenceConnection] @dataclass(frozen=True, slots=True) @@ -68,6 +78,20 @@ class SamlGroupUser(UserIdentity): saml_group_memberships: tuple[SamlGroupMembership, ...] +@dataclass(frozen=True, slots=True) +class ScopedSamlGroupUser(UserIdentity): + """One in-scope user for a scoped (per-user) SAML organization sync. + + Unlike `SamlGroupUser`, users with zero group memberships are kept: + scoped org sync must still remove them from synced orgs they no + longer belong to. `synced_organizations` carries the user's current + memberships in tool-managed (`synced-` prefixed) organizations. + """ + + saml_group_memberships: tuple[SamlGroupMembership, ...] + synced_organizations: tuple[OrganizationReference, ...] + + class SiteUserCandidate(TypedDict): id: str username: str diff --git a/tests/e2e/case_runner.py b/tests/e2e/case_runner.py index 85362d9..9158068 100644 --- a/tests/e2e/case_runner.py +++ b/tests/e2e/case_runner.py @@ -88,12 +88,22 @@ class FixtureRepo(TypedDict): createdAt: NotRequired[str] # default: 2026-01-01T00:00:00Z +class FixtureOrganization(TypedDict): + id: int + name: str + members: list[str] # usernames + + class FixtureState(TypedDict): endpoint: str authProviders: list[shared_types.AuthProvider] externalServices: list[FixtureExternalService] users: list[FixtureUser] repos: list[FixtureRepo] + organizations: NotRequired[list[FixtureOrganization]] # default: [] + # The user behind the fixture token (createOrganization auto-adds it). + # Default: the first user. + currentUsername: NotRequired[str] class FixtureCase(TypedDict): @@ -187,6 +197,18 @@ def __init__(self, state: FixtureState) -> None: self._external_services = list(state["externalServices"]) self._users = list(state["users"]) self._repos = list(state["repos"]) + self._organizations: list[FixtureOrganization] = [ + { + "id": organization["id"], + "name": organization["name"], + "members": list(organization["members"]), + } + for organization in state.get("organizations", []) + ] + self._organization_members_by_id: dict[int, set[str]] = { + organization["id"]: set(organization["members"]) for organization in self._organizations + } + self._current_username = state.get("currentUsername") or self._users[0]["username"] self._mutation_count = 0 self._users_by_graphql_id = { @@ -269,6 +291,42 @@ def graphql( return self._repository_names_by_id(variable_values) if "query PendingBindIDs" in query: return {"usersWithPendingPermissions": self._pending_bind_ids()} + if "query SamlOrganizationSyncCurrentUser" in query: + return {"currentUser": self._graphql_current_user()} + if "query SyncedOrganizations" in query: + return { + "currentUser": self._graphql_current_user(), + "organizations": self._search_organizations( + variable_values["query"], variable_values["first"] + ), + } + if "query SamlOrganizationLookup" in query: + lookup_data: dict[str, Any] = {"currentUser": self._graphql_current_user()} + index = 0 + while f"name{index}" in variable_values: + lookup_data[f"organization{index}"] = self._search_organizations( + variable_values[f"name{index}"], + variable_values["organizationFirst"], + ) + index += 1 + return lookup_data + if "query UsersOrganizationsBatch" in query: + organizations_data: dict[str, Any] = {} + index = 0 + while f"user{index}" in variable_values: + organizations_data[f"user{index}"] = self._graphql_user_by_id( + variable_values[f"user{index}"] + ) + index += 1 + return organizations_data + if "mutation CreateOrganization" in query: + return {"createOrganization": self._create_organization(variable_values["name"])} + if "mutation AddUserToOrganization" in query: + self._add_user_to_organization(variable_values) + return {"addUserToOrganization": {"alwaysNil": None}} + if "mutation RemoveUserFromOrganization" in query: + self._remove_user_from_organization(variable_values) + return {"removeUserFromOrganization": {"alwaysNil": None}} if "mutation SetRepoPerms" in query: self._set_repo_permissions(variable_values) return {"setRepositoryPermissionsForUsers": {"alwaysNil": None}} @@ -307,6 +365,8 @@ def stream_connection_nodes( return iter(self._explicit_repository_nodes_for_user(variable_values["id"])) if path == ("authorizedUserRepositories",): return iter(self._authorized_user_repositories(variable_values["bindID"])) + if path == ("node", "members"): + return iter(self._organization_member_nodes(variable_values["id"])) raise AssertionError(f"Unhandled fixture connection path: {path}") def export_state(self) -> FixtureState: @@ -331,6 +391,17 @@ def export_state(self) -> FixtureState: "externalServices": self._external_services, "users": self._users, "repos": repos, + "organizations": [ + { + "id": organization["id"], + "name": organization["name"], + "members": sorted(self._organization_members_by_id[organization["id"]]), + } + for organization in sorted( + self._organizations, key=lambda organization: organization["name"] + ) + ], + "currentUsername": self._current_username, } def _graphql_user_by_username(self, username_value: object) -> dict[str, Any] | None: @@ -363,6 +434,13 @@ def _graphql_user(self, user: FixtureUser) -> dict[str, Any]: "builtinAuth": user["builtinAuth"], "emails": list(user["emails"]), "externalAccounts": {"nodes": list(user["externalAccounts"])}, + "organizations": { + "nodes": [ + self._graphql_organization_reference(organization) + for organization in self._organizations + if user["username"] in self._organization_members_by_id[organization["id"]] + ] + }, } def _graphql_external_services(self) -> list[dict[str, Any]]: @@ -564,6 +642,99 @@ def _require_graphql_repository(self, repository: FixtureRepo) -> dict[str, Any] assert graphql_repository is not None return graphql_repository + def _graphql_current_user(self) -> dict[str, Any]: + user = self._users_by_username[self._current_username] + return {"id": self._user_graphql_id(user["id"]), "username": user["username"]} + + def _graphql_organization_reference(self, organization: FixtureOrganization) -> dict[str, Any]: + return { + "id": self._organization_graphql_id(organization["id"]), + "name": organization["name"], + } + + def _search_organizations(self, query_value: object, first_value: object) -> dict[str, Any]: + """Substring search over org names, like the real organizations(query:).""" + if not isinstance(query_value, str): + raise AssertionError("organizations query variable must be a string") + if not isinstance(first_value, int): + raise AssertionError("organizations first variable must be an integer") + matches = [ + organization + for organization in self._organizations + if query_value in organization["name"] + ] + return { + "totalCount": len(matches), + "nodes": [ + self._graphql_organization_reference(organization) + for organization in matches[:first_value] + ], + } + + def _organization_member_nodes(self, organization_id_value: object) -> list[dict[str, Any]]: + organization = self._organization_by_graphql_id(organization_id_value) + return [ + self._graphql_current_user_like(username) + for username in sorted(self._organization_members_by_id[organization["id"]]) + ] + + def _graphql_current_user_like(self, username: str) -> dict[str, Any]: + user = self._users_by_username[username] + return {"id": self._user_graphql_id(user["id"]), "username": username} + + def _create_organization(self, name_value: object) -> dict[str, Any]: + if not isinstance(name_value, str): + raise AssertionError("organization name variable must be a string") + if any(organization["name"] == name_value for organization in self._organizations): + raise src.GraphQLError( + f"createOrganization: organization name is already taken: {name_value}" + ) + organization_id = ( + max((organization["id"] for organization in self._organizations), default=0) + 1 + ) + organization: FixtureOrganization = { + "id": organization_id, + "name": name_value, + "members": [], + } + self._organizations.append(organization) + # createOrganization adds the caller as the first member. + self._organization_members_by_id[organization_id] = {self._current_username} + self._mutation_count += 1 + return self._graphql_organization_reference(organization) + + def _add_user_to_organization(self, variables: dict[str, object]) -> None: + organization = self._organization_by_graphql_id(variables["organization"]) + username_value = variables["username"] + if not isinstance(username_value, str): + raise AssertionError("username variable must be a string") + if username_value not in self._users_by_username: + raise src.GraphQLError(f"addUserToOrganization: no such user: {username_value}") + members = self._organization_members_by_id[organization["id"]] + if username_value in members: + raise src.GraphQLError( + "addUserToOrganization: user is already a member of the organization" + ) + members.add(username_value) + self._mutation_count += 1 + + def _remove_user_from_organization(self, variables: dict[str, object]) -> None: + organization = self._organization_by_graphql_id(variables["organization"]) + username = self._username_from_user_graphql_id(variables["user"]) + self._organization_members_by_id[organization["id"]].discard(username) + self._mutation_count += 1 + + def _organization_by_graphql_id(self, organization_id_value: object) -> FixtureOrganization: + if not isinstance(organization_id_value, str): + raise AssertionError("organization variable must be a string") + for organization in self._organizations: + if self._organization_graphql_id(organization["id"]) == organization_id_value: + return organization + raise AssertionError(f"unknown organization GraphQL ID: {organization_id_value}") + + def _organization_graphql_id(self, organization_id: int) -> str: + return src.encode_sourcegraph_node_id("Org", organization_id) + def _add_repo_permission(self, variables: dict[str, object]) -> None: repository_id = self._repository_integer_id(variables["repo"]) username = self._username_from_user_graphql_id(variables["user"]) @@ -664,7 +835,13 @@ def case_cli_arguments(case: FixtureCase, case_name: str) -> list[str]: argv = shlex.split(cli_command) else: flags = cli_flags_by_field_name() - argv = [cast(str, args["command"])] + # args carry the CommandName (e.g. sync_saml_orgs); the argv needs + # the CLI argument spelling (sync-saml-orgs). + argument_names = { + command.command_name: command.argument_name for command in cli.CLI_COMMANDS + } + command_name = cast(str, args["command"]) + argv = [argument_names.get(command_name, command_name)] for field_name, value in args.items(): if field_name == "command" or value is None or value is False: continue diff --git a/tests/e2e/fixtures/set-users-sync-saml-orgs-noop/before.json b/tests/e2e/fixtures/set-users-sync-saml-orgs-noop/before.json new file mode 100644 index 0000000..4daeaf4 --- /dev/null +++ b/tests/e2e/fixtures/set-users-sync-saml-orgs-noop/before.json @@ -0,0 +1,108 @@ +{ + "endpoint": "https://fixture.sourcegraph.test", + "currentUsername": "org_admin_09990", + "authProviders": [ + { + "serviceType": "builtin", + "serviceID": "", + "clientID": "", + "displayName": "Builtin username/password", + "isBuiltin": true, + "configID": "" + }, + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "displayName": "Okta", + "isBuiltin": false, + "configID": "okta" + } + ], + "externalServices": [ + { + "id": 1, + "kind": "GITHUB", + "displayName": "GitHub", + "url": "https://github.com/", + "config": "{}" + } + ], + "users": [ + { + "id": 1, + "username": "org_admin_09990", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "org_admin_09990@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + }, + { + "id": 2, + "username": "test_user_09991", + "builtinAuth": false, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09991@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [ + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "accountData": { + "Values": { + "groups": { + "Values": [ + { + "Value": "engineering" + } + ] + } + } + } + } + ] + }, + { + "id": 3, + "username": "test_user_09992", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09992@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + } + ], + "repos": [ + { + "id": 101, + "name": "test-repo-49981", + "externalServiceID": 1, + "explicitPermissionsUsers": [ + "test_user_09991" + ] + } + ], + "organizations": [ + { + "id": 1, + "name": "synced-okta-engineering", + "members": [ + "test_user_09991" + ] + } + ] +} diff --git a/tests/e2e/fixtures/set-users-sync-saml-orgs-noop/maps.yaml b/tests/e2e/fixtures/set-users-sync-saml-orgs-noop/maps.yaml new file mode 100644 index 0000000..fc31cb1 --- /dev/null +++ b/tests/e2e/fixtures/set-users-sync-saml-orgs-noop/maps.yaml @@ -0,0 +1,8 @@ +maps: + - name: Grant the app repo to the selected user + users: + usernames: + - test_user_09991 + repos: + names: + - test-repo-49981 diff --git a/tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/after.json b/tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/after.json new file mode 100644 index 0000000..4ebbac7 --- /dev/null +++ b/tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/after.json @@ -0,0 +1,109 @@ +{ + "endpoint": "https://fixture.sourcegraph.test", + "currentUsername": "org_admin_09990", + "authProviders": [ + { + "serviceType": "builtin", + "serviceID": "", + "clientID": "", + "displayName": "Builtin username/password", + "isBuiltin": true, + "configID": "" + }, + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "displayName": "Okta", + "isBuiltin": false, + "configID": "okta" + } + ], + "externalServices": [ + { + "id": 1, + "kind": "GITHUB", + "displayName": "GitHub", + "url": "https://github.com/", + "config": "{}" + } + ], + "users": [ + { + "id": 1, + "username": "org_admin_09990", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "org_admin_09990@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + }, + { + "id": 2, + "username": "test_user_09991", + "builtinAuth": false, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09991@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [ + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "accountData": { + "Values": { + "groups": { + "Values": [ + { + "Value": "engineering" + } + ] + } + } + } + } + ] + }, + { + "id": 3, + "username": "test_user_09992", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09992@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + } + ], + "repos": [ + { + "id": 101, + "name": "test-repo-49981", + "externalServiceID": 1, + "explicitPermissionsUsers": ["test_user_09991"] + } + ], + "organizations": [ + { + "id": 2, + "name": "synced-okta-engineering", + "members": ["test_user_09991"] + }, + { + "id": 1, + "name": "synced-okta-stale", + "members": ["test_user_09992"] + } + ] +} diff --git a/tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/before.json b/tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/before.json new file mode 100644 index 0000000..9f17136 --- /dev/null +++ b/tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/before.json @@ -0,0 +1,104 @@ +{ + "endpoint": "https://fixture.sourcegraph.test", + "currentUsername": "org_admin_09990", + "authProviders": [ + { + "serviceType": "builtin", + "serviceID": "", + "clientID": "", + "displayName": "Builtin username/password", + "isBuiltin": true, + "configID": "" + }, + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "displayName": "Okta", + "isBuiltin": false, + "configID": "okta" + } + ], + "externalServices": [ + { + "id": 1, + "kind": "GITHUB", + "displayName": "GitHub", + "url": "https://github.com/", + "config": "{}" + } + ], + "users": [ + { + "id": 1, + "username": "org_admin_09990", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "org_admin_09990@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + }, + { + "id": 2, + "username": "test_user_09991", + "builtinAuth": false, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09991@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [ + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "accountData": { + "Values": { + "groups": { + "Values": [ + { + "Value": "engineering" + } + ] + } + } + } + } + ] + }, + { + "id": 3, + "username": "test_user_09992", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09992@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + } + ], + "repos": [ + { + "id": 101, + "name": "test-repo-49981", + "externalServiceID": 1, + "explicitPermissionsUsers": [] + } + ], + "organizations": [ + { + "id": 1, + "name": "synced-okta-stale", + "members": ["test_user_09991", "test_user_09992"] + } + ] +} diff --git a/tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/maps.yaml b/tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/maps.yaml new file mode 100644 index 0000000..fc31cb1 --- /dev/null +++ b/tests/e2e/fixtures/set-users-sync-saml-orgs-scoped/maps.yaml @@ -0,0 +1,8 @@ +maps: + - name: Grant the app repo to the selected user + users: + usernames: + - test_user_09991 + repos: + names: + - test-repo-49981 diff --git a/tests/e2e/fixtures/sync-saml-orgs-full-cleanup/after.json b/tests/e2e/fixtures/sync-saml-orgs-full-cleanup/after.json new file mode 100644 index 0000000..7c5b5b2 --- /dev/null +++ b/tests/e2e/fixtures/sync-saml-orgs-full-cleanup/after.json @@ -0,0 +1,111 @@ +{ + "endpoint": "https://fixture.sourcegraph.test", + "currentUsername": "org_admin_09990", + "authProviders": [ + { + "serviceType": "builtin", + "serviceID": "", + "clientID": "", + "displayName": "Builtin username/password", + "isBuiltin": true, + "configID": "" + }, + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "displayName": "Okta", + "isBuiltin": false, + "configID": "okta" + } + ], + "externalServices": [ + { + "id": 1, + "kind": "GITHUB", + "displayName": "GitHub", + "url": "https://github.com/", + "config": "{}" + } + ], + "users": [ + { + "id": 1, + "username": "org_admin_09990", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "org_admin_09990@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + }, + { + "id": 2, + "username": "test_user_09991", + "builtinAuth": false, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09991@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [ + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "accountData": { + "Values": { + "groups": { + "Values": [ + { + "Value": "engineering" + } + ] + } + } + } + } + ] + }, + { + "id": 3, + "username": "test_user_09992", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09992@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + } + ], + "repos": [ + { + "id": 101, + "name": "test-repo-49981", + "externalServiceID": 1, + "explicitPermissionsUsers": [] + } + ], + "organizations": [ + { + "id": 2, + "name": "synced-okta-engineering", + "members": [ + "test_user_09991" + ] + }, + { + "id": 1, + "name": "synced-okta-stale", + "members": [] + } + ] +} diff --git a/tests/e2e/fixtures/sync-saml-orgs-full-cleanup/before.json b/tests/e2e/fixtures/sync-saml-orgs-full-cleanup/before.json new file mode 100644 index 0000000..9374140 --- /dev/null +++ b/tests/e2e/fixtures/sync-saml-orgs-full-cleanup/before.json @@ -0,0 +1,106 @@ +{ + "endpoint": "https://fixture.sourcegraph.test", + "currentUsername": "org_admin_09990", + "authProviders": [ + { + "serviceType": "builtin", + "serviceID": "", + "clientID": "", + "displayName": "Builtin username/password", + "isBuiltin": true, + "configID": "" + }, + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "displayName": "Okta", + "isBuiltin": false, + "configID": "okta" + } + ], + "externalServices": [ + { + "id": 1, + "kind": "GITHUB", + "displayName": "GitHub", + "url": "https://github.com/", + "config": "{}" + } + ], + "users": [ + { + "id": 1, + "username": "org_admin_09990", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "org_admin_09990@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + }, + { + "id": 2, + "username": "test_user_09991", + "builtinAuth": false, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09991@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [ + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "accountData": { + "Values": { + "groups": { + "Values": [ + { + "Value": "engineering" + } + ] + } + } + } + } + ] + }, + { + "id": 3, + "username": "test_user_09992", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09992@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + } + ], + "repos": [ + { + "id": 101, + "name": "test-repo-49981", + "externalServiceID": 1, + "explicitPermissionsUsers": [] + } + ], + "organizations": [ + { + "id": 1, + "name": "synced-okta-stale", + "members": [ + "test_user_09992" + ] + } + ] +} diff --git a/tests/tests.yaml b/tests/tests.yaml index 4898ef0..603df30 100644 --- a/tests/tests.yaml +++ b/tests/tests.yaml @@ -1317,6 +1317,52 @@ cases: no_backup: true expectedMutations: 2 + set-users-sync-saml-orgs-scoped: + description: >- + Additive set --users with --sync-saml-orgs runs a SCOPED org sync: + the selected user is added to the synced org their SAML group + requires (created on demand, creator membership cleaned up) and + removed from a synced org their groups no longer justify — while an + out-of-scope member of that same org is left untouched. One repo + grant + create + add + creator removal + stale removal = 5 mutations. + modes: + - local + args: + command: set + users: + - test_user_09991 + sync_saml_organizations: true + apply: true + expectedMutations: 5 + + set-users-sync-saml-orgs-noop: + description: >- + Scoped org sync is idempotent: with repo grants and synced org + membership already converged, a set --users --sync-saml-orgs apply + issues zero mutations and changes nothing. + modes: + - local + args: + command: set + users: + - test_user_09991 + sync_saml_organizations: true + apply: true + expectedMutations: 0 + + sync-saml-orgs-full-cleanup: + description: >- + Standalone full org sync converges synced orgs both ways: creates the + org a SAML group requires (cleaning up the creator's auto-membership) + and empties — but never deletes — a synced org whose SAML group + disappeared from every user's assertion. + modes: + - local + args: + command: sync_saml_orgs + apply: true + expectedMutations: 4 + full-overwrite-removes-stale-grant: # scope: # users: 2 diff --git a/tests/unit/test_saml_groups.py b/tests/unit/test_saml_groups.py index 7640785..67932b9 100644 --- a/tests/unit/test_saml_groups.py +++ b/tests/unit/test_saml_groups.py @@ -240,3 +240,89 @@ def test_compact_saml_group_users_keeps_only_org_sync_fields(self) -> None: ], compact_users, ) + + def test_organization_name_for_saml_group_uses_synced_prefix_and_sanitizes(self) -> None: + self.assertEqual( + "synced-okta-eng-team", + saml_groups.organization_name_for_saml_group("okta", "eng team!"), + ) + self.assertTrue( + saml_groups.is_synced_organization_name("synced-okta-eng-team"), + ) + self.assertFalse(saml_groups.is_synced_organization_name("okta-eng-team")) + + def test_organization_name_for_saml_group_rejects_unconvertible_parts(self) -> None: + with self.assertRaises(SystemExit): + saml_groups.organization_name_for_saml_group("okta", "!!!") + + def test_compact_scoped_saml_group_users_keeps_users_without_groups(self) -> None: + providers: list[shared_types.AuthProvider] = [ + { + "serviceType": "saml", + "serviceID": "https://idp.example.com", + "clientID": "sourcegraph", + "displayName": "SAML", + "isBuiltin": False, + "configID": "okta", + }, + ] + users: list[shared_types.User] = [ + { + "id": "user-1", + "username": "alice", + "builtinAuth": False, + "externalAccounts": { + "nodes": [ + { + "serviceType": "saml", + "serviceID": "https://idp.example.com", + "clientID": "sourcegraph", + "accountData": { + "Values": {"groups": {"Values": [{"Value": "engineering"}]}} + }, + }, + ] + }, + "organizations": { + "nodes": [ + {"id": "org-1", "name": "synced-okta-stale"}, + {"id": "org-2", "name": "manually-created-org"}, + ] + }, + }, + { + "id": "user-2", + "username": "bob", + "builtinAuth": True, + "externalAccounts": {"nodes": []}, + "organizations": {"nodes": [{"id": "org-1", "name": "synced-okta-stale"}]}, + }, + ] + + scoped_users = saml_groups.compact_scoped_saml_group_users(users, providers, {}) + + self.assertEqual( + [ + shared_types.ScopedSamlGroupUser( + user_id="user-1", + username="alice", + saml_group_memberships=( + shared_types.SamlGroupMembership( + provider_config_id="okta", group_name="engineering" + ), + ), + # The manually created org is NOT tool-managed: it must + # never appear as a removal candidate. + synced_organizations=({"id": "org-1", "name": "synced-okta-stale"},), + ), + # Users with zero group memberships are kept: scoped org + # sync must still remove them from synced orgs they left. + shared_types.ScopedSamlGroupUser( + user_id="user-2", + username="bob", + saml_group_memberships=(), + synced_organizations=({"id": "org-1", "name": "synced-okta-stale"},), + ), + ], + scoped_users, + ) From 6d24d8734550da4883c37afce2a46a7cf9bf7b45 Mon Sep 17 00:00:00 2001 From: Marc LeBlanc <7050295+marcleblanc2@users.noreply.github.com> Date: Fri, 12 Jun 2026 17:32:40 -0600 Subject: [PATCH 2/3] Give sync-saml-orgs the set command's user filters and --full MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The standalone command now requires an explicit mode — bare sync-saml-orgs is rejected: - --full keeps the existing whole-instance sync (with orphaned synced-org cleanup). - --users / --users-without-explicit-perms / --created-after run a scoped per-user sync (additions AND removals for the selected users only), reusing the get/set user-selection pipeline; accountData and each user's org list ride along in the same queries. - Artifact names carry the mode (sync-saml-orgs-full-apply, sync-saml-orgs-users-dry-run, ...). - cmd_get's user loader is now load_selected_users, shared by the standalone scoped org-sync modes. - Rejection matrix updated: bare invocation, --full + user-filter conflicts, and repo filters are rejected; live cases pass --full. Amp-Thread-ID: https://ampcode.com/threads/T-019ebdc3-c01c-72cb-aa46-52a0183c2ab1 Co-authored-by: Amp --- README.md | 32 +++-- src/src_auth_perms_sync/cli.py | 82 ++++++++++++- src/src_auth_perms_sync/orgs/sync.py | 74 ++++++++++-- .../permissions/command.py | 22 +++- .../sync-saml-orgs-users-scoped/after.json | 113 ++++++++++++++++++ .../sync-saml-orgs-users-scoped/before.json | 107 +++++++++++++++++ tests/run.py | 2 +- tests/tests.yaml | 107 ++++++++++++++--- tests/unit/test_cli_config.py | 55 ++++++++- tests/unit/test_command_files.py | 2 +- 10 files changed, 547 insertions(+), 49 deletions(-) create mode 100644 tests/e2e/fixtures/sync-saml-orgs-users-scoped/after.json create mode 100644 tests/e2e/fixtures/sync-saml-orgs-users-scoped/before.json diff --git a/README.md b/README.md index 26a296e..aff3891 100644 --- a/README.md +++ b/README.md @@ -269,7 +269,7 @@ snapshots that make `--apply` reversible. 1. **Get user and org metadata** ```bash - src-auth-perms-sync sync-saml-orgs + src-auth-perms-sync sync-saml-orgs --full ``` - Queries the Sourcegraph instance for auth providers, users, users' SAML groups, and orgs @@ -278,28 +278,40 @@ snapshots that make `--apply` reversible. 2. **Apply org sync** ```bash - src-auth-perms-sync sync-saml-orgs --apply + src-auth-perms-sync sync-saml-orgs --full --apply ``` - Creates the orgs if they don't exist, and sync the members from the SAML groups to the orgs - `--sync-saml-orgs` can also be added to a `set` run, to run both at the same time +3. **Scoped org sync for selected users** + + ```bash + src-auth-perms-sync sync-saml-orgs --users alice,bob + src-auth-perms-sync sync-saml-orgs --created-after 2026-06-01 + src-auth-perms-sync sync-saml-orgs --users-without-explicit-perms + ``` + + - Same user filters as `get` and `set`; a mode flag is required — there + is no bare `sync-saml-orgs` + ### Org sync behavior - Org names are `synced--` (non-alphanumeric characters become `-`). The `synced-` prefix marks tool ownership: the sync only ever modifies orgs whose name carries it, so manually created orgs are never touched. -- The org sync mode follows the permission sync mode of the same run — no surprises: - - **Full** (standalone `sync-saml-orgs`, or `set --full` / `--repos*` +- The org sync mode is always explicit — no surprises: + - **Full** (`sync-saml-orgs --full`, or `set --full` / `--repos*` `--sync-saml-orgs`): converges every synced org against all users. A synced org whose SAML group disappeared has all members removed, but the org itself is kept (its settings survive in case the group comes back). - - **Scoped** (`set --users` / `--users-without-explicit-perms` / - `--created-after` with `--sync-saml-orgs`): syncs org membership for exactly - the users the set run selected — per-user additions AND removals, computed - from each user's own SAML assertion and org list. Other users' memberships - never change, and no full user scan or org member listing is needed, so API - traffic stays proportional to the selection. + - **Scoped** (user filters on `sync-saml-orgs`, or `set --users` / + `--users-without-explicit-perms` / `--created-after` with + `--sync-saml-orgs`): syncs org membership for exactly the selected users — + per-user additions AND removals, computed from each user's own SAML + assertion and org list. Other users' memberships never change, and no full + user scan or org member listing is needed, so API traffic stays + proportional to the selection. ## Options diff --git a/src/src_auth_perms_sync/cli.py b/src/src_auth_perms_sync/cli.py index dd0347a..fe93498 100644 --- a/src/src_auth_perms_sync/cli.py +++ b/src/src_auth_perms_sync/cli.py @@ -92,10 +92,15 @@ ) SYNC_SAML_ORGS_CONFIG_FIELDS = src.config_field_names( *COMMON_CONFIG_FIELDS_BEFORE, + "full", + "users", + "users_without_explicit_perms", + "created_after", "apply", "no_backup", "artifacts_dir", "no_files", + "explicit_permissions_batch_size", "parallelism", *COMMON_CONFIG_FIELDS_AFTER, ) @@ -146,6 +151,12 @@ "repos_without_explicit_perms": "set_repos_without_explicit_perms_sync_saml_orgs", "repos_created_after": "set_repos_created_after_sync_saml_orgs", } +SYNC_SAML_ORGS_ARTIFACT_NAMES: dict[str, str] = { + "full": "sync-saml-orgs-full-{run_mode}", + "users": "sync-saml-orgs-users-{run_mode}", + "users_without_explicit_perms": "sync-saml-orgs-users-without-explicit-perms-{run_mode}", + "created_after": "sync-saml-orgs-created-after-{run_mode}", +} SYNC_SET_COMMAND_ARTIFACT_NAMES: dict[permission_types.SetCommandMode, str] = { "full": "set-sync-saml-orgs-{run_mode}", "users": "set-add-users-sync-saml-orgs-{run_mode}", @@ -431,6 +442,7 @@ def validate_config(command_name: CommandName, config: Config) -> None: validate_user_filter_selection(command_name, config) validate_repository_filter_selection(command_name, config) validate_set_mode_selection(command_name, config) + validate_sync_saml_orgs_mode_selection(command_name, config) def validate_command_options(command_name: CommandName, config: Config) -> None: @@ -460,10 +472,11 @@ def validate_user_filter_selection(command_name: CommandName, config: Config) -> config_error("choose only one of --users or --users-without-explicit-perms") user_filter_selected = user_scope_filter_count > 0 or config.created_after is not None - user_filter_allowed = command_name in {"get", "set"} + user_filter_allowed = command_name in {"get", "set", "sync_saml_orgs"} if user_filter_selected and not user_filter_allowed: config_error( - "--users, --users-without-explicit-perms, and --created-after require get or set" + "--users, --users-without-explicit-perms, and --created-after " + "require get, set, or sync-saml-orgs" ) @@ -495,10 +508,39 @@ def validate_repository_filter_selection(command_name: CommandName, config: Conf config_error("choose either user filters or repo filters, not both") +def validate_sync_saml_orgs_mode_selection(command_name: CommandName, config: Config) -> None: + """Validate sync-saml-orgs command mode flags.""" + if command_name != "sync_saml_orgs": + return + + if config.full and config.created_after is not None: + config_error( + "--full cannot be combined with --created-after; full mode already syncs every user" + ) + if config.full and (config.users or config.users_without_explicit_perms): + config_error( + "with sync-saml-orgs, choose at most one of --full, --users, " + "or --users-without-explicit-perms" + ) + mode_selected = any( + ( + config.full, + bool(config.users), + config.users_without_explicit_perms, + config.created_after is not None, + ) + ) + if not mode_selected: + config_error( + "sync-saml-orgs requires one of --full, --users, " + "--users-without-explicit-perms, or --created-after" + ) + + def validate_set_mode_selection(command_name: CommandName, config: Config) -> None: """Validate set command mode flags.""" - if config.full and command_name != "set": - config_error("--full requires the set command") + if config.full and command_name not in {"set", "sync_saml_orgs"}: + config_error("--full requires the set or sync-saml-orgs command") if command_name != "set": return @@ -600,11 +642,24 @@ def resolve_command(command_name: CommandName, config: Config) -> ResolvedComman return ResolvedCommand( name="sync_saml_orgs", log_name="sync_saml_orgs", - artifact_name=f"sync-saml-orgs-{run_mode}", + artifact_name=SYNC_SAML_ORGS_ARTIFACT_NAMES[sync_saml_orgs_mode(config)].format( + run_mode=run_mode + ), sync_saml_organizations=True, ) +def sync_saml_orgs_mode(config: Config) -> str: + """Return the validated standalone sync-saml-orgs mode.""" + if config.users: + return "users" + if config.users_without_explicit_perms: + return "users_without_explicit_perms" + if config.created_after is not None: + return "created_after" + return "full" + + def resolve_set_command(config: Config, run_mode: str) -> ResolvedCommand: """Return resolved metadata for the selected set command mode.""" set_options = set_command_options(config) @@ -790,6 +845,8 @@ def run_command( run_restore(config, client, sourcegraph_site_config, run_paths, worker_pool) return command_data else: + # Standalone command: the config's user filters (or --full) choose + # between a scoped and a full org sync. run_sync_saml_organizations( config, client, @@ -797,6 +854,7 @@ def run_command( command_data, run_paths, worker_pool, + standalone=True, ) return command_data @@ -883,8 +941,16 @@ def run_sync_saml_organizations( command_data: run_context.CommandData, run_paths: backups.RunPaths, worker_pool: ThreadPoolExecutor, + *, + standalone: bool = False, ) -> None: - """Run the selected SAML organization sync command.""" + """Run the selected SAML organization sync command. + + Only the standalone command forwards the config's user filters: in + combined `set ... --sync-saml-orgs` runs those filters belong to the + set phase, which already hands over its selected users via + `command_data`. + """ organizations_command.cmd_sync_saml_organizations( client, run_paths, @@ -895,6 +961,10 @@ def run_sync_saml_organizations( ), do_backup=not config.no_backup, command_data=command_data, + user_identifiers=config.users if standalone else (), + users_without_explicit_perms=(config.users_without_explicit_perms if standalone else False), + user_created_after=config.created_after if standalone else None, + explicit_permissions_batch_size=config.explicit_permissions_batch_size, worker_pool=worker_pool, ) diff --git a/src/src_auth_perms_sync/orgs/sync.py b/src/src_auth_perms_sync/orgs/sync.py index 76148af..181f84e 100644 --- a/src/src_auth_perms_sync/orgs/sync.py +++ b/src/src_auth_perms_sync/orgs/sync.py @@ -15,6 +15,7 @@ import src_py_lib as src from ..permissions import apply as permissions_apply +from ..permissions import command as permissions_command from ..shared import backups, run_context, saml_groups from ..shared import sourcegraph as shared_sourcegraph from ..shared import types as shared_types @@ -259,6 +260,45 @@ def _load_scoped_organization_sync_state( ) +def _load_scoped_users_for_filters( + client: src.SourcegraphClient, + *, + user_identifiers: tuple[str, ...], + users_without_explicit_perms: bool, + user_created_after: str | None, + parallelism: int, + explicit_permissions_batch_size: int, + saml_groups_attribute_name_by_config_id: dict[str, str], + worker_pool: ThreadPoolExecutor | None = None, +) -> list[shared_types.ScopedSamlGroupUser]: + """Load and compact the users a standalone scoped org sync covers. + + Reuses the get/set user-selection pipeline; accountData and each + user's org memberships ride along in the same queries. + """ + users = permissions_command.load_selected_users( + client, + user_identifiers=user_identifiers, + users_without_explicit_perms=users_without_explicit_perms, + user_created_after=user_created_after, + parallelism=parallelism, + explicit_permissions_batch_size=explicit_permissions_batch_size, + include_account_data=True, + include_organizations=True, + worker_pool=worker_pool, + ) + log.info("Querying auth providers from %s ...", client.endpoint) + providers = shared_sourcegraph.list_auth_providers(client) + attribute_names_by_provider = saml_groups.attribute_names_by_provider_key( + providers, saml_groups_attribute_name_by_config_id + ) + return saml_groups.compact_scoped_saml_group_users( + users, + providers, + attribute_names_by_provider, + ) + + def _log_organization_sync_plan(sync_state: _OrganizationSyncState) -> None: log.info( "Organization sync plan: create %d org(s), add %d member(s), remove %d member(s).", @@ -569,6 +609,10 @@ def cmd_sync_saml_organizations( saml_groups_attribute_name_by_config_id: dict[str, str], do_backup: bool, command_data: run_context.CommandData | None = None, + user_identifiers: tuple[str, ...] = (), + users_without_explicit_perms: bool = False, + user_created_after: str | None = None, + explicit_permissions_batch_size: int = 50, worker_pool: ThreadPoolExecutor | None = None, ) -> None: """Create/update Sourcegraph orgs from discovered SAML groups. @@ -580,26 +624,42 @@ def cmd_sync_saml_organizations( The `synced-` prefix marks tool ownership: only orgs carrying it are ever modified. - Two modes, matching the permission-sync mode of the same run: + Two modes: - - Full (standalone `sync-saml-orgs`, or after a full-overwrite set): + - Full (`sync-saml-orgs --full`, or after a full-overwrite set): converges every synced org to the complete user population, including emptying (but never deleting) synced orgs whose SAML group disappeared. - - Scoped (`command_data.scoped_saml_group_users` is set, after an - additive set): per-user additions and removals for exactly the - selected users; no other users or orgs are touched and no full - user stream or member pages are loaded. + - Scoped: per-user additions and removals for exactly the selected + users; no other users or orgs are touched and no full user stream + or member pages are loaded. The scope comes either from a preceding + additive set (`command_data.scoped_saml_group_users`) or from this + command's own user filters (`user_identifiers`, + `users_without_explicit_perms`, `user_created_after`). """ resolved_command_data = command_data or run_context.CommandData() scoped_users = resolved_command_data.scoped_saml_group_users + user_filters_selected = ( + bool(user_identifiers) or users_without_explicit_perms or user_created_after is not None + ) with src.span( "cmd_sync_saml_organizations", dry_run=dry_run, parallelism=parallelism, do_backup=do_backup, - scoped=scoped_users is not None, + scoped=scoped_users is not None or user_filters_selected, ) as command_event: + if scoped_users is None and user_filters_selected: + scoped_users = _load_scoped_users_for_filters( + client, + user_identifiers=user_identifiers, + users_without_explicit_perms=users_without_explicit_perms, + user_created_after=user_created_after, + parallelism=parallelism, + explicit_permissions_batch_size=explicit_permissions_batch_size, + saml_groups_attribute_name_by_config_id=(saml_groups_attribute_name_by_config_id), + worker_pool=worker_pool, + ) if scoped_users is not None: sync_state = _load_scoped_organization_sync_state( client, diff --git a/src/src_auth_perms_sync/permissions/command.py b/src/src_auth_perms_sync/permissions/command.py index 7b029e1..d0aa5a1 100644 --- a/src/src_auth_perms_sync/permissions/command.py +++ b/src/src_auth_perms_sync/permissions/command.py @@ -274,7 +274,7 @@ def cmd_get( cmd_event["external_service_count"] = len(services) include_user_account_data = _providers_need_saml_account_data(raw_providers) - users = _load_get_users( + users = load_selected_users( client, user_identifiers=user_identifiers, users_without_explicit_perms=users_without_explicit_perms, @@ -394,7 +394,7 @@ def cmd_get( ) -def _load_get_users( +def load_selected_users( client: src.SourcegraphClient, *, user_identifiers: tuple[str, ...], @@ -403,14 +403,21 @@ def _load_get_users( parallelism: int, explicit_permissions_batch_size: int, include_account_data: bool, + include_organizations: bool = False, worker_pool: ThreadPoolExecutor | None, ) -> list[shared_types.User]: - """Load the Sourcegraph users selected by get/set-compatible user filters.""" + """Load the Sourcegraph users selected by the shared user filters. + + Used by the get command and by the standalone scoped sync-saml-orgs + modes; `include_organizations` rides the users' org memberships along + in the same queries for scoped org sync. + """ if user_identifiers: users = _resolve_user_identifiers( client, user_identifiers, include_account_data=include_account_data, + include_organizations=include_organizations, ) if user_created_after is None: return users @@ -463,19 +470,25 @@ def _load_get_users( client, candidates, include_account_data=include_account_data, + include_organizations=include_organizations, parallelism=parallelism, worker_pool=worker_pool, ) log.info("Selected %d user(s) for get output.", len(users)) return users - return _load_all_get_users(client, include_account_data=include_account_data) + return _load_all_get_users( + client, + include_account_data=include_account_data, + include_organizations=include_organizations, + ) def _load_all_get_users( client: src.SourcegraphClient, *, include_account_data: bool, + include_organizations: bool = False, ) -> list[shared_types.User]: """Load all users for get output, with progress logs for large instances.""" total_users = shared_sourcegraph.count_users(client) @@ -495,6 +508,7 @@ def _load_all_get_users( shared_sourcegraph.list_users_streaming( client, include_account_data=include_account_data, + include_organizations=include_organizations, ), start=1, ): diff --git a/tests/e2e/fixtures/sync-saml-orgs-users-scoped/after.json b/tests/e2e/fixtures/sync-saml-orgs-users-scoped/after.json new file mode 100644 index 0000000..62850bc --- /dev/null +++ b/tests/e2e/fixtures/sync-saml-orgs-users-scoped/after.json @@ -0,0 +1,113 @@ +{ + "endpoint": "https://fixture.sourcegraph.test", + "currentUsername": "org_admin_09990", + "authProviders": [ + { + "serviceType": "builtin", + "serviceID": "", + "clientID": "", + "displayName": "Builtin username/password", + "isBuiltin": true, + "configID": "" + }, + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "displayName": "Okta", + "isBuiltin": false, + "configID": "okta" + } + ], + "externalServices": [ + { + "id": 1, + "kind": "GITHUB", + "displayName": "GitHub", + "url": "https://github.com/", + "config": "{}" + } + ], + "users": [ + { + "id": 1, + "username": "org_admin_09990", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "org_admin_09990@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + }, + { + "id": 2, + "username": "test_user_09991", + "builtinAuth": false, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09991@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [ + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "accountData": { + "Values": { + "groups": { + "Values": [ + { + "Value": "engineering" + } + ] + } + } + } + } + ] + }, + { + "id": 3, + "username": "test_user_09992", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09992@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + } + ], + "repos": [ + { + "id": 101, + "name": "test-repo-49981", + "externalServiceID": 1, + "explicitPermissionsUsers": [] + } + ], + "organizations": [ + { + "id": 2, + "name": "synced-okta-engineering", + "members": [ + "test_user_09991" + ] + }, + { + "id": 1, + "name": "synced-okta-stale", + "members": [ + "test_user_09992" + ] + } + ] +} diff --git a/tests/e2e/fixtures/sync-saml-orgs-users-scoped/before.json b/tests/e2e/fixtures/sync-saml-orgs-users-scoped/before.json new file mode 100644 index 0000000..7dce83f --- /dev/null +++ b/tests/e2e/fixtures/sync-saml-orgs-users-scoped/before.json @@ -0,0 +1,107 @@ +{ + "endpoint": "https://fixture.sourcegraph.test", + "currentUsername": "org_admin_09990", + "authProviders": [ + { + "serviceType": "builtin", + "serviceID": "", + "clientID": "", + "displayName": "Builtin username/password", + "isBuiltin": true, + "configID": "" + }, + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "displayName": "Okta", + "isBuiltin": false, + "configID": "okta" + } + ], + "externalServices": [ + { + "id": 1, + "kind": "GITHUB", + "displayName": "GitHub", + "url": "https://github.com/", + "config": "{}" + } + ], + "users": [ + { + "id": 1, + "username": "org_admin_09990", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "org_admin_09990@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + }, + { + "id": 2, + "username": "test_user_09991", + "builtinAuth": false, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09991@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [ + { + "serviceType": "saml", + "serviceID": "http://www.okta.com/test123", + "clientID": "https://sourcegraph.test/.auth/saml/metadata", + "accountData": { + "Values": { + "groups": { + "Values": [ + { + "Value": "engineering" + } + ] + } + } + } + } + ] + }, + { + "id": 3, + "username": "test_user_09992", + "builtinAuth": true, + "createdAt": "2026-01-01T00:00:00Z", + "emails": [ + { + "email": "test_user_09992@perms-sync.test", + "verified": true + } + ], + "externalAccounts": [] + } + ], + "repos": [ + { + "id": 101, + "name": "test-repo-49981", + "externalServiceID": 1, + "explicitPermissionsUsers": [] + } + ], + "organizations": [ + { + "id": 1, + "name": "synced-okta-stale", + "members": [ + "test_user_09991", + "test_user_09992" + ] + } + ] +} diff --git a/tests/run.py b/tests/run.py index 7048380..f12c68c 100644 --- a/tests/run.py +++ b/tests/run.py @@ -1980,7 +1980,7 @@ def run_seeded_org_sync_check(self, environment: dict[str, str]) -> None: self.run_cli_case( CliCase( f"{label} [apply]", - ("sync-saml-orgs", "--apply"), + ("sync-saml-orgs", "--full", "--apply"), 0, (ORGANIZATION_SYNC_VALIDATION_OK,), ), diff --git a/tests/tests.yaml b/tests/tests.yaml index 603df30..1b8b72f 100644 --- a/tests/tests.yaml +++ b/tests/tests.yaml @@ -435,7 +435,7 @@ cases: expectedOutput: - "unrecognized arguments: --sync-saml-orgs" - reject-sync-saml-orgs-created-after: + reject-bare-sync-saml-orgs: # scope: # users: 0 # repos: 0 @@ -443,15 +443,35 @@ cases: # cost: # bigO: O(1) # note: parse-only -- no instance reads or writes - description: sync-saml-orgs does not take user filters. + description: >- + sync-saml-orgs requires an explicit mode: --full for a whole-instance + sync, or a user filter for a scoped sync. modes: - local - cliCommand: sync-saml-orgs --created-after 2099-01-01 + cliCommand: sync-saml-orgs expectedExitCode: 2 expectedOutput: - - unrecognized arguments + - >- + sync-saml-orgs requires one of --full, --users, + --users-without-explicit-perms, or --created-after + + reject-sync-saml-orgs-full-and-users: + # scope: + # users: 0 + # repos: 0 + # perms: 0 + # cost: + # bigO: O(1) + # note: parse-only -- no instance reads or writes + description: "--full means all users; combining it with a user filter is contradictory." + modes: + - local + cliCommand: sync-saml-orgs --full --users username_doesnt_exist_01 + expectedExitCode: 2 + expectedOutput: + - "choose at most one of --full, --users" - reject-sync-saml-orgs-users: + reject-sync-saml-orgs-full-and-created-after: # scope: # users: 0 # repos: 0 @@ -459,15 +479,15 @@ cases: # cost: # bigO: O(1) # note: parse-only -- no instance reads or writes - description: sync-saml-orgs does not take a user list. + description: "--full means all users; combining it with --created-after is contradictory." modes: - local - cliCommand: sync-saml-orgs --users username_doesnt_exist_01 + cliCommand: sync-saml-orgs --full --created-after 2099-01-01 expectedExitCode: 2 expectedOutput: - - "unrecognized arguments: --users" + - "--full cannot be combined with --created-after" - reject-sync-saml-orgs-full: + reject-sync-saml-orgs-repos: # scope: # users: 0 # repos: 0 @@ -475,13 +495,13 @@ cases: # cost: # bigO: O(1) # note: parse-only -- no instance reads or writes - description: "--full belongs to set; sync-saml-orgs does not accept it." + description: sync-saml-orgs takes user filters, never repo filters. modes: - local - cliCommand: sync-saml-orgs --full + cliCommand: sync-saml-orgs --repos test-repo-49981 expectedExitCode: 2 expectedOutput: - - "unrecognized arguments: --full" + - "unrecognized arguments: --repos" reject-sync-saml-orgs-restore-path: # scope: @@ -1318,6 +1338,16 @@ cases: expectedMutations: 2 set-users-sync-saml-orgs-scoped: + # scope: + # users: 3 + # repos: 1 + # perms: 0 + # cost: + # bigO: O(1) + # reads: + # users: 1 + # orgLookups: 1 + # writes: 5 mutations description: >- Additive set --users with --sync-saml-orgs runs a SCOPED org sync: the selected user is added to the synced org their SAML group @@ -1336,6 +1366,15 @@ cases: expectedMutations: 5 set-users-sync-saml-orgs-noop: + # scope: + # users: 3 + # repos: 1 + # perms: 1 + # cost: + # bigO: O(1) + # reads: + # users: 1 + # writes: none description: >- Scoped org sync is idempotent: with repo grants and synced org membership already converged, a set --users --sync-saml-orgs apply @@ -1351,6 +1390,16 @@ cases: expectedMutations: 0 sync-saml-orgs-full-cleanup: + # scope: + # users: 3 + # repos: 1 + # perms: 0 + # cost: + # bigO: O(U) + # reads: + # users: all + # orgMembers: all + # writes: 4 mutations description: >- Standalone full org sync converges synced orgs both ways: creates the org a SAML group requires (cleaning up the creator's auto-membership) @@ -1360,6 +1409,34 @@ cases: - local args: command: sync_saml_orgs + full: true + apply: true + expectedMutations: 4 + + sync-saml-orgs-users-scoped: + # scope: + # users: 3 + # repos: 1 + # perms: 0 + # cost: + # bigO: O(1) + # reads: + # users: 1 + # orgLookups: 1 + # writes: 4 mutations + description: >- + Standalone scoped org sync via sync-saml-orgs --users: the selected + user is added to the synced org their SAML group requires (created + on demand, creator membership cleaned up) and removed from a synced + org their groups no longer justify; an out-of-scope member of that + org is untouched and repo permissions never change. create + add + + creator removal + stale removal = 4 mutations. + modes: + - local + args: + command: sync_saml_orgs + users: + - test_user_09991 apply: true expectedMutations: 4 @@ -1600,11 +1677,11 @@ cases: # writes: none # note: streams every user's SAML account data description: >- - Standalone organization sync dry run. Also pins the explicit + Standalone full organization sync dry run. Also pins the explicit --env-file flag against its default value. modes: - live - cliCommand: sync-saml-orgs --env-file .env + cliCommand: sync-saml-orgs --full --env-file .env expectedExitCode: 0 expectedOutput: - Dry run complete @@ -1763,7 +1840,7 @@ cases: outcome; it is safe to re-run. modes: - live - cliCommand: sync-saml-orgs --apply + cliCommand: sync-saml-orgs --full --apply expectedExitCode: 0 expectedOutput: - "VALIDATION OK: all target org memberships match" diff --git a/tests/unit/test_cli_config.py b/tests/unit/test_cli_config.py index b581552..118218c 100644 --- a/tests/unit/test_cli_config.py +++ b/tests/unit/test_cli_config.py @@ -357,7 +357,7 @@ def test_validate_config_allows_no_files_without_apply(self) -> None: "restore", make_config(restore_path=Path("snapshot.json"), no_files=True), ) - cli.validate_config("sync_saml_orgs", make_config(no_files=True)) + cli.validate_config("sync_saml_orgs", make_config(full=True, no_files=True)) def test_validate_config_rejects_maps_path_outside_get_and_set(self) -> None: expected_message = "--maps-path requires the get or set command" @@ -397,7 +397,7 @@ def test_artifacts_dir_flag_is_accepted_on_every_command(self) -> None: "get": [], "set": ["--full"], "restore": ["--restore-path", str(snapshot_path)], - "sync-saml-orgs": [], + "sync-saml-orgs": ["--full"], } for command_argument, extra_arguments in extra_arguments_by_command.items(): @@ -440,7 +440,9 @@ def test_validate_config_rejects_restore_path_without_restore(self) -> None: ) def test_validate_config_rejects_set_modes_without_set(self) -> None: - self.assert_config_error("get", make_config(full=True), "requires the set command") + self.assert_config_error( + "get", make_config(full=True), "requires the set or sync-saml-orgs command" + ) def test_validate_config_allows_get_user_filters_without_set(self) -> None: cli.validate_config("get", make_config(users=("alice", "bob@example.com"))) @@ -463,7 +465,7 @@ def test_validate_config_rejects_user_filters_on_non_get_set_commands(self) -> N self.assert_config_error( "restore", make_config(restore_path=Path("snapshot.json"), users=("alice",)), - "require get or set", + "require get, set, or sync-saml-orgs", ) def test_validate_config_rejects_repo_filter_conflicts(self) -> None: @@ -498,6 +500,49 @@ def test_validate_config_rejects_set_without_explicit_mode(self) -> None: "set requires one of --full", ) + def test_validate_config_rejects_sync_saml_orgs_without_explicit_mode(self) -> None: + self.assert_config_error( + "sync_saml_orgs", + make_config(), + "sync-saml-orgs requires one of --full, --users", + ) + + def test_validate_config_rejects_sync_saml_orgs_full_with_user_filters(self) -> None: + self.assert_config_error( + "sync_saml_orgs", + make_config(full=True, users=("alice",)), + "choose at most one of --full, --users", + ) + self.assert_config_error( + "sync_saml_orgs", + make_config(full=True, created_after="2099-01-01"), + "--full cannot be combined with --created-after", + ) + + def test_validate_config_allows_sync_saml_orgs_modes(self) -> None: + cli.validate_config("sync_saml_orgs", make_config(full=True)) + cli.validate_config("sync_saml_orgs", make_config(users=("alice",))) + cli.validate_config("sync_saml_orgs", make_config(users_without_explicit_perms=True)) + cli.validate_config("sync_saml_orgs", make_config(created_after="2026-01-01")) + + def test_resolve_command_names_sync_saml_orgs_artifacts_by_mode(self) -> None: + self.assertEqual( + "sync-saml-orgs-full-dry-run", + cli.resolve_command("sync_saml_orgs", make_config(full=True)).artifact_name, + ) + self.assertEqual( + "sync-saml-orgs-users-apply", + cli.resolve_command( + "sync_saml_orgs", make_config(users=("alice",), apply=True) + ).artifact_name, + ) + self.assertEqual( + "sync-saml-orgs-created-after-dry-run", + cli.resolve_command( + "sync_saml_orgs", make_config(created_after="2026-01-01") + ).artifact_name, + ) + def test_created_after_config_accepts_yyyy_mm_dd_date_arguments(self) -> None: config = load_config_from_env(SRC_AUTH_PERMS_SYNC_CREATED_AFTER="2026-01-01") @@ -733,7 +778,7 @@ def test_cmd_get_no_backup_skips_snapshot_artifacts(self) -> None: with ( mock.patch.object(permissions_command, "load_discovery", return_value=([], [], {})), - mock.patch.object(permissions_command, "_load_get_users", return_value=[]), + mock.patch.object(permissions_command, "load_selected_users", return_value=[]), mock.patch.object(permissions_command.permissions_maps, "dump_code_hosts_yaml"), mock.patch.object(permissions_command.permissions_maps, "dump_auth_providers_yaml"), mock.patch.object( diff --git a/tests/unit/test_command_files.py b/tests/unit/test_command_files.py index 44e278c..91223bc 100644 --- a/tests/unit/test_command_files.py +++ b/tests/unit/test_command_files.py @@ -113,7 +113,7 @@ def run_cmd_get( "load_discovery", return_value=([make_auth_provider()], [make_external_service()], {}), ), - mock.patch.object(permissions_command, "_load_get_users", return_value=[]), + mock.patch.object(permissions_command, "load_selected_users", return_value=[]), mock.patch.object( permissions_command.permission_snapshot, "build_snapshot", From cf613a0a2712c8aef66b1e97b241452ec3adfdd6 Mon Sep 17 00:00:00 2001 From: Marc LeBlanc <7050295+marcleblanc2@users.noreply.github.com> Date: Fri, 12 Jun 2026 18:05:16 -0600 Subject: [PATCH 3/3] Log 'Dry run complete' when an org sync has nothing to plan A scoped org sync whose selection is empty (e.g. set --created-after 2099-01-01 --sync-saml-orgs) returned before the dry-run completion line, so operators (and the live set-created-after-sync-saml-orgs case) lost the run-finished marker. Log it on the nothing-to-sync path; --apply runs still return quietly after their own summary. Amp-Thread-ID: https://ampcode.com/threads/T-019ebdc3-c01c-72cb-aa46-52a0183c2ab1 Co-authored-by: Amp --- src/src_auth_perms_sync/orgs/sync.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/src_auth_perms_sync/orgs/sync.py b/src/src_auth_perms_sync/orgs/sync.py index 181f84e..9931280 100644 --- a/src/src_auth_perms_sync/orgs/sync.py +++ b/src/src_auth_perms_sync/orgs/sync.py @@ -678,6 +678,8 @@ def cmd_sync_saml_organizations( worker_pool, ) if sync_state is None: + if dry_run: + log.info("Dry run complete. Nothing to sync, so --apply would make no changes.") return _log_organization_sync_plan(sync_state)