Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion scripts/codex_audit_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,23 @@ def _job_dedupe_key(payload: dict[str, Any]) -> str:

def _classify_failure(error: str) -> str:
text = error.lower()
if any(word in text for word in ("permission", "unauth", "forbidden", "oidc", "token", "allow", "secret")):
auth_config_signals = (
"permission denied",
"unauthorized",
"forbidden",
"oidc",
"missing bearer",
"missing token",
"invalid token",
"bad credentials",
"not allowed",
"allowlist",
"api key is required",
"no api key configured",
"secret is missing",
"secret not configured",
Comment on lines +675 to +689

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Restore auth classification for clone authentication failures

When the service clone step fails with GitHub’s usual fatal: authentication failed ... message, _prepare_repo() wraps it as git clone failed ..., but this new allowlist no longer matches that phrase. Those jobs are now marked unknown_failure, so run_monthly_codex_audit.py will skip the infra-failure path and, for auto, can fall back to API remediation instead of reporting the service auth/config outage. Please include the explicit authentication-failure wording here and in the mirrored classifiers.

Useful? React with 👍 / 👎.

)
if any(signal in text for signal in auth_config_signals):
return "auth_or_config_failure"
if any(word in text for word in ("quota", "rate limit", "too many active", "budget")):
return "quota_or_capacity_failure"
Expand Down
18 changes: 17 additions & 1 deletion scripts/run_monthly_codex_audit.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,23 @@ class BridgeError(RuntimeError):

def classify_service_failure(error: str) -> str:
text = error.lower()
if any(word in text for word in ("permission", "unauth", "forbidden", "oidc", "token", "allow", "secret")):
auth_config_signals = (
"permission denied",
"unauthorized",
"forbidden",
"oidc",
"missing bearer",
"missing token",
"invalid token",

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Classify expired service tokens as auth failures

When the script-based audit service rejects a request because the GitHub OIDC JWT is past exp, it raises PermissionError("Token expired") (scripts/codex_audit_service.py:557), and the client wraps that 401 body before this classifier runs. Because the narrowed signals only include missing token/invalid token, that request-time auth outage is now unknown_failure, so auto runs can fall back to API remediation instead of posting the service-infrastructure failure. Please keep an explicit token expired signal here and in the mirrored service classifier.

Useful? React with 👍 / 👎.

"bad credentials",
"not allowed",
"allowlist",
"api key is required",
"no api key configured",
"secret is missing",
"secret not configured",
Comment on lines +184 to +188

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Classify request-time service config failures

For request-time 401s there is no async job failure_category, so run_codex_service() relies on this tuple to decide whether to return the infra exit code. The service still emits config errors such as CODEX_AUDIT_SERVICE_ALLOWED_REPOSITORIES is required before a job is created; after replacing the broad allow match with only allowlist, those messages become unknown_failure, and auto runs can proceed to API fallback instead of posting the service-infra failure comment. Add explicit signals for these required service allowlist/config errors.

Useful? React with 👍 / 👎.

)
if any(signal in text for signal in auth_config_signals):
return "auth_or_config_failure"
if any(word in text for word in ("quota", "rate limit", "too many active", "budget")):
return "quota_or_capacity_failure"
Expand Down
18 changes: 17 additions & 1 deletion service/ai_gateway_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,23 @@ def _job_dedupe_key(payload: dict[str, Any]) -> str:

def _classify_failure(error: str) -> str:
text = error.lower()
if any(word in text for word in ("permission", "unauth", "forbidden", "oidc", "token", "allow", "secret")):
auth_config_signals = (
"permission denied",
"unauthorized",
"forbidden",
"oidc",
"missing bearer",
"missing token",
"invalid token",
"bad credentials",
"not allowed",
"allowlist",
"api key is required",
"no api key configured",
"secret is missing",
"secret not configured",
)
if any(signal in text for signal in auth_config_signals):
return "auth_or_config_failure"
if any(word in text for word in ("quota", "rate limit", "too many active", "budget")):
return "quota_or_capacity_failure"
Expand Down
4 changes: 4 additions & 0 deletions tests/test_run_monthly_codex_audit.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,10 @@ def test_service_failure_classification_identifies_infra_failures(self) -> None:
self.assertTrue(is_service_infrastructure_failure("Codex audit service job failed [transient_service_failure]: timed out"))
self.assertFalse(is_service_infrastructure_failure("Codex audit service job failed [patch_contract_failure]: invalid JSON"))

def test_service_failure_classification_ignores_source_code_secret_words(self) -> None:
message = "codex exec failed: BLOCKED_PATH_RE = r'.*token.*|.*secret.*'"
self.assertEqual(classify_service_failure(message), "unknown_failure")

def test_codex_audit_service_async_job_lifecycle(self) -> None:
with tempfile.TemporaryDirectory() as tmp:
env = {
Expand Down
Loading