From 3009c86ffb00db410d6dfce3265ddb38f2b490a2 Mon Sep 17 00:00:00 2001 From: mac Date: Wed, 24 Jun 2026 02:37:13 +0800 Subject: [PATCH 1/8] Discriminate WG21 index refresh failures and propagate stale signal to scheduler --- src/paperscout/errors.py | 8 +++ src/paperscout/models.py | 8 +++ src/paperscout/monitor.py | 59 +++++++++------ src/paperscout/sources.py | 43 ++++++----- tests/test_callback_protocols.py | 4 +- tests/test_datasource_protocol.py | 4 +- tests/test_monitor.py | 102 +++++++++++++++++++++++++- tests/test_sources.py | 115 +++++++++++++++++++++++------- 8 files changed, 272 insertions(+), 71 deletions(-) diff --git a/src/paperscout/errors.py b/src/paperscout/errors.py index 675e9e2..23b0ece 100644 --- a/src/paperscout/errors.py +++ b/src/paperscout/errors.py @@ -17,3 +17,11 @@ class FailureCategory(str, enum.Enum): class ConfigurationError(Exception): """Permanent misconfiguration (credentials, required integration settings).""" + + +class IndexRefreshError(Exception): + """Transient index download failure (retry-eligible).""" + + def __init__(self, category: FailureCategory, message: str = "") -> None: + super().__init__(message) + self.category = category diff --git a/src/paperscout/models.py b/src/paperscout/models.py index e63413d..3138787 100644 --- a/src/paperscout/models.py +++ b/src/paperscout/models.py @@ -192,6 +192,14 @@ class CycleStatus(str, Enum): FAILED = "failed" +@dataclass(frozen=True, slots=True) +class IndexRefreshResult: + """Outcome of ``WG21Index.refresh()`` / ``fetch()``.""" + + papers: dict[str, Paper] + stale: bool = False + + @dataclass(frozen=True, slots=True) class CycleResult: """Discriminated probe cycle result (success vs empty vs failed).""" diff --git a/src/paperscout/monitor.py b/src/paperscout/monitor.py index 643bd0b..4c8c703 100644 --- a/src/paperscout/monitor.py +++ b/src/paperscout/monitor.py @@ -18,8 +18,8 @@ from .concurrency import run_blocking_io from .config import Settings, settings -from .errors import ConfigurationError, FailureCategory -from .models import CycleResult, CycleStatus, Paper, PerUserMatches, ProbeHit +from .errors import ConfigurationError, FailureCategory, IndexRefreshError +from .models import CycleResult, CycleStatus, IndexRefreshResult, Paper, PerUserMatches, ProbeHit from .protocols import ( SOURCE_ISO_PROBE, SOURCE_OPEN_STD, @@ -187,6 +187,7 @@ def __init__( self._last_cycle_status: CycleStatus | None = None self._last_cycle_error: str | None = None self._last_ops_alert: float | None = None + self._index_stale = False self._health_lock = threading.Lock() self._health_snapshot: SchedulerSnapshot | None = None @@ -233,12 +234,24 @@ def _log_index_diff(self, diff: DiffResult) -> None: async def _poll_sources(self, *, baseline: bool = False) -> tuple[DiffResult, list[ProbeHit]]: diff = DiffResult(new_papers=[], updated_papers=[]) probe_hits: list[ProbeHit] = [] + self._index_stale = False for source in self.sources: if not self._source_enabled(source.source_id): continue - current = await source.fetch() + raw = await source.fetch() + if source.source_id == SOURCE_WG21_INDEX: + refresh_result = cast(IndexRefreshResult, raw) + current = refresh_result.papers + if refresh_result.stale: + log.warning( + "INDEX-STALE papers=%d (poll using persisted cache)", + len(current), + ) + self._index_stale = True + else: + current = raw if baseline: self._snapshots[source.source_id] = current if source.source_id == SOURCE_ISO_PROBE: @@ -253,8 +266,7 @@ async def _poll_sources(self, *, baseline: bool = False) -> tuple[DiffResult, li if source.source_id == SOURCE_WG21_INDEX: diff = result - papers = cast(dict[str, Paper], current) - log.info("INDEX-LOAD papers=%d", len(papers)) + log.info("INDEX-LOAD papers=%d", len(current)) self._log_index_diff(diff) elif source.source_id == SOURCE_ISO_PROBE: cycle = cast(CycleResult, current) @@ -293,6 +305,15 @@ def _mark_poll_successful_if_probe_ok(self) -> None: if self._last_cycle_status is not CycleStatus.FAILED: self._last_successful_poll = time.time() + def _advance_staleness_clock_if_ok(self) -> None: + """Advance staleness clock when index is fresh and probe cycle did not fail.""" + if self._index_stale: + return + if self.cfg.enable_iso_probe: + self._mark_poll_successful_if_probe_ok() + else: + self._last_successful_poll = time.time() + def _publish_health_snapshot(self) -> None: """Publish immutable snapshot for cross-thread health reads (event loop only).""" lsp = self._last_successful_poll @@ -371,11 +392,8 @@ async def poll_once(self) -> PollResult: if not self._seeded: seed_result = await self.seed() if not seed_result.had_prior_state: - if self.cfg.enable_iso_probe: - # Stats already recorded in seed() after run_cycle. - self._mark_poll_successful_if_probe_ok() - else: - self._last_successful_poll = time.time() + self._advance_staleness_clock_if_ok() + if not self.cfg.enable_iso_probe: self._record_probe_cycle_completion() self._publish_health_snapshot() return PollResult( @@ -415,11 +433,8 @@ async def poll_once(self) -> PollResult: ) if self.notify_callback: self.notify_callback(result) - if self.cfg.enable_iso_probe: - # Stats already recorded in seed() after run_cycle. - self._mark_poll_successful_if_probe_ok() - else: - self._last_successful_poll = time.time() + self._advance_staleness_clock_if_ok() + if not self.cfg.enable_iso_probe: self._record_probe_cycle_completion() self._publish_health_snapshot() return result @@ -512,11 +527,8 @@ async def poll_once(self) -> PollResult: len(dp_transitions), len(per_user_matches), ) - if self.cfg.enable_iso_probe: - # Stats already recorded above after run_cycle. - self._mark_poll_successful_if_probe_ok() - else: - self._last_successful_poll = time.time() + self._advance_staleness_clock_if_ok() + if not self.cfg.enable_iso_probe: self._record_probe_cycle_completion() self._publish_health_snapshot() return result @@ -581,6 +593,13 @@ async def run_forever(self, shutdown_event: asyncio.Event | None = None) -> None exc, ) return + except IndexRefreshError as exc: + log.error( + "POLL-ERROR failure_category=%s poll=%d %s", + exc.category.value, + self._poll_count, + exc, + ) except httpx.TimeoutException as exc: log.error( "POLL-ERROR failure_category=%s poll=%d %s", diff --git a/src/paperscout/sources.py b/src/paperscout/sources.py index e9475b7..31d6cec 100644 --- a/src/paperscout/sources.py +++ b/src/paperscout/sources.py @@ -17,8 +17,8 @@ import httpx from .config import Settings, settings -from .errors import FailureCategory -from .models import CycleResult, CycleStatus, Paper, ProbeHit, Tier +from .errors import ConfigurationError, FailureCategory, IndexRefreshError +from .models import CycleResult, CycleStatus, IndexRefreshResult, Paper, ProbeHit, Tier from .protocols import SOURCE_ISO_PROBE, SOURCE_OPEN_STD, SOURCE_WG21_INDEX from .storage import PaperCache, ProbeState, UserWatchlist @@ -54,20 +54,20 @@ def __init__(self, pool, cfg: Settings | None = None): self._max_p: int = 0 # absolute highest P-number self._sorted_p_nums: list[int] = [] # sorted unique P-numbers, for gap analysis - async def refresh(self) -> dict[str, Paper]: + async def refresh(self) -> IndexRefreshResult: """Load index from cache or network; populate ``self.papers``.""" cached = self._cache.read_if_fresh() if cached is not None: log.info("Loaded %d entries from cache", len(cached)) self.papers = self._parse_and_index(cached) - return self.papers + return IndexRefreshResult(self.papers, stale=False) - raw = await self._download() + raw, category = await self._download() if raw is not None: self._cache.write(raw) log.info("Downloaded and cached %d entries", len(raw)) self.papers = self._parse_and_index(raw) - return self.papers + return IndexRefreshResult(self.papers, stale=False) stale = self._cache.read() if stale is not None: @@ -76,12 +76,17 @@ async def refresh(self) -> dict[str, Paper]: len(stale), ) self.papers = self._parse_and_index(stale) - return self.papers + return IndexRefreshResult(self.papers, stale=True) log.error("No index data available") - return self.papers + self._raise_on_download_failure(category or FailureCategory.CONFIGURATION) - async def _download(self) -> dict | None: + def _raise_on_download_failure(self, category: FailureCategory) -> None: + if category is FailureCategory.CONFIGURATION: + raise ConfigurationError("No index data available") + raise IndexRefreshError(category=category) + + async def _download(self) -> tuple[dict | None, FailureCategory | None]: timeout = httpx.Timeout(self._cfg.wg21_index_timeout_s) try: async with httpx.AsyncClient( @@ -93,9 +98,13 @@ async def _download(self) -> dict | None: resp.raise_for_status() data = resp.json() if isinstance(data, dict): - return data - log.warning("Index response is not a dict") - return None + return data, None + log.warning( + "INDEX-FETCH failure_category=%s url=%s response is not a dict", + FailureCategory.CONFIGURATION.value, + WG21_INDEX_URL, + ) + return None, FailureCategory.CONFIGURATION except httpx.TimeoutException as exc: log.warning( "INDEX-FETCH failure_category=%s url=%s %s", @@ -103,7 +112,7 @@ async def _download(self) -> dict | None: WG21_INDEX_URL, exc, ) - return None + return None, FailureCategory.TIMEOUT except httpx.HTTPStatusError as exc: cat = ( FailureCategory.RATE_LIMIT @@ -116,7 +125,7 @@ async def _download(self) -> dict | None: WG21_INDEX_URL, exc.response.status_code, ) - return None + return None, cat except (httpx.HTTPError, ValueError) as exc: log.error( "INDEX-FETCH failure_category=%s url=%s %s", @@ -124,7 +133,7 @@ async def _download(self) -> dict | None: WG21_INDEX_URL, exc, ) - return None + return None, FailureCategory.NETWORK def _parse_and_index(self, raw: dict) -> dict[str, Paper]: papers: dict[str, Paper] = {} @@ -188,8 +197,8 @@ def get_papers_snapshot(self) -> Mapping[str, Paper]: """Read-only mapping copy of ``papers`` (not the live dict object).""" return MappingProxyType(dict(self.papers)) - async def fetch(self) -> dict[str, Paper]: - """DataSource: load index and return the current paper map.""" + async def fetch(self) -> IndexRefreshResult: + """DataSource: load index and return papers plus stale signal.""" return await self.refresh() def diff( diff --git a/tests/test_callback_protocols.py b/tests/test_callback_protocols.py index bca724e..7a9e3bc 100644 --- a/tests/test_callback_protocols.py +++ b/tests/test_callback_protocols.py @@ -8,7 +8,7 @@ from pathlib import Path from unittest.mock import AsyncMock, MagicMock -from paperscout.models import CycleResult, CycleStatus +from paperscout.models import CycleResult, CycleStatus, IndexRefreshResult from paperscout.monitor import DiffResult, PollResult, Scheduler from paperscout.protocols import SOURCE_ISO_PROBE, SOURCE_WG21_INDEX, NotifyCallback, OpsAlertFn from paperscout.sources import ISOProber, WG21Index @@ -45,7 +45,7 @@ def on_ops_alert(message: str) -> None: wg21 = MagicMock(spec=WG21Index) wg21.source_id = SOURCE_WG21_INDEX - wg21.fetch = AsyncMock(return_value={}) + wg21.fetch = AsyncMock(return_value=IndexRefreshResult({}, stale=False)) wg21.diff = MagicMock(return_value=DiffResult([], [])) iso = MagicMock(spec=ISOProber) diff --git a/tests/test_datasource_protocol.py b/tests/test_datasource_protocol.py index 8f15021..cd84fcc 100644 --- a/tests/test_datasource_protocol.py +++ b/tests/test_datasource_protocol.py @@ -6,7 +6,7 @@ from typing import Any from unittest.mock import AsyncMock, MagicMock -from paperscout.models import CycleResult, CycleStatus, Paper +from paperscout.models import CycleResult, CycleStatus, IndexRefreshResult, Paper from paperscout.monitor import DiffResult, Scheduler from paperscout.protocols import SOURCE_ISO_PROBE, SOURCE_WG21_INDEX, DataSource from paperscout.sources import ISOProber, WG21Index @@ -67,7 +67,7 @@ async def test_scheduler_polls_custom_datasource(self, fake_pool): ) wg21 = MagicMock(spec=WG21Index) wg21.source_id = SOURCE_WG21_INDEX - wg21.fetch = AsyncMock(return_value={}) + wg21.fetch = AsyncMock(return_value=IndexRefreshResult({}, stale=False)) wg21.diff = MagicMock(return_value=DiffResult([], [])) iso = MagicMock(spec=ISOProber) diff --git a/tests/test_monitor.py b/tests/test_monitor.py index 9711e1e..71fa6be 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -11,10 +11,11 @@ import httpx import pytest -from paperscout.errors import ConfigurationError +from paperscout.errors import ConfigurationError, FailureCategory, IndexRefreshError from paperscout.models import ( CycleResult, CycleStatus, + IndexRefreshResult, MatchReason, Paper, PerUserMatches, @@ -29,7 +30,7 @@ from paperscout.protocols import SOURCE_ISO_PROBE, SOURCE_WG21_INDEX from paperscout.sources import ISOProber, WG21Index from paperscout.storage import ProbeState, UserWatchlist -from tests.conftest import make_test_settings +from tests.conftest import SAMPLE_INDEX_DATA, make_test_settings def _wait_for_timeout(awaitable, timeout=None): @@ -211,7 +212,7 @@ def _make_mock_wg21() -> MagicMock: mock.papers = {} async def _fetch(): - return dict(mock.papers) + return IndexRefreshResult(dict(mock.papers), stale=False) mock.fetch = AsyncMock(side_effect=_fetch) mock.diff = lambda previous, current: diff_snapshots(previous or {}, current) @@ -260,6 +261,22 @@ def _make_scheduler(fake_pool, **cfg_overrides): return scheduler, wg21, iso, user_watchlist, state +def _make_scheduler_with_real_wg21(fake_pool, **cfg_overrides): + cfg = make_test_settings(**cfg_overrides) + wg21 = WG21Index(fake_pool, cfg=cfg) + iso = _make_mock_iso() + user_watchlist = MagicMock(spec=UserWatchlist) + user_watchlist.matches_for_users.return_value = {} + state = ProbeState(fake_pool) + scheduler = Scheduler( + sources=[wg21, iso], + user_watchlist=user_watchlist, + state=state, + cfg=cfg, + ) + return scheduler, wg21, iso, user_watchlist, state + + class TestScheduler: async def test_poll_once_seeds_on_first_call(self, fake_pool): scheduler, index, prober, _, _ = _make_scheduler(fake_pool) @@ -778,3 +795,82 @@ def capture_wait_for(awaitable, timeout=None): assert len(slept) == 1 assert slept[0] == pytest.approx(300.0) + + async def test_poll_once_propagates_index_refresh_error(self, fake_pool): + scheduler, wg21, _, _, _ = _make_scheduler_with_real_wg21(fake_pool, enable_iso_probe=False) + req = MagicMock() + mock_client = AsyncMock() + mock_client.get = AsyncMock(side_effect=httpx.ReadTimeout("timed out", request=req)) + with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: + mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) + with pytest.raises(IndexRefreshError) as exc_info: + await scheduler.poll_once() + assert exc_info.value.category is FailureCategory.TIMEOUT + assert wg21.papers == {} + + async def test_poll_once_uses_stale_index(self, fake_pool, caplog): + scheduler, wg21, prober, _, _ = _make_scheduler_with_real_wg21(fake_pool) + wg21._cache.write(SAMPLE_INDEX_DATA) + wg21._cache.ttl_seconds = 0 + req = MagicMock() + mock_client = AsyncMock() + mock_client.get = AsyncMock(side_effect=httpx.ReadTimeout("timed out", request=req)) + with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: + mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) + with caplog.at_level(logging.WARNING, logger="paperscout.monitor"): + await scheduler.poll_once() + assert "P2300R10" in wg21.papers + assert "INDEX-STALE" in caplog.text + assert scheduler._last_successful_poll is None + prober.fetch.assert_called_once() + + async def test_run_forever_continues_after_index_refresh_error(self, fake_pool, caplog): + scheduler, _, _, _, _ = _make_scheduler_with_real_wg21( + fake_pool, + enable_iso_probe=False, + poll_interval_minutes=0, + poll_overrun_cooldown_seconds=0, + ) + shutdown_event = asyncio.Event() + call_count = 0 + req = MagicMock() + mock_client = AsyncMock() + mock_client.get = AsyncMock(side_effect=httpx.ReadTimeout("timed out", request=req)) + real_poll_once = scheduler.poll_once + + async def counting_poll_once(): + nonlocal call_count + call_count += 1 + if call_count >= 2: + shutdown_event.set() + return await real_poll_once() + + scheduler.poll_once = counting_poll_once + with caplog.at_level(logging.ERROR, logger="paperscout.monitor"): + with patch("asyncio.wait_for", _wait_for_timeout): + with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: + mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) + await scheduler.run_forever(shutdown_event) + assert call_count == 2 + assert "failure_category=TIMEOUT" in caplog.text + + async def test_run_forever_halts_on_configuration_error_from_index(self, fake_pool, caplog): + scheduler, _, _, _, _ = _make_scheduler_with_real_wg21(fake_pool, enable_iso_probe=False) + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json = MagicMock(return_value=[1, 2, 3]) + mock_resp.raise_for_status = MagicMock() + mock_resp.headers = {} + mock_client = AsyncMock() + mock_client.get = AsyncMock(return_value=mock_resp) + with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: + mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) + with caplog.at_level(logging.CRITICAL, logger="paperscout.monitor"): + with patch("asyncio.sleep", AsyncMock()): + await scheduler.run_forever() + assert "failure_category=CONFIGURATION" in caplog.text + assert "POLL-FATAL" in caplog.text diff --git a/tests/test_sources.py b/tests/test_sources.py index b8a714e..e6fe06e 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -12,6 +12,7 @@ import pytest from paperscout.config import override_settings +from paperscout.errors import ConfigurationError, FailureCategory, IndexRefreshError from paperscout.models import CycleStatus, Paper from paperscout.sources import ( ISOProber, @@ -92,33 +93,41 @@ def _old_lm() -> datetime: class TestWG21Index: async def test_refresh_downloads_when_no_cache(self, fake_pool): index = WG21Index(fake_pool) - with patch.object(index, "_download", AsyncMock(return_value=SAMPLE_INDEX_DATA)): - papers = await index.refresh() - assert "P2300R10" in papers - assert "N4950" in papers + with patch.object(index, "_download", AsyncMock(return_value=(SAMPLE_INDEX_DATA, None))): + result = await index.refresh() + assert result.stale is False + assert "P2300R10" in result.papers + assert "N4950" in result.papers async def test_refresh_uses_cache_when_fresh(self, fake_pool): index = WG21Index(fake_pool) index._cache.write(SAMPLE_INDEX_DATA) mock_download = AsyncMock() with patch.object(index, "_download", mock_download): - papers = await index.refresh() + result = await index.refresh() mock_download.assert_not_called() - assert "P2300R10" in papers + assert result.stale is False + assert "P2300R10" in result.papers async def test_refresh_falls_back_to_stale_cache(self, fake_pool): index = WG21Index(fake_pool) index._cache.write(SAMPLE_INDEX_DATA) index._cache.ttl_seconds = 0 - with patch.object(index, "_download", AsyncMock(return_value=None)): - papers = await index.refresh() - assert "P2300R10" in papers - - async def test_refresh_returns_empty_when_no_data(self, fake_pool): + with patch.object( + index, "_download", AsyncMock(return_value=(None, FailureCategory.NETWORK)) + ): + result = await index.refresh() + assert result.stale is True + assert "P2300R10" in result.papers + + async def test_refresh_raises_on_transient_failure_when_no_data(self, fake_pool): index = WG21Index(fake_pool) - with patch.object(index, "_download", AsyncMock(return_value=None)): - papers = await index.refresh() - assert papers == {} + with patch.object( + index, "_download", AsyncMock(return_value=(None, FailureCategory.NETWORK)) + ): + with pytest.raises(IndexRefreshError) as exc_info: + await index.refresh() + assert exc_info.value.category is FailureCategory.NETWORK async def test_download_success(self, fake_pool): index = WG21Index(fake_pool) @@ -127,18 +136,22 @@ async def test_download_success(self, fake_pool): with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) - result = await index._download() - assert result == SAMPLE_INDEX_DATA + data, category = await index._download() + assert data == SAMPLE_INDEX_DATA + assert category is None - async def test_download_non_dict_response(self, fake_pool): + async def test_download_non_dict_response(self, fake_pool, caplog): index = WG21Index(fake_pool) mock_resp = _make_response(200, json_data=[1, 2, 3]) mock_client = _make_async_client(get_resp=mock_resp) with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) - result = await index._download() - assert result is None + with caplog.at_level(logging.WARNING, logger="paperscout.sources"): + data, category = await index._download() + assert data is None + assert category is FailureCategory.CONFIGURATION + assert "failure_category=CONFIGURATION" in caplog.text async def test_download_http_error(self, fake_pool, caplog): index = WG21Index(fake_pool) @@ -148,8 +161,9 @@ async def test_download_http_error(self, fake_pool, caplog): mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) with caplog.at_level(logging.ERROR, logger="paperscout.sources"): - result = await index._download() - assert result is None + data, category = await index._download() + assert data is None + assert category is FailureCategory.NETWORK assert "failure_category=NETWORK" in caplog.text async def test_download_http_status_429_emits_rate_limit(self, fake_pool, caplog): @@ -160,8 +174,9 @@ async def test_download_http_status_429_emits_rate_limit(self, fake_pool, caplog mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) with caplog.at_level(logging.ERROR, logger="paperscout.sources"): - result = await index._download() - assert result is None + data, category = await index._download() + assert data is None + assert category is FailureCategory.RATE_LIMIT assert "failure_category=RATE_LIMIT" in caplog.text async def test_download_http_status_500_emits_network(self, fake_pool, caplog): @@ -172,8 +187,9 @@ async def test_download_http_status_500_emits_network(self, fake_pool, caplog): mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) with caplog.at_level(logging.ERROR, logger="paperscout.sources"): - result = await index._download() - assert result is None + data, category = await index._download() + assert data is None + assert category is FailureCategory.NETWORK assert "failure_category=NETWORK" in caplog.text async def test_download_uses_wg21_index_timeout_from_settings(self, fake_pool): @@ -204,12 +220,57 @@ async def test_refresh_timeout_then_stale_fallback(self, fake_pool, caplog): mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) with caplog.at_level(logging.WARNING, logger="paperscout.sources"): - papers = await index.refresh() + result = await index.refresh() - assert "P2300R10" in papers + assert result.stale is True + assert "P2300R10" in result.papers assert "failure_category=TIMEOUT" in caplog.text assert "INDEX-STALE-FALLBACK" in caplog.text + async def test_refresh_timeout_raises_when_no_stale_cache(self, fake_pool): + index = WG21Index(fake_pool) + req = MagicMock() + mock_client = AsyncMock() + mock_client.get = AsyncMock(side_effect=httpx.ReadTimeout("timed out", request=req)) + with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: + mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) + with pytest.raises(IndexRefreshError) as exc_info: + await index.refresh() + assert exc_info.value.category is FailureCategory.TIMEOUT + + async def test_refresh_429_raises_rate_limit(self, fake_pool): + index = WG21Index(fake_pool) + mock_resp = _make_response(429, json_data={}) + mock_client = _make_async_client(get_resp=mock_resp) + with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: + mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) + with pytest.raises(IndexRefreshError) as exc_info: + await index.refresh() + assert exc_info.value.category is FailureCategory.RATE_LIMIT + + async def test_refresh_stale_fallback_sets_stale_signal(self, fake_pool): + index = WG21Index(fake_pool) + index._cache.write(SAMPLE_INDEX_DATA) + index._cache.ttl_seconds = 0 + with patch.object( + index, "_download", AsyncMock(return_value=(None, FailureCategory.TIMEOUT)) + ): + result = await index.refresh() + assert result.stale is True + assert len(result.papers) > 0 + + async def test_refresh_no_data_raises_configuration_error(self, fake_pool): + index = WG21Index(fake_pool) + mock_resp = _make_response(200, json_data=[1, 2, 3]) + mock_client = _make_async_client(get_resp=mock_resp) + with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: + mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) + with pytest.raises(ConfigurationError, match="No index data available"): + await index.refresh() + def test_parse_and_index(self, fake_pool): index = WG21Index(fake_pool) papers = index._parse_and_index(SAMPLE_INDEX_DATA) From f34047b8457c054fd875c2f801b0ecea8fad5ce8 Mon Sep 17 00:00:00 2001 From: mac Date: Wed, 24 Jun 2026 02:56:47 +0800 Subject: [PATCH 2/8] addressed ai reviews --- src/paperscout/sources.py | 46 ++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/src/paperscout/sources.py b/src/paperscout/sources.py index 31d6cec..47ea0f1 100644 --- a/src/paperscout/sources.py +++ b/src/paperscout/sources.py @@ -98,6 +98,13 @@ async def _download(self) -> tuple[dict | None, FailureCategory | None]: resp.raise_for_status() data = resp.json() if isinstance(data, dict): + if not data: + log.warning( + "INDEX-FETCH failure_category=%s url=%s empty index payload", + FailureCategory.CONFIGURATION.value, + WG21_INDEX_URL, + ) + return None, FailureCategory.CONFIGURATION return data, None log.warning( "INDEX-FETCH failure_category=%s url=%s response is not a dict", @@ -371,7 +378,9 @@ async def run_cycle(self) -> CycleResult: try: urls = self._build_probe_list() known_ids = self.index.get_known_paper_ids() - hot_count = sum(1 for u in urls if u[1] in (Tier.WATCHLIST, Tier.FRONTIER, Tier.RECENT)) + hot_count = sum( + 1 for u in urls if u[1] in (Tier.WATCHLIST, Tier.FRONTIER, Tier.RECENT) + ) cold_count = sum(1 for u in urls if u[1] == Tier.COLD) slice_idx = (self._cycle - 1) % self.cfg.cold_cycle_divisor log.info( @@ -393,7 +402,9 @@ async def run_cycle(self) -> CycleResult: follow_redirects=True, ) as client: tasks = [ - self._probe_one(client, sem, url, prefix, num, rev, ext, tier, known_ids) + self._probe_one( + client, sem, url, prefix, num, rev, ext, tier, known_ids + ) for url, tier, prefix, num, rev, ext in urls ] results = await asyncio.gather(*tasks, return_exceptions=True) @@ -484,9 +495,9 @@ def _build_probe_list(self) -> list[_Entry]: extra_p_numbers=self.state.paper_nums_from_discovered_iso_urls(), ) hot_known, hot_unknown = self._hot_numbers(frontier) - return self._build_hot_list(frontier, hot_known, hot_unknown) + self._build_cold_slice( - self._cycle, frontier, hot_known, hot_unknown - ) + return self._build_hot_list( + frontier, hot_known, hot_unknown + ) + self._build_cold_slice(self._cycle, frontier, hot_known, hot_unknown) def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: """Return (known_hot, unknown_hot) P-number sets to probe every cycle.""" @@ -504,9 +515,16 @@ def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: # Recently active papers if self.cfg.hot_lookback_months > 0: - cutoff = date.today() - timedelta(days=int(self.cfg.hot_lookback_months * 30.44)) + cutoff = date.today() - timedelta( + days=int(self.cfg.hot_lookback_months * 30.44) + ) for p in self.index.get_papers_snapshot().values(): - if p.prefix != "P" or p.number is None or not p.date or p.date == "unknown": + if ( + p.prefix != "P" + or p.number is None + or not p.date + or p.date == "unknown" + ): continue try: if date.fromisoformat(p.date[:10]) >= cutoff: @@ -517,7 +535,9 @@ def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: known_p_nums = self.index.known_p_numbers() return hot & known_p_nums, hot - known_p_nums - def _tier_label(self, num: int, watchlist_set: set[int], frontier_range: set[int]) -> Tier: + def _tier_label( + self, num: int, watchlist_set: set[int], frontier_range: set[int] + ) -> Tier: if num in watchlist_set: return Tier.WATCHLIST if num in frontier_range: @@ -665,7 +685,9 @@ async def _probe_one( if last_modified.tzinfo is None: last_modified = last_modified.replace(tzinfo=timezone.utc) threshold = timedelta(hours=self.cfg.alert_modified_hours) - is_recent = (datetime.now(timezone.utc) - last_modified) <= threshold + is_recent = ( + datetime.now(timezone.utc) - last_modified + ) <= threshold except (TypeError, ValueError): # Bad Last-Modified: merge with no-LM so we don't silently drop hits. last_modified = None @@ -675,7 +697,11 @@ async def _probe_one( # file; treat as recent so we don't silently drop it. is_recent = True - lm_display = last_modified.strftime("%Y-%m-%d %H:%M UTC") if last_modified else "no-lm" + lm_display = ( + last_modified.strftime("%Y-%m-%d %H:%M UTC") + if last_modified + else "no-lm" + ) log.info( "HIT tier=%-10s recent=%-5s lm=%-20s %s", tier, From fb2e8358fc5d0fc8b4c9af029b9edbdcaad392cb Mon Sep 17 00:00:00 2001 From: mac Date: Wed, 24 Jun 2026 02:57:30 +0800 Subject: [PATCH 3/8] fixed lint error --- src/paperscout/sources.py | 39 ++++++++++----------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/src/paperscout/sources.py b/src/paperscout/sources.py index 47ea0f1..6d662d7 100644 --- a/src/paperscout/sources.py +++ b/src/paperscout/sources.py @@ -378,9 +378,7 @@ async def run_cycle(self) -> CycleResult: try: urls = self._build_probe_list() known_ids = self.index.get_known_paper_ids() - hot_count = sum( - 1 for u in urls if u[1] in (Tier.WATCHLIST, Tier.FRONTIER, Tier.RECENT) - ) + hot_count = sum(1 for u in urls if u[1] in (Tier.WATCHLIST, Tier.FRONTIER, Tier.RECENT)) cold_count = sum(1 for u in urls if u[1] == Tier.COLD) slice_idx = (self._cycle - 1) % self.cfg.cold_cycle_divisor log.info( @@ -402,9 +400,7 @@ async def run_cycle(self) -> CycleResult: follow_redirects=True, ) as client: tasks = [ - self._probe_one( - client, sem, url, prefix, num, rev, ext, tier, known_ids - ) + self._probe_one(client, sem, url, prefix, num, rev, ext, tier, known_ids) for url, tier, prefix, num, rev, ext in urls ] results = await asyncio.gather(*tasks, return_exceptions=True) @@ -495,9 +491,9 @@ def _build_probe_list(self) -> list[_Entry]: extra_p_numbers=self.state.paper_nums_from_discovered_iso_urls(), ) hot_known, hot_unknown = self._hot_numbers(frontier) - return self._build_hot_list( - frontier, hot_known, hot_unknown - ) + self._build_cold_slice(self._cycle, frontier, hot_known, hot_unknown) + return self._build_hot_list(frontier, hot_known, hot_unknown) + self._build_cold_slice( + self._cycle, frontier, hot_known, hot_unknown + ) def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: """Return (known_hot, unknown_hot) P-number sets to probe every cycle.""" @@ -515,16 +511,9 @@ def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: # Recently active papers if self.cfg.hot_lookback_months > 0: - cutoff = date.today() - timedelta( - days=int(self.cfg.hot_lookback_months * 30.44) - ) + cutoff = date.today() - timedelta(days=int(self.cfg.hot_lookback_months * 30.44)) for p in self.index.get_papers_snapshot().values(): - if ( - p.prefix != "P" - or p.number is None - or not p.date - or p.date == "unknown" - ): + if p.prefix != "P" or p.number is None or not p.date or p.date == "unknown": continue try: if date.fromisoformat(p.date[:10]) >= cutoff: @@ -535,9 +524,7 @@ def _hot_numbers(self, frontier: int) -> tuple[set[int], set[int]]: known_p_nums = self.index.known_p_numbers() return hot & known_p_nums, hot - known_p_nums - def _tier_label( - self, num: int, watchlist_set: set[int], frontier_range: set[int] - ) -> Tier: + def _tier_label(self, num: int, watchlist_set: set[int], frontier_range: set[int]) -> Tier: if num in watchlist_set: return Tier.WATCHLIST if num in frontier_range: @@ -685,9 +672,7 @@ async def _probe_one( if last_modified.tzinfo is None: last_modified = last_modified.replace(tzinfo=timezone.utc) threshold = timedelta(hours=self.cfg.alert_modified_hours) - is_recent = ( - datetime.now(timezone.utc) - last_modified - ) <= threshold + is_recent = (datetime.now(timezone.utc) - last_modified) <= threshold except (TypeError, ValueError): # Bad Last-Modified: merge with no-LM so we don't silently drop hits. last_modified = None @@ -697,11 +682,7 @@ async def _probe_one( # file; treat as recent so we don't silently drop it. is_recent = True - lm_display = ( - last_modified.strftime("%Y-%m-%d %H:%M UTC") - if last_modified - else "no-lm" - ) + lm_display = last_modified.strftime("%Y-%m-%d %H:%M UTC") if last_modified else "no-lm" log.info( "HIT tier=%-10s recent=%-5s lm=%-20s %s", tier, From bb04d777de73b0095e8f0ff2c86d3e32c2686100 Mon Sep 17 00:00:00 2001 From: mac Date: Wed, 24 Jun 2026 22:07:29 +0800 Subject: [PATCH 4/8] addressed ai reviews --- src/paperscout/sources.py | 16 ++++++++++++---- tests/test_monitor.py | 2 +- tests/test_sources.py | 24 ++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/paperscout/sources.py b/src/paperscout/sources.py index 6d662d7..62282e4 100644 --- a/src/paperscout/sources.py +++ b/src/paperscout/sources.py @@ -57,20 +57,20 @@ def __init__(self, pool, cfg: Settings | None = None): async def refresh(self) -> IndexRefreshResult: """Load index from cache or network; populate ``self.papers``.""" cached = self._cache.read_if_fresh() - if cached is not None: + if cached: log.info("Loaded %d entries from cache", len(cached)) self.papers = self._parse_and_index(cached) return IndexRefreshResult(self.papers, stale=False) raw, category = await self._download() - if raw is not None: + if raw: self._cache.write(raw) log.info("Downloaded and cached %d entries", len(raw)) self.papers = self._parse_and_index(raw) return IndexRefreshResult(self.papers, stale=False) stale = self._cache.read() - if stale is not None: + if stale: log.warning( "INDEX-STALE-FALLBACK entries=%d (download failed; using persisted cache)", len(stale), @@ -133,7 +133,7 @@ async def _download(self) -> tuple[dict | None, FailureCategory | None]: exc.response.status_code, ) return None, cat - except (httpx.HTTPError, ValueError) as exc: + except httpx.HTTPError as exc: log.error( "INDEX-FETCH failure_category=%s url=%s %s", FailureCategory.NETWORK.value, @@ -141,6 +141,14 @@ async def _download(self) -> tuple[dict | None, FailureCategory | None]: exc, ) return None, FailureCategory.NETWORK + except ValueError as exc: + log.error( + "INDEX-FETCH failure_category=%s url=%s %s", + FailureCategory.CONFIGURATION.value, + WG21_INDEX_URL, + exc, + ) + return None, FailureCategory.CONFIGURATION def _parse_and_index(self, raw: dict) -> dict[str, Paper]: papers: dict[str, Paper] = {} diff --git a/tests/test_monitor.py b/tests/test_monitor.py index 71fa6be..126e424 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -870,7 +870,7 @@ async def test_run_forever_halts_on_configuration_error_from_index(self, fake_po mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) with caplog.at_level(logging.CRITICAL, logger="paperscout.monitor"): - with patch("asyncio.sleep", AsyncMock()): + with patch("asyncio.sleep", AsyncMock(side_effect=AssertionError("sleep reached"))): await scheduler.run_forever() assert "failure_category=CONFIGURATION" in caplog.text assert "POLL-FATAL" in caplog.text diff --git a/tests/test_sources.py b/tests/test_sources.py index e6fe06e..116ce34 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -271,6 +271,30 @@ async def test_refresh_no_data_raises_configuration_error(self, fake_pool): with pytest.raises(ConfigurationError, match="No index data available"): await index.refresh() + async def test_refresh_empty_cache_does_not_return_fresh_success(self, fake_pool): + index = WG21Index(fake_pool) + index._cache.write({}) + with patch.object( + index, "_download", AsyncMock(return_value=(None, FailureCategory.NETWORK)) + ): + with pytest.raises(IndexRefreshError) as exc_info: + await index.refresh() + assert exc_info.value.category is FailureCategory.NETWORK + + async def test_download_json_parse_error_emits_configuration(self, fake_pool, caplog): + index = WG21Index(fake_pool) + mock_resp = _make_response(200) + mock_resp.json = MagicMock(side_effect=ValueError("bad json")) + mock_client = _make_async_client(get_resp=mock_resp) + with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: + mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) + with caplog.at_level(logging.ERROR, logger="paperscout.sources"): + data, category = await index._download() + assert data is None + assert category is FailureCategory.CONFIGURATION + assert "failure_category=CONFIGURATION" in caplog.text + def test_parse_and_index(self, fake_pool): index = WG21Index(fake_pool) papers = index._parse_and_index(SAMPLE_INDEX_DATA) From afc6ef2094cf83f79f7f6263f5d0e15dce15c34f Mon Sep 17 00:00:00 2001 From: mac Date: Wed, 24 Jun 2026 22:18:52 +0800 Subject: [PATCH 5/8] _parse_and_index() can return {} when the raw dict has keys but no valid entry dicts, so len(cached) > 0 / if raw: was not enough to guarantee usable index data. --- src/paperscout/sources.py | 38 +++++++++++++++++++++++++------------- tests/test_sources.py | 23 +++++++++++++++++++++++ 2 files changed, 48 insertions(+), 13 deletions(-) diff --git a/src/paperscout/sources.py b/src/paperscout/sources.py index 62282e4..f00f227 100644 --- a/src/paperscout/sources.py +++ b/src/paperscout/sources.py @@ -58,25 +58,37 @@ async def refresh(self) -> IndexRefreshResult: """Load index from cache or network; populate ``self.papers``.""" cached = self._cache.read_if_fresh() if cached: - log.info("Loaded %d entries from cache", len(cached)) - self.papers = self._parse_and_index(cached) - return IndexRefreshResult(self.papers, stale=False) + papers = self._parse_and_index(cached) + if papers: + self.papers = papers + log.info("Loaded %d entries from cache", len(papers)) + return IndexRefreshResult(self.papers, stale=False) raw, category = await self._download() if raw: - self._cache.write(raw) - log.info("Downloaded and cached %d entries", len(raw)) - self.papers = self._parse_and_index(raw) - return IndexRefreshResult(self.papers, stale=False) + papers = self._parse_and_index(raw) + if papers: + self._cache.write(raw) + self.papers = papers + log.info("Downloaded and cached %d entries", len(papers)) + return IndexRefreshResult(self.papers, stale=False) + log.warning( + "INDEX-FETCH failure_category=%s url=%s payload produced zero parseable papers", + FailureCategory.CONFIGURATION.value, + WG21_INDEX_URL, + ) + category = FailureCategory.CONFIGURATION stale = self._cache.read() if stale: - log.warning( - "INDEX-STALE-FALLBACK entries=%d (download failed; using persisted cache)", - len(stale), - ) - self.papers = self._parse_and_index(stale) - return IndexRefreshResult(self.papers, stale=True) + papers = self._parse_and_index(stale) + if papers: + self.papers = papers + log.warning( + "INDEX-STALE-FALLBACK entries=%d (download failed; using persisted cache)", + len(papers), + ) + return IndexRefreshResult(self.papers, stale=True) log.error("No index data available") self._raise_on_download_failure(category or FailureCategory.CONFIGURATION) diff --git a/tests/test_sources.py b/tests/test_sources.py index 116ce34..63ee5b5 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -281,6 +281,29 @@ async def test_refresh_empty_cache_does_not_return_fresh_success(self, fake_pool await index.refresh() assert exc_info.value.category is FailureCategory.NETWORK + async def test_refresh_unparseable_fresh_cache_falls_through(self, fake_pool): + index = WG21Index(fake_pool) + index._cache.write({"bad": "not-a-dict", "also": 1}) + with patch.object( + index, "_download", AsyncMock(return_value=(None, FailureCategory.NETWORK)) + ): + with pytest.raises(IndexRefreshError) as exc_info: + await index.refresh() + assert exc_info.value.category is FailureCategory.NETWORK + + async def test_refresh_unparseable_download_preserves_stale_cache(self, fake_pool): + index = WG21Index(fake_pool) + index._cache.write(SAMPLE_INDEX_DATA) + index._cache.ttl_seconds = 0 + with patch.object( + index, + "_download", + AsyncMock(return_value=({"bad": "not-a-dict"}, None)), + ): + result = await index.refresh() + assert result.stale is True + assert "P2300R10" in result.papers + async def test_download_json_parse_error_emits_configuration(self, fake_pool, caplog): index = WG21Index(fake_pool) mock_resp = _make_response(200) From 505dacbcf98cc901e1377b24b12c216d4eff74ac Mon Sep 17 00:00:00 2001 From: mac Date: Wed, 24 Jun 2026 22:41:43 +0800 Subject: [PATCH 6/8] =?UTF-8?q?refresh()=20called=20=5Fparse=5Fand=5Findex?= =?UTF-8?q?()=20while=20probing=20candidates,=20which=20mutated=20=5Fmax?= =?UTF-8?q?=5Frev,=20=5Fmax=5Fp,=20and=20=5Fsorted=5Fp=5Fnums=20before=20a?= =?UTF-8?q?cceptance=20=E2=80=94=20leaving=20frontier=20metadata=20out=20o?= =?UTF-8?q?f=20sync=20with=20self.papers=20when=20a=20payload=20was=20reje?= =?UTF-8?q?cted.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/paperscout/sources.py | 24 +++++++++++++++++------- tests/test_sources.py | 14 ++++++++++++++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/paperscout/sources.py b/src/paperscout/sources.py index f00f227..870ea21 100644 --- a/src/paperscout/sources.py +++ b/src/paperscout/sources.py @@ -58,18 +58,18 @@ async def refresh(self) -> IndexRefreshResult: """Load index from cache or network; populate ``self.papers``.""" cached = self._cache.read_if_fresh() if cached: - papers = self._parse_and_index(cached) + papers, max_rev, max_p = self._parse_raw(cached) if papers: - self.papers = papers + self._commit_index(papers, max_rev, max_p) log.info("Loaded %d entries from cache", len(papers)) return IndexRefreshResult(self.papers, stale=False) raw, category = await self._download() if raw: - papers = self._parse_and_index(raw) + papers, max_rev, max_p = self._parse_raw(raw) if papers: self._cache.write(raw) - self.papers = papers + self._commit_index(papers, max_rev, max_p) log.info("Downloaded and cached %d entries", len(papers)) return IndexRefreshResult(self.papers, stale=False) log.warning( @@ -81,9 +81,9 @@ async def refresh(self) -> IndexRefreshResult: stale = self._cache.read() if stale: - papers = self._parse_and_index(stale) + papers, max_rev, max_p = self._parse_raw(stale) if papers: - self.papers = papers + self._commit_index(papers, max_rev, max_p) log.warning( "INDEX-STALE-FALLBACK entries=%d (download failed; using persisted cache)", len(papers), @@ -162,7 +162,7 @@ async def _download(self) -> tuple[dict | None, FailureCategory | None]: ) return None, FailureCategory.CONFIGURATION - def _parse_and_index(self, raw: dict) -> dict[str, Paper]: + def _parse_raw(self, raw: dict) -> tuple[dict[str, Paper], dict[int, int], int]: papers: dict[str, Paper] = {} max_rev: dict[int, int] = {} max_p = 0 @@ -177,9 +177,19 @@ def _parse_and_index(self, raw: dict) -> dict[str, Paper]: prev = max_rev.get(paper.number, -1) if paper.revision > prev: max_rev[paper.number] = paper.revision + return papers, max_rev, max_p + + def _commit_index( + self, papers: dict[str, Paper], max_rev: dict[int, int], max_p: int + ) -> None: + self.papers = papers self._max_rev = max_rev self._max_p = max_p self._sorted_p_nums = sorted(max_rev.keys()) + + def _parse_and_index(self, raw: dict) -> dict[str, Paper]: + papers, max_rev, max_p = self._parse_raw(raw) + self._commit_index(papers, max_rev, max_p) return papers def highest_p_number(self) -> int: diff --git a/tests/test_sources.py b/tests/test_sources.py index 63ee5b5..b5b77d5 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -304,6 +304,20 @@ async def test_refresh_unparseable_download_preserves_stale_cache(self, fake_poo assert result.stale is True assert "P2300R10" in result.papers + async def test_refresh_rejected_parse_preserves_index_metadata(self, fake_pool): + index = WG21Index(fake_pool) + index._parse_and_index(SAMPLE_INDEX_DATA) + original_max_p = index._max_p + original_papers = dict(index.papers) + index._cache.write({"bad": "not-a-dict"}) + with patch.object( + index, "_download", AsyncMock(return_value=(None, FailureCategory.NETWORK)) + ): + with pytest.raises(IndexRefreshError): + await index.refresh() + assert index._max_p == original_max_p + assert index.papers == original_papers + async def test_download_json_parse_error_emits_configuration(self, fake_pool, caplog): index = WG21Index(fake_pool) mock_resp = _make_response(200) From 262a8ce88256f99a60e82d2cd9ef18f9b33f7cc5 Mon Sep 17 00:00:00 2001 From: mac Date: Wed, 24 Jun 2026 22:42:24 +0800 Subject: [PATCH 7/8] fixed lint error --- src/paperscout/sources.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/paperscout/sources.py b/src/paperscout/sources.py index 870ea21..501dd26 100644 --- a/src/paperscout/sources.py +++ b/src/paperscout/sources.py @@ -179,9 +179,7 @@ def _parse_raw(self, raw: dict) -> tuple[dict[str, Paper], dict[int, int], int]: max_rev[paper.number] = paper.revision return papers, max_rev, max_p - def _commit_index( - self, papers: dict[str, Paper], max_rev: dict[int, int], max_p: int - ) -> None: + def _commit_index(self, papers: dict[str, Paper], max_rev: dict[int, int], max_p: int) -> None: self.papers = papers self._max_rev = max_rev self._max_p = max_p From b6170d14bac671e3fd2840d43bbec5a0c434d6cc Mon Sep 17 00:00:00 2001 From: mac Date: Wed, 24 Jun 2026 22:53:43 +0800 Subject: [PATCH 8/8] update tests - Added a second await scheduler.poll_once() after resetting the ISO prober mock, asserting _last_successful_poll stays None on the incremental poll path. - Added test_refresh_malformed_json_raises_configuration_error asserting refresh() raises ConfigurationError("No index data available") when resp.json() raises ValueError with no cache. --- tests/test_monitor.py | 13 +++++++++---- tests/test_sources.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/test_monitor.py b/tests/test_monitor.py index 126e424..fe68369 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -821,10 +821,15 @@ async def test_poll_once_uses_stale_index(self, fake_pool, caplog): mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) with caplog.at_level(logging.WARNING, logger="paperscout.monitor"): await scheduler.poll_once() - assert "P2300R10" in wg21.papers - assert "INDEX-STALE" in caplog.text - assert scheduler._last_successful_poll is None - prober.fetch.assert_called_once() + assert "P2300R10" in wg21.papers + assert "INDEX-STALE" in caplog.text + assert scheduler._last_successful_poll is None + prober.fetch.assert_called_once() + + prober.fetch.reset_mock() + await scheduler.poll_once() + assert scheduler._last_successful_poll is None + prober.fetch.assert_called_once() async def test_run_forever_continues_after_index_refresh_error(self, fake_pool, caplog): scheduler, _, _, _, _ = _make_scheduler_with_real_wg21( diff --git a/tests/test_sources.py b/tests/test_sources.py index b5b77d5..43756ba 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -332,6 +332,17 @@ async def test_download_json_parse_error_emits_configuration(self, fake_pool, ca assert category is FailureCategory.CONFIGURATION assert "failure_category=CONFIGURATION" in caplog.text + async def test_refresh_malformed_json_raises_configuration_error(self, fake_pool): + index = WG21Index(fake_pool) + mock_resp = _make_response(200) + mock_resp.json = MagicMock(side_effect=ValueError("bad json")) + mock_client = _make_async_client(get_resp=mock_resp) + with patch("paperscout.sources.httpx.AsyncClient") as mock_cls: + mock_cls.return_value.__aenter__ = AsyncMock(return_value=mock_client) + mock_cls.return_value.__aexit__ = AsyncMock(return_value=False) + with pytest.raises(ConfigurationError, match="No index data available"): + await index.refresh() + def test_parse_and_index(self, fake_pool): index = WG21Index(fake_pool) papers = index._parse_and_index(SAMPLE_INDEX_DATA)