trpc-group · woshidage77 · Jul 1, 2026
diff --git a/examples/optimization/eval_optimize_loop/.gitignore b/examples/optimization/eval_optimize_loop/.gitignore
@@ -0,0 +1,4 @@
+output/
+__pycache__/
+.pytest_cache/
+*.pyc
diff --git a/examples/optimization/eval_optimize_loop/config/optimizer.json b/examples/optimization/eval_optimize_loop/config/optimizer.json
@@ -0,0 +1,87 @@
+{
+    "_description": "Evaluation + Optimization 自动回归闭环配置",
+    "pipeline": {
+        "name": "PlateAgent Eval-Optimize Loop",
+        "version": "1.0.0",
+        "max_iterations": 5,
+        "random_seed": 42
+    },
+    "gate": {
+        "rules": {
+            "total_score_improvement": {
+                "enabled": true,
+                "threshold": 0.03,
+                "description": "验证集总分提升 ≥ 3%"
+            },
+            "no_new_hard_fail": {
+                "enabled": true,
+                "max_new_fails": 0,
+                "description": "不允许新增 hard fail"
+            },
+            "critical_case_no_regress": {
+                "enabled": true,
+                "critical_case_ids": [],
+                "description": "关键 case 不退步"
+            },
+            "cost_within_budget": {
+                "enabled": true,
+                "max_cost_ratio": 1.2,
+                "description": "成本不超过 baseline 的 120%"
+            },
+            "overfit_detection": {
+                "enabled": true,
+                "description": "训练集提升 + 验证集退化 → 拒绝候选"
+            }
+        },
+        "acceptance_strategy": "all_must_pass",
+        "description": "all_must_pass: 所有启用的规则都通过才接受; majority: 多数通过即可"
+    },
+    "attribution": {
+        "categories": [
+            "final_answer_mismatch",
+            "tool_call_error",
+            "param_error",
+            "llm_rubric_fail",
+            "knowledge_recall_insufficient",
+            "format_invalid"
+        ],
+        "rules": {
+            "final_answer_mismatch": {
+                "trigger": "predicted != ground_truth",
+                "priority": 1
+            },
+            "tool_call_error": {
+                "trigger": "tool execution failed or timeout",
+                "priority": 2
+            },
+            "param_error": {
+                "trigger": "tool parameter invalid",
+                "priority": 3
+            },
+            "llm_rubric_fail": {
+                "trigger": "LLM Judge score below threshold",
+                "threshold": 0.6,
+                "priority": 4
+            },
+            "knowledge_recall_insufficient": {
+                "trigger": "blacklist miss or confusion char not recalled",
+                "priority": 5
+            },
+            "format_invalid": {
+                "trigger": "output does not match expected JSON schema",
+                "priority": 6
+            }
+        }
+    },
+    "optimizer": {
+        "target_prompts": ["system_prompt", "skill_prompt"],
+        "strategy": "failure_driven",
+        "description": "根据归因结果，优先优化失败率最高的类别对应的 prompt 片段"
+    },
+    "output": {
+        "dir": "output",
+        "formats": ["json", "markdown"],
+        "retain_audit_trail": true,
+        "max_audit_entries": 50
+    }
+}
diff --git a/examples/optimization/eval_optimize_loop/config/train.evalset.json b/examples/optimization/eval_optimize_loop/config/train.evalset.json
@@ -0,0 +1,43 @@
+{
+  "_description": "???",
+  "version": "1.0.0",
+  "cases": [
+    {
+      "case_id": "train_001",
+      "image": "plate_001.jpg",
+      "ground_truth": "\u4eacA12345",
+      "conditions": {
+        "type": "clear"
+      },
+      "expected_behavior": "should_pass",
+      "description": "????"
+    },
+    {
+      "case_id": "train_002",
+      "image": "plate_028.jpg",
+      "ground_truth": "\u4eacA12345",
+      "conditions": {
+        "type": "noise",
+        "noise_level": 0.15
+      },
+      "expected_behavior": "may_fail",
+      "description": "????"
+    },
+    {
+      "case_id": "train_003",
+      "image": "plate_012.jpg",
+      "ground_truth": "\u82cfA88U88",
+      "conditions": {
+        "type": "blur",
+        "blur_kernel": 5
+      },
+      "expected_behavior": "may_fail",
+      "description": "????"
+    }
+  ],
+  "stats": {
+    "total": 3,
+    "should_pass": 1,
+    "may_fail": 2
+  }
+}
diff --git a/examples/optimization/eval_optimize_loop/config/val.evalset.json b/examples/optimization/eval_optimize_loop/config/val.evalset.json
@@ -0,0 +1,47 @@
+{
+  "_description": "???",
+  "version": "1.0.0",
+  "cases": [
+    {
+      "case_id": "val_001",
+      "image": "plate_005.jpg",
+      "ground_truth": "\u7ca4B54321",
+      "conditions": {
+        "type": "clear"
+      },
+      "expected_behavior": "should_pass",
+      "critical": true,
+      "description": "??case"
+    },
+    {
+      "case_id": "val_002",
+      "image": "plate_029.jpg",
+      "ground_truth": "\u82cfD13579",
+      "conditions": {
+        "type": "noise",
+        "noise_level": 0.2
+      },
+      "expected_behavior": "should_fail_baseline",
+      "critical": false,
+      "description": "??+???"
+    },
+    {
+      "case_id": "val_003",
+      "image": "plate_018.jpg",
+      "ground_truth": "\u6d59C36912",
+      "conditions": {
+        "type": "blur",
+        "blur_kernel": 7
+      },
+      "expected_behavior": "should_fail_baseline",
+      "critical": false,
+      "description": "????"
+    }
+  ],
+  "stats": {
+    "total": 3,
+    "should_pass": 1,
+    "should_fail_baseline": 2,
+    "critical": 1
+  }
+}
diff --git a/examples/optimization/eval_optimize_loop/fake/__init__.py b/examples/optimization/eval_optimize_loop/fake/__init__.py
@@ -0,0 +1,11 @@
+"""Fake 模块公共导出"""
+from .fake_model import FakeLLM, FakeLLMResponse
+from .fake_judge import FakeJudge, JudgeResult, JudgeScore
+
+__all__ = [
+    "FakeLLM",
+    "FakeLLMResponse",
+    "FakeJudge",
+    "JudgeResult",
+    "JudgeScore",
+]
diff --git a/examples/optimization/eval_optimize_loop/fake/fake_judge.py b/examples/optimization/eval_optimize_loop/fake/fake_judge.py
@@ -0,0 +1,110 @@
+"""Fake Judge — 无 LLM API 调用下模拟评测打分。
+
+基于规则引擎（非 LLM）对预测结果和 ground truth 进行对比评分，
+输出与 LLMJudge 相同的数据结构，保证 pipeline 可无缝切换。
+
+三维评分均基于字符匹配率推导，模拟真实 LLM Judge 行为：
+识别差 → 黑名单召回和回复质量也会相应下降。
+"""
+
+from dataclasses import dataclass
+
+
+@dataclass
+class JudgeScore:
+    """模拟的三维评分"""
+    recognition_quality: float    # 0.0-1.0
+    blacklist_quality: float      # 0.0-1.0
+    response_quality: float       # 0.0-1.0
+
+    @property
+    def overall(self) -> float:
+        return (self.recognition_quality + self.blacklist_quality + self.response_quality) / 3.0
+
+    @property
+    def passed(self) -> bool:
+        return self.overall >= 0.6
+
+
+@dataclass
+class JudgeResult:
+    """模拟的评测结果"""
+    case_id: str
+    ground_truth: str
+    predicted: str
+    score: JudgeScore
+    passed: bool
+    failure_reason: str = ""
+
+
+class FakeJudge:
+    """基于规则的假 Judge。
+
+    评分逻辑（完全确定性，无 LLM 依赖）：
+    - recognition_quality: 字符匹配率（0.0-1.0）
+    - blacklist_quality: 基于识别质量推导（识别差→黑名单召回也差）
+    - response_quality: 基于识别质量推导（识别差→回复质量也差）
+
+    使用方式:
+        judge = FakeJudge()
+        result = judge.evaluate("val_001", "京A12345", "京A12345")
+    """
+
+    def evaluate(
+        self,
+        case_id: str,
+        ground_truth: str,
+        predicted: str,
+    ) -> JudgeResult:
+        """对单条 case 进行评测。
+
+        Args:
+            case_id: case 标识
+            ground_truth: 标注真值
+            predicted: Agent 预测结果
+
+        Returns:
+            JudgeResult: 包含三维评分和 pass/fail 判断
+        """
+        recognition = self._char_match_score(ground_truth, predicted)
+        # 黑名单和回复质量随识别质量缩放（模拟真实场景）
+        blacklist = max(0.1, recognition * 0.9)
+        response = max(0.2, recognition * 1.05)
+
+        score = JudgeScore(
+            recognition_quality=recognition,
+            blacklist_quality=blacklist,
+            response_quality=response,
+        )
+
+        passed = score.passed
+        reason = ""
+        if not passed:
+            if recognition < 0.8:
+                reason = f"final_answer_mismatch: char_match={recognition:.2f}"
+            elif blacklist < 0.6:
+                reason = "knowledge_recall_insufficient: blacklist miss"
+            else:
+                reason = f"llm_rubric_fail: overall={score.overall:.2f}"
+
+        return JudgeResult(
+            case_id=case_id,
+            ground_truth=ground_truth,
+            predicted=predicted,
+            score=score,
+            passed=passed,
+            failure_reason=reason,
+        )
+
+    @staticmethod
+    def _char_match_score(a: str, b: str) -> float:
+        """字符级匹配得分。
+
+        完全匹配 → 1.0，逐字符比较取平均。
+        """
+        if not a or not b:
+            return 0.0
+        if a == b:
+            return 1.0
+        matches = sum(1 for ca, cb in zip(a, b) if ca == cb)
+        return matches / max(len(a), len(b))
diff --git a/examples/optimization/eval_optimize_loop/fake/fake_model.py b/examples/optimization/eval_optimize_loop/fake/fake_model.py
@@ -0,0 +1,80 @@
+"""Fake LLM — 无 API Key 模式下模拟 LLM 响应。
+
+设计思路：
+- 基于 case_id 匹配预设的响应映射表
+- 支持多种场景：通过、失败、工具调用错误等
+- 不产生任何网络请求，所有数据来自配置文件
+"""
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class FakeLLMResponse:
+    """模拟的 LLM 单次响应"""
+    content: str
+    tool_calls: list[dict] = field(default_factory=list)
+    finish_reason: str = "stop"
+
+
+class FakeLLM:
+    """无依赖的假 LLM，用于 pipeline 快速验证。
+
+    使用方式:
+        fake = FakeLLM(scenarios={"plate_001": "京A12345"})
+        response = await fake.generate("识别 plate_001")
+    """
+
+    def __init__(self, scenarios: Optional[dict[str, str]] = None):
+        """
+        Args:
+            scenarios: {case_id: predicted_result} 映射。
+                       不传则使用内置默认值。
+        """
+        self.scenarios = scenarios or self._default_scenarios()
+        self.call_count = 0
+        self.call_history: list[dict] = []
+
+    @staticmethod
+    def _default_scenarios() -> dict[str, str]:
+        """内置默认场景 — 覆盖 6 个样例 case"""
+        return {
+            "train_001": "京A12345",   # 清晰 → 通过
+            "train_002": "京A12345",   # 噪声 → 黑名单应命中
+            "train_003": "苏A88U88",   # 模糊 → 可能识别错误
+            "val_001": "粤B54321",     # 关键 case → 应通过
+            "val_002": "苏D13579",     # 噪声+黑名单 → 基线失败
+            "val_003": "浙C36912",     # 严重模糊 → 过拟合风险
+        }
+
+    async def generate(self, prompt: str) -> FakeLLMResponse:
+        """模拟一次 LLM 调用。
+
+        从 prompt 中提取 case_id，返回对应的预设结果。
+        若未匹配到 case_id，返回 "UNKNOWN"。
+        """
+        self.call_count += 1
+        case_id = self._extract_case_id(prompt)
+        result = self.scenarios.get(case_id, "UNKNOWN")
+
+        response = FakeLLMResponse(content=result)
+        self.call_history.append({
+            "call": self.call_count,
+            "case_id": case_id,
+            "result": result,
+            "prompt_snippet": prompt[:200],
+        })
+        return response
+
+    def _extract_case_id(self, prompt: str) -> str:
+        """从 prompt 中提取 case_id。"""
+        for cid in self.scenarios:
+            if cid in prompt:
+                return cid
+        return "unknown"
+
+    def reset(self):
+        """重置调用计数和历史。"""
+        self.call_count = 0
+        self.call_history.clear()