Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions examples/optimization/eval_optimize_loop/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
output/
__pycache__/
.pytest_cache/
*.pyc
87 changes: 87 additions & 0 deletions examples/optimization/eval_optimize_loop/config/optimizer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
{
"_description": "Evaluation + Optimization 自动回归闭环配置",
"pipeline": {
"name": "PlateAgent Eval-Optimize Loop",
"version": "1.0.0",
"max_iterations": 5,
"random_seed": 42
},
"gate": {
"rules": {
"total_score_improvement": {
"enabled": true,
"threshold": 0.03,
"description": "验证集总分提升 ≥ 3%"
},
"no_new_hard_fail": {
"enabled": true,
"max_new_fails": 0,
"description": "不允许新增 hard fail"
},
"critical_case_no_regress": {
"enabled": true,
"critical_case_ids": [],
"description": "关键 case 不退步"
},
"cost_within_budget": {
"enabled": true,
"max_cost_ratio": 1.2,
"description": "成本不超过 baseline 的 120%"
},
"overfit_detection": {
"enabled": true,
"description": "训练集提升 + 验证集退化 → 拒绝候选"
}
},
"acceptance_strategy": "all_must_pass",
"description": "all_must_pass: 所有启用的规则都通过才接受; majority: 多数通过即可"
},
"attribution": {
"categories": [
"final_answer_mismatch",
"tool_call_error",
"param_error",
"llm_rubric_fail",
"knowledge_recall_insufficient",
"format_invalid"
],
"rules": {
"final_answer_mismatch": {
"trigger": "predicted != ground_truth",
"priority": 1
},
"tool_call_error": {
"trigger": "tool execution failed or timeout",
"priority": 2
},
"param_error": {
"trigger": "tool parameter invalid",
"priority": 3
},
"llm_rubric_fail": {
"trigger": "LLM Judge score below threshold",
"threshold": 0.6,
"priority": 4
},
"knowledge_recall_insufficient": {
"trigger": "blacklist miss or confusion char not recalled",
"priority": 5
},
"format_invalid": {
"trigger": "output does not match expected JSON schema",
"priority": 6
}
}
},
"optimizer": {
"target_prompts": ["system_prompt", "skill_prompt"],
"strategy": "failure_driven",
"description": "根据归因结果,优先优化失败率最高的类别对应的 prompt 片段"
},
"output": {
"dir": "output",
"formats": ["json", "markdown"],
"retain_audit_trail": true,
"max_audit_entries": 50
}
}
43 changes: 43 additions & 0 deletions examples/optimization/eval_optimize_loop/config/train.evalset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"_description": "???",
"version": "1.0.0",
"cases": [
{
"case_id": "train_001",
"image": "plate_001.jpg",
"ground_truth": "\u4eacA12345",
"conditions": {
"type": "clear"
},
"expected_behavior": "should_pass",
"description": "????"
},
{
"case_id": "train_002",
"image": "plate_028.jpg",
"ground_truth": "\u4eacA12345",
"conditions": {
"type": "noise",
"noise_level": 0.15
},
"expected_behavior": "may_fail",
"description": "????"
},
{
"case_id": "train_003",
"image": "plate_012.jpg",
"ground_truth": "\u82cfA88U88",
"conditions": {
"type": "blur",
"blur_kernel": 5
},
"expected_behavior": "may_fail",
"description": "????"
}
],
"stats": {
"total": 3,
"should_pass": 1,
"may_fail": 2
}
}
47 changes: 47 additions & 0 deletions examples/optimization/eval_optimize_loop/config/val.evalset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"_description": "???",
"version": "1.0.0",
"cases": [
{
"case_id": "val_001",
"image": "plate_005.jpg",
"ground_truth": "\u7ca4B54321",
"conditions": {
"type": "clear"
},
"expected_behavior": "should_pass",
"critical": true,
"description": "??case"
},
{
"case_id": "val_002",
"image": "plate_029.jpg",
"ground_truth": "\u82cfD13579",
"conditions": {
"type": "noise",
"noise_level": 0.2
},
"expected_behavior": "should_fail_baseline",
"critical": false,
"description": "??+???"
},
{
"case_id": "val_003",
"image": "plate_018.jpg",
"ground_truth": "\u6d59C36912",
"conditions": {
"type": "blur",
"blur_kernel": 7
},
"expected_behavior": "should_fail_baseline",
"critical": false,
"description": "????"
}
],
"stats": {
"total": 3,
"should_pass": 1,
"should_fail_baseline": 2,
"critical": 1
}
}
11 changes: 11 additions & 0 deletions examples/optimization/eval_optimize_loop/fake/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
"""Fake 模块公共导出"""
from .fake_model import FakeLLM, FakeLLMResponse
from .fake_judge import FakeJudge, JudgeResult, JudgeScore

__all__ = [
"FakeLLM",
"FakeLLMResponse",
"FakeJudge",
"JudgeResult",
"JudgeScore",
]
110 changes: 110 additions & 0 deletions examples/optimization/eval_optimize_loop/fake/fake_judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""Fake Judge — 无 LLM API 调用下模拟评测打分。

基于规则引擎(非 LLM)对预测结果和 ground truth 进行对比评分,
输出与 LLMJudge 相同的数据结构,保证 pipeline 可无缝切换。

三维评分均基于字符匹配率推导,模拟真实 LLM Judge 行为:
识别差 → 黑名单召回和回复质量也会相应下降。
"""

from dataclasses import dataclass


@dataclass
class JudgeScore:
"""模拟的三维评分"""
recognition_quality: float # 0.0-1.0
blacklist_quality: float # 0.0-1.0
response_quality: float # 0.0-1.0

@property
def overall(self) -> float:
return (self.recognition_quality + self.blacklist_quality + self.response_quality) / 3.0

@property
def passed(self) -> bool:
return self.overall >= 0.6


@dataclass
class JudgeResult:
"""模拟的评测结果"""
case_id: str
ground_truth: str
predicted: str
score: JudgeScore
passed: bool
failure_reason: str = ""


class FakeJudge:
"""基于规则的假 Judge。

评分逻辑(完全确定性,无 LLM 依赖):
- recognition_quality: 字符匹配率(0.0-1.0)
- blacklist_quality: 基于识别质量推导(识别差→黑名单召回也差)
- response_quality: 基于识别质量推导(识别差→回复质量也差)

使用方式:
judge = FakeJudge()
result = judge.evaluate("val_001", "京A12345", "京A12345")
"""

def evaluate(
self,
case_id: str,
ground_truth: str,
predicted: str,
) -> JudgeResult:
"""对单条 case 进行评测。

Args:
case_id: case 标识
ground_truth: 标注真值
predicted: Agent 预测结果

Returns:
JudgeResult: 包含三维评分和 pass/fail 判断
"""
recognition = self._char_match_score(ground_truth, predicted)
# 黑名单和回复质量随识别质量缩放(模拟真实场景)
blacklist = max(0.1, recognition * 0.9)
response = max(0.2, recognition * 1.05)

score = JudgeScore(
recognition_quality=recognition,
blacklist_quality=blacklist,
response_quality=response,
)

passed = score.passed
reason = ""
if not passed:
if recognition < 0.8:
reason = f"final_answer_mismatch: char_match={recognition:.2f}"
elif blacklist < 0.6:
reason = "knowledge_recall_insufficient: blacklist miss"
else:
reason = f"llm_rubric_fail: overall={score.overall:.2f}"

return JudgeResult(
case_id=case_id,
ground_truth=ground_truth,
predicted=predicted,
score=score,
passed=passed,
failure_reason=reason,
)

@staticmethod
def _char_match_score(a: str, b: str) -> float:
"""字符级匹配得分。

完全匹配 → 1.0,逐字符比较取平均。
"""
if not a or not b:
return 0.0
if a == b:
return 1.0
matches = sum(1 for ca, cb in zip(a, b) if ca == cb)
return matches / max(len(a), len(b))
80 changes: 80 additions & 0 deletions examples/optimization/eval_optimize_loop/fake/fake_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Fake LLM — 无 API Key 模式下模拟 LLM 响应。

设计思路:
- 基于 case_id 匹配预设的响应映射表
- 支持多种场景:通过、失败、工具调用错误等
- 不产生任何网络请求,所有数据来自配置文件
"""

from dataclasses import dataclass, field
from typing import Optional


@dataclass
class FakeLLMResponse:
"""模拟的 LLM 单次响应"""
content: str
tool_calls: list[dict] = field(default_factory=list)
finish_reason: str = "stop"


class FakeLLM:
"""无依赖的假 LLM,用于 pipeline 快速验证。

使用方式:
fake = FakeLLM(scenarios={"plate_001": "京A12345"})
response = await fake.generate("识别 plate_001")
"""

def __init__(self, scenarios: Optional[dict[str, str]] = None):
"""
Args:
scenarios: {case_id: predicted_result} 映射。
不传则使用内置默认值。
"""
self.scenarios = scenarios or self._default_scenarios()
self.call_count = 0
self.call_history: list[dict] = []

@staticmethod
def _default_scenarios() -> dict[str, str]:
"""内置默认场景 — 覆盖 6 个样例 case"""
return {
"train_001": "京A12345", # 清晰 → 通过
"train_002": "京A12345", # 噪声 → 黑名单应命中
"train_003": "苏A88U88", # 模糊 → 可能识别错误
"val_001": "粤B54321", # 关键 case → 应通过
"val_002": "苏D13579", # 噪声+黑名单 → 基线失败
"val_003": "浙C36912", # 严重模糊 → 过拟合风险
}

async def generate(self, prompt: str) -> FakeLLMResponse:
"""模拟一次 LLM 调用。

从 prompt 中提取 case_id,返回对应的预设结果。
若未匹配到 case_id,返回 "UNKNOWN"。
"""
self.call_count += 1
case_id = self._extract_case_id(prompt)
result = self.scenarios.get(case_id, "UNKNOWN")

response = FakeLLMResponse(content=result)
self.call_history.append({
"call": self.call_count,
"case_id": case_id,
"result": result,
"prompt_snippet": prompt[:200],
})
return response

def _extract_case_id(self, prompt: str) -> str:
"""从 prompt 中提取 case_id。"""
for cid in self.scenarios:
if cid in prompt:
return cid
return "unknown"

def reset(self):
"""重置调用计数和历史。"""
self.call_count = 0
self.call_history.clear()
Loading
Loading