"""Exploratory LiveCodeBench Pro leaderboard client.
Purpose:
Provide typed, read-only access to the public LiveCodeBench Pro leaderboard
surfaces currently used by the project website.
Design:
- Keep this module explicitly best-effort: the backend endpoints are
discoverable from the public frontend but are not documented as a stable
API contract.
- Use only the Python standard library for HTTP so the package does not
gain a required network-client dependency.
- Normalize the useful result shapes while allowing extra fields because
the upstream payload may evolve.
Examples:
>>> from ooai_llm.benchmarks.livecodebench_pro import LiveCodeBenchProModel
>>> row = LiveCodeBenchProModel(name="gpt-5", provider="openai", rating=2176)
>>> row.label
'gpt-5'
"""
from __future__ import annotations
import json
from collections.abc import Callable, Mapping, Sequence
from typing import Any, Literal
from urllib.parse import urlencode
from urllib.request import Request, urlopen
from pydantic import BaseModel, ConfigDict, Field, computed_field
[docs]
DEFAULT_LIVECODEBENCH_PRO_BASE_URL = "https://webhook.cp-bench.orzzh.com"
[docs]
LiveCodeBenchProDifficulty = Literal["easy", "medium", "hard"]
[docs]
LiveCodeBenchProSort = Literal["rating", "provider", "organization", "model", "status"]
[docs]
JsonTransport = Callable[[str], Any]
[docs]
class LiveCodeBenchProError(RuntimeError):
"""Raised when LiveCodeBench Pro data cannot be fetched or parsed."""
[docs]
class LiveCodeBenchProEndpoint(BaseModel):
"""Document one known LiveCodeBench Pro endpoint.
Args:
name: Stable local label for the endpoint.
method: HTTP method.
path: Backend path.
description: What the endpoint currently returns.
query: Required or useful query parameters.
stability: Stability note for callers.
"""
[docs]
model_config = ConfigDict(extra="forbid", frozen=True)
[docs]
query: list[str] = Field(default_factory=list)
[docs]
stability: str = "undocumented"
[docs]
class LiveCodeBenchProRatingEvent(BaseModel):
"""One rating event from the LiveCodeBench Pro leaderboard."""
[docs]
model_config = ConfigDict(extra="allow", populate_by_name=True)
[docs]
rating: int | None = None
[docs]
time: int | None = None
[docs]
contest_id: int | None = Field(default=None, alias="contestId")
[docs]
name: str | None = None
[docs]
calc_rating: int | None = Field(default=None, alias="calcRating")
[docs]
class LiveCodeBenchProDifficultyRow(LiveCodeBenchProModel):
"""One per-difficulty row with validation and pass rates."""
[docs]
validrate: float | None = None
[docs]
passrate: float | None = None
@computed_field # type: ignore[prop-decorator]
@property
[docs]
def passrate_percent(self) -> float | None:
"""Return ``passrate`` as a percentage."""
return self.passrate * 100 if self.passrate is not None else None
@computed_field # type: ignore[prop-decorator]
@property
[docs]
def validrate_percent(self) -> float | None:
"""Return ``validrate`` as a percentage."""
return self.validrate * 100 if self.validrate is not None else None
[docs]
class LiveCodeBenchProDifficultyResult(BaseModel):
"""Per-difficulty leaderboard payload."""
[docs]
model_config = ConfigDict(extra="allow")
[docs]
difficulty: LiveCodeBenchProDifficulty
[docs]
llms: list[LiveCodeBenchProDifficultyRow] = Field(default_factory=list)
[docs]
class LiveCodeBenchProProblemResult(BaseModel):
"""One problem verdict for a model submission set."""
[docs]
model_config = ConfigDict(extra="allow")
[docs]
problem_index: str | None = None
[docs]
problem_link: str | None = None
[docs]
problem_name: str | None = None
[docs]
verdict: str | None = None
[docs]
status: str | None = None
[docs]
submission_id: str | None = None
@computed_field # type: ignore[prop-decorator]
@property
[docs]
def accepted(self) -> bool:
"""Whether the verdict indicates an accepted solution."""
return (self.verdict or "").lower() == "accepted" or (self.status or "").upper() == "AC"
[docs]
class LiveCodeBenchProContestResult(BaseModel):
"""One contest grouping in a model submission set."""
[docs]
model_config = ConfigDict(extra="allow")
[docs]
contest_title: str | None = None
[docs]
contest_link: str | None = None
[docs]
contest_start_time: int | None = None
[docs]
problems: list[LiveCodeBenchProProblemResult] = Field(default_factory=list)
[docs]
class LiveCodeBenchProSubmissionsResult(BaseModel):
"""Contest/problem verdicts for one model and difficulty."""
[docs]
model_config = ConfigDict(extra="allow")
[docs]
difficulty: LiveCodeBenchProDifficulty
[docs]
contests: list[LiveCodeBenchProContestResult] = Field(default_factory=list)
@computed_field # type: ignore[prop-decorator]
@property
[docs]
def problem_count(self) -> int:
"""Return the number of listed problems."""
return sum(len(contest.problems) for contest in self.contests)
@computed_field # type: ignore[prop-decorator]
@property
[docs]
def accepted_count(self) -> int:
"""Return the number of accepted listed problems."""
return sum(1 for contest in self.contests for problem in contest.problems if problem.accepted)
[docs]
class LiveCodeBenchProSubmissionDetail(BaseModel):
"""Individual submission detail including generated code when exposed."""
[docs]
model_config = ConfigDict(extra="allow")
[docs]
problem_id: str | None = None
[docs]
model_name: str | None = None
[docs]
model_provider: str | None = None
[docs]
verdict: str | None = None
[docs]
status: str | None = None
[docs]
code: str | None = None
[docs]
class LiveCodeBenchProSnapshot(BaseModel):
"""High-level snapshot of leaderboard rows and difficulty views."""
[docs]
model_config = ConfigDict(extra="forbid")
[docs]
models: list[LiveCodeBenchProModel] = Field(default_factory=list)
[docs]
difficulties: dict[LiveCodeBenchProDifficulty, LiveCodeBenchProDifficultyResult] = Field(default_factory=dict)
[docs]
endpoints: list[LiveCodeBenchProEndpoint] = Field(default_factory=list)
[docs]
notes: list[str] = Field(default_factory=list)
@computed_field # type: ignore[prop-decorator]
@property
[docs]
def active_count(self) -> int:
"""Return the number of active model rows."""
return sum(1 for model in self.models if model.is_active)
[docs]
class LiveCodeBenchProClient:
"""Small client for the current LiveCodeBench Pro leaderboard backend.
Args:
base_url: Backend URL. Defaults to the public website backend observed
in the frontend bundle.
timeout: HTTP timeout in seconds.
transport: Optional testing hook. It receives the fully built URL and
returns a decoded JSON-compatible object.
"""
def __init__(
self,
*,
base_url: str = DEFAULT_LIVECODEBENCH_PRO_BASE_URL,
timeout: float = 30,
transport: JsonTransport | None = None,
) -> None:
[docs]
self.base_url = base_url.rstrip("/")
self._transport = transport
[docs]
def list_models(
self,
*,
status: str | None = None,
providers: Sequence[str] | None = None,
organizations: Sequence[str] | None = None,
query: str | None = None,
sort_by: LiveCodeBenchProSort = "rating",
descending: bool = True,
limit: int | None = None,
) -> list[LiveCodeBenchProModel]:
"""Return leaderboard model rows with local filters applied."""
payload = self._get_json("/leaderboard/llm")
if not isinstance(payload, list):
raise LiveCodeBenchProError("Expected /leaderboard/llm to return a JSON list.")
rows = [LiveCodeBenchProModel.model_validate(item) for item in payload]
rows = _filter_models(
rows,
status=status,
providers=providers,
organizations=organizations,
query=query,
)
rows = _sort_models(rows, sort_by=sort_by, descending=descending)
return _limit(rows, limit)
[docs]
def get_difficulty(
self,
difficulty: LiveCodeBenchProDifficulty,
*,
providers: Sequence[str] | None = None,
organizations: Sequence[str] | None = None,
query: str | None = None,
sort_by: LiveCodeBenchProSort = "rating",
descending: bool = True,
limit: int | None = None,
) -> LiveCodeBenchProDifficultyResult:
"""Return a per-difficulty leaderboard view."""
payload = self._get_json("/leaderboard/llm/difficulty", query={"difficulty": difficulty})
result = LiveCodeBenchProDifficultyResult.model_validate(payload)
rows = _filter_models(
result.llms,
status=None,
providers=providers,
organizations=organizations,
query=query,
)
rows = _sort_models(rows, sort_by=sort_by, descending=descending)
return result.model_copy(update={"llms": _limit(rows, limit)})
[docs]
def get_submissions(
self,
*,
model_name: str,
model_provider: str,
difficulty: LiveCodeBenchProDifficulty,
) -> LiveCodeBenchProSubmissionsResult:
"""Return contest/problem verdicts for one model and difficulty."""
payload = self._get_json(
"/leaderboard/llm/submissions",
query={
"model_name": model_name,
"model_provider": model_provider,
"difficulty": difficulty,
},
)
return LiveCodeBenchProSubmissionsResult.model_validate(payload)
[docs]
def get_submission(self, submission_id: str) -> LiveCodeBenchProSubmissionDetail:
"""Return an individual submission detail."""
payload = self._get_json(f"/leaderboard/submission/{submission_id}")
return LiveCodeBenchProSubmissionDetail.model_validate(payload)
[docs]
def snapshot(
self,
*,
include_difficulties: bool = True,
active_only: bool = False,
limit: int | None = None,
) -> LiveCodeBenchProSnapshot:
"""Return a high-level snapshot of currently exposed data."""
models = self.list_models(status="active" if active_only else None, limit=limit)
difficulties: dict[LiveCodeBenchProDifficulty, LiveCodeBenchProDifficultyResult] = {}
if include_difficulties:
for difficulty in ("easy", "medium", "hard"):
difficulties[difficulty] = self.get_difficulty(difficulty, limit=limit)
return LiveCodeBenchProSnapshot(
models=models,
difficulties=difficulties,
endpoints=livecodebench_pro_endpoints(),
notes=[
"LiveCodeBench Pro endpoints are public but undocumented; treat payloads as exploratory.",
"Use the local benchmark toolkit or standard-judge upload flow for your own model runs.",
],
)
def _get_json(self, path: str, *, query: Mapping[str, Any] | None = None) -> Any:
url = self._build_url(path, query=query)
if self._transport is not None:
return self._transport(url)
request = Request(url, method="GET", headers={"Accept": "application/json"})
try:
with urlopen(request, timeout=self.timeout) as response:
payload = response.read().decode("utf-8")
return json.loads(payload)
except Exception as exc: # pragma: no cover - exercised by callers with live network
raise LiveCodeBenchProError(f"Could not fetch LiveCodeBench Pro URL {url}: {exc}") from exc
def _build_url(self, path: str, *, query: Mapping[str, Any] | None = None) -> str:
path = path if path.startswith("/") else f"/{path}"
url = f"{self.base_url}{path}"
if query:
url = f"{url}?{urlencode(query)}"
return url
[docs]
def livecodebench_pro_endpoints() -> list[LiveCodeBenchProEndpoint]:
"""Return the currently known LiveCodeBench Pro endpoint surfaces."""
return [
LiveCodeBenchProEndpoint(
name="models",
path="/leaderboard/llm",
description="Overall model leaderboard rows with ratings and rating events.",
),
LiveCodeBenchProEndpoint(
name="difficulty",
path="/leaderboard/llm/difficulty",
description="Per-difficulty validation and pass rates for active models.",
query=["difficulty=easy|medium|hard"],
),
LiveCodeBenchProEndpoint(
name="submissions",
path="/leaderboard/llm/submissions",
description="Contest/problem verdicts for one model, provider, and difficulty.",
query=["model_name", "model_provider", "difficulty=easy|medium|hard"],
),
LiveCodeBenchProEndpoint(
name="submission",
path="/leaderboard/submission/{submission_id}",
description="Individual submission detail, including generated code when exposed.",
query=["submission_id"],
),
LiveCodeBenchProEndpoint(
name="standard_judge",
method="POST",
path="/standard_judge",
description="Website upload flow for benchmark JSON artifacts after email verification.",
query=["model", "author", "email_challenge_id", "email_verification_code", "file"],
),
]
[docs]
def list_livecodebench_pro_models(
*,
client: LiveCodeBenchProClient | None = None,
**kwargs: Any,
) -> list[LiveCodeBenchProModel]:
"""Return LiveCodeBench Pro model rows using a default client."""
resolved = client or LiveCodeBenchProClient()
return resolved.list_models(**kwargs)
[docs]
def get_livecodebench_pro_difficulty(
difficulty: LiveCodeBenchProDifficulty,
*,
client: LiveCodeBenchProClient | None = None,
**kwargs: Any,
) -> LiveCodeBenchProDifficultyResult:
"""Return a LiveCodeBench Pro per-difficulty result using a default client."""
resolved = client or LiveCodeBenchProClient()
return resolved.get_difficulty(difficulty, **kwargs)
[docs]
def get_livecodebench_pro_submissions(
*,
model_name: str,
model_provider: str,
difficulty: LiveCodeBenchProDifficulty,
client: LiveCodeBenchProClient | None = None,
) -> LiveCodeBenchProSubmissionsResult:
"""Return LiveCodeBench Pro contest/problem verdicts using a default client."""
resolved = client or LiveCodeBenchProClient()
return resolved.get_submissions(
model_name=model_name,
model_provider=model_provider,
difficulty=difficulty,
)
[docs]
def get_livecodebench_pro_submission(
submission_id: str,
*,
client: LiveCodeBenchProClient | None = None,
) -> LiveCodeBenchProSubmissionDetail:
"""Return one LiveCodeBench Pro submission detail using a default client."""
resolved = client or LiveCodeBenchProClient()
return resolved.get_submission(submission_id)
[docs]
def get_livecodebench_pro_snapshot(
*,
client: LiveCodeBenchProClient | None = None,
**kwargs: Any,
) -> LiveCodeBenchProSnapshot:
"""Return a high-level LiveCodeBench Pro snapshot using a default client."""
resolved = client or LiveCodeBenchProClient()
return resolved.snapshot(**kwargs)
def _filter_models(
rows: Sequence[LiveCodeBenchProModel],
*,
status: str | None,
providers: Sequence[str] | None,
organizations: Sequence[str] | None,
query: str | None,
) -> list[LiveCodeBenchProModel]:
provider_filter = _normalize_terms(providers)
organization_filter = _normalize_terms(organizations)
status_filter = status.lower() if status else None
query_filter = query.lower() if query else None
filtered: list[LiveCodeBenchProModel] = []
for row in rows:
if status_filter and (row.status or "").lower() != status_filter:
continue
if provider_filter and row.provider.lower() not in provider_filter:
continue
if organization_filter and (row.organization or "").lower() not in organization_filter:
continue
if query_filter and query_filter not in " ".join(
[
row.name,
row.display_name or "",
row.provider,
row.organization or "",
]
).lower():
continue
filtered.append(row)
return filtered
def _sort_models(
rows: Sequence[LiveCodeBenchProModel],
*,
sort_by: LiveCodeBenchProSort,
descending: bool,
) -> list[LiveCodeBenchProModel]:
def sort_key(row: LiveCodeBenchProModel) -> Any:
if sort_by == "rating":
return row.rating if row.rating is not None else -1
if sort_by == "provider":
return row.provider.lower()
if sort_by == "organization":
return (row.organization or "").lower()
if sort_by == "status":
return (row.status or "").lower()
return row.label.lower()
return sorted(rows, key=sort_key, reverse=descending)
def _limit[T](rows: Sequence[T], limit: int | None) -> list[T]:
if limit is None or limit <= 0:
return list(rows)
return list(rows[:limit])
def _normalize_terms(values: Sequence[str] | None) -> set[str]:
if not values:
return set()
terms: set[str] = set()
for value in values:
terms.update(part.strip().lower() for part in value.split(",") if part.strip())
return terms