The first usable API

dustalov · Jul 6, 2024 · 3e95c03 · 3e95c03
1 parent 3f5e33a
commit 3e95c03
Show file tree

Hide file tree

Showing 5 changed files with 359 additions and 109 deletions.
diff --git a/Tutorial.ipynb b/Tutorial.ipynb
@@ -12,7 +12,6 @@
     "import numpy.typing as npt\n",
     "import pandas as pd\n",
     "import plotly.express as px\n",
-    "from evalica import Winner\n",
     "from plotly.graph_objects import Figure"
    ]
   },
@@ -34,33 +33,44 @@
    "outputs": [],
    "source": [
     "df_food = pd.read_csv(\"food.csv\", dtype=str)\n",
+    "\n",
+    "df_food['winner'] = df_food[\"winner\"].map({\n",
+    "    \"left\": evalica.Winner.X,\n",
+    "    \"right\": evalica.Winner.Y,\n",
+    "    \"tie\": evalica.Winner.Draw,\n",
+    "})\n",
+    "\n",
     "df_food.head(5)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "024e6a61-0ddb-4235-aa7b-6fb5280b5d9e",
+   "id": "27ebe4b8-d159-4689-be31-ba5e25848e69",
    "metadata": {},
    "outputs": [],
    "source": [
-    "index: dict[str, int] = {}\n",
-    "\n",
-    "for xy in zip(df_food[\"left\"], df_food[\"right\"], strict=False):\n",
-    "    for e in xy:\n",
-    "        index[e] = index.get(e, len(index))\n",
-    "\n",
-    "xs = [index[x] for x in df_food[\"left\"]]\n",
-    "ys = [index[y] for y in df_food[\"right\"]]\n",
-    "ws = df_food[\"winner\"].map({\n",
-    "        \"left\": Winner.X,\n",
-    "        \"right\": Winner.Y,\n",
-    "        \"tie\": Winner.Draw,\n",
-    "    }).tolist()\n",
-    "\n",
-    "wins, ties = evalica.matrices(xs, ys, ws)\n",
-    "\n",
-    "wins, ties"
+    "matrices = evalica.matrices(df_food['left'], df_food['right'], df_food['winner'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42da86e0-c2c6-42c4-9c5e-84d56587fdea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame(matrices.win_matrix, index=matrices.index, columns=matrices.index)  # win matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c5dc51f6-bca7-4153-8265-18c019e8a639",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.DataFrame(matrices.tie_matrix, index=matrices.index, columns=matrices.index)  # tie matrix"
    ]
   },
   {
@@ -70,40 +80,55 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "scores, _iterations = evalica.bradley_terry(wins + ties / 2, tolerance=1e-4, limit=100)\n",
-    "scores"
+    "bt_result = evalica.bradley_terry(df_food['left'], df_food['right'], df_food['winner'])\n",
+    "bt_result.scores.sort_values(ascending=False).to_frame()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64bf8f2b-a08f-46b9-b4df-e5d58d8c8a50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_result = evalica.newman(df_food['left'], df_food['right'], df_food['winner'])\n",
+    "n_result.scores.sort_values(ascending=False).to_frame()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "1d331724-d4aa-4270-b0d7-d07ecab17ac7",
+   "id": "35cdcc91-d182-4d1c-842e-251052defcd4",
    "metadata": {},
    "outputs": [],
    "source": [
-    "scores, _v, _iterations = evalica.newman(wins.astype(np.float64), ties / 2, .5, tolerance=1e-4, limit=100)\n",
-    "scores"
+    "eigen_result = evalica.eigen(df_food['left'], df_food['right'], df_food['winner'])\n",
+    "eigen_result.scores.sort_values(ascending=False).to_frame()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "62c6a626-0d64-4b8a-979b-5bf33dac67ba",
+   "id": "39c5d898-944c-472d-abc2-553c54503adf",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def to_pairwise(scores: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:\n",
-    "    return scores[:, np.newaxis] / (scores + scores[:, np.newaxis])"
+    "elo_result = evalica.elo(df_food['left'], df_food['right'], df_food['winner'])\n",
+    "elo_result.scores.sort_values(ascending=False).to_frame()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "d1bff4e2-441d-48ad-93e4-e075b0ecb007",
+   "id": "66e1a1c5-b32d-4988-92b3-7549fbad3845",
    "metadata": {},
    "outputs": [],
    "source": [
-    "to_pairwise(scores)"
+    "bt_pairwise = evalica.pairwise(bt_result.scores)\n",
+    "\n",
+    "df_bt_pairwise = pd.DataFrame(bt_pairwise, index=bt_result.scores.index, columns=bt_result.scores.index)\n",
+    "\n",
+    "df_bt_pairwise"
    ]
   },
   {
@@ -127,7 +152,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "visualize(to_pairwise(scores))"
+    "visualize(df_bt_pairwise)"
    ]
   },
   {

diff --git a/python/evalica/__init__.py b/python/evalica/__init__.py
@@ -1,10 +1,13 @@
-from .evalica import Winner, __version__
-from .evalica import py_bradley_terry as bradley_terry
-from .evalica import py_counting as counting
-from .evalica import py_eigen as eigen
-from .evalica import py_elo as elo
-from .evalica import py_matrices as matrices
-from .evalica import py_newman as newman
+from collections import OrderedDict
+from collections.abc import Hashable, Iterable
+from dataclasses import dataclass
+from typing import TypeVar
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+
+from .evalica import Winner, __version__, py_bradley_terry, py_counting, py_eigen, py_elo, py_matrices, py_newman
 from .naive import bradley_terry as bradley_terry_naive
 from .naive import newman as newman_naive
 
@@ -15,5 +18,207 @@
     Winner.Ignore,
 ]
 
-__all__ = ["__version__", "Winner", "matrices", "counting", "bradley_terry", "newman", "elo", "eigen", "WINNERS",
-           "bradley_terry_naive", "newman_naive"]
+T = TypeVar("T", bound=Hashable)
+
+def index(xs: Iterable[T], *yss: Iterable[T]) -> dict[T, int]:
+    index: dict[T, int] = OrderedDict()
+
+    for ys in (xs, *yss):
+        for y in ys:
+            index[y] = index.get(y, len(index))
+
+    return index
+
+def _index_elements(xs: Iterable[T], ys: Iterable[T]) -> tuple["pd.Index[T]", list[int], list[int]]:
+    xy_index = index(xs, ys)
+
+    xs_indexed = [xy_index[x] for x in xs]
+    ys_indexed = [xy_index[y] for y in ys]
+
+    return pd.Index(xy_index), xs_indexed, ys_indexed
+
+
+@dataclass(frozen=True)
+class MatricesResult:
+    win_matrix: npt.NDArray[np.int64]
+    tie_matrix: npt.NDArray[np.int64]
+    index: "pd.Index[Hashable]"
+
+def matrices(
+    xs: Iterable[T],
+    ys: Iterable[T],
+    ws: Iterable[Winner],
+) -> MatricesResult:
+    xy_index, _xs, _ys = _index_elements(xs, ys)
+
+    W, T = py_matrices(_xs, _ys, ws)  # noqa: N806
+
+    return MatricesResult(
+        win_matrix=W,
+        tie_matrix=T,
+        index=xy_index,
+    )
+
+
+@dataclass(frozen=True)
+class CountingResult:
+    scores: "pd.Series[T]"
+    win_matrix: npt.NDArray[np.int64]
+
+
+def counting(
+    xs: Iterable[T],
+    ys: Iterable[T],
+    ws: Iterable[Winner],
+) -> CountingResult:
+    xy_index, _xs, _ys = _index_elements(xs, ys)
+
+    W, _ = py_matrices(_xs, _ys, ws)  # noqa: N806
+
+    counts = py_counting(W)
+
+    return CountingResult(
+        scores=pd.Series(counts, index=xy_index, name=counting.__name__),
+        win_matrix=W,
+    )
+
+@dataclass(frozen=True)
+class BradleyTerryResult:
+    scores: "pd.Series[T]"
+    matrix: npt.NDArray[np.float64]
+    tie_weight: float
+    iterations: int
+
+def bradley_terry(
+    xs: Iterable[T],
+    ys: Iterable[T],
+    ws: Iterable[Winner],
+    tie_weight: float = .5,
+    tolerance: float = 1e-4,
+    limit: int = 100,
+) -> BradleyTerryResult:
+    assert np.isfinite(tie_weight), "tie_weight must be finite"
+
+    xy_index, _xs, _ys = _index_elements(xs, ys)
+
+    W, T = py_matrices(_xs, _ys, ws)  # noqa: N806
+
+    M = W.astype(float) + tie_weight * T.astype(float)  # noqa: N806
+
+    scores, iterations = py_bradley_terry(M, tolerance, limit)
+
+    return BradleyTerryResult(
+        scores=pd.Series(scores, index=xy_index, name=bradley_terry.__name__),
+        matrix=M,
+        tie_weight=tie_weight,
+        iterations=iterations,
+    )
+
+@dataclass(frozen=True)
+class NewmanResult:
+    scores: "pd.Series[T]"
+    win_matrix: npt.NDArray[np.float64]
+    tie_matrix: npt.NDArray[np.float64]
+    v: float
+    v_init: float
+    iterations: int
+
+def newman(
+    xs: Iterable[T],
+    ys: Iterable[T],
+    ws: Iterable[Winner],
+    v_init: float = .5,
+    tolerance: float = 1e-4,
+    limit: int = 100,
+) -> NewmanResult:
+    assert np.isfinite(v_init), "v_init must be finite"
+
+    xy_index, _xs, _ys = _index_elements(xs, ys)
+
+    W, T = py_matrices(_xs, _ys, ws)  # noqa: N806
+    W_float, T_float = W.astype(float), T.astype(float)  # noqa: N806
+
+    scores, v, iterations = py_newman(W_float, T_float, v_init, tolerance, limit)
+
+    return NewmanResult(
+        scores=pd.Series(scores, index=xy_index, name=newman.__name__),
+        win_matrix=W_float,
+        tie_matrix=T_float,
+        v=v,
+        v_init=v_init,
+        iterations=iterations,
+    )
+
+@dataclass(frozen=True)
+class EloResult:
+    scores: "pd.Series[T]"
+    r: float
+    k: int
+    s: float
+
+def elo(
+    xs: Iterable[T],
+    ys: Iterable[T],
+    ws: Iterable[Winner],
+    r: float = 1500,
+    k: int = 30,
+    s: float = 400,
+) -> EloResult:
+    xy_index, _xs, _ys = _index_elements(xs, ys)
+
+    scores = py_elo(_xs, _ys, ws, r, k, s)
+
+    return EloResult(
+        scores=pd.Series(scores, index=xy_index, name=elo.__name__),
+        r=r,
+        k=k,
+        s=s,
+    )
+
+@dataclass(frozen=True)
+class EigenResult:
+    scores: "pd.Series[T]"
+    matrix: npt.NDArray[np.float64]
+    tie_weight: float
+
+def eigen(
+    xs: Iterable[T],
+    ys: Iterable[T],
+    ws: Iterable[Winner],
+    tie_weight: float = .5,
+) -> EigenResult:
+    xy_index, _xs, _ys = _index_elements(xs, ys)
+
+    W, T = py_matrices(_xs, _ys, ws)  # noqa: N806
+
+    M = W.astype(float) + tie_weight * T.astype(float)  # noqa: N806
+
+    scores = py_eigen(M)
+
+    return EigenResult(
+        scores=pd.Series(scores, index=xy_index, name=eigen.__name__),
+        matrix=M,
+        tie_weight=tie_weight,
+    )
+
+def pairwise(scores: "pd.Series[T] | npt.NDArray[np.float64]") -> npt.NDArray[np.float64]:
+    if isinstance(scores, pd.Series):
+        return pairwise(scores.sort_values(ascending=False).to_numpy())
+
+    return scores[:, np.newaxis] / (scores + scores[:, np.newaxis])
+
+__all__ = [
+    "Winner",
+    "__version__",
+    "bradley_terry",
+    "counting",
+    "eigen",
+    "elo",
+    "py_matrices",
+    "newman",
+    "bradley_terry_naive",
+    "newman_naive",
+    "WINNERS",
+    "index",
+    "pairwise",
+]