🔀 Merge pull request #37 from FleksySDK/leaderboard_improvement

Leaderboard improvements
FleksySDK · May 28, 2024 · 5a5deb8 · 5a5deb8
2 parents b6dfbc3 + 94e7326
commit 5a5deb8
Show file tree

Hide file tree

Showing 8 changed files with 217 additions and 76 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,6 +8,7 @@ repos:
       - id: end-of-file-fixer
         exclude: "coverage_report/.*"
       - id: check-yaml
+        args: [ --unsafe ]
       - id: check-added-large-files
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.3.4

diff --git a/docs/css/mkdocstrings.css b/docs/css/mkdocstrings.css
@@ -3,3 +3,11 @@ div.doc-contents:not(.first) {
     border-left: 4px solid rgba(230, 230, 230);
     margin-bottom: 80px;
 }
+
+.x_icon {
+    color: #ef5552;
+}
+
+.v_icon {
+    color: #4cae50;
+}
diff --git a/docs/hooks.py b/docs/hooks.py
@@ -2,11 +2,24 @@
 
 import json
 import os
+from dataclasses import dataclass
+from typing import Dict, List
 
 from mkdocs.config.defaults import MkDocsConfig
 from mkdocs.structure.nav import Page
 
 
+@dataclass
+class DynamicEntry:
+    """Represents a dynamic entry for a data table : the data will be pulled
+    from the results files.
+    """
+
+    name: str
+    results: Dict
+    additional_fields: List[str]
+
+
 def on_page_markdown(markdown: str, page: Page, config: MkDocsConfig, **kwargs) -> str:
     """Function that runs before rendering the markdown.
 
@@ -21,58 +34,125 @@ def on_page_markdown(markdown: str, page: Page, config: MkDocsConfig, **kwargs)
     Returns:
         str: The updated markdown content.
     """
-    if page.file.src_uri == "leaderboard.md":
+    if "leaderboards" in page.file.src_uri:
         lines = markdown.split("\n")
         entries = []
         for line in lines:
             if line.startswith(">>>"):
                 # This is a line with a path to a result file
                 # -> parse it and extract the results
-                kb_name, result_file_path = line[3:].split("|")
+                name, result_file_path, *args = line[3:].split("|")
 
                 with open(os.path.join(config.docs_dir, result_file_path), "r") as f:
                     res = json.load(f)
 
-                entries.append(
-                    {
-                        "kb_name": kb_name,
-                        "score": res["overall_score"],
-                        "nwp": res["next_word_prediction"]["score"]["top3_accuracy"],
-                        "acp": res["auto_completion"]["score"]["top3_accuracy"],
-                        "acr": res["auto_correction"]["score"]["fscore"],
-                    }
-                )
-
-        # Sort according to the overall score
-        entries.sort(reverse=True, key=lambda x: x["score"])
-
-        # Find the best scores to highlight
-        best_score = max(entries, key=lambda x: x["score"])["score"]
-        best_nwp = max(entries, key=lambda x: x["nwp"])["nwp"]
-        best_acp = max(entries, key=lambda x: x["acp"])["acp"]
-        best_acr = max(entries, key=lambda x: x["acr"])["acr"]
+                entries.append(DynamicEntry(name, res, args))
+
+        # Each leaderboard implements its own render logic
+        rendered_entries = [None for _ in entries]
+        if page.file.src_uri.endswith("main.md"):
+            rendered_entries = render_main(entries)
+        elif page.file.src_uri.endswith("compare.md"):
+            rendered_entries = render_compare(entries)
 
         # Replace the lines accordingly
         for i, line in enumerate(lines):
             if line.startswith(">>>"):
-                e = entries.pop(0)
-
-                score = f"{round(e['score'], 2):g}"
-                nwp = f"{round(e['nwp'], 2):g}"
-                acp = f"{round(e['acp'], 2):g}"
-                acr = f"{round(e['acr'], 2):g}"
-
-                # Highlight the best scores
-                if e["score"] == best_score:
-                    score = f"**{score}**"
-                if e["nwp"] == best_nwp:
-                    nwp = f"**{nwp}**"
-                if e["acp"] == best_acp:
-                    acp = f"**{acp}**"
-                if e["acr"] == best_acr:
-                    acr = f"**{acr}**"
-
-                # Overwrite the line
-                lines[i] = f"| {e['kb_name']} | {score} | {nwp} | {acp} | {acr} |"
+                lines[i] = rendered_entries.pop(0)
 
         return "\n".join(lines)
+
+
+def render_main(entries: List[DynamicEntry]) -> List[str]:
+    """Code for rendering the leaderboard : `leaderboards/main.md`."""
+    # Extract the scores we are going to use
+    for e in entries:
+        e.score = e.results["overall_score"]
+        e.nwp = e.results["next_word_prediction"]["score"]["top3_accuracy"]
+        e.acp = e.results["auto_completion"]["score"]["top3_accuracy"]
+        e.acr = e.results["auto_correction"]["score"]["fscore"]
+
+    # Sort entries according to the overall score
+    entries.sort(reverse=True, key=lambda e: e.score)
+
+    # Find the best scores to highlight for each column
+    best_score = max(e.score for e in entries)
+    best_nwp = max(e.nwp for e in entries)
+    best_acp = max(e.acp for e in entries)
+    best_acr = max(e.acr for e in entries)
+
+    # Render the entries
+    rendered_entries = []
+    for e in entries:
+        score = f"{round(e.score, 2):g}"
+        nwp = f"{round(e.nwp, 2):g}"
+        acp = f"{round(e.acp, 2):g}"
+        acr = f"{round(e.acr, 2):g}"
+
+        # Highlight the best scores
+        if e.score == best_score:
+            score = f"**{score}**"
+        if e.nwp == best_nwp:
+            nwp = f"**{nwp}**"
+        if e.acp == best_acp:
+            acp = f"**{acp}**"
+        if e.acr == best_acr:
+            acr = f"**{acr}**"
+
+        # Render
+        rendered_entries.append(f"| {e.name} | {score} | {acr} | {acp} | {nwp} |")
+
+    return rendered_entries
+
+
+def render_compare(entries: List[DynamicEntry]) -> List[str]:
+    """Code for rendering the leaderboard : `leaderboards/compare.md`."""
+    # Extract the scores we are going to use
+    for e in entries:
+        e.score = e.results["overall_score"]
+        e.nwp = e.results["next_word_prediction"]["score"]["top3_accuracy"]
+        e.acp = e.results["auto_completion"]["score"]["top3_accuracy"]
+        e.acr_detection = e.results["auto_correction"]["score"]["recall"]
+        e.acr_frustration = 1 - e.results["auto_correction"]["score"]["precision"]
+
+    # Sort entries according to the overall score
+    entries.sort(reverse=True, key=lambda e: e.score)
+
+    # Find the best scores to highlight for each column
+    best_score = max(e.score for e in entries)
+    best_nwp = max(e.nwp for e in entries)
+    best_acp = max(e.acp for e in entries)
+    best_acr_detection = max(e.acr_detection for e in entries)
+    best_acr_frustration = min(e.acr_frustration for e in entries)
+
+    # Render the entries
+    rendered_entries = []
+    for e in entries:
+        score = f"{round(e.score * 1000)}"
+        nwp = f"{round(e.nwp * 100)}%"
+        acp = f"{round(e.acp * 100)}%"
+        acr_detection = f"{round(e.acr_detection * 100)}%"
+        acr_frustration = f"{round(e.acr_frustration * 100)}%"
+
+        # Highlight the best scores
+        if e.score == best_score:
+            score = f"**{score}**"
+        if e.nwp == best_nwp:
+            nwp = f"**{nwp}**"
+        if e.acp == best_acp:
+            acp = f"**{acp}**"
+        if e.acr_detection == best_acr_detection:
+            acr_detection = f"**{acr_detection}**"
+        if e.acr_frustration == best_acr_frustration:
+            acr_frustration = f"**{acr_frustration}**"
+
+        # Render
+        additional_fields = " | ".join(e.additional_fields)
+        if additional_fields != "":
+            rendered_entries.append(
+                f"| {e.name} | {score} | {acr_detection} | {acr_frustration} | {acp} | {nwp} | {additional_fields} |"
+            )
+        else:
+            rendered_entries.append(f"| {e.name} | {score} | {acr_detection} | {acr_frustration} | {acp} | {nwp} |")
+
+    return rendered_entries
diff --git a/docs/javascripts/tablesort.js b/docs/javascripts/tablesort.js
diff --git a/docs/leaderboard.md b/docs/leaderboard.md
diff --git a/docs/leaderboards/compare.md b/docs/leaderboards/compare.md
@@ -0,0 +1,53 @@
+---
+hide:
+  - toc
+---
+
+# Leaderboard
+
+[//]: # (A bit of explanation is required for this page)
+[//]: # (There is a Mkdocs hook (defined in `docs/hooks.py`) that will read the content of this page. Any line starting with `>>>` will be extracted and replaced with the scores found in the corresponding result file.)
+[//]: # (The format to follow is : `>>>{name}|{result_file_name}|{optional_additional_fields}`)
+
+| Keyboard | Overall score | Typo detection rate | Auto-correction frustration rate | Auto-completion success rate | Next-word prediction success rate | SDK available |
+|---------:|:-------------:|:-------------------:|:--------------------------------:|:---------------:|:-------------------:|:-------------:|
+>>>Fleksy|results/fleksy.json|:fontawesome-solid-circle-check:{ .v_icon }
+>>>iOS keyboard|results/ios.json|:fontawesome-regular-circle-xmark:{ .x_icon }
+>>>KeyboardKit Open-source|results/keyboardkit_oss.json|:fontawesome-solid-circle-check:{ .v_icon }
+>>>KeyboardKit Pro|results/keyboardkit_pro.json|:fontawesome-solid-circle-check:{ .v_icon }
+>>>Gboard|results/gboard.json|:fontawesome-regular-circle-xmark:{ .x_icon }
+>>>Swiftkey|results/swiftkey.json|:fontawesome-regular-circle-xmark:{ .x_icon }
+>>>Tappa|results/tappa.json|:fontawesome-solid-circle-check:{ .v_icon }
+>>>Yandex|results/yandex.json|:fontawesome-regular-circle-xmark:{ .x_icon }
+
+### Metrics
+
+=== "Overall score"
+
+    A single, general score representing the performances of the keyboard across all tasks.
+
+    :material-trending-up: _Higher is better._
+
+=== "Typo detection rate"
+
+    Percentage of typos detected and corrected by the keyboard.
+
+    :material-trending-up: _Higher is better._
+
+=== "Auto-correction frustration rate"
+
+    Percentage of words correctly typed, but corrected to something else by the keyboard.
+
+    :material-trending-down: _Lower is better._
+
+=== "Auto-completion success rate"
+
+    Percentage of words correctly auto-completed.
+
+    :material-trending-up: _Higher is better._
+
+=== "Next-word prediction success rate"
+
+    Percentage of words correctly predicted from the context.
+
+    :material-trending-up: _Higher is better._
diff --git a/docs/leaderboards/main.md b/docs/leaderboards/main.md
@@ -0,0 +1,29 @@
+# Leaderboard
+
+[//]: # (A bit of explanation is required for this page)
+[//]: # (There is a Mkdocs hook (defined in `docs/hooks.py`) that will read the content of this page. Any line starting with `>>>` will be extracted and replaced with the scores found in the corresponding result file.)
+[//]: # (The format to follow is : `>>>{name}|{result_file_name}|{optional_additional_fields}`)
+
+| Keyboard | Overall<br>score | Auto-correction | Auto-completion | Next-word prediction |
+|---------:|:----------------:|:---------------:|:---------------:|:--------------------:|
+>>>Fleksy|results/fleksy.json
+>>>iOS keyboard|results/ios.json
+>>>KeyboardKit Open-source|results/keyboardkit_oss.json
+>>>KeyboardKit Pro|results/keyboardkit_pro.json
+>>>Gboard|results/gboard.json
+>>>Swiftkey|results/swiftkey.json
+>>>Tappa|results/tappa.json
+>>>Yandex|results/yandex.json
+
+---
+
+The metrics used in this leaderboard are :
+
+* Auto-correction : _**F-score**_
+* Auto-completion : _**top-3 accuracy**_
+* Next-word prediction : _**top-3 accuracy**_
+
+!!! tip
+    See [Understanding the metrics](../how_testing_is_done.md#understanding-the-metrics) for more details.
+
+The overall score is a _**weighted sum**_ of all tasks.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -34,6 +34,12 @@ markdown_extensions:
   - attr_list
   - pymdownx.highlight
   - pymdownx.superfences
+  - pymdownx.emoji:
+      emoji_index: !!python/name:material.extensions.emoji.twemoji
+      emoji_generator: !!python/name:material.extensions.emoji.to_svg
+  - pymdownx.superfences
+  - pymdownx.tabbed:
+      alternate_style: true
 
 nav:
   - Welcome: "index.md"
@@ -42,7 +48,7 @@ nav:
   - "emu_setup.md"
   - "how_testing_is_done.md"
   - "architecture.md"
-  - "leaderboard.md"
+  - "leaderboards/main.md"
   - Code reference:
     - "public_api.md"
     - "internals.md"
@@ -68,7 +74,3 @@ extra:
 
 extra_css:
   - css/mkdocstrings.css
-
-extra_javascript:
-  - https://unpkg.com/[email protected]/dist/tablesort.min.js
-  - javascripts/tablesort.js