yt-project · matthewturk · Aug 10, 2023 · Aug 9, 2023 · Aug 10, 2023
diff --git a/.github/workflows/manubot.yaml b/.github/workflows/manubot.yaml
@@ -106,10 +106,9 @@ jobs:
         env:
           GH_TOKEN: ${{ github.token }}
         run: |
-          python scripts/generate_yt_stats.py
           gh api graphql --paginate -f query="`cat scripts/pull_request_graphql.gql`" > pull_requests.json
           jq -s '[.[].data.repository.pullRequests.nodes[]]' < pull_requests.json > pr.json
-          python scripts/generate_pr_stats.py
+          python scripts/generate_stats.py
       - name: Build Manuscript
         run: bash build/build.sh
       - name: Upload Artifacts

diff --git a/content/15.development_procedure.md b/content/15.development_procedure.md
@@ -18,11 +18,11 @@ We discuss the implications of this on sustainability in Section @sec:sustainabi
 Of particular note is that the development history of yt is also highly bifurcated between version control systems and developer practice.
 In the past, yt developers tended to commit frequently and include all of the individual development history of individual features or bug fixes.
 Recent practice, however, is more inclined toward commit "squashing," where multiple commits are combined into a single commit with the same net effect, or commit rebasing, where changes are included linearly rather than through a branched history.
-One result of this is in figures such as Figure @fig:commit-graph, some contributors appear to have made a smaller quantity of contributions than an informed observer would recognize.
+One result of this is in figures such as the top row of Figure @fig:commit-graph, some contributors appear to have made a smaller quantity of contributions than an informed observer would recognize.
 Specifically, this applies to Clément Robert, who has contributed a considerable amount of change to the code base but has done so in a way that does not maximize the "statistics" presented below.
 This particular bias, toward contributions measured in count, is one that affects other members of the community as well, especially those whose participation is through community engagement, documentation, tutorials, and mentoring, rather than through direct modifications of the code base.
 
-To mitigate this shortcoming, we present the number of pull requests merged into the code base, as a function of time, as well as the time between their creation and their merge, in Figure @fig:pr-graph.
+To mitigate this shortcoming, we present the number of pull requests merged into the code base, as a function of time, as well as the time between their creation and their merge, in the lower row of Figure @fig:figure-commit-graph.
 This demonstrates that in many cases, the number of discrete contributions to the codebase varies greatly depending on the developer, and we believe gives a more informed perception of the activity in the code base.
 The longest time between opening a pull request and merging it was nearly four years; this was the addition of the `cf_radial` frontend, which occurred in fits and starts over a very long period of time.
 The next longest pull request durations are for splitting the code used for bitmap indexing (see @sec:point_indexing) and a per-field configuration system.
@@ -31,251 +31,11 @@ This includes only those pull requests that occurred on GitHub.
 <div id="figure-commit-graph"></div>
 
 ![
-Commits to the source code as a function of time.
+Commits and pull requests to the source code as a function of time.
 ](images/blank.svg){#fig:commit-graph width="1px"}
 
 <script>
-var commitGraphSpecification = {
-  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
-  "data": {"url": "images/yt_repo.json"},
-  "transform": [
-    {
-      "timeUnit": "yearquarter",
-      "field": "committed_datetime",
-      "as": "commit_quarter"
-    },
-    {
-      "aggregate": [{"op": "count", "as": "count"}],
-      "groupby": ["commit_quarter", "author"]
-    }
-  ],
-  "hconcat": [
-    {
-      "params": [{"name": "commit_range", "select": {"type": "interval", "encodings": ["x"]}}],
-      "mark": "bar",
-      "encoding": {
-        "y": {
-          "aggregate": "sum",
-          "field": "count",
-          "type": "quantitative",
-          "axis": {"title": "# commits"}
-        },
-        "x": {
-          "field": "commit_quarter",
-          "type": "temporal",
-          "scale": {"padding": 0}
-        }
-      }
-    },
-    {
-      "transform": [
-        {"filter": {"param": "commit_range"}},
-        {
-          "aggregate": [{"op": "sum", "field": "count", "as": "author_count"}],
-          "groupby": ["author"]
-        },
-        {
-          "window": [{"op": "rank", "as": "rank"}],
-          "sort": [{"order": "descending", "field": "author_count"}]
-        },
-        {"filter": {"field": "rank", "lte": 10}},
-        {"calculate": "split(datum['author'], ' <', 1)[0]", "as": "author_name"}
-      ],
-      "mark": {"type": "bar"},
-      "encoding": {
-        "y": {
-          "field": "author_name",
-          "type": "nominal",
-          "axis": {"title": "top 10 authors"},
-          "sort": {"order": "ascending", "field": "rank"}
-        },
-        "x": {
-          "type": "quantitative",
-          "field": "author_count",
-          "scale": {"domain": [0, 9000]},
-          "axis": {"title": "number of commits"}
-        }
-      }
-    }
-  ],
-  "config": {
-    "axis": {"labelFontSize": 16, "titleFontSize": 16},
-    "legend": {"labelFontSize": 16, "titleFontSize": 16}
-  }
-};
-
-vegaEmbed('#figure-commit-graph', commitGraphSpecification);
-
-</script>
-
-<div id="figure-pr-graph"></div>
-
-![
-Pull requests merged into the source code as a function of time.
-](images/blank.svg){#fig:pr-graph width="1px"}
-
-<script>
-var prGraphSpecification = {
-  "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
-  "data": {
-    "url": "images/pr_stats.csv",
-    "name": "prs",
-    "format": {"parse": {"createdAt": "date", "closedAt": "date"}}
-  },
-  "transform": [
-    {"timeUnit": "yearquarter", "field": "closedAt", "as": "closed_quarter"}
-  ],
-  "vconcat": [
-    {
-      "transform": [
-        {
-          "aggregate": [{"op": "count", "as": "count"}],
-          "groupby": ["closed_quarter", "author"]
-        }
-      ],
-      "hconcat": [
-        {
-          "params": [
-            {
-              "name": "closed_range",
-              "select": {"type": "interval", "encodings": ["x"]}
-            }
-          ],
-          "mark": "bar",
-          "encoding": {
-            "y": {
-              "aggregate": "sum",
-              "field": "count",
-              "type": "quantitative",
-              "axis": {"title": "# merged pull requests"}
-            },
-            "x": {
-              "field": "closed_quarter",
-              "type": "temporal",
-              "scale": {"padding": 0},
-              "axis": {"title": ""}
-            }
-          }
-        },
-        {
-          "transform": [
-            {"filter": {"param": "closed_range"}},
-            {
-              "aggregate": [
-                {"op": "sum", "field": "count", "as": "author_count"}
-              ],
-              "groupby": ["author"]
-            },
-            {
-              "window": [{"op": "rank", "as": "rank"}],
-              "sort": [
-                {"order": "descending", "field": "author_count"},
-                {"order": "ascending", "field": "author"}
-              ]
-            },
-            {"filter": {"field": "rank", "lte": 10}}
-          ],
-          "mark": {"type": "bar"},
-          "encoding": {
-            "y": {
-              "field": "author",
-              "type": "nominal",
-              "axis": {"title": "top 10 authors"},
-              "sort": {"order": "ascending", "field": "rank"}
-            },
-            "x": {
-              "type": "quantitative",
-              "field": "author_count",
-              "scale": {
-                "domainMax": {
-                  "expr": "max(extent(pluck(data('data_2'), 'author_count'))[1], 200)"
-                }
-              },
-              "axis": {"title": "# of closed pull requests"}
-            }
-          }
-        }
-      ]
-    },
-    {
-      "transform": [
-        {"filter": {"param": "closed_range"}},
-        {"calculate": "log(datum.duration)/log(10)", "as": "log_duration"},
-        {
-          "bin": {"base": 10, "extent": [2, 8.2], "step": 0.25},
-          "field": "log_duration",
-          "as": "bin_log_duration"
-        },
-        {"calculate": "pow(10, datum.bin_log_duration)", "as": "x1"},
-        {"calculate": "pow(10, datum.bin_log_duration_end)", "as": "x2"}
-      ],
-      "layer": [
-        {
-          "mark": "bar",
-          "encoding": {
-            "y": {
-              "field": "x1",
-              "scale": {"type": "log", "base": 10, "domain": [100, 100000000]},
-              "axis": {"tickCount": 5, "title": "Seconds to Merge"},
-              "type": "quantitative"
-            },
-            "y2": {"field": "x2"},
-            "x": {"aggregate": "count", "scale": {"type": "linear", "domain": [0, 250]}}
-          }
-        },
-        {
-          "data": {
-            "values": [
-              {"t": 3600, "label": "1h", "y": 1},
-              {"t": 86400, "label": "1d", "y": 1},
-              {"t": 604800, "label": "1w", "y": 1},
-              {"t": 2592000, "label": "1m", "y": 1},
-              {"t": 31536000, "label": "1y", "y": 1}
-            ]
-          },
-          "resolve": {"scale": {"x": "independent"}},
-          "layer": [
-            {
-              "mark": "rule",
-              "encoding": {
-                "y": {"field": "t", "type": "quantitative"},
-                "color": {"value": "black"}
-              }
-            },
-            {
-              "mark": {
-                "type": "text",
-                "angle": 0,
-                "align": "left",
-                "dy": 0,
-                "dx": 5,
-                "baseline": "middle"
-              },
-              "encoding": {
-                "y": {"field": "t", "type": "quantitative"},
-                "text": {"field": "label"},
-                "x": {
-                  "field": "y",
-                  "scale": {"domain": [0, 1]},
-                  "type": "quantitative",
-                  "axis": {"title": "# of merged pull requests"}
-                }
-              }
-            }
-          ]
-        }
-      ]
-    }
-  ],
-  "config": {
-    "axis": {"labelFontSize": 16, "titleFontSize": 16},
-    "legend": {"labelFontSize": 16, "titleFontSize": 16}
-  }
-};
-
-vegaEmbed('#figure-pr-graph', prGraphSpecification);
-
-
+vegaEmbed('#figure-commit-graph', "images/yt_repo.vl");
 </script>
 
 ### Unit Testing {#sec:unit_testing}

diff --git a/scripts/generate_stats.py b/scripts/generate_stats.py
@@ -0,0 +1,74 @@
+import json
+import pandas as pd
+import git
+
+# Commit info
+
+mapping = {}
+
+repo_loc = "./yt/"
+
+with open(repo_loc + ".mailmap", 'r') as mailmap:
+    for line in mailmap.readlines():
+        try:
+            val, key = line.split('>', 1)
+            key = key.strip()
+            val = val.strip() + '>'
+            mapping[key] = val
+        except ValueError:
+            print("ValueError", line)
+
+yt_repo = git.repo.Repo(repo_loc)
+
+attributes = ("author", "committed_datetime", "hexsha")
+
+data = {_:[] for _ in attributes}
+
+for i, c in enumerate(yt_repo.iter_commits()):
+    for attr in attributes:
+        data[attr].append(getattr(c, attr))
+
+new_authors = []
+for author in data["author"]:
+    author_str = f"{author.name} <{author.email}>"
+    try:
+        author_str = mapping[author_str]
+    except KeyError:
+        pass
+    new_authors.append(author_str.split(" <")[0])
+
+
+data["author"] = new_authors
+df_commits = pd.DataFrame(data)
+df_commits.rename(columns = {'committed_datetime':'datetime'}, inplace=True)
+df_commits["type"] = "commit"
+df_commits["duration"] = pd.NA
+
+pull_requests = json.load(open("pr.json"))
+# We're going to use json_normalize, but it has fewer options, so we have to do
+# more manipulation after we load it.
+df_pr = pd.json_normalize(pull_requests)
+
+
+# In[5]:
+
+
+# We only want the ones that are closed ...
+df_pr = df_pr[(~df_pr["closedAt"].isna()) & (df_pr["state"] == "MERGED")]
+
+df_pr[["createdAt", "closedAt"]] = df_pr[["createdAt", "closedAt"]].apply(pd.to_datetime)
+df_pr["duration"] = (df_pr["closedAt"] - df_pr["createdAt"]).dt.total_seconds()
+
+df_pr["author"] = df_pr["author.name"]
+df_pr["author"].fillna(df_pr["author.login"])
+df_pr["author"].fillna(df_pr["author.id"])
+df_pr["author"].fillna("Automated Bot")
+df_pr["author"][df_pr["author"] == ""] = "Automated Bot"
+df_pr["type"] = "pull-request"
+df_pr.rename(columns = {'closedAt': 'datetime'}, inplace=True)
+
+df_pr = df_pr[["author", "datetime", "type", "duration"]]
+
+df = pd.concat([df_commits, df_pr])
+
+df.to_csv("content/images/yt_repo.csv", columns=["author", "datetime", "type", "duration"])