From b611fa5af2634cdfdf10644e9a344ef3a5e66d78 Mon Sep 17 00:00:00 2001
From: Alexandra Eliseeva <saridormi123@gmail.com>
Date: Wed, 5 Jun 2024 19:18:42 +0200
Subject: [PATCH] Fix formatting

---
 .../notebooks/cmg_data_stats.ipynb            | 118 +++++++++++-------
 1 file changed, 73 insertions(+), 45 deletions(-)

diff --git a/commit_message_generation/notebooks/cmg_data_stats.ipynb b/commit_message_generation/notebooks/cmg_data_stats.ipynb
index a52a47d..7df0754 100644
--- a/commit_message_generation/notebooks/cmg_data_stats.ipynb
+++ b/commit_message_generation/notebooks/cmg_data_stats.ipynb
@@ -35,7 +35,9 @@
    "source": [
     "from datasets import load_dataset\n",
     "\n",
-    "df = load_dataset(\"JetBrains-Research/lca-commit-message-generation\", \"commitchronicle-py-long\", split=\"test\").to_pandas()\n",
+    "df = load_dataset(\n",
+    "    \"JetBrains-Research/lca-commit-message-generation\", \"commitchronicle-py-long\", split=\"test\"\n",
+    ").to_pandas()\n",
     "df.head()"
    ]
   },
@@ -117,7 +119,9 @@
     "df[\"num_words_msg\"] = df.message.str.split(\" \").str.len()\n",
     "df[\"num_lines_msg\"] = df.message.str.split(\"\\n\").str.len()\n",
     "\n",
-    "df[[\"num_characters_msg\", \"num_words_msg\", \"num_lines_msg\"]].describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
+    "df[[\"num_characters_msg\", \"num_words_msg\", \"num_lines_msg\"]].describe(\n",
+    "    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
+    ").T"
    ],
    "metadata": {
     "collapsed": false,
@@ -158,7 +162,9 @@
     "df[\"num_words_diff\"] = [sum(len(mod[\"diff\"].split(\" \")) for mod in mods) for mods in df.mods]\n",
     "df[\"num_lines_diff\"] = [sum(len(mod[\"diff\"].split(\"\\n\")) for mod in mods) for mods in df.mods]\n",
     "\n",
-    "df[[\"num_modified_files\", \"num_characters_diff\", \"num_words_diff\", \"num_lines_diff\"]].describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
+    "df[[\"num_modified_files\", \"num_characters_diff\", \"num_words_diff\", \"num_lines_diff\"]].describe(\n",
+    "    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
+    ").T"
    ],
    "metadata": {
     "collapsed": false,
@@ -228,9 +234,14 @@
     "\n",
     "\n",
     "for repo_file in list_repo_tree(\"JetBrains-Research/lca-commit-message-generation\", \"repos\", repo_type=\"dataset\"):\n",
-    "    file_path = hf_hub_download(repo_id=\"JetBrains-Research/lca-commit-message-generation\", filename=repo_file.path, repo_type=\"dataset\", local_dir=data_dir)\n",
-    "    \n",
-    "    with tarfile.open(file_path, 'r:gz') as tar:\n",
+    "    file_path = hf_hub_download(\n",
+    "        repo_id=\"JetBrains-Research/lca-commit-message-generation\",\n",
+    "        filename=repo_file.path,\n",
+    "        repo_type=\"dataset\",\n",
+    "        local_dir=data_dir,\n",
+    "    )\n",
+    "\n",
+    "    with tarfile.open(file_path, \"r:gz\") as tar:\n",
     "        tar.extractall(path=os.path.join(data_dir, \"extracted_repos\"))"
    ],
    "metadata": {
@@ -259,22 +270,22 @@
     "\n",
     "\n",
     "def get_changed_files_before_commit_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n",
-    "    repo = git.Repo(repo_path) \n",
-    "    repo.git.checkout('HEAD', '.')\n",
-    "    repo.git.clean('-fd')\n",
+    "    repo = git.Repo(repo_path)\n",
+    "    repo.git.checkout(\"HEAD\", \".\")\n",
+    "    repo.git.clean(\"-fd\")\n",
     "    commit = repo.commit(commit_hash)\n",
-    "    \n",
+    "\n",
     "    if len(commit.parents) > 1:\n",
     "        raise ValueError(\"More than one parent\")\n",
-    "    \n",
+    "\n",
     "    changed_files = list(commit.stats.files.keys())\n",
     "    try:\n",
     "        repo.git.checkout(commit.parents[0].hexsha)\n",
     "    except git.GitCommandError as e:\n",
     "        print(os.path.basename(repo_path), commit_hash, e)\n",
     "        if repo.is_dirty(untracked_files=True):\n",
-    "            repo.git.stash('save', '--include-untracked')\n",
-    "            repo.git.clean('-fd')\n",
+    "            repo.git.stash(\"save\", \"--include-untracked\")\n",
+    "            repo.git.clean(\"-fd\")\n",
     "            repo.git.checkout(commit.parents[0].hexsha)\n",
     "\n",
     "    stats = defaultdict(int)\n",
@@ -286,23 +297,26 @@
     "                stats[\"num_words\"] += len(content.split(\" \"))\n",
     "                stats[\"num_lines\"] += len(content.split(\"\\n\"))\n",
     "        except FileNotFoundError:\n",
-    "            print(f\"File {file_path} before commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\")\n",
+    "            print(\n",
+    "                f\"File {file_path} before commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\"\n",
+    "            )\n",
     "\n",
-    "    repo.git.checkout('HEAD', '.')\n",
+    "    repo.git.checkout(\"HEAD\", \".\")\n",
     "    return stats\n",
     "\n",
+    "\n",
     "def get_changed_files_after_commit_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n",
-    "    repo = git.Repo(repo_path) \n",
-    "    repo.git.checkout('HEAD', '.')\n",
-    "    repo.git.clean('-fd')\n",
+    "    repo = git.Repo(repo_path)\n",
+    "    repo.git.checkout(\"HEAD\", \".\")\n",
+    "    repo.git.clean(\"-fd\")\n",
     "    commit = repo.commit(commit_hash)\n",
     "    changed_files = list(commit.stats.files.keys())\n",
     "    try:\n",
     "        repo.git.checkout(commit_hash)\n",
     "    except git.GitCommandError as e:\n",
     "        print(os.path.basename(repo_path), commit_hash, e)\n",
-    "        repo.git.stash('save', '--include-untracked')\n",
-    "        repo.git.clean('-fd')\n",
+    "        repo.git.stash(\"save\", \"--include-untracked\")\n",
+    "        repo.git.clean(\"-fd\")\n",
     "        repo.git.checkout(commit_hash)\n",
     "\n",
     "    stats = defaultdict(int)\n",
@@ -314,9 +328,11 @@
     "                stats[\"num_words\"] += len(content.split(\" \"))\n",
     "                stats[\"num_lines\"] += len(content.split(\"\\n\"))\n",
     "        except FileNotFoundError:\n",
-    "            print(f\"File {file_path} after commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\")\n",
+    "            print(\n",
+    "                f\"File {file_path} after commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\"\n",
+    "            )\n",
     "\n",
-    "    repo.git.checkout('HEAD', '.')\n",
+    "    repo.git.checkout(\"HEAD\", \".\")\n",
     "    return stats\n",
     "\n",
     "\n",
@@ -329,19 +345,19 @@
     "\n",
     "\n",
     "def get_all_files_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n",
-    "    repo = git.Repo(repo_path) \n",
-    "    repo.git.checkout('HEAD', '.')\n",
-    "    repo.git.clean('-fd')\n",
+    "    repo = git.Repo(repo_path)\n",
+    "    repo.git.checkout(\"HEAD\", \".\")\n",
+    "    repo.git.clean(\"-fd\")\n",
     "    commit = repo.commit(commit_hash)\n",
     "    try:\n",
     "        repo.git.checkout(commit_hash)\n",
     "    except git.GitCommandError as e:\n",
     "        print(os.path.basename(repo_path), commit_hash, e)\n",
     "        if repo.is_dirty(untracked_files=True):\n",
-    "            repo.git.stash('save', '--include-untracked')\n",
-    "            repo.git.clean('-fd')\n",
+    "            repo.git.stash(\"save\", \"--include-untracked\")\n",
+    "            repo.git.clean(\"-fd\")\n",
     "            repo.git.checkout(commit_hash)\n",
-    "    \n",
+    "\n",
     "    stats = defaultdict(int)\n",
     "\n",
     "    for blob in commit.tree.traverse():\n",
@@ -354,8 +370,8 @@
     "                    stats[\"num_lines\"] += len(content.split(\"\\n\"))\n",
     "            except:\n",
     "                continue\n",
-    "    \n",
-    "    repo.git.checkout('HEAD', '.')\n",
+    "\n",
+    "    repo.git.checkout(\"HEAD\", \".\")\n",
     "    return stats"
    ],
    "metadata": {
@@ -423,7 +439,7 @@
     "\n",
     "all_file_stats = []\n",
     "\n",
-    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n",
+    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
     "    all_file_stats.append(get_all_files_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))"
    ],
    "metadata": {
@@ -492,8 +508,10 @@
     "\n",
     "changed_files_before_commit_stats = []\n",
     "\n",
-    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n",
-    "    changed_files_before_commit_stats.append(get_changed_files_before_commit_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))"
+    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
+    "    changed_files_before_commit_stats.append(\n",
+    "        get_changed_files_before_commit_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash)\n",
+    "    )"
    ],
    "metadata": {
     "collapsed": false,
@@ -560,9 +578,13 @@
     "\n",
     "changed_files_after_commit_stats = []\n",
     "\n",
-    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n",
+    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
     "    try:\n",
-    "        changed_files_after_commit_stats.append(get_changed_files_after_commit_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))\n",
+    "        changed_files_after_commit_stats.append(\n",
+    "            get_changed_files_after_commit_stats(\n",
+    "                os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash\n",
+    "            )\n",
+    "        )\n",
     "    except git.GitCommandError:  # TODO: idk what's happening here\n",
     "        continue"
    ],
@@ -657,9 +679,11 @@
     "\n",
     "changed_files_full_stats = []\n",
     "\n",
-    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n",
+    "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
     "    try:\n",
-    "        changed_files_full_stats.append(get_changed_files_full_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))\n",
+    "        changed_files_full_stats.append(\n",
+    "            get_changed_files_full_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash)\n",
+    "        )\n",
     "    except git.GitCommandError:\n",
     "        continue"
    ],
@@ -710,8 +734,8 @@
     "import pandas as pd\n",
     "\n",
     "\n",
-    "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
-    "pd.DataFrame(all_file_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
+    "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
+    "pd.DataFrame(all_file_stats).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T"
    ],
    "metadata": {
     "collapsed": false,
@@ -750,8 +774,10 @@
     "import pandas as pd\n",
     "\n",
     "\n",
-    "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
-    "pd.DataFrame(changed_files_before_commit_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
+    "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
+    "pd.DataFrame(changed_files_before_commit_stats).describe(\n",
+    "    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
+    ").T"
    ],
    "metadata": {
     "collapsed": false,
@@ -790,8 +816,10 @@
     "import pandas as pd\n",
     "\n",
     "\n",
-    "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
-    "pd.DataFrame(changed_files_after_commit_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
+    "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
+    "pd.DataFrame(changed_files_after_commit_stats).describe(\n",
+    "    percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
+    ").T"
    ],
    "metadata": {
     "collapsed": false,
@@ -830,8 +858,8 @@
     "import pandas as pd\n",
     "\n",
     "\n",
-    "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
-    "pd.DataFrame(changed_files_full_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
+    "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
+    "pd.DataFrame(changed_files_full_stats).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T"
    ],
    "metadata": {
     "collapsed": false,