From b611fa5af2634cdfdf10644e9a344ef3a5e66d78 Mon Sep 17 00:00:00 2001 From: Alexandra Eliseeva Date: Wed, 5 Jun 2024 19:18:42 +0200 Subject: [PATCH] Fix formatting --- .../notebooks/cmg_data_stats.ipynb | 118 +++++++++++------- 1 file changed, 73 insertions(+), 45 deletions(-) diff --git a/commit_message_generation/notebooks/cmg_data_stats.ipynb b/commit_message_generation/notebooks/cmg_data_stats.ipynb index a52a47d..7df0754 100644 --- a/commit_message_generation/notebooks/cmg_data_stats.ipynb +++ b/commit_message_generation/notebooks/cmg_data_stats.ipynb @@ -35,7 +35,9 @@ "source": [ "from datasets import load_dataset\n", "\n", - "df = load_dataset(\"JetBrains-Research/lca-commit-message-generation\", \"commitchronicle-py-long\", split=\"test\").to_pandas()\n", + "df = load_dataset(\n", + " \"JetBrains-Research/lca-commit-message-generation\", \"commitchronicle-py-long\", split=\"test\"\n", + ").to_pandas()\n", "df.head()" ] }, @@ -117,7 +119,9 @@ "df[\"num_words_msg\"] = df.message.str.split(\" \").str.len()\n", "df[\"num_lines_msg\"] = df.message.str.split(\"\\n\").str.len()\n", "\n", - "df[[\"num_characters_msg\", \"num_words_msg\", \"num_lines_msg\"]].describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T" + "df[[\"num_characters_msg\", \"num_words_msg\", \"num_lines_msg\"]].describe(\n", + " percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n", + ").T" ], "metadata": { "collapsed": false, @@ -158,7 +162,9 @@ "df[\"num_words_diff\"] = [sum(len(mod[\"diff\"].split(\" \")) for mod in mods) for mods in df.mods]\n", "df[\"num_lines_diff\"] = [sum(len(mod[\"diff\"].split(\"\\n\")) for mod in mods) for mods in df.mods]\n", "\n", - "df[[\"num_modified_files\", \"num_characters_diff\", \"num_words_diff\", \"num_lines_diff\"]].describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T" + "df[[\"num_modified_files\", \"num_characters_diff\", \"num_words_diff\", \"num_lines_diff\"]].describe(\n", + " percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n", + ").T" ], "metadata": { "collapsed": false, @@ -228,9 +234,14 @@ "\n", "\n", "for repo_file in list_repo_tree(\"JetBrains-Research/lca-commit-message-generation\", \"repos\", repo_type=\"dataset\"):\n", - " file_path = hf_hub_download(repo_id=\"JetBrains-Research/lca-commit-message-generation\", filename=repo_file.path, repo_type=\"dataset\", local_dir=data_dir)\n", - " \n", - " with tarfile.open(file_path, 'r:gz') as tar:\n", + " file_path = hf_hub_download(\n", + " repo_id=\"JetBrains-Research/lca-commit-message-generation\",\n", + " filename=repo_file.path,\n", + " repo_type=\"dataset\",\n", + " local_dir=data_dir,\n", + " )\n", + "\n", + " with tarfile.open(file_path, \"r:gz\") as tar:\n", " tar.extractall(path=os.path.join(data_dir, \"extracted_repos\"))" ], "metadata": { @@ -259,22 +270,22 @@ "\n", "\n", "def get_changed_files_before_commit_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n", - " repo = git.Repo(repo_path) \n", - " repo.git.checkout('HEAD', '.')\n", - " repo.git.clean('-fd')\n", + " repo = git.Repo(repo_path)\n", + " repo.git.checkout(\"HEAD\", \".\")\n", + " repo.git.clean(\"-fd\")\n", " commit = repo.commit(commit_hash)\n", - " \n", + "\n", " if len(commit.parents) > 1:\n", " raise ValueError(\"More than one parent\")\n", - " \n", + "\n", " changed_files = list(commit.stats.files.keys())\n", " try:\n", " repo.git.checkout(commit.parents[0].hexsha)\n", " except git.GitCommandError as e:\n", " print(os.path.basename(repo_path), commit_hash, e)\n", " if repo.is_dirty(untracked_files=True):\n", - " repo.git.stash('save', '--include-untracked')\n", - " repo.git.clean('-fd')\n", + " repo.git.stash(\"save\", \"--include-untracked\")\n", + " repo.git.clean(\"-fd\")\n", " repo.git.checkout(commit.parents[0].hexsha)\n", "\n", " stats = defaultdict(int)\n", @@ -286,23 +297,26 @@ " stats[\"num_words\"] += len(content.split(\" \"))\n", " stats[\"num_lines\"] += len(content.split(\"\\n\"))\n", " except FileNotFoundError:\n", - " print(f\"File {file_path} before commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\")\n", + " print(\n", + " f\"File {file_path} before commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\"\n", + " )\n", "\n", - " repo.git.checkout('HEAD', '.')\n", + " repo.git.checkout(\"HEAD\", \".\")\n", " return stats\n", "\n", + "\n", "def get_changed_files_after_commit_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n", - " repo = git.Repo(repo_path) \n", - " repo.git.checkout('HEAD', '.')\n", - " repo.git.clean('-fd')\n", + " repo = git.Repo(repo_path)\n", + " repo.git.checkout(\"HEAD\", \".\")\n", + " repo.git.clean(\"-fd\")\n", " commit = repo.commit(commit_hash)\n", " changed_files = list(commit.stats.files.keys())\n", " try:\n", " repo.git.checkout(commit_hash)\n", " except git.GitCommandError as e:\n", " print(os.path.basename(repo_path), commit_hash, e)\n", - " repo.git.stash('save', '--include-untracked')\n", - " repo.git.clean('-fd')\n", + " repo.git.stash(\"save\", \"--include-untracked\")\n", + " repo.git.clean(\"-fd\")\n", " repo.git.checkout(commit_hash)\n", "\n", " stats = defaultdict(int)\n", @@ -314,9 +328,11 @@ " stats[\"num_words\"] += len(content.split(\" \"))\n", " stats[\"num_lines\"] += len(content.split(\"\\n\"))\n", " except FileNotFoundError:\n", - " print(f\"File {file_path} after commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\")\n", + " print(\n", + " f\"File {file_path} after commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\"\n", + " )\n", "\n", - " repo.git.checkout('HEAD', '.')\n", + " repo.git.checkout(\"HEAD\", \".\")\n", " return stats\n", "\n", "\n", @@ -329,19 +345,19 @@ "\n", "\n", "def get_all_files_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n", - " repo = git.Repo(repo_path) \n", - " repo.git.checkout('HEAD', '.')\n", - " repo.git.clean('-fd')\n", + " repo = git.Repo(repo_path)\n", + " repo.git.checkout(\"HEAD\", \".\")\n", + " repo.git.clean(\"-fd\")\n", " commit = repo.commit(commit_hash)\n", " try:\n", " repo.git.checkout(commit_hash)\n", " except git.GitCommandError as e:\n", " print(os.path.basename(repo_path), commit_hash, e)\n", " if repo.is_dirty(untracked_files=True):\n", - " repo.git.stash('save', '--include-untracked')\n", - " repo.git.clean('-fd')\n", + " repo.git.stash(\"save\", \"--include-untracked\")\n", + " repo.git.clean(\"-fd\")\n", " repo.git.checkout(commit_hash)\n", - " \n", + "\n", " stats = defaultdict(int)\n", "\n", " for blob in commit.tree.traverse():\n", @@ -354,8 +370,8 @@ " stats[\"num_lines\"] += len(content.split(\"\\n\"))\n", " except:\n", " continue\n", - " \n", - " repo.git.checkout('HEAD', '.')\n", + "\n", + " repo.git.checkout(\"HEAD\", \".\")\n", " return stats" ], "metadata": { @@ -423,7 +439,7 @@ "\n", "all_file_stats = []\n", "\n", - "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n", + "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n", " all_file_stats.append(get_all_files_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))" ], "metadata": { @@ -492,8 +508,10 @@ "\n", "changed_files_before_commit_stats = []\n", "\n", - "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n", - " changed_files_before_commit_stats.append(get_changed_files_before_commit_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))" + "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n", + " changed_files_before_commit_stats.append(\n", + " get_changed_files_before_commit_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash)\n", + " )" ], "metadata": { "collapsed": false, @@ -560,9 +578,13 @@ "\n", "changed_files_after_commit_stats = []\n", "\n", - "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n", + "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n", " try:\n", - " changed_files_after_commit_stats.append(get_changed_files_after_commit_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))\n", + " changed_files_after_commit_stats.append(\n", + " get_changed_files_after_commit_stats(\n", + " os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash\n", + " )\n", + " )\n", " except git.GitCommandError: # TODO: idk what's happening here\n", " continue" ], @@ -657,9 +679,11 @@ "\n", "changed_files_full_stats = []\n", "\n", - "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n", + "for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n", " try:\n", - " changed_files_full_stats.append(get_changed_files_full_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))\n", + " changed_files_full_stats.append(\n", + " get_changed_files_full_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash)\n", + " )\n", " except git.GitCommandError:\n", " continue" ], @@ -710,8 +734,8 @@ "import pandas as pd\n", "\n", "\n", - "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", - "pd.DataFrame(all_file_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T" + "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n", + "pd.DataFrame(all_file_stats).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T" ], "metadata": { "collapsed": false, @@ -750,8 +774,10 @@ "import pandas as pd\n", "\n", "\n", - "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", - "pd.DataFrame(changed_files_before_commit_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T" + "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n", + "pd.DataFrame(changed_files_before_commit_stats).describe(\n", + " percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n", + ").T" ], "metadata": { "collapsed": false, @@ -790,8 +816,10 @@ "import pandas as pd\n", "\n", "\n", - "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", - "pd.DataFrame(changed_files_after_commit_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T" + "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n", + "pd.DataFrame(changed_files_after_commit_stats).describe(\n", + " percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n", + ").T" ], "metadata": { "collapsed": false, @@ -830,8 +858,8 @@ "import pandas as pd\n", "\n", "\n", - "pd.set_option('display.float_format', lambda x: '%.2f' % x)\n", - "pd.DataFrame(changed_files_full_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T" + "pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n", + "pd.DataFrame(changed_files_full_stats).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T" ], "metadata": { "collapsed": false,