Skip to content

Commit

Permalink
Fix formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
saridormi committed Jun 5, 2024
1 parent 6b2a99b commit b611fa5
Showing 1 changed file with 73 additions and 45 deletions.
118 changes: 73 additions & 45 deletions commit_message_generation/notebooks/cmg_data_stats.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@
"source": [
"from datasets import load_dataset\n",
"\n",
"df = load_dataset(\"JetBrains-Research/lca-commit-message-generation\", \"commitchronicle-py-long\", split=\"test\").to_pandas()\n",
"df = load_dataset(\n",
" \"JetBrains-Research/lca-commit-message-generation\", \"commitchronicle-py-long\", split=\"test\"\n",
").to_pandas()\n",
"df.head()"
]
},
Expand Down Expand Up @@ -117,7 +119,9 @@
"df[\"num_words_msg\"] = df.message.str.split(\" \").str.len()\n",
"df[\"num_lines_msg\"] = df.message.str.split(\"\\n\").str.len()\n",
"\n",
"df[[\"num_characters_msg\", \"num_words_msg\", \"num_lines_msg\"]].describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
"df[[\"num_characters_msg\", \"num_words_msg\", \"num_lines_msg\"]].describe(\n",
" percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
").T"
],
"metadata": {
"collapsed": false,
Expand Down Expand Up @@ -158,7 +162,9 @@
"df[\"num_words_diff\"] = [sum(len(mod[\"diff\"].split(\" \")) for mod in mods) for mods in df.mods]\n",
"df[\"num_lines_diff\"] = [sum(len(mod[\"diff\"].split(\"\\n\")) for mod in mods) for mods in df.mods]\n",
"\n",
"df[[\"num_modified_files\", \"num_characters_diff\", \"num_words_diff\", \"num_lines_diff\"]].describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
"df[[\"num_modified_files\", \"num_characters_diff\", \"num_words_diff\", \"num_lines_diff\"]].describe(\n",
" percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
").T"
],
"metadata": {
"collapsed": false,
Expand Down Expand Up @@ -228,9 +234,14 @@
"\n",
"\n",
"for repo_file in list_repo_tree(\"JetBrains-Research/lca-commit-message-generation\", \"repos\", repo_type=\"dataset\"):\n",
" file_path = hf_hub_download(repo_id=\"JetBrains-Research/lca-commit-message-generation\", filename=repo_file.path, repo_type=\"dataset\", local_dir=data_dir)\n",
" \n",
" with tarfile.open(file_path, 'r:gz') as tar:\n",
" file_path = hf_hub_download(\n",
" repo_id=\"JetBrains-Research/lca-commit-message-generation\",\n",
" filename=repo_file.path,\n",
" repo_type=\"dataset\",\n",
" local_dir=data_dir,\n",
" )\n",
"\n",
" with tarfile.open(file_path, \"r:gz\") as tar:\n",
" tar.extractall(path=os.path.join(data_dir, \"extracted_repos\"))"
],
"metadata": {
Expand Down Expand Up @@ -259,22 +270,22 @@
"\n",
"\n",
"def get_changed_files_before_commit_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n",
" repo = git.Repo(repo_path) \n",
" repo.git.checkout('HEAD', '.')\n",
" repo.git.clean('-fd')\n",
" repo = git.Repo(repo_path)\n",
" repo.git.checkout(\"HEAD\", \".\")\n",
" repo.git.clean(\"-fd\")\n",
" commit = repo.commit(commit_hash)\n",
" \n",
"\n",
" if len(commit.parents) > 1:\n",
" raise ValueError(\"More than one parent\")\n",
" \n",
"\n",
" changed_files = list(commit.stats.files.keys())\n",
" try:\n",
" repo.git.checkout(commit.parents[0].hexsha)\n",
" except git.GitCommandError as e:\n",
" print(os.path.basename(repo_path), commit_hash, e)\n",
" if repo.is_dirty(untracked_files=True):\n",
" repo.git.stash('save', '--include-untracked')\n",
" repo.git.clean('-fd')\n",
" repo.git.stash(\"save\", \"--include-untracked\")\n",
" repo.git.clean(\"-fd\")\n",
" repo.git.checkout(commit.parents[0].hexsha)\n",
"\n",
" stats = defaultdict(int)\n",
Expand All @@ -286,23 +297,26 @@
" stats[\"num_words\"] += len(content.split(\" \"))\n",
" stats[\"num_lines\"] += len(content.split(\"\\n\"))\n",
" except FileNotFoundError:\n",
" print(f\"File {file_path} before commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\")\n",
" print(\n",
" f\"File {file_path} before commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\"\n",
" )\n",
"\n",
" repo.git.checkout('HEAD', '.')\n",
" repo.git.checkout(\"HEAD\", \".\")\n",
" return stats\n",
"\n",
"\n",
"def get_changed_files_after_commit_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n",
" repo = git.Repo(repo_path) \n",
" repo.git.checkout('HEAD', '.')\n",
" repo.git.clean('-fd')\n",
" repo = git.Repo(repo_path)\n",
" repo.git.checkout(\"HEAD\", \".\")\n",
" repo.git.clean(\"-fd\")\n",
" commit = repo.commit(commit_hash)\n",
" changed_files = list(commit.stats.files.keys())\n",
" try:\n",
" repo.git.checkout(commit_hash)\n",
" except git.GitCommandError as e:\n",
" print(os.path.basename(repo_path), commit_hash, e)\n",
" repo.git.stash('save', '--include-untracked')\n",
" repo.git.clean('-fd')\n",
" repo.git.stash(\"save\", \"--include-untracked\")\n",
" repo.git.clean(\"-fd\")\n",
" repo.git.checkout(commit_hash)\n",
"\n",
" stats = defaultdict(int)\n",
Expand All @@ -314,9 +328,11 @@
" stats[\"num_words\"] += len(content.split(\" \"))\n",
" stats[\"num_lines\"] += len(content.split(\"\\n\"))\n",
" except FileNotFoundError:\n",
" print(f\"File {file_path} after commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\")\n",
" print(\n",
" f\"File {file_path} after commit not found for commit {commit_hash} in repo {os.path.basename(repo_path)}\"\n",
" )\n",
"\n",
" repo.git.checkout('HEAD', '.')\n",
" repo.git.checkout(\"HEAD\", \".\")\n",
" return stats\n",
"\n",
"\n",
Expand All @@ -329,19 +345,19 @@
"\n",
"\n",
"def get_all_files_stats(repo_path: str, commit_hash: str) -> Dict[str, int]:\n",
" repo = git.Repo(repo_path) \n",
" repo.git.checkout('HEAD', '.')\n",
" repo.git.clean('-fd')\n",
" repo = git.Repo(repo_path)\n",
" repo.git.checkout(\"HEAD\", \".\")\n",
" repo.git.clean(\"-fd\")\n",
" commit = repo.commit(commit_hash)\n",
" try:\n",
" repo.git.checkout(commit_hash)\n",
" except git.GitCommandError as e:\n",
" print(os.path.basename(repo_path), commit_hash, e)\n",
" if repo.is_dirty(untracked_files=True):\n",
" repo.git.stash('save', '--include-untracked')\n",
" repo.git.clean('-fd')\n",
" repo.git.stash(\"save\", \"--include-untracked\")\n",
" repo.git.clean(\"-fd\")\n",
" repo.git.checkout(commit_hash)\n",
" \n",
"\n",
" stats = defaultdict(int)\n",
"\n",
" for blob in commit.tree.traverse():\n",
Expand All @@ -354,8 +370,8 @@
" stats[\"num_lines\"] += len(content.split(\"\\n\"))\n",
" except:\n",
" continue\n",
" \n",
" repo.git.checkout('HEAD', '.')\n",
"\n",
" repo.git.checkout(\"HEAD\", \".\")\n",
" return stats"
],
"metadata": {
Expand Down Expand Up @@ -423,7 +439,7 @@
"\n",
"all_file_stats = []\n",
"\n",
"for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n",
"for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
" all_file_stats.append(get_all_files_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))"
],
"metadata": {
Expand Down Expand Up @@ -492,8 +508,10 @@
"\n",
"changed_files_before_commit_stats = []\n",
"\n",
"for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n",
" changed_files_before_commit_stats.append(get_changed_files_before_commit_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))"
"for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
" changed_files_before_commit_stats.append(\n",
" get_changed_files_before_commit_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash)\n",
" )"
],
"metadata": {
"collapsed": false,
Expand Down Expand Up @@ -560,9 +578,13 @@
"\n",
"changed_files_after_commit_stats = []\n",
"\n",
"for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n",
"for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
" try:\n",
" changed_files_after_commit_stats.append(get_changed_files_after_commit_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))\n",
" changed_files_after_commit_stats.append(\n",
" get_changed_files_after_commit_stats(\n",
" os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash\n",
" )\n",
" )\n",
" except git.GitCommandError: # TODO: idk what's happening here\n",
" continue"
],
Expand Down Expand Up @@ -657,9 +679,11 @@
"\n",
"changed_files_full_stats = []\n",
"\n",
"for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)): \n",
"for repo, hash in tqdm(zip(df.repo, df.hash), total=len(df)):\n",
" try:\n",
" changed_files_full_stats.append(get_changed_files_full_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash))\n",
" changed_files_full_stats.append(\n",
" get_changed_files_full_stats(os.path.join(data_dir, \"extracted_repos\", repo.replace(\"/\", \"__\")), hash)\n",
" )\n",
" except git.GitCommandError:\n",
" continue"
],
Expand Down Expand Up @@ -710,8 +734,8 @@
"import pandas as pd\n",
"\n",
"\n",
"pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
"pd.DataFrame(all_file_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
"pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
"pd.DataFrame(all_file_stats).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T"
],
"metadata": {
"collapsed": false,
Expand Down Expand Up @@ -750,8 +774,10 @@
"import pandas as pd\n",
"\n",
"\n",
"pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
"pd.DataFrame(changed_files_before_commit_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
"pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
"pd.DataFrame(changed_files_before_commit_stats).describe(\n",
" percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
").T"
],
"metadata": {
"collapsed": false,
Expand Down Expand Up @@ -790,8 +816,10 @@
"import pandas as pd\n",
"\n",
"\n",
"pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
"pd.DataFrame(changed_files_after_commit_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
"pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
"pd.DataFrame(changed_files_after_commit_stats).describe(\n",
" percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]\n",
").T"
],
"metadata": {
"collapsed": false,
Expand Down Expand Up @@ -830,8 +858,8 @@
"import pandas as pd\n",
"\n",
"\n",
"pd.set_option('display.float_format', lambda x: '%.2f' % x)\n",
"pd.DataFrame(changed_files_full_stats).describe(percentiles=[.01, .05, .1, .25, .5, .75, .9, .95, .99]).T"
"pd.set_option(\"display.float_format\", lambda x: \"%.2f\" % x)\n",
"pd.DataFrame(changed_files_full_stats).describe(percentiles=[0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).T"
],
"metadata": {
"collapsed": false,
Expand Down

0 comments on commit b611fa5

Please sign in to comment.