From 1e34d10e237f78540c95b6f6f31e005c123f6a00 Mon Sep 17 00:00:00 2001 From: Farshad Kazemi Date: Fri, 1 Apr 2022 14:31:26 -0400 Subject: [PATCH] Updated readme file --- README.md | 26 ++++++++-------- R_scripts/conover_draw_plot.R | 40 ------------------------- R_scripts/friedman_draw_plot.R | 17 ----------- R_scripts/friedman_test.R | 30 ------------------- R_scripts/holm-bonferrani.R | 55 ---------------------------------- 5 files changed, 13 insertions(+), 155 deletions(-) delete mode 100644 R_scripts/conover_draw_plot.R delete mode 100644 R_scripts/friedman_draw_plot.R delete mode 100644 R_scripts/friedman_test.R delete mode 100644 R_scripts/holm-bonferrani.R diff --git a/README.md b/README.md index f8bef07..9b80d08 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Replication Package -This repository contains the necessary data for replicating the necessary information to replicate the study of "Exploring the Notion of Risk in Reviewer Recommendation". This code extends the Relationalgit package (https://github.com/CESEL/RelationalGit), and add some functionlities that is needed to incorporate the concept of fix-inducing likelihood of a project. +This repository contains the required information for replicating the necessary information to replicate the study of "Exploring the Notion of Risk in Reviewer Recommendation". This code extends the Relationalgit package (https://github.com/CESEL/RelationalGit), and add some functionalities that is needed to incorporate the concept of fix-inducing likelihood of a project. # Dependencies In order to run the code and replicate the results, you should first install the following tools: @@ -8,21 +8,21 @@ In order to run the code and replicate the results, you should first install the [.NET Core](https://www.microsoft.com/net/download). ## 2) SQL Server -[Sql Server Management Studio](https://docs.microsoft.com/en-us/sql/ssms/download-sql-server-management-studio-ssms) to import the databases. +[SQL Server Management Studio](https://docs.microsoft.com/en-us/sql/ssms/download-sql-server-management-studio-ssms) to import the databases. -Sql Server - [LocalDb, Express, and Developer Editions](https://www.microsoft.com/en-ca/sql-server/sql-server-downloads) +SQL Server - [LocalDb, Express, and Developer Editions](https://www.microsoft.com/en-ca/sql-server/sql-server-downloads) # Import data: -Once you install the necessary tools, you can download the dataset files from [here]() and import them using Sql Server Management Studio. -After importing the files, you have to update the database credntials in the files that needs connection to the datase. These files are locates inside: +Once you install the necessary tools, you can download the dataset files from [here](https://zenodo.org/record/6403760#.Ykc9zW7MJuU) and import them using SQL Server Management Studio. +After importing the files, you have to update the database credentials in the files that needs connection to the datase. These files are locates inside: - /notebooks - /ReplicationPackage ## Step 1: Prepare the model and data (data exist in the replication package) -You start by running 'notebooks\RQ1_PRMetricExtraction.ipynb' to extract metrics from github, please fill the github id and token with your own token and id. Then, run the '/notebooks\RQ1_CreatePeriodicModels.ipynb' notebook and create the models for all the periods of the studied project. Then, you can run the other RQ1 notebook (notebooks\RQ1_Figure-balanced_accuracy.ipynb) to see the distribution of the predicted defect proness. +You start by running 'notebooks\RQ1_PRMetricExtraction.ipynb' to extract metrics from github, please fill the Github id and token with your own token and id. Then, run the '/notebooks\RQ1_CreatePeriodicModels.ipynb' notebook and create the models for all the periods of the studied project. Then, you can run the other RQ1 notebook (notebooks\RQ1_Figure-balanced_accuracy.ipynb) to see the distribution of the predicted defect proneness. ## Step 2: (RQ1) How do existing CRR approaches perform with respect to the risk of inducing future fixes? -Now you should run the simulator for differnt projects. You can open the visual studio.net and open the project properties. Under debug, run the following command: +Now you should run the simulator for different projects. You can open the visual studio.net and open the project properties. Under debug, run the following command: ``` --cmd simulate-recommender --recommendation-strategy Strategy --conf-path "Absolute/address/to/json/setting.json" ``` @@ -35,25 +35,25 @@ Where Strategy should be replaced by one of: - AuthorshipRec - RevOwnRec -And the json file is the one resides in /ReplicationPackage folder. +And the JSON file is the one that resides in /ReplicationPackage folder. Once the replication is finished, you can runthe following command to compare the CRR approach with reality: ``` --cmd analyze-simulations --analyze-result-path "absolute/path/to/save/results" --recommender-simulation recommendation_id --reality-simulation reality_simulation_id --conf-path "Absolute/address/to/json/setting.json" ``` -The recommendation_id and reality_simulation_id are ids for the CRR under analyze and the reality run in the database. +The recommendation_id and reality_simulation_id are ids for the CRR under analysis and the reality run in the database. you can see the results of this RQ1 available in '/notebooks/RQ1' folder. ## Step 3: (RQ2) How can the risk of fix-inducing code changes be effectively balanced with other quantities of interest? For the next research question, you can run the simulation using the strategy of 'RAR'. You can change the value of PD in line 69 of 'src\RelationalGit.Recommendation\Strategies\Spreading\JITSofiaRecommendationStrategy.cs' file. In this RQ we changed the value between 0.1 and 0.9 with 0.1 intervals. -After running all the experiments, you can compare the results against reality and put the in a format similar to '/notebooks/RQ2/'. The for the analysis, please run the RQ2 notebooks and see the resutls. +After running all the experiments, you can compare the results against reality and put them in a format similar to '/notebooks/RQ2/'. The for the analysis, please run the RQ2 notebooks and see the resutls. ## Step 4: (RQ3) How can we identify an effective fix-inducing likelihood threshold (PD ) interval for a given project? -In this research question, we wanted to see if we can suggest any method to help defining a rang depending on the risk levels of the stakeholders. In order to replicate this experiments, please first run '/notebooks/RQ3_calculate_boundaries.ipynb' notbook.This notebook, find the dynamic and normalized boundaries for different periods. Once it is done, you can comment line 69 and uncomment line 70 in 'src\RelationalGit.Recommendation\Strategies\Spreading\JITSofiaRecommendationStrategy.cs' file. +In this research question, we wanted to see if we can suggest any method to help define a rang depending on the risk levels of the stakeholders. In order to replicate this experiments, please first run '/notebooks/RQ3_calculate_boundaries.ipynb' notbook.This notebook, find the dynamic and normalized boundaries for different periods. Once it is done, you can comment line 69 and uncomment line 70 in 'src\RelationalGit.Recommendation\Strategies\Spreading\JITSofiaRecommendationStrategy.cs' file. After running the experiment for three Quartiles (Q1, Q2, and Q3) for each method, you can analyze the result and put them in the format of '/notebooks/RQ3/' folder. -Next steps, would be to run '/notebooks/RQ3_merge_csvs.ipynb' notebook to merge these data into one csv. This csv should then be placed in in the folder next to the R scripts (a sample merged files is already there). +Next steps, would be to run '/notebooks/RQ3_merge_csvs.ipynb' notebook to merge these data into one CSV. This CSV should then be placed in in the folder next to the R scripts (a sample merged files is already there). Then you can run the R script in the following order: - '/R_scripts_RQ3/friedman_test.R': Load files and run Friendman test. - '/R_scripts_RQ3/friedman_draw_plot.R': Draw the results of the Friedman test. - '/R_scripts_RQ3/Conover_with_holm-bonferrani_method.R': Run Conover test. -- '/R_scripts_RQ3/conover_draw_plot.R': Draw the results of the conover test. +- '/R_scripts_RQ3/conover_draw_plot.R': Draw the results of the Conover test. This concludes the experiments in this study! \ No newline at end of file diff --git a/R_scripts/conover_draw_plot.R b/R_scripts/conover_draw_plot.R deleted file mode 100644 index 55739a1..0000000 --- a/R_scripts/conover_draw_plot.R +++ /dev/null @@ -1,40 +0,0 @@ - - -#pvalues=c(dynamic.norm,dynamic.static,norm.static) -#comparison=c(c(rep("dynamic vs norm",length(dynamic.norm))), -#c(rep("dynamic vs static",length(dynamic.static))), -#c(rep("static vs norm",length(norm.static)))) - -#m <- as.data.frame(cbind(pvalues,comparison)) -#print(m) -#df$pvalue=as.numeric(df$pvalue) -base_breaks <- function(n = 3){ - function(x) { - axisTicks(range(x, na.rm = TRUE), log = FALSE, n = n) - } -} - -df$pvalue <- as.double(df$pvalue) - -ggplot(df, aes(type,pvalue),shape=factor(project)) + - facet_grid(vars(threshold), scales="free")+ - theme_bw()+ - geom_point(aes(shape = project),size=5,alpha=0.5)+ - #scale_y_continuous(trans = 'log2',labels = scales::scientific)+ - scale_y_continuous(trans=scales::pseudo_log_trans(base = 10), - labels = scales::number_format(accuracy = 0.01,decimal.mark = '.'), - breaks = base_breaks(), - limits = c(0, 0.2))+ - geom_hline(yintercept =0.05 ,alpha=0.5, color="red")+ - labs( x = "Method Pairs", y = "P-Values",shape = "Projects: \n")+ - theme( - panel.background = element_rect(fill = NA), - axis.text = element_text(size=20, hjust=1,color = "black"), - axis.title=element_text(size=24), - strip.text.x = element_text(size = 18), - strip.text.y = element_text(size = 18), - legend.text = element_text(size = 18), - legend.position = "top", - legend.title = element_text(size = 18), - axis.text.x=element_text(size=18, hjust=0.5,vjust=0.2) - ) diff --git a/R_scripts/friedman_draw_plot.R b/R_scripts/friedman_draw_plot.R deleted file mode 100644 index eae23b1..0000000 --- a/R_scripts/friedman_draw_plot.R +++ /dev/null @@ -1,17 +0,0 @@ -# draw plot -data=read.csv("./defect_expertiseloss.csv") -ggplot(data, aes(res, method)) + - geom_violin(aes(fill=res),draw_quantiles = c(0.25, 0.5, 0.75),fill='#e0e0e0') + - facet_grid(vars(threshold), vars(project), scales="free")+ - theme_bw()+ - geom_point(position = position_jitter(seed = 1, width = 0.1),size=0.5,alpha = 0.2 )+ - theme( - panel.background = element_rect(fill = NA), - axis.text = element_text(size=20, hjust=1,color = "black"), - axis.title=element_text(size=24), - strip.text.x = element_text(size = 18), - strip.text.y = element_text(size = 18), - - )+ - labs( x = "Performance Improvement (%)", y = "Methods") - diff --git a/R_scripts/friedman_test.R b/R_scripts/friedman_test.R deleted file mode 100644 index 2f14f93..0000000 --- a/R_scripts/friedman_test.R +++ /dev/null @@ -1,30 +0,0 @@ -library(magrittr) # needs to be run every time you start R and want to use %>% -library(dplyr) -library(tidyverse) -library(ggpubr) -library(rstatix) -library(stats) - -setwd("/projectfolder") -data=read.csv("./defect_expertiseloss.csv") # This file is the output of the notebook file -prjs <- c('Roslyn','Rust','Kubernetes') -thrs <- c('0.25','0.5','0.75') -for (prj in prjs) { - for (thr in thrs) { - print('------------------') - print(prj) - print(thr) - subData<- subset(data, (project==prj & threshold==thr)) - subData$method<-factor(subData$method) - subData$PeriodId<-factor(subData$PeriodId) - #res.aov <- subData %>% friedman.test(y=subData$res , group=subData$method, block=subData$PeriodId , data = subData) - res.friedman_test <- subData %>% friedman_test(res ~ method | PeriodId) - #res.friedman_test <- friedman.test(subData$res, subData$method, subData$PeriodId) - print(res.friedman_test) - res.KW<- subData %>%friedman_effsize(res ~ method | PeriodId) - print(res.KW) - - } -} - - diff --git a/R_scripts/holm-bonferrani.R b/R_scripts/holm-bonferrani.R deleted file mode 100644 index f928ecf..0000000 --- a/R_scripts/holm-bonferrani.R +++ /dev/null @@ -1,55 +0,0 @@ -library(magrittr) # needs to be run every time you start R and want to use %>% -library(dplyr) -library(tidyverse) -library(ggpubr) -library(rstatix) -library(stats) -library(DescTools) -library(PMCMRplus) -setwd("/Users/farshadkazemi/Downloads") - -data=read.csv("./defect_expertiseloss.csv") -prjs <- c('Roslyn','Rust','Kubernetes') -thrs <- c('0.25','0.5','0.75') -dynamic.norm <- c() -dynamic.static <- c() -norm.static <- c() - - -df <- data.frame(threshold=factor(levels = unique(data$threshold)), - project=factor( levels = unique(data$project)), - pvalue=double(), - type=factor(levels=c("dynamic vs norm","dynamic vs static","norm vs static")), - stringsAsFactors=FALSE) -for (prj in prjs) { - for (thr in thrs) { - print('------------------') - print(prj) - print(thr) - subData<- subset(data, (project==prj & threshold==thr)) - subData$method<-factor(subData$method) - subData$PeriodId<-factor(subData$PeriodId) - #res.friedman_test <- subData %>% friedman_test(res ~ method | PeriodId) - - #f=res.friedman_test$statistic - #k=dim(subData)[2] - #nf=length(subData[,1]) - #res.conover<-ConoverTest(subData$res ~ subData$method | subData$PeriodId,method = "holm") - res.conover<-frdAllPairsConoverTest(subData$res,subData$method,subData$PeriodId,p.adjust.method = "holm") - pvalues=res.conover$p.value - - df[nrow(df) + 1,] = c(thr,prj,pvalues[1,1],"dynamic vs norm") - df[nrow(df) + 1,] = c(thr,prj,pvalues[2,1],"dynamic vs static") - df[nrow(df) + 1,] = c(thr,prj,pvalues[2,2],"norm vs static") - - dynamic.norm <- c(dynamic.norm, pvalues[1,1]) - dynamic.static <- c(dynamic.static, pvalues[2,1]) - norm.static <- c(norm.static, pvalues[2,2]) - - print(res.conover$p.value) - - - } -} - -