Skip to content

Commit

Permalink
Merge pull request #15 from VIB-PSB/dev
Browse files Browse the repository at this point in the history
Release 1.1 - e2e test
  • Loading branch information
nicomaper authored Oct 6, 2023
2 parents 3cb9481 + db632dc commit 24eef94
Show file tree
Hide file tree
Showing 32 changed files with 101,140 additions and 112 deletions.
48 changes: 48 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
name: MINI-AC test suite

on:
push:
branches: [ "main", "dev" ]
pull_request:
branches: [ "main", "dev" ]

jobs:
nf-test:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Prepare nf-test config file
run: sed -i -e "s@%TMP%@${RUNNER_TEMP}@g" tests/nextflow.config

- uses: actions/setup-java@v3
with:
distribution: oracle
java-version: 17

- name: Check Java version
run: java -version

- name: Setup Nextflow
uses: nf-core/[email protected]

- name: Setup singularity
uses: eWaterCycle/setup-singularity@v7
with:
singularity-version: 3.8.3

- name: Setup nf-test
run: wget -qO- https://code.askimed.com/install/nf-test | bash

- name: Fetch motif mapping files
run: |
curl -k -o tests/data/zma_v4_chr1/zma_v4_genome_wide_motif_mappings_chr1.bed https://floppy.psb.ugent.be/index.php/s/NekMYztyxEnsQiY/download/zma_v4_genome_wide_motif_mappings_chr1.bed
curl -k -o tests/data/zma_v4_chr1/zma_v4_locus_based_motif_mappings_5kbup_1kbdown_chr1.bed https://floppy.psb.ugent.be/index.php/s/r2wQmFjPy79qSp7/download/zma_v4_locus_based_motif_mappings_5kbup_1kbdown_chr1.bed
curl -k -o data/ath/ath_genome_wide_motif_mappings.bed https://floppy.psb.ugent.be/index.php/s/iaZPwdrRGe3YDdK/download/ath_genome_wide_motif_mappings.bed
curl -k -o data/ath/ath_locus_based_motif_mappings_5kbup_1kbdown.bed https://floppy.psb.ugent.be/index.php/s/qcQ7KndzHaSpd9e/download/ath_locus_based_motif_mappings_5kbup_1kbdown.bed
- name: Run nf-test
shell: bash
run: ./nf-test test
29 changes: 29 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# ignore Nextflow cache and logs
.nextflow/
.nextflow.log*

# ignore Singularity cache
singularity_cache/

# ignore large motif mapping files
*motif_mappings*.bed

# ignore nf-test executable
nf-test

# ignore test cache
.nf-test/

# ignore test outputs
tests/outputs/

# ignore SLURM output and error files
slurm.*.out
slurm.*.err

# ignore jupyter notebook checkpoints
.ipynb_checkpoints/

# python cache and compiled files
__pycache__/
*.pyc
3 changes: 2 additions & 1 deletion bin/add_go_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
import go_manipulations

gene_go_file = argv[1]
ontology_file = argv[2]

go_tree = go_manipulations.GOtree(path.join(path.dirname(path.dirname(argv[0])), "ontologies", "go.obo"))
go_tree = go_manipulations.GOtree(ontology_file)

go_tree.add_descriptions(gene_go_file)
16 changes: 11 additions & 5 deletions bin/getGO_xlsx_gw.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def parseArgs():
parser.add_argument('-ex', '--expressed_genes_file', nargs = 1, type = str,
default = None, help = '',
metavar = 'List of genes expressed in biological context of experiment')

args = parser.parse_args()

return args
Expand Down Expand Up @@ -94,8 +94,11 @@ def parseArgs():

if not GO_info:
empty_table = pd.DataFrame(["### This dataset did not yield any GO enrichment"])
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
if(output_file.endswith('.csv')):
empty_table.to_csv(output_file, index = False, header = False)
else:
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
sys.exit()

### Integrating data ###
Expand Down Expand Up @@ -130,5 +133,8 @@ def parseArgs():

### Writing output file ###

with pd.ExcelWriter(output_file) as writer:
go_df.to_excel(writer, index = False)
if (output_file.endswith('.csv')):
go_df.to_csv(output_file, index = False)
else:
with pd.ExcelWriter(output_file) as writer:
go_df.to_excel(writer, index = False)
16 changes: 11 additions & 5 deletions bin/getGO_xlsx_lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def parseArgs():
parser.add_argument('-ex', '--expressed_genes_file', nargs = 1, type = str,
default = None, help = '',
metavar = 'List of genes expressed in biological context of experiment')

args = parser.parse_args()

return args
Expand Down Expand Up @@ -94,8 +94,11 @@ def parseArgs():

if not GO_info:
empty_table = pd.DataFrame(["### This dataset did not yield any GO enrichment"])
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
if(output_file.endswith('.csv')):
empty_table.to_csv(output_file, index = False, header = False)
else:
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
sys.exit()

### Integrating data ###
Expand Down Expand Up @@ -130,5 +133,8 @@ def parseArgs():

### Writing output file ###

with pd.ExcelWriter(output_file) as writer:
go_df.to_excel(writer, index = False)
if (output_file.endswith('.csv')):
go_df.to_csv(output_file, index = False)
else:
with pd.ExcelWriter(output_file) as writer:
go_df.to_excel(writer, index = False)
18 changes: 12 additions & 6 deletions bin/getMotifCentricOutput_gw.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,11 @@ def parseArgs():

if enr_stats.empty:
empty_table = pd.DataFrame(["### This dataset did not yield any motif enrichment"])
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
if(output_file.endswith('.csv')):
empty_table.to_csv(output_file, index = False, header = False)
else:
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
sys.exit()

for col in enr_stats.select_dtypes(include = ['float']).columns:
Expand All @@ -122,11 +125,11 @@ def parseArgs():
if expressed_genes_file:
enr_stats['Any expressed gene'] = enr_stats.gene_id.isin(exp_genes)

enr_stats = enr_stats.groupby(['dataset', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(list(set(x))), 'Any expressed gene': any}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
enr_stats = enr_stats.groupby(['dataset', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(sorted(set(x))), 'Any expressed gene': any}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)

if not expressed_genes_file:

enr_stats = enr_stats.groupby(['dataset', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(list(set(x)))}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
enr_stats = enr_stats.groupby(['dataset', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(sorted(set(x)))}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)

enr_stats = enr_stats.merge(mot_tf, how = 'right', left_on = 'motif', right_on = 'motif_id').drop('motif_id', axis = 1)

Expand All @@ -146,5 +149,8 @@ def parseArgs():

### Writing output file ###

with pd.ExcelWriter(output_file) as writer:
enr_stats.to_excel(writer, index = False)
if (output_file.endswith('.csv')):
enr_stats.to_csv(output_file, index = False)
else:
with pd.ExcelWriter(output_file) as writer:
enr_stats.to_excel(writer, index = False)
18 changes: 12 additions & 6 deletions bin/getMotifCentricOutput_lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,11 @@ def parseArgs():

if enr_stats.empty:
empty_table = pd.DataFrame(["### This dataset did not yield any motif enrichment"])
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
if(output_file.endswith('.csv')):
empty_table.to_csv(output_file, index = False, header = False)
else:
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
sys.exit()

for col in enr_stats.select_dtypes(include = ['float']).columns:
Expand All @@ -122,11 +125,11 @@ def parseArgs():
if expressed_genes_file:
enr_stats['Any expressed gene'] = enr_stats.gene_id.isin(exp_genes)

enr_stats = enr_stats.groupby(['dataset', 'input_total_peaks', 'peaks_in_promoter', 'motif','real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(list(set(x))), 'Any expressed gene': any}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
enr_stats = enr_stats.groupby(['dataset', 'input_total_peaks', 'peaks_in_promoter', 'motif','real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(sorted(set(x))), 'Any expressed gene': any}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)

if not expressed_genes_file:

enr_stats = enr_stats.groupby(['dataset', 'input_total_peaks', 'peaks_in_promoter', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(list(set(x)))}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)
enr_stats = enr_stats.groupby(['dataset', 'input_total_peaks', 'peaks_in_promoter', 'motif', 'real_int', 'shuffled_int', 'p_val', 'enr_fold', 'adj_pval', 'pi_value', 'rank_pi_val']).agg({'gene_id': ','.join, 'family': lambda x: ','.join(sorted(set(x)))}).reset_index().sort_values(by = 'rank_pi_val').drop('gene_id', axis = 1)

enr_stats = enr_stats.merge(mot_tf, how = 'right', left_on = 'motif', right_on = 'motif_id').drop('motif_id', axis = 1)

Expand All @@ -146,5 +149,8 @@ def parseArgs():

### Writing output file ###

with pd.ExcelWriter(output_file) as writer:
enr_stats.to_excel(writer, index = False)
if (output_file.endswith('.csv')):
enr_stats.to_csv(output_file, index = False)
else:
with pd.ExcelWriter(output_file) as writer:
enr_stats.to_excel(writer, index = False)
15 changes: 11 additions & 4 deletions bin/getTFCentricOutput_gw.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,11 @@ def parseArgs():

if enr_stats.empty:
empty_table = pd.DataFrame(["### This dataset did not yield any motif enrichment"])
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
if(output_file.endswith('.csv')):
empty_table.to_csv(output_file, index = False, header = False)
else:
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
sys.exit()

### Reading and processing GO enrichment data ###
Expand Down Expand Up @@ -261,5 +264,9 @@ def parseArgs():

### Writing output file ###

with pd.ExcelWriter(output_file) as writer:
enr_stats.to_excel(writer, index = False)
if (output_file.endswith('.csv')):
enr_stats.to_csv(output_file, index = False)
else:
with pd.ExcelWriter(output_file) as writer:
enr_stats.to_excel(writer, index = False)

16 changes: 11 additions & 5 deletions bin/getTFCentricOutput_lb.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def parseArgs():
parser.add_argument('-ex', '--expressed_genes_file', nargs = 1, type = str,
default = None, help = '',
metavar = 'List of genes expressed in biological context of experiment')

args = parser.parse_args()

return args
Expand Down Expand Up @@ -133,8 +133,11 @@ def parseArgs():

if enr_stats.empty:
empty_table = pd.DataFrame(["### This dataset did not yield any motif enrichment"])
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
if(output_file.endswith('.csv')):
empty_table.to_csv(output_file, index = False, header = False)
else:
with pd.ExcelWriter(output_file) as writer:
empty_table.to_excel(writer, index = False, header = False)
sys.exit()

### Reading and processing GO enrichment data ###
Expand Down Expand Up @@ -261,5 +264,8 @@ def parseArgs():

### Writing output file ###

with pd.ExcelWriter(output_file) as writer:
enr_stats.to_excel(writer, index = False)
if (output_file.endswith('.csv')):
enr_stats.to_csv(output_file, index = False)
else:
with pd.ExcelWriter(output_file) as writer:
enr_stats.to_excel(writer, index = False)
23 changes: 15 additions & 8 deletions bin/processStats_bps_gw.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ def parseArgs():
parser.add_argument('output', nargs = 1, type = str,
help = '',
metavar = 'output_file')

parser.add_argument('shuffle_count', nargs = 1, type = int,
help = '',
metavar = 'Number of ACR shuffles that were performed for background generation')

args = parser.parse_args()

Expand All @@ -37,6 +41,7 @@ def parseArgs():
out_file = args.output[0]
cns_list_file = args.cns_sets_list[0]
total_peaks = args.num_peaks[0]
shuffle_count = args.shuffle_count[0]

file_name = "_".join(raw_file.split("/")[-1].split("_")[0:-5])

Expand Down Expand Up @@ -77,26 +82,28 @@ def parseArgs():
try:
shuff_overlap = np.array(shuff_dict[cns_set])
except KeyError:
shuff_overlap = np.zeros(1000, dtype = int)
shuff_overlap = np.zeros(shuffle_count, dtype = int)
try:
real_overlap = real_dict[cns_set]
except KeyError:
real_overlap = 0
if len(shuff_overlap) < 1000:
zero_pad = np.zeros(1000 - len(shuff_overlap), dtype = int)
if len(shuff_overlap) < shuffle_count:
zero_pad = np.zeros(shuffle_count - len(shuff_overlap), dtype = int)
shuff_overlap = np.concatenate([shuff_overlap, zero_pad])
else:
pass
p_val_1000 = len(shuff_overlap[shuff_overlap >= real_overlap])
if p_val_1000 == 0:
p_val_1000 = 0.9
times_above_real_overlap = len(shuff_overlap[shuff_overlap >= real_overlap])
if times_above_real_overlap == 0:
times_above_real_overlap = 0.9
median = np.median(shuff_overlap)
if real_overlap == 0 or median == 0:
enrichment_fold = 0
else:
enrichment_fold = real_overlap / median

p_val = times_above_real_overlap/shuffle_count

stats[(file_name, cns_set)] = [int(total_peaks), real_overlap, median, (p_val_1000/1000), enrichment_fold]
stats[(file_name, cns_set)] = [int(total_peaks), real_overlap, median, p_val, enrichment_fold]

data_df = pd.DataFrame.from_dict(stats).T
data_df = data_df.reset_index()
Expand All @@ -105,5 +112,5 @@ def parseArgs():
FDR = multipletests(data_df['p_val'], method = 'fdr_bh', alpha = 0.05)
data_df.insert(6, 'adj_pval', FDR[1])

data_df.to_csv(out_file, sep = "\t", index = None, na_rep = "nan")
data_df.sort_values(by = 'motif').to_csv(out_file, sep = "\t", index = None, na_rep = "nan")

Loading

0 comments on commit 24eef94

Please sign in to comment.