From a04cc45720aebddfaebd7ce7a41d9a2bf22bf833 Mon Sep 17 00:00:00 2001 From: Thomas Wayne McCarthy <10548984+twmccart@users.noreply.github.com> Date: Mon, 21 Jun 2021 13:32:15 -0400 Subject: [PATCH 1/5] Adjust filename cleaning regex so that chromosome names with periods do not cause inappropriate splitting. --- src/cineca/parallel_reditools.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cineca/parallel_reditools.py b/src/cineca/parallel_reditools.py index eccfcf2..c99d2ba 100755 --- a/src/cineca/parallel_reditools.py +++ b/src/cineca/parallel_reditools.py @@ -566,7 +566,8 @@ def calculate_intervals(total_coverage, coverage_file, region): if little_file.endswith("groups.txt"): continue print(little_file) - pieces = re.sub("\..*", "", os.path.basename(little_file)).split("#") + # Strip the file ending and split by '#' + pieces = re.sub(r"\.[^\.#]*", "", os.path.basename(little_file)).split("#") pieces.insert(0, little_file) little_files.append(pieces) From b939b50e881deb003cd12172cdc318feaf4ab605 Mon Sep 17 00:00:00 2001 From: Thomas Wayne McCarthy <10548984+twmccart@users.noreply.github.com> Date: Mon, 21 Jun 2021 13:45:55 -0400 Subject: [PATCH 2/5] Also fixed the regex of filenames that have periods inside them in reditools2_multisample.py --- src/cineca/reditools2_multisample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cineca/reditools2_multisample.py b/src/cineca/reditools2_multisample.py index 99d8521..16814c2 100755 --- a/src/cineca/reditools2_multisample.py +++ b/src/cineca/reditools2_multisample.py @@ -515,7 +515,7 @@ def calculate_intervals(total_coverage, coverage_file, region): if little_file.endswith("groups.txt"): continue print(little_file) - pieces = re.sub("\..*", "", os.path.basename(little_file)).split("#") + pieces = re.sub(r"\.[^\.#]*", "", os.path.basename(little_file)).split("#") pieces.insert(0, little_file) little_files.append(pieces) From a48371f285f9a14eac4845b7606e1d36c90c17a6 Mon Sep 17 00:00:00 2001 From: Thomas Wayne McCarthy <10548984+twmccart@users.noreply.github.com> Date: Mon, 21 Jun 2021 15:37:05 -0400 Subject: [PATCH 3/5] Last fix did not work, this change should properly process filenames that have internal periods. --- src/cineca/parallel_reditools.py | 2 +- src/cineca/reditools2_multisample.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cineca/parallel_reditools.py b/src/cineca/parallel_reditools.py index c99d2ba..591ac5c 100755 --- a/src/cineca/parallel_reditools.py +++ b/src/cineca/parallel_reditools.py @@ -567,7 +567,7 @@ def calculate_intervals(total_coverage, coverage_file, region): print(little_file) # Strip the file ending and split by '#' - pieces = re.sub(r"\.[^\.#]*", "", os.path.basename(little_file)).split("#") + pieces = re.sub(r"\.[^\.]*$", "", os.path.basename(little_file)).split("#") pieces.insert(0, little_file) little_files.append(pieces) diff --git a/src/cineca/reditools2_multisample.py b/src/cineca/reditools2_multisample.py index 16814c2..5222c09 100755 --- a/src/cineca/reditools2_multisample.py +++ b/src/cineca/reditools2_multisample.py @@ -515,7 +515,7 @@ def calculate_intervals(total_coverage, coverage_file, region): if little_file.endswith("groups.txt"): continue print(little_file) - pieces = re.sub(r"\.[^\.#]*", "", os.path.basename(little_file)).split("#") + pieces = re.sub(r"\.[^\.]*$", "", os.path.basename(little_file)).split("#") pieces.insert(0, little_file) little_files.append(pieces) From 65f1c9699af6fe7b049fa6144fc04ef1941ed506 Mon Sep 17 00:00:00 2001 From: Thomas Wayne McCarthy <10548984+twmccart@users.noreply.github.com> Date: Thu, 24 Jun 2021 12:02:10 -0400 Subject: [PATCH 4/5] Stop setting option "remove_header" to always True. --- src/cineca/parallel_reditools.py | 2 +- src/cineca/reditools2_multisample.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cineca/parallel_reditools.py b/src/cineca/parallel_reditools.py index 591ac5c..6d8ff5f 100755 --- a/src/cineca/parallel_reditools.py +++ b/src/cineca/parallel_reditools.py @@ -165,7 +165,7 @@ def calculate_intervals(total_coverage, coverage_file, region): size = comm.Get_size() options = reditools.parse_options() - options["remove_header"] = True + #options["remove_header"] = True parser = argparse.ArgumentParser(description='REDItools 2.0') parser.add_argument('-G', '--coverage-file', help='The coverage file of the sample to analyze') diff --git a/src/cineca/reditools2_multisample.py b/src/cineca/reditools2_multisample.py index 5222c09..839e386 100755 --- a/src/cineca/reditools2_multisample.py +++ b/src/cineca/reditools2_multisample.py @@ -144,7 +144,7 @@ def calculate_intervals(total_coverage, coverage_file, region): size = comm.Get_size() options = reditools.parse_options() - options["remove_header"] = True + #options["remove_header"] = True parser = argparse.ArgumentParser(description='REDItools 2.0') parser.add_argument('-D', '--coverage-dir', help='The coverage directory containing the coverage file of the sample to analyze divided by chromosome') From f2ba6b472bad75873a2d8adcb141350eade284b5 Mon Sep 17 00:00:00 2001 From: Thomas Wayne McCarthy <10548984+twmccart@users.noreply.github.com> Date: Fri, 25 Jun 2021 22:18:50 -0400 Subject: [PATCH 5/5] Re-enabling headers was a mistake. Until tabix can index a non-standard tab-delimited file, the header causes a problem. The -S 1 argument to tabix in merge.sh should work, but does not with tabix v1.12. --- src/cineca/parallel_reditools.py | 3 ++- src/cineca/reditools2_multisample.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cineca/parallel_reditools.py b/src/cineca/parallel_reditools.py index 6d8ff5f..0eb9bf9 100755 --- a/src/cineca/parallel_reditools.py +++ b/src/cineca/parallel_reditools.py @@ -165,7 +165,8 @@ def calculate_intervals(total_coverage, coverage_file, region): size = comm.Get_size() options = reditools.parse_options() - #options["remove_header"] = True + # Until tabix can properly index the output, the header is disruptive. + options["remove_header"] = True parser = argparse.ArgumentParser(description='REDItools 2.0') parser.add_argument('-G', '--coverage-file', help='The coverage file of the sample to analyze') diff --git a/src/cineca/reditools2_multisample.py b/src/cineca/reditools2_multisample.py index 839e386..98a7265 100755 --- a/src/cineca/reditools2_multisample.py +++ b/src/cineca/reditools2_multisample.py @@ -144,7 +144,8 @@ def calculate_intervals(total_coverage, coverage_file, region): size = comm.Get_size() options = reditools.parse_options() - #options["remove_header"] = True + # Until tabix can properly index the output, the header is disruptive. + options["remove_header"] = True parser = argparse.ArgumentParser(description='REDItools 2.0') parser.add_argument('-D', '--coverage-dir', help='The coverage directory containing the coverage file of the sample to analyze divided by chromosome')