diff --git a/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py b/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py index 7ac2dd329..33171dab8 100644 --- a/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py +++ b/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py @@ -316,6 +316,88 @@ ) +def clusters_and_types(cos_angle_clusters, cc_clusters, methods): + if "cos_angle" in methods and "correlation" not in methods: + clusters = cos_angle_clusters + ctype = ["cos"] * len(clusters) + elif "correlation" in methods and "cos_angle" not in methods: + clusters = cc_clusters + ctype = ["cc"] * len(clusters) + elif "cos_angle" in methods and "correlation" in methods: + clusters = cos_angle_clusters + cc_clusters + ctype = ["cos"] * len(cos_angle_clusters) + ["cc"] * len(cc_clusters) + else: + raise ValueError("Invalid cluster method: %s" % methods) + + clusters.reverse() + ctype.reverse() + return clusters, ctype + + +def get_subclusters(params, ids_to_identifiers_map, cos_angle_clusters, cc_clusters): + subclusters = [] + + min_completeness = params.min_completeness + min_multiplicity = params.min_multiplicity + max_clusters = params.max_output_clusters + min_cluster_size = params.min_cluster_size + max_cluster_height_cos = params.hierarchical.max_cluster_height_cos + max_cluster_height_cc = params.hierarchical.max_cluster_height_cc + max_cluster_height = params.hierarchical.max_cluster_height + + clusters, ctype = clusters_and_types( + cos_angle_clusters, cc_clusters, params.hierarchical.method + ) + + n_processed_cos = 0 + n_processed_cc = 0 + + for c, cluster in zip(ctype, clusters): + # This simplifies max_cluster_height into cc and cos angle versions + # But still gives the user the option of just selecting max_cluster_height + # Which makes more sense when they only want one type of clustering + + if c == "cc" and max_cluster_height != 100 and max_cluster_height_cc == 100: + max_cluster_height_cc = max_cluster_height + # if user has weirdly set both max_cluster_height and max_cluster_height_cc + # will still default to max_cluster_height_cc as intended + if c == "cos" and max_cluster_height != 100 and max_cluster_height_cos == 100: + max_cluster_height_cos = max_cluster_height + + if n_processed_cos == max_clusters and c == "cos": + continue + if n_processed_cc == max_clusters and c == "cc": + continue + if cluster.completeness < min_completeness: + continue + if cluster.multiplicity < min_multiplicity: + continue + if ( + len(cluster.labels) + == len(ids_to_identifiers_map) # was len(data_manager_original.experiments) + and not params.hierarchical.distinct_clusters + ): + continue + if cluster.height > max_cluster_height_cc and c == "cc": + continue + if cluster.height > max_cluster_height_cos and c == "cos": + continue + if len(cluster.labels) < min_cluster_size: + continue + + cluster_identifiers = [ids_to_identifiers_map[l] for l in cluster.labels] + subclusters.append((c, cluster_identifiers, cluster)) + if ( + not params.hierarchical.distinct_clusters + ): # increment so that we only get up to N clusters + if c == "cos": + n_processed_cos += 1 + elif c == "cc": + n_processed_cc += 1 + + return subclusters + + class MultiCrystalScale: def __init__(self, experiments, reflections, params): self._data_manager = DataManager(experiments, reflections) @@ -472,119 +554,40 @@ def __init__(self, experiments, reflections, params): self._mca = self.multi_crystal_analysis() self.cluster_analysis() - min_completeness = self._params.clustering.min_completeness - min_multiplicity = self._params.clustering.min_multiplicity - max_clusters = self._params.clustering.max_output_clusters - min_cluster_size = self._params.clustering.min_cluster_size - max_cluster_height_cos = self._params.clustering.max_cluster_height_cos - max_cluster_height_cc = self._params.clustering.max_cluster_height_cc - max_cluster_height = self._params.clustering.max_cluster_height - - if ( - "cos_angle" in params.clustering.method - and "correlation" not in params.clustering.method - ): - clusters = self._cos_angle_clusters - ctype = ["cos" for i in clusters] - elif ( - "correlation" in params.clustering.method - and "cos_angle" not in params.clustering.method - ): - clusters = self._cc_clusters - ctype = ["cc" for i in clusters] - elif ( - "cos_angle" in params.clustering.method - and "correlation" in params.clustering.method - ): - clusters = self._cos_angle_clusters + self._cc_clusters - ctype = ["cos" for i in self._cos_angle_clusters] + [ - "cc" for i in self._cc_clusters - ] - else: - raise ValueError( - "Invalid cluster method: %s" % self._params.clustering.method - ) - - clusters.reverse() - ctype.reverse() - self.cos_clusters = [] - self.cc_clusters = [] - self.cos_cluster_ids = {} - self.cc_cluster_ids = {} + # now do the interesting cluster identification algorithm from xia2.cluster_analysis. + # but don't repeat the code. if self._params.clustering.output_clusters: + ## note this is all hierarchical stuff first - need an if statement self._data_manager_original = self._data_manager cwd = os.path.abspath(os.getcwd()) - n_processed_cos = 0 - n_processed_cc = 0 - - for c, cluster in zip(ctype, clusters): - # This simplifies max_cluster_height into cc and cos angle versions - # But still gives the user the option of just selecting max_cluster_height - # Which makes more sense when they only want one type of clustering - if ( - c == "cc" - and max_cluster_height != 100 - and max_cluster_height_cc == 100 - ): - max_cluster_height_cc = max_cluster_height - # if user has weirdly set both max_cluster_height and max_cluster_height_cc - # will still default to max_cluster_height_cc as intended - if ( - c == "cos" - and max_cluster_height != 100 - and max_cluster_height_cos == 100 - ): - max_cluster_height_cos = max_cluster_height - - if n_processed_cos == max_clusters and c == "cos": - continue - if n_processed_cc == max_clusters and c == "cc": - continue - if cluster.completeness < min_completeness: - continue - if cluster.multiplicity < min_multiplicity: - continue - if ( - len(cluster.labels) == len(self._data_manager_original.experiments) - and not params.clustering.find_distinct_clusters - ): - continue - if cluster.height > max_cluster_height_cc and c == "cc": - continue - if cluster.height > max_cluster_height_cos and c == "cos": - continue - if len(cluster.labels) < min_cluster_size: - continue - - data_manager = copy.deepcopy(self._data_manager_original) - cluster_identifiers = [ - data_manager.ids_to_identifiers_map[l] for l in cluster.labels - ] + subclusters = get_subclusters( + self._params.clustering, + data_manager.ids_to_identifiers_map, + self._cos_angle_clusters, + self._cc_clusters, + ) - if self._params.clustering.find_distinct_clusters: + if not self._params.clustering.hierarchical.distinct_clusters: + for c, cluster_identifiers, cluster in subclusters: + """if self._params.clustering.find_distinct_clusters: + # set things up for later, but don't scale now? if c == "cos": self.cos_clusters.append(cluster) self.cos_cluster_ids[cluster.cluster_id] = cluster_identifiers elif c == "cc": self.cc_clusters.append(cluster) - self.cc_cluster_ids[cluster.cluster_id] = cluster_identifiers + self.cc_cluster_ids[cluster.cluster_id] = cluster_identifiers""" - else: - if c == "cos": - n_processed_cos += 1 - elif c == "cc": - n_processed_cc += 1 - - if c == "cos": - logger.info("Scaling cos cluster %i:" % cluster.cluster_id) - logger.info(cluster) - cluster_dir = "cos_cluster_%i" % cluster.cluster_id - elif c == "cc": - logger.info("Scaling cc cluster %i:" % cluster.cluster_id) - logger.info(cluster) - cluster_dir = "cc_cluster_%i" % cluster.cluster_id + # else: + # if c == "cos": + # n_processed_cos += 1 + # elif c == "cc": + # n_processed_cc += 1 + logger.info(f"Scaling {c} cluster {cluster.cluster_id}:") + logger.info(cluster) + cluster_dir = f"{c}_cluster_{cluster.cluster_id}" if not os.path.exists(cluster_dir): os.mkdir(cluster_dir) @@ -600,28 +603,44 @@ def __init__(self, experiments, reflections, params): ) os.chdir(cwd) - if self._params.clustering.find_distinct_clusters: - for k, clusters in enumerate([self.cos_clusters, self.cc_clusters]): - if k == 0 and "cos_angle" in self._params.clustering.method: - cty = "cos" - elif k == 1 and "correlation" in self._params.clustering.method: - cty = "cc" - else: - cty = False - - if cty: - logger.info("----------------------") - logger.info(f"{cty} cluster analysis") - logger.info("----------------------") + if self._params.clustering.hierarchical.distinct_clusters: + self.cos_clusters = [] + self.cc_clusters = [] + self.cos_cluster_ids = {} + self.cc_cluster_ids = {} + for c, cluster_identifiers in subclusters: + if c == "cos": + self.cos_clusters.append(cluster) + self.cos_cluster_ids[cluster.cluster_id] = cluster_identifiers + elif c == "cc": + self.cc_clusters.append(cluster) + self.cc_cluster_ids[cluster.cluster_id] = cluster_identifiers - ( - file_data, - list_of_clusters, - ) = MultiCrystalAnalysis.interesting_cluster_identification( - clusters, self._params - ) + for k, clusters in enumerate([self.cos_clusters, self.cc_clusters]): + if ( + k == 0 + and "cos_angle" in self._params.clustering.hierarchical.method + ): + cty = "cos" + elif ( + k == 1 + and "correlation" in self._params.clustering.hierarchical.method + ): + cty = "cc" + else: + cty = False + + if cty: + logger.info("----------------------") + logger.info(f"{cty} cluster analysis") + logger.info("----------------------") + + _, list_of_clusters = ( + MultiCrystalAnalysis.interesting_cluster_identification( + clusters, self._params + ) + ) - if len(list_of_clusters) > 0: for item in list_of_clusters: if k == 0: cluster_dir = "cos_" + item diff --git a/src/xia2/cli/cluster_analysis.py b/src/xia2/cli/cluster_analysis.py index 511f8b1e5..89af1bfaa 100644 --- a/src/xia2/cli/cluster_analysis.py +++ b/src/xia2/cli/cluster_analysis.py @@ -2,7 +2,7 @@ import copy import logging -import os +import pathlib import random import sys @@ -34,16 +34,16 @@ .help = "Set this to true to enable scaling and merging of individual clusters" .short_caption = "Output individual clusters" - output_correlation_cluster_number = 0 + output_correlation_cluster_number = None .type = int .short_caption = "Option to output a specific correlation cluster when re-running the code" - output_cos_cluster_number = 0 + output_cos_cluster_number = None .type = int .short_caption = "Option to output a specific cos cluster when re-running the code" - exclude_correlation_cluster_number = 0 + exclude_correlation_cluster_number = None .type = int .short_caption = "Option to output all data excluding a specific correlation cluster" - exclude_cos_cluster_number = 0 + exclude_cos_cluster_number = None .type = int .short_caption = "option to output all data excluding a specific cos cluster" @@ -196,155 +196,22 @@ def run(args=sys.argv[1:]): logger.info("\nCos(angle) clustering summary:") logger.info(tabulate(MCA.cos_table, headers="firstrow", tablefmt="rst")) - min_completeness = params.clustering.min_completeness - min_multiplicity = params.clustering.min_multiplicity - max_clusters = params.clustering.max_output_clusters - min_cluster_size = params.clustering.min_cluster_size - max_cluster_height_cos = params.clustering.hierarchical.max_cluster_height_cos - max_cluster_height_cc = params.clustering.hierarchical.max_cluster_height_cc - max_cluster_height = params.clustering.hierarchical.max_cluster_height - - if not os.path.exists("cc_clusters"): - os.mkdir("cc_clusters") - if not os.path.exists("cos_angle_clusters"): - os.mkdir("cos_angle_clusters") - - if ( - "cos_angle" in params.clustering.hierarchical.method - and "correlation" not in params.clustering.hierarchical.method - ): - clusters = MCA.cos_angle_clusters - ctype = ["cos" for i in clusters] - elif ( - "correlation" in params.clustering.hierarchical.method - and "cos_angle" not in params.clustering.hierarchical.method - ): - clusters = MCA.correlation_clusters - ctype = ["cc" for i in clusters] - elif ( - "cos_angle" in params.clustering.hierarchical.method - and "correlation" in params.clustering.hierarchical.method - ): - clusters = MCA.cos_angle_clusters + MCA.correlation_clusters - ctype = ["cos" for i in MCA.cos_angle_clusters] + [ - "cc" for i in MCA.correlation_clusters - ] - - clusters.reverse() - ctype.reverse() - cos_clusters = [] - cc_clusters = [] - cos_cluster_ids = {} - cc_cluster_ids = {} + cwd = pathlib.Path.cwd() + if not pathlib.Path.exists(cwd / "cos_clusters"): + pathlib.Path.mkdir(cwd / "cos_clusters") + if not pathlib.Path.exists(cwd / "cc_clusters"): + pathlib.Path.mkdir(cwd / "cc_clusters") - if params.clustering.output_clusters: - n_processed_cos = 0 - n_processed_cc = 0 - - for c, cluster in zip(ctype, clusters): - # This simplifies max_cluster_height into cc and cos angle versions - # But still gives the user the option of just selecting max_cluster_height - # Which makes more sense when they only want one type of clustering + # First get any specific requested/excluded clusters - if ( - c == "cc" - and max_cluster_height != 100 - and max_cluster_height_cc == 100 - ): - max_cluster_height_cc = max_cluster_height - # if user has weirdly set both max_cluster_height and max_cluster_height_cc - # will still default to max_cluster_height_cc as intended - if ( - c == "cos" - and max_cluster_height != 100 - and max_cluster_height_cos == 100 - ): - max_cluster_height_cos = max_cluster_height - - if n_processed_cos == max_clusters and c == "cos": - continue - if n_processed_cc == max_clusters and c == "cc": - continue - if cluster.completeness < min_completeness: - continue - if cluster.multiplicity < min_multiplicity: - continue - if ( - len(cluster.labels) == len(experiments) - and not params.clustering.hierarchical.distinct_clusters - ): - continue - if cluster.height > max_cluster_height_cc and c == "cc": - continue - if cluster.height > max_cluster_height_cos and c == "cos": - continue - if len(cluster.labels) < min_cluster_size: - continue - - cluster_identifiers = [ - MCA.ids_to_identifiers_map[l] for l in cluster.labels - ] - - if params.clustering.hierarchical.distinct_clusters: - if c == "cos": - cos_clusters.append(cluster) - cos_cluster_ids[cluster.cluster_id] = cluster_identifiers - elif c == "cc": - cc_clusters.append(cluster) - cc_cluster_ids[cluster.cluster_id] = cluster_identifiers - - else: - if c == "cos": - n_processed_cos += 1 - cluster_dir = ( - "cos_angle_clusters/cluster_%i" % cluster.cluster_id - ) - if ( - params.clustering.output_cos_cluster_number == 0 - or params.clustering.output_cos_cluster_number - == cluster.cluster_id - ): - logger.info( - "Outputting cos cluster %i:" % cluster.cluster_id - ) - logger.info(cluster) - output_cluster( - cluster_dir, - experiments, - reflections, - cluster_identifiers, - cluster, - ) - elif c == "cc": - n_processed_cc += 1 - cluster_dir = "cc_clusters/cluster_%i" % cluster.cluster_id - if ( - params.clustering.output_correlation_cluster_number == 0 - or params.clustering.output_correlation_cluster_number - == cluster.cluster_id - ): - logger.info( - "Outputting cc cluster %i:" % cluster.cluster_id - ) - logger.info(cluster) - output_cluster( - cluster_dir, - experiments, - reflections, - cluster_identifiers, - cluster, - ) - - for c, cluster in zip(ctype, clusters): - # Specific Clusters to output - - if ( - params.clustering.output_correlation_cluster_number - == cluster.cluster_id - and c == "cc" - ): - logger.info("Outputting cluster number %i:" % cluster.cluster_id) - new_folder = "cc_clusters/cluster_%i" % cluster.cluster_id + # These are options that are only available to xia2.cluster_analysis + if params.clustering.output_cos_cluster_number: + for cluster in MCA.cos_angle_clusters: + if params.clustering.output_cos_cluster_number == cluster.cluster_id: + logger.info( + f"Outputting cos angle cluster number {cluster.cluster_id}" + ) + new_folder = cwd / "cos_clusters" / f"cluster_{cluster.cluster_id}" identifiers = [ MCA.ids_to_identifiers_map[l] for l in cluster.labels ] @@ -353,14 +220,15 @@ def run(args=sys.argv[1:]): experiments, reflections, identifiers, - cluster, ) + if params.clustering.output_correlation_cluster_number: + for cluster in MCA.correlation_clusters: if ( - params.clustering.output_cos_cluster_number == cluster.cluster_id - and c == "cos" + params.clustering.output_correlation_cluster_number + == cluster.cluster_id ): - logger.info("Outputting cluster number %i:" % cluster.cluster_id) - new_folder = "cos_angle_clusters/cluster_%i" % cluster.cluster_id + logger.info(f"Outputting cc cluster number {cluster.cluster_id}") + new_folder = cwd / "cc_clusters" / f"cluster_{cluster.cluster_id}" identifiers = [ MCA.ids_to_identifiers_map[l] for l in cluster.labels ] @@ -369,21 +237,19 @@ def run(args=sys.argv[1:]): experiments, reflections, identifiers, - cluster, ) - # Excluded Clusters - + if params.clustering.exclude_correlation_cluster_number: + for cluster in MCA.correlation_clusters: if ( params.clustering.exclude_correlation_cluster_number == cluster.cluster_id - and c == "cc" ): logger.info( - "Outputting data excluding cc cluster %i:" % cluster.cluster_id + f"Outputting data excluding cc cluster {cluster.cluster_id}" ) - new_folder = "cc_clusters/excluded_cluster_" + str( - cluster.cluster_id + new_folder = ( + cwd / "cc_clusters" / f"excluded_cluster_{cluster.cluster_id}" ) overall_cluster = MCA.correlation_clusters[-1] identifiers_overall_cluster = [ @@ -402,21 +268,17 @@ def run(args=sys.argv[1:]): experiments, reflections, identifiers_to_output, - cluster, ) - - if ( - params.clustering.exclude_cos_cluster_number == cluster.cluster_id - and c == "cos" - ): + if params.clustering.exclude_cos_cluster_number: + for cluster in MCA.cos_angle_clusters: + if params.clustering.exclude_cos_cluster_number == cluster.cluster_id: logger.info( - "Outputting data excluding cos angle cluster %i:" - % cluster.cluster_id + f"Outputting data excluding cos angle cluster {cluster.cluster_id}" ) - new_folder = "cos_angle_clusters/excluded_cluster_" + str( - cluster.cluster_id + new_folder = ( + cwd / "cos_clusters" / f"excluded_cluster_{cluster.cluster_id}" ) - overall_cluster = MCA._cluster_analysis.cos_angle_clusters[-1] + overall_cluster = MCA.cos_angle_clusters[-1] identifiers_overall_cluster = [ MCA.ids_to_identifiers_map[l] for l in overall_cluster.labels ] @@ -433,15 +295,51 @@ def run(args=sys.argv[1:]): experiments, reflections, identifiers_to_output, - cluster, ) + # End of include/exclude options that are only available to xia2.cluster_analysis + + # all under if params.clustering.output_clusters:? + from xia2.Modules.MultiCrystal.ScaleAndMerge import get_subclusters + + # First get subclusters that meet the required thresholds + # - min size, completeness, multiplciity, dendrogram height etc. + # subclusters will be of length max_output_clusters if distinct_clusters=False + subclusters = get_subclusters( + params.clustering, + MCA.ids_to_identifiers_map, + MCA.cos_angle_clusters, + MCA.correlation_clusters, + ) + + # if not doing distinct cluster analysis, can now output clusters + if not params.clustering.hierarchical.distinct_clusters: + for c, cluster_identifiers, cluster in subclusters: + cluster_dir = cwd / f"{c}_clusters/cluster_{cluster.cluster_id}" + logger.info(f"Outputting {c} cluster {cluster.cluster_id}:") + logger.info(cluster) + output_cluster( + cluster_dir, + experiments, + reflections, + cluster_identifiers, + ) + # if doing distinct cluster analysis, do the analysis and output clusters if params.clustering.hierarchical.distinct_clusters: + cos_clusters = [] + cc_clusters = [] + cos_cluster_ids = {} + cc_cluster_ids = {} + for c, cluster_identifiers, cluster in subclusters: + if c == "cos": + cos_clusters.append(cluster) + cos_cluster_ids[cluster.cluster_id] = cluster_identifiers + elif c == "cc": + cc_clusters.append(cluster) + cc_cluster_ids[cluster.cluster_id] = cluster_identifiers + for k, clusters in enumerate([cos_clusters, cc_clusters]): - if k == 0: - cty = "cos" - elif k == 1: - cty = "cc" + cty = "cc" if k == 1 else "cos" # cluster type as a string logger.info("----------------------") logger.info(f"{cty} cluster analysis") logger.info("----------------------") @@ -452,29 +350,24 @@ def run(args=sys.argv[1:]): ) = MultiCrystalAnalysis.interesting_cluster_identification( clusters, params ) - - if len(list_of_clusters) > 0: - for item in list_of_clusters: - if k == 0: - cluster_dir = "cos_angle_clusters/" + item - elif k == 1: - cluster_dir = "cc_clusters/" + item - logger.info("Outputting: %s" % cluster_dir) - - for cluster in clusters: - if "cluster_" + str(cluster.cluster_id) == item: - if k == 0: - ids = cos_cluster_ids[cluster.cluster_id] - elif k == 1: - ids = cc_cluster_ids[cluster.cluster_id] - - output_cluster( - cluster_dir, - experiments, - reflections, - ids, - cluster, - ) + for item in list_of_clusters: + cluster_dir = f"{cty}_clusters/{item}" + logger.info(f"Outputting: {cluster_dir}") + output_dir = cwd / cluster_dir + + for cluster in clusters: + if f"cluster_{cluster.cluster_id}" == item: + ids = ( + cc_cluster_ids[cluster.cluster_id] + if k + else cos_cluster_ids[cluster.cluster_id] + ) + output_cluster( + output_dir, + experiments, + reflections, + ids, + ) if params.clustering.hierarchical.distinct_clusters: logger.info(f"Clusters recommended for comparison in {params.output.log}") @@ -510,19 +403,19 @@ def run(args=sys.argv[1:]): MCA.output_json() -def output_cluster(new_folder, experiments, reflections, ids, cluster): +def output_cluster(new_folder, experiments, reflections, ids): expts = copy.deepcopy(experiments) expts.select_on_experiment_identifiers(ids) refl = [] - for idx, i in enumerate(reflections): - if idx in cluster.labels: - refl.append(i) + for table in reflections: + if table.experiment_identifiers().values()[0] in ids: + refl.append(table) joint_refl = flex.reflection_table.concat(refl) - if not os.path.exists(new_folder): - os.mkdir(new_folder) + if not pathlib.Path.exists(new_folder): + pathlib.Path.mkdir(new_folder) - expts.as_file(new_folder + "/cluster_" + str(cluster.cluster_id) + ".expt") - joint_refl.as_file(new_folder + "/cluster_" + str(cluster.cluster_id) + ".refl") + expts.as_file(new_folder / "cluster.expt") + joint_refl.as_file(new_folder / "cluster.refl") diff --git a/tests/regression/test_cluster_analysis.py b/tests/regression/test_cluster_analysis.py index f1ae41128..6cafd65d6 100644 --- a/tests/regression/test_cluster_analysis.py +++ b/tests/regression/test_cluster_analysis.py @@ -165,10 +165,10 @@ def test_rotation_data(dials_data, run_in_tmp_path): def check_output(main_dir, output_clusters=True, interesting_clusters=False): if output_clusters and not interesting_clusters: assert (main_dir / "cc_clusters" / "cluster_2").exists() - assert (main_dir / "cos_angle_clusters" / "cluster_2").exists() + assert (main_dir / "cos_clusters" / "cluster_2").exists() if output_clusters and interesting_clusters: assert (main_dir / "cc_clusters" / "cluster_2").exists() - assert (main_dir / "cos_angle_clusters" / "cluster_2").exists() + assert (main_dir / "cos_clusters" / "cluster_2").exists() assert (main_dir / "xia2.cluster_analysis.json").is_file() assert (main_dir / "xia2.cluster_analysis.log").is_file() assert (main_dir / "xia2.cluster_analysis.html").is_file()