diff --git a/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py b/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py index 73d05432b..f0f162771 100644 --- a/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py +++ b/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py @@ -475,20 +475,14 @@ def __init__(self, experiments, reflections, params): self.cluster_analysis() # now do the interesting cluster identification algorithm from xia2.cluster_analysis. - # but don't repeat the code. + # Same code structure as MultiCrystalAnalysis/cluster_analysis.py but changes the call + # from output_cluster to self._scale_and_report_cluster if ( self._params.clustering.output_clusters and "hierarchical" in self._params.clustering.method ): - ## note this is all hierarchical stuff first - need an if statement self._data_manager_original = self._data_manager - cwd = pathlib.Path.cwd() - """if not pathlib.Path.exists(cwd / "cos_clusters"): - pathlib.Path.mkdir(cwd / "cos_clusters") - if not pathlib.Path.exists(cwd / "cc_clusters"): - pathlib.Path.mkdir(cwd / "cc_clusters")""" - subclusters = get_subclusters( self._params.clustering, self._data_manager.ids_to_identifiers_map, @@ -498,7 +492,6 @@ def __init__(self, experiments, reflections, params): if not self._params.clustering.hierarchical.distinct_clusters: for c, cluster_identifiers, cluster in subclusters: - cluster_dir = cwd / f"{c}_clusters/cluster_{cluster.cluster_id}" logger.info(f"Scaling {c} cluster {cluster.cluster_id}:") logger.info(cluster) cluster_dir = f"{c}_cluster_{cluster.cluster_id}" @@ -532,7 +525,8 @@ def __init__(self, experiments, reflections, params): ) for item in list_of_clusters: - cluster_dir = f"{cty}_clusters/{item}" + cluster_no = item.split("_")[-1] + cluster_dir = f"{cty}_cluster_{cluster_no}" logger.info(f"Scaling : {cluster_dir}") for cluster in clusters: @@ -551,13 +545,9 @@ def __init__(self, experiments, reflections, params): and "coordinate" in self._params.clustering.method ): clusters = self._mca.significant_coordinate_clusters - # if not pathlib.Path.exists(cwd / "coordinate_clusters"): - # pathlib.Path.mkdir(cwd / "coordinate_clusters") for c in clusters: cluster_dir = f"coordinate_cluster_{c.cluster_id}" logger.info(f"Scaling: {cluster_dir}") - # if not pathlib.Path.exists(cwd / cluster_dir): - # pathlib.Path.mkdir(cwd / cluster_dir) cluster_identifiers = [ self._data_manager.ids_to_identifiers_map[l] for l in c.labels ] diff --git a/src/xia2/Modules/MultiCrystal/cluster_analysis.py b/src/xia2/Modules/MultiCrystal/cluster_analysis.py new file mode 100644 index 000000000..402aa1917 --- /dev/null +++ b/src/xia2/Modules/MultiCrystal/cluster_analysis.py @@ -0,0 +1,246 @@ +from __future__ import annotations + +import copy +import logging +import pathlib + +from dials.array_family import flex + +from xia2.Modules.MultiCrystalAnalysis import MultiCrystalAnalysis + +logger = logging.getLogger("") + + +cluster_phil_scope = """\ +clustering + .short_caption = "Clustering" +{ + output_clusters = False + .type = bool + .help = "Set this to true to enable scaling and merging of individual clusters" + .short_caption = "Output individual clusters" + + method = *hierarchical coordinate + .type = choice(multi=True) + .short_caption = "Clustering method to use - analyse the clusters generated from" + "the hierarchical dendrograms or the density based" + "clustering analysis of the cosym coordinates." + min_cluster_size = 5 + .type = int + .short_caption = "Minimum number of datasets for an output cluster" + min_completeness = 0 + .type = float(value_min=0, value_max=1) + .short_caption = "Minimum completeness" + min_multiplicity = 0 + .type = float(value_min=0) + .short_caption = "Minimum multiplicity" + max_output_clusters = 10 + .type = int(value_min=1) + .short_caption = "Maximum number of clusters to be output" + hierarchical + { + method = *cos_angle correlation + .type = choice(multi=True) + .short_caption = "Metric on which to perform hierarchical clustering" + max_cluster_height = 100 + .type = float + .short_caption = "Maximum height in dendrogram for clusters" + max_cluster_height_cc = 100 + .type = float + .short_caption = "Maximum height in correlation dendrogram for clusters" + max_cluster_height_cos = 100 + .type = float + .short_caption = "Maximum height in cos angle dendrogram for clusters" + distinct_clusters = False + .type = bool + .help = "This will determine whether optional cluster analysis is undertaken." + "To assist in decreasing computation time, only clusters that have" + "no datasets in common but eventually combine to form a joined cluster" + "in the output dendrogram will be scaled and merged." + "These may contain interesting differences worth comparing in" + "downstream analysis." + .short_caption = "Find distinct clusters" + } +} +""" + + +def clusters_and_types(cos_angle_clusters, cc_clusters, methods): + if "cos_angle" in methods and "correlation" not in methods: + clusters = cos_angle_clusters + ctype = ["cos"] * len(clusters) + elif "correlation" in methods and "cos_angle" not in methods: + clusters = cc_clusters + ctype = ["cc"] * len(clusters) + elif "cos_angle" in methods and "correlation" in methods: + clusters = cos_angle_clusters + cc_clusters + ctype = ["cos"] * len(cos_angle_clusters) + ["cc"] * len(cc_clusters) + else: + raise ValueError("Invalid cluster method: %s" % methods) + + clusters.reverse() + ctype.reverse() + return clusters, ctype + + +def get_subclusters(params, ids_to_identifiers_map, cos_angle_clusters, cc_clusters): + subclusters = [] + + min_completeness = params.min_completeness + min_multiplicity = params.min_multiplicity + max_clusters = params.max_output_clusters + min_cluster_size = params.min_cluster_size + max_cluster_height_cos = params.hierarchical.max_cluster_height_cos + max_cluster_height_cc = params.hierarchical.max_cluster_height_cc + max_cluster_height = params.hierarchical.max_cluster_height + + clusters, ctype = clusters_and_types( + cos_angle_clusters, cc_clusters, params.hierarchical.method + ) + + n_processed_cos = 0 + n_processed_cc = 0 + + for c, cluster in zip(ctype, clusters): + # This simplifies max_cluster_height into cc and cos angle versions + # But still gives the user the option of just selecting max_cluster_height + # Which makes more sense when they only want one type of clustering + + if c == "cc" and max_cluster_height != 100 and max_cluster_height_cc == 100: + max_cluster_height_cc = max_cluster_height + # if user has weirdly set both max_cluster_height and max_cluster_height_cc + # will still default to max_cluster_height_cc as intended + if c == "cos" and max_cluster_height != 100 and max_cluster_height_cos == 100: + max_cluster_height_cos = max_cluster_height + + if n_processed_cos == max_clusters and c == "cos": + continue + if n_processed_cc == max_clusters and c == "cc": + continue + if cluster.completeness < min_completeness: + continue + if cluster.multiplicity < min_multiplicity: + continue + if ( + len(cluster.labels) + == len(ids_to_identifiers_map) # was len(data_manager_original.experiments) + and not params.hierarchical.distinct_clusters + ): + continue + if cluster.height > max_cluster_height_cc and c == "cc": + continue + if cluster.height > max_cluster_height_cos and c == "cos": + continue + if len(cluster.labels) < min_cluster_size: + continue + + cluster_identifiers = [ids_to_identifiers_map[l] for l in cluster.labels] + subclusters.append((c, cluster_identifiers, cluster)) + if ( + not params.hierarchical.distinct_clusters + ): # increment so that we only get up to N clusters + if c == "cos": + n_processed_cos += 1 + elif c == "cc": + n_processed_cc += 1 + + return subclusters + + +def output_cluster(new_folder, experiments, reflections, ids): + expts = copy.deepcopy(experiments) + expts.select_on_experiment_identifiers(ids) + + refl = [] + for table in reflections: + if table.experiment_identifiers().values()[0] in ids: + refl.append(table) + + joint_refl = flex.reflection_table.concat(refl) + + if not pathlib.Path.exists(new_folder): + pathlib.Path.mkdir(new_folder) + + expts.as_file(new_folder / "cluster.expt") + joint_refl.as_file(new_folder / "cluster.refl") + + +def output_hierarchical_clusters(params, MCA, experiments, reflections): + cwd = pathlib.Path.cwd() + + # First get subclusters that meet the required thresholds + # - min size, completeness, multiplciity, dendrogram height etc. + # subclusters will be of length max_output_clusters if distinct_clusters=False + subclusters = get_subclusters( + params.clustering, + MCA.ids_to_identifiers_map, + MCA.cos_angle_clusters, + MCA.correlation_clusters, + ) + + # if not doing distinct cluster analysis, can now output clusters + if not params.clustering.hierarchical.distinct_clusters: + for c, cluster_identifiers, cluster in subclusters: + cluster_dir = cwd / f"{c}_clusters/cluster_{cluster.cluster_id}" + logger.info(f"Outputting {c} cluster {cluster.cluster_id}:") + logger.info(cluster) + output_cluster( + cluster_dir, + experiments, + reflections, + cluster_identifiers, + ) + + # if doing distinct cluster analysis, do the analysis and output clusters + if params.clustering.hierarchical.distinct_clusters: + cos_clusters = [] + cc_clusters = [] + cos_cluster_ids = {} + cc_cluster_ids = {} + for c, cluster_identifiers, cluster in subclusters: + if c == "cos": + cos_clusters.append(cluster) + cos_cluster_ids[cluster.cluster_id] = cluster_identifiers + elif c == "cc": + cc_clusters.append(cluster) + cc_cluster_ids[cluster.cluster_id] = cluster_identifiers + + for k, clusters in enumerate([cos_clusters, cc_clusters]): + cty = "cc" if k == 1 else "cos" # cluster type as a string + logger.info("-" * 22 + f"\n{cty} cluster analysis\n" + "-" * 22) + + _, list_of_clusters = ( + MultiCrystalAnalysis.interesting_cluster_identification( + clusters, params + ) + ) + for item in list_of_clusters: + cluster_dir = f"{cty}_clusters/{item}" + logger.info(f"Outputting: {cluster_dir}") + output_dir = cwd / cluster_dir + + for cluster in clusters: + if f"cluster_{cluster.cluster_id}" == item: + ids = ( + cc_cluster_ids[cluster.cluster_id] + if k + else cos_cluster_ids[cluster.cluster_id] + ) + output_cluster( + output_dir, + experiments, + reflections, + ids, + ) + break + + if params.clustering.hierarchical.distinct_clusters: + logger.info(f"Clusters recommended for comparison in {params.output.log}") + if params.clustering.output_clusters: + logger.info("----------------") + logger.info("Output given as DIALS .expt/.refl files:") + logger.info("To merge rotation data: use dials.merge") + logger.info( + "To merge still data: use xia2.ssx_reduce with the option steps=merge" + ) + logger.info("----------------") diff --git a/tests/regression/test_multiplex.py b/tests/regression/test_multiplex.py index 4f4ae093b..edd7ea329 100644 --- a/tests/regression/test_multiplex.py +++ b/tests/regression/test_multiplex.py @@ -233,12 +233,12 @@ def test_proteinase_k_dose( [ "clustering.output_clusters=True", "clustering.min_completeness=0.4", - "clustering.method=cos_angle", + "clustering.hierarchical.method=cos_angle", ], [ "clustering.output_clusters=True", "clustering.min_completeness=0.5", - "clustering.method=correlation", + "clustering.hierarchical.method=correlation", ], ), )