From 16ec9713a1d57732f4111d38c5786608dc0a881b Mon Sep 17 00:00:00 2001 From: James Beilsten-Edmands <30625594+jbeilstenedmands@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:58:10 +0100 Subject: [PATCH] Do coordinate clustering in multiplex --- .../Modules/MultiCrystal/ScaleAndMerge.py | 229 ++++++------------ src/xia2/Modules/MultiCrystalAnalysis.py | 1 + src/xia2/cli/cluster_analysis.py | 178 +++----------- src/xia2/cli/multiplex.py | 12 +- 4 files changed, 105 insertions(+), 315 deletions(-) diff --git a/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py b/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py index 33171dab8..73d05432b 100644 --- a/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py +++ b/src/xia2/Modules/MultiCrystal/ScaleAndMerge.py @@ -3,6 +3,7 @@ import copy import logging import os +import pathlib from collections import OrderedDict import iotbx.phil @@ -19,6 +20,7 @@ from xia2.Handlers.Phil import PhilIndex from xia2.lib.bits import auto_logfiler from xia2.Modules import Report +from xia2.Modules.MultiCrystal.cluster_analysis import get_subclusters from xia2.Modules.MultiCrystal.data_manager import DataManager from xia2.Modules.MultiCrystalAnalysis import MultiCrystalAnalysis from xia2.Modules.Scaler.DialsScaler import ( @@ -316,88 +318,6 @@ ) -def clusters_and_types(cos_angle_clusters, cc_clusters, methods): - if "cos_angle" in methods and "correlation" not in methods: - clusters = cos_angle_clusters - ctype = ["cos"] * len(clusters) - elif "correlation" in methods and "cos_angle" not in methods: - clusters = cc_clusters - ctype = ["cc"] * len(clusters) - elif "cos_angle" in methods and "correlation" in methods: - clusters = cos_angle_clusters + cc_clusters - ctype = ["cos"] * len(cos_angle_clusters) + ["cc"] * len(cc_clusters) - else: - raise ValueError("Invalid cluster method: %s" % methods) - - clusters.reverse() - ctype.reverse() - return clusters, ctype - - -def get_subclusters(params, ids_to_identifiers_map, cos_angle_clusters, cc_clusters): - subclusters = [] - - min_completeness = params.min_completeness - min_multiplicity = params.min_multiplicity - max_clusters = params.max_output_clusters - min_cluster_size = params.min_cluster_size - max_cluster_height_cos = params.hierarchical.max_cluster_height_cos - max_cluster_height_cc = params.hierarchical.max_cluster_height_cc - max_cluster_height = params.hierarchical.max_cluster_height - - clusters, ctype = clusters_and_types( - cos_angle_clusters, cc_clusters, params.hierarchical.method - ) - - n_processed_cos = 0 - n_processed_cc = 0 - - for c, cluster in zip(ctype, clusters): - # This simplifies max_cluster_height into cc and cos angle versions - # But still gives the user the option of just selecting max_cluster_height - # Which makes more sense when they only want one type of clustering - - if c == "cc" and max_cluster_height != 100 and max_cluster_height_cc == 100: - max_cluster_height_cc = max_cluster_height - # if user has weirdly set both max_cluster_height and max_cluster_height_cc - # will still default to max_cluster_height_cc as intended - if c == "cos" and max_cluster_height != 100 and max_cluster_height_cos == 100: - max_cluster_height_cos = max_cluster_height - - if n_processed_cos == max_clusters and c == "cos": - continue - if n_processed_cc == max_clusters and c == "cc": - continue - if cluster.completeness < min_completeness: - continue - if cluster.multiplicity < min_multiplicity: - continue - if ( - len(cluster.labels) - == len(ids_to_identifiers_map) # was len(data_manager_original.experiments) - and not params.hierarchical.distinct_clusters - ): - continue - if cluster.height > max_cluster_height_cc and c == "cc": - continue - if cluster.height > max_cluster_height_cos and c == "cos": - continue - if len(cluster.labels) < min_cluster_size: - continue - - cluster_identifiers = [ids_to_identifiers_map[l] for l in cluster.labels] - subclusters.append((c, cluster_identifiers, cluster)) - if ( - not params.hierarchical.distinct_clusters - ): # increment so that we only get up to N clusters - if c == "cos": - n_processed_cos += 1 - elif c == "cc": - n_processed_cc += 1 - - return subclusters - - class MultiCrystalScale: def __init__(self, experiments, reflections, params): self._data_manager = DataManager(experiments, reflections) @@ -557,58 +477,43 @@ def __init__(self, experiments, reflections, params): # now do the interesting cluster identification algorithm from xia2.cluster_analysis. # but don't repeat the code. - if self._params.clustering.output_clusters: + if ( + self._params.clustering.output_clusters + and "hierarchical" in self._params.clustering.method + ): ## note this is all hierarchical stuff first - need an if statement self._data_manager_original = self._data_manager - cwd = os.path.abspath(os.getcwd()) + cwd = pathlib.Path.cwd() + """if not pathlib.Path.exists(cwd / "cos_clusters"): + pathlib.Path.mkdir(cwd / "cos_clusters") + if not pathlib.Path.exists(cwd / "cc_clusters"): + pathlib.Path.mkdir(cwd / "cc_clusters")""" subclusters = get_subclusters( self._params.clustering, - data_manager.ids_to_identifiers_map, + self._data_manager.ids_to_identifiers_map, self._cos_angle_clusters, self._cc_clusters, ) if not self._params.clustering.hierarchical.distinct_clusters: for c, cluster_identifiers, cluster in subclusters: - """if self._params.clustering.find_distinct_clusters: - # set things up for later, but don't scale now? - if c == "cos": - self.cos_clusters.append(cluster) - self.cos_cluster_ids[cluster.cluster_id] = cluster_identifiers - elif c == "cc": - self.cc_clusters.append(cluster) - self.cc_cluster_ids[cluster.cluster_id] = cluster_identifiers""" - - # else: - # if c == "cos": - # n_processed_cos += 1 - # elif c == "cc": - # n_processed_cc += 1 + cluster_dir = cwd / f"{c}_clusters/cluster_{cluster.cluster_id}" logger.info(f"Scaling {c} cluster {cluster.cluster_id}:") logger.info(cluster) cluster_dir = f"{c}_cluster_{cluster.cluster_id}" - - if not os.path.exists(cluster_dir): - os.mkdir(cluster_dir) - os.chdir(cluster_dir) - - scaled = self.scale_cluster( - data_manager, + self._scale_and_report_cluster( + self._data_manager, + cluster_dir, cluster_identifiers, - free_flags_in_full_set, - ) - self._record_individual_report( - data_manager, scaled.report(), cluster_dir.replace("_", " ") ) - os.chdir(cwd) if self._params.clustering.hierarchical.distinct_clusters: self.cos_clusters = [] self.cc_clusters = [] self.cos_cluster_ids = {} self.cc_cluster_ids = {} - for c, cluster_identifiers in subclusters: + for c, cluster_identifiers, cluster in subclusters: if c == "cos": self.cos_clusters.append(cluster) self.cos_cluster_ids[cluster.cluster_id] = cluster_identifiers @@ -617,57 +522,48 @@ def __init__(self, experiments, reflections, params): self.cc_cluster_ids[cluster.cluster_id] = cluster_identifiers for k, clusters in enumerate([self.cos_clusters, self.cc_clusters]): - if ( - k == 0 - and "cos_angle" in self._params.clustering.hierarchical.method - ): - cty = "cos" - elif ( - k == 1 - and "correlation" in self._params.clustering.hierarchical.method - ): - cty = "cc" - else: - cty = False - - if cty: - logger.info("----------------------") - logger.info(f"{cty} cluster analysis") - logger.info("----------------------") - - _, list_of_clusters = ( - MultiCrystalAnalysis.interesting_cluster_identification( - clusters, self._params - ) + cty = "cc" if k == 1 else "cos" # cluster type as a string + logger.info("-" * 22 + f"\n{cty} cluster analysis\n" + "-" * 22) + + _, list_of_clusters = ( + MultiCrystalAnalysis.interesting_cluster_identification( + clusters, self._params ) + ) - for item in list_of_clusters: - if k == 0: - cluster_dir = "cos_" + item - elif k == 1: - cluster_dir = "cc_" + item - if not os.path.exists(cluster_dir): - os.mkdir(cluster_dir) - os.chdir(cluster_dir) - logger.info("Scaling: %s" % cluster_dir) - free_flags_in_full_set = True - - for cluster in clusters: - if "cluster_" + str(cluster.cluster_id) == item: - if k == 0: - ids = self.cos_cluster_ids[cluster.cluster_id] - elif k == 1: - ids = self.cc_cluster_ids[cluster.cluster_id] - - scaled = self.scale_cluster( - data_manager, ids, free_flags_in_full_set - ) - self._record_individual_report( - data_manager, - scaled.report(), - cluster_dir.replace("_", " "), - ) - os.chdir("..") + for item in list_of_clusters: + cluster_dir = f"{cty}_clusters/{item}" + logger.info(f"Scaling : {cluster_dir}") + + for cluster in clusters: + if f"cluster_{cluster.cluster_id}" == item: + ids = ( + self.cc_cluster_ids[cluster.cluster_id] + if k + else self.cos_cluster_ids[cluster.cluster_id] + ) + self._scale_and_report_cluster( + self._data_manager, cluster_dir, ids + ) + break + if ( + self._params.clustering.output_clusters + and "coordinate" in self._params.clustering.method + ): + clusters = self._mca.significant_coordinate_clusters + # if not pathlib.Path.exists(cwd / "coordinate_clusters"): + # pathlib.Path.mkdir(cwd / "coordinate_clusters") + for c in clusters: + cluster_dir = f"coordinate_cluster_{c.cluster_id}" + logger.info(f"Scaling: {cluster_dir}") + # if not pathlib.Path.exists(cwd / cluster_dir): + # pathlib.Path.mkdir(cwd / cluster_dir) + cluster_identifiers = [ + self._data_manager.ids_to_identifiers_map[l] for l in c.labels + ] + self._scale_and_report_cluster( + self._data_manager, cluster_dir, cluster_identifiers + ) if self._params.filtering.method: # Final round of scaling, this time filtering out any bad datasets @@ -789,6 +685,17 @@ def scale_cluster(self, data_manager_input, identifiers, free_flags_in_full_set) return scaled + def _scale_and_report_cluster(self, data_manager, cluster_dir, cluster_identifiers): + cwd = pathlib.Path.cwd() + if not os.path.exists(cluster_dir): + os.mkdir(cluster_dir) + os.chdir(cluster_dir) + scaled = self.scale_cluster(data_manager, cluster_identifiers, True) + self._record_individual_report( + "", scaled.report(), cluster_dir.replace("_", " ") + ) + os.chdir(cwd) + def _record_individual_report(self, data_manager, report, cluster_name): d = self._report_as_dict(report) diff --git a/src/xia2/Modules/MultiCrystalAnalysis.py b/src/xia2/Modules/MultiCrystalAnalysis.py index 796cfa809..186d0b9dd 100644 --- a/src/xia2/Modules/MultiCrystalAnalysis.py +++ b/src/xia2/Modules/MultiCrystalAnalysis.py @@ -149,6 +149,7 @@ def cluster_analysis(self): self._cc_cluster_table = matrices.cc_table self._cos_angle_cluster_table = matrices.cos_table self._cosym_graphs = matrices.rij_graphs + self.significant_coordinate_clusters = matrices.significant_clusters # Need this here or else cos-angle dendrogram does not replicate original multiplex output self._cluster_analysis = True diff --git a/src/xia2/cli/cluster_analysis.py b/src/xia2/cli/cluster_analysis.py index 89af1bfaa..302c912b2 100644 --- a/src/xia2/cli/cluster_analysis.py +++ b/src/xia2/cli/cluster_analysis.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import logging import pathlib import random @@ -20,20 +19,19 @@ from jinja2 import ChoiceLoader, Environment, PackageLoader import xia2.Handlers.Streams -from xia2.Modules.MultiCrystalAnalysis import MultiCrystalAnalysis +from xia2.Modules.MultiCrystal.cluster_analysis import ( + cluster_phil_scope, + output_cluster, + output_hierarchical_clusters, +) from xia2.XIA2Version import Version logger = logging.getLogger("xia2.cluster_analysis") -cluster_phil_scope = """\ +xia2_cluster_phil_scope = """\ clustering .short_caption = "Clustering" { - output_clusters = False - .type = bool - .help = "Set this to true to enable scaling and merging of individual clusters" - .short_caption = "Output individual clusters" - output_correlation_cluster_number = None .type = int .short_caption = "Option to output a specific correlation cluster when re-running the code" @@ -46,48 +44,6 @@ exclude_cos_cluster_number = None .type = int .short_caption = "option to output all data excluding a specific cos cluster" - - method = *hierarchical coordinate - .type = choice(multi=True) - .short_caption = "Clustering method to use - analyse the clusters generated from" - "the hierarchical dendrograms or the density based" - "clustering analysis of the cosym coordinates." - min_cluster_size = 5 - .type = int - .short_caption = "Minimum number of datasets for an output cluster" - min_completeness = 0 - .type = float(value_min=0, value_max=1) - .short_caption = "Minimum completeness" - min_multiplicity = 0 - .type = float(value_min=0) - .short_caption = "Minimum multiplicity" - max_output_clusters = 10 - .type = int(value_min=1) - .short_caption = "Maximum number of clusters to be output" - hierarchical - { - method = *cos_angle correlation - .type = choice(multi=True) - .short_caption = "Metric on which to perform hierarchical clustering" - max_cluster_height = 100 - .type = float - .short_caption = "Maximum height in dendrogram for clusters" - max_cluster_height_cc = 100 - .type = float - .short_caption = "Maximum height in correlation dendrogram for clusters" - max_cluster_height_cos = 100 - .type = float - .short_caption = "Maximum height in cos angle dendrogram for clusters" - distinct_clusters = False - .type = bool - .help = "This will determine whether optional cluster analysis is undertaken." - "To assist in decreasing computation time, only clusters that have" - "no datasets in common but eventually combine to form a joined cluster" - "in the output dendrogram will be scaled and merged." - "These may contain interesting differences worth comparing in" - "downstream analysis." - .short_caption = "Find distinct clusters" - } } """ @@ -106,6 +62,7 @@ .short_caption = "Maximum hight difference between clusters" %s +%s output { log = xia2.cluster_analysis.log @@ -114,7 +71,7 @@ .type = str } """ - % cluster_phil_scope, + % (cluster_phil_scope, xia2_cluster_phil_scope), process_includes=True, ) # batch_phil_scope @@ -299,86 +256,29 @@ def run(args=sys.argv[1:]): # End of include/exclude options that are only available to xia2.cluster_analysis # all under if params.clustering.output_clusters:? - from xia2.Modules.MultiCrystal.ScaleAndMerge import get_subclusters - - # First get subclusters that meet the required thresholds - # - min size, completeness, multiplciity, dendrogram height etc. - # subclusters will be of length max_output_clusters if distinct_clusters=False - subclusters = get_subclusters( - params.clustering, - MCA.ids_to_identifiers_map, - MCA.cos_angle_clusters, - MCA.correlation_clusters, - ) - - # if not doing distinct cluster analysis, can now output clusters - if not params.clustering.hierarchical.distinct_clusters: - for c, cluster_identifiers, cluster in subclusters: - cluster_dir = cwd / f"{c}_clusters/cluster_{cluster.cluster_id}" - logger.info(f"Outputting {c} cluster {cluster.cluster_id}:") - logger.info(cluster) - output_cluster( - cluster_dir, - experiments, - reflections, - cluster_identifiers, - ) - - # if doing distinct cluster analysis, do the analysis and output clusters - if params.clustering.hierarchical.distinct_clusters: - cos_clusters = [] - cc_clusters = [] - cos_cluster_ids = {} - cc_cluster_ids = {} - for c, cluster_identifiers, cluster in subclusters: - if c == "cos": - cos_clusters.append(cluster) - cos_cluster_ids[cluster.cluster_id] = cluster_identifiers - elif c == "cc": - cc_clusters.append(cluster) - cc_cluster_ids[cluster.cluster_id] = cluster_identifiers - - for k, clusters in enumerate([cos_clusters, cc_clusters]): - cty = "cc" if k == 1 else "cos" # cluster type as a string - logger.info("----------------------") - logger.info(f"{cty} cluster analysis") - logger.info("----------------------") - - ( - file_data, - list_of_clusters, - ) = MultiCrystalAnalysis.interesting_cluster_identification( - clusters, params - ) - for item in list_of_clusters: - cluster_dir = f"{cty}_clusters/{item}" - logger.info(f"Outputting: {cluster_dir}") - output_dir = cwd / cluster_dir - - for cluster in clusters: - if f"cluster_{cluster.cluster_id}" == item: - ids = ( - cc_cluster_ids[cluster.cluster_id] - if k - else cos_cluster_ids[cluster.cluster_id] - ) - output_cluster( - output_dir, - experiments, - reflections, - ids, - ) - - if params.clustering.hierarchical.distinct_clusters: - logger.info(f"Clusters recommended for comparison in {params.output.log}") if params.clustering.output_clusters: - logger.info("----------------") - logger.info("Output given as DIALS .expt/.refl files:") - logger.info("To merge rotation data: use dials.merge") - logger.info( - "To merge still data: use xia2.ssx_reduce with the option steps=merge" - ) - logger.info("----------------") + if "hierarchical" in params.clustering.method: + output_hierarchical_clusters(params, MCA, experiments, reflections) + if "coordinate" in params.clustering.method: + from dxtbx.model import ExperimentList + + clusters = MCA.significant_clusters + if not pathlib.Path.exists(cwd / "coordinate_clusters"): + pathlib.Path.mkdir(cwd / "coordinate_clusters") + for c in clusters: + cluster_dir = f"coordinate_clusters/cluster_{c.cluster_id}" + logger.info(f"Outputting: {cluster_dir}") + if not pathlib.Path.exists(cwd / cluster_dir): + pathlib.Path.mkdir(cwd / cluster_dir) + expts = ExperimentList() + tables = [] + print(dir(c)) + for idx in c.labels: + expts.append(MCA._experiments[idx]) + tables.append(MCA._reflections[idx]) + joint_refl = flex.reflection_table.concat(tables) + expts.as_file(cwd / cluster_dir / "cluster.expt") + joint_refl.as_file(cwd / cluster_dir / "cluster.refl") loader = ChoiceLoader( [PackageLoader("xia2", "templates"), PackageLoader("dials", "templates")] @@ -401,21 +301,3 @@ def run(args=sys.argv[1:]): f.write(html.encode("utf-8", "xmlcharrefreplace")) MCA.output_json() - - -def output_cluster(new_folder, experiments, reflections, ids): - expts = copy.deepcopy(experiments) - expts.select_on_experiment_identifiers(ids) - - refl = [] - for table in reflections: - if table.experiment_identifiers().values()[0] in ids: - refl.append(table) - - joint_refl = flex.reflection_table.concat(refl) - - if not pathlib.Path.exists(new_folder): - pathlib.Path.mkdir(new_folder) - - expts.as_file(new_folder / "cluster.expt") - joint_refl.as_file(new_folder / "cluster.refl") diff --git a/src/xia2/cli/multiplex.py b/src/xia2/cli/multiplex.py index f57116d21..3f5db9d16 100644 --- a/src/xia2/cli/multiplex.py +++ b/src/xia2/cli/multiplex.py @@ -207,19 +207,19 @@ def run(args=sys.argv[1:]): f"symmetry.space_group has been set to: {params.symmetry.space_group}" ) - if params.clustering.output_clusters and not params.reference: + """if params.clustering.output_clusters and not params.reference: logger.info( "WARNING: clustering selected but no reference given. " "Inconsistent settings may occur." ) logger.info( "For consistent settings, please provide a reference .pdb, .mtz or .cif." - ) + )""" if ( - len(params.clustering.method) == 2 - and params.clustering.max_cluster_height != 100 - and params.clustering.max_cluster_height_cc == 100 - and params.clustering.max_cluster_height_cos == 100 + len(params.clustering.hierarchical.method) == 2 + and params.clustering.hierarchicalmax_cluster_height != 100 + and params.clustering.hierarchicalmax_cluster_height_cc == 100 + and params.clustering.hierarchicalmax_cluster_height_cos == 100 ): # This means user has changed max_cluster_height from default # BUT wants both cos and cc clustering