Skip to content

Commit

Permalink
Do coordinate clustering in multiplex
Browse files Browse the repository at this point in the history
  • Loading branch information
jbeilstenedmands committed Oct 18, 2024
1 parent 79224fd commit 16ec971
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 315 deletions.
229 changes: 68 additions & 161 deletions src/xia2/Modules/MultiCrystal/ScaleAndMerge.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import copy
import logging
import os
import pathlib
from collections import OrderedDict

import iotbx.phil
Expand All @@ -19,6 +20,7 @@
from xia2.Handlers.Phil import PhilIndex
from xia2.lib.bits import auto_logfiler
from xia2.Modules import Report
from xia2.Modules.MultiCrystal.cluster_analysis import get_subclusters
from xia2.Modules.MultiCrystal.data_manager import DataManager
from xia2.Modules.MultiCrystalAnalysis import MultiCrystalAnalysis
from xia2.Modules.Scaler.DialsScaler import (
Expand Down Expand Up @@ -316,88 +318,6 @@
)


def clusters_and_types(cos_angle_clusters, cc_clusters, methods):
if "cos_angle" in methods and "correlation" not in methods:
clusters = cos_angle_clusters
ctype = ["cos"] * len(clusters)
elif "correlation" in methods and "cos_angle" not in methods:
clusters = cc_clusters
ctype = ["cc"] * len(clusters)
elif "cos_angle" in methods and "correlation" in methods:
clusters = cos_angle_clusters + cc_clusters
ctype = ["cos"] * len(cos_angle_clusters) + ["cc"] * len(cc_clusters)
else:
raise ValueError("Invalid cluster method: %s" % methods)

clusters.reverse()
ctype.reverse()
return clusters, ctype


def get_subclusters(params, ids_to_identifiers_map, cos_angle_clusters, cc_clusters):
subclusters = []

min_completeness = params.min_completeness
min_multiplicity = params.min_multiplicity
max_clusters = params.max_output_clusters
min_cluster_size = params.min_cluster_size
max_cluster_height_cos = params.hierarchical.max_cluster_height_cos
max_cluster_height_cc = params.hierarchical.max_cluster_height_cc
max_cluster_height = params.hierarchical.max_cluster_height

clusters, ctype = clusters_and_types(
cos_angle_clusters, cc_clusters, params.hierarchical.method
)

n_processed_cos = 0
n_processed_cc = 0

for c, cluster in zip(ctype, clusters):
# This simplifies max_cluster_height into cc and cos angle versions
# But still gives the user the option of just selecting max_cluster_height
# Which makes more sense when they only want one type of clustering

if c == "cc" and max_cluster_height != 100 and max_cluster_height_cc == 100:
max_cluster_height_cc = max_cluster_height
# if user has weirdly set both max_cluster_height and max_cluster_height_cc
# will still default to max_cluster_height_cc as intended
if c == "cos" and max_cluster_height != 100 and max_cluster_height_cos == 100:
max_cluster_height_cos = max_cluster_height

if n_processed_cos == max_clusters and c == "cos":
continue
if n_processed_cc == max_clusters and c == "cc":
continue
if cluster.completeness < min_completeness:
continue
if cluster.multiplicity < min_multiplicity:
continue
if (
len(cluster.labels)
== len(ids_to_identifiers_map) # was len(data_manager_original.experiments)
and not params.hierarchical.distinct_clusters
):
continue
if cluster.height > max_cluster_height_cc and c == "cc":
continue
if cluster.height > max_cluster_height_cos and c == "cos":
continue
if len(cluster.labels) < min_cluster_size:
continue

cluster_identifiers = [ids_to_identifiers_map[l] for l in cluster.labels]
subclusters.append((c, cluster_identifiers, cluster))
if (
not params.hierarchical.distinct_clusters
): # increment so that we only get up to N clusters
if c == "cos":
n_processed_cos += 1
elif c == "cc":
n_processed_cc += 1

return subclusters


class MultiCrystalScale:
def __init__(self, experiments, reflections, params):
self._data_manager = DataManager(experiments, reflections)
Expand Down Expand Up @@ -557,58 +477,43 @@ def __init__(self, experiments, reflections, params):
# now do the interesting cluster identification algorithm from xia2.cluster_analysis.
# but don't repeat the code.

if self._params.clustering.output_clusters:
if (
self._params.clustering.output_clusters
and "hierarchical" in self._params.clustering.method
):
## note this is all hierarchical stuff first - need an if statement
self._data_manager_original = self._data_manager
cwd = os.path.abspath(os.getcwd())
cwd = pathlib.Path.cwd()
"""if not pathlib.Path.exists(cwd / "cos_clusters"):
pathlib.Path.mkdir(cwd / "cos_clusters")
if not pathlib.Path.exists(cwd / "cc_clusters"):
pathlib.Path.mkdir(cwd / "cc_clusters")"""

subclusters = get_subclusters(
self._params.clustering,
data_manager.ids_to_identifiers_map,
self._data_manager.ids_to_identifiers_map,
self._cos_angle_clusters,
self._cc_clusters,
)

if not self._params.clustering.hierarchical.distinct_clusters:
for c, cluster_identifiers, cluster in subclusters:
"""if self._params.clustering.find_distinct_clusters:
# set things up for later, but don't scale now?
if c == "cos":
self.cos_clusters.append(cluster)
self.cos_cluster_ids[cluster.cluster_id] = cluster_identifiers
elif c == "cc":
self.cc_clusters.append(cluster)
self.cc_cluster_ids[cluster.cluster_id] = cluster_identifiers"""

# else:
# if c == "cos":
# n_processed_cos += 1
# elif c == "cc":
# n_processed_cc += 1
cluster_dir = cwd / f"{c}_clusters/cluster_{cluster.cluster_id}"
logger.info(f"Scaling {c} cluster {cluster.cluster_id}:")
logger.info(cluster)
cluster_dir = f"{c}_cluster_{cluster.cluster_id}"

if not os.path.exists(cluster_dir):
os.mkdir(cluster_dir)
os.chdir(cluster_dir)

scaled = self.scale_cluster(
data_manager,
self._scale_and_report_cluster(
self._data_manager,
cluster_dir,
cluster_identifiers,
free_flags_in_full_set,
)
self._record_individual_report(
data_manager, scaled.report(), cluster_dir.replace("_", " ")
)
os.chdir(cwd)

if self._params.clustering.hierarchical.distinct_clusters:
self.cos_clusters = []
self.cc_clusters = []
self.cos_cluster_ids = {}
self.cc_cluster_ids = {}
for c, cluster_identifiers in subclusters:
for c, cluster_identifiers, cluster in subclusters:
if c == "cos":
self.cos_clusters.append(cluster)
self.cos_cluster_ids[cluster.cluster_id] = cluster_identifiers
Expand All @@ -617,57 +522,48 @@ def __init__(self, experiments, reflections, params):
self.cc_cluster_ids[cluster.cluster_id] = cluster_identifiers

for k, clusters in enumerate([self.cos_clusters, self.cc_clusters]):
if (
k == 0
and "cos_angle" in self._params.clustering.hierarchical.method
):
cty = "cos"
elif (
k == 1
and "correlation" in self._params.clustering.hierarchical.method
):
cty = "cc"
else:
cty = False

if cty:
logger.info("----------------------")
logger.info(f"{cty} cluster analysis")
logger.info("----------------------")

_, list_of_clusters = (
MultiCrystalAnalysis.interesting_cluster_identification(
clusters, self._params
)
cty = "cc" if k == 1 else "cos" # cluster type as a string
logger.info("-" * 22 + f"\n{cty} cluster analysis\n" + "-" * 22)

_, list_of_clusters = (
MultiCrystalAnalysis.interesting_cluster_identification(
clusters, self._params
)
)

for item in list_of_clusters:
if k == 0:
cluster_dir = "cos_" + item
elif k == 1:
cluster_dir = "cc_" + item
if not os.path.exists(cluster_dir):
os.mkdir(cluster_dir)
os.chdir(cluster_dir)
logger.info("Scaling: %s" % cluster_dir)
free_flags_in_full_set = True

for cluster in clusters:
if "cluster_" + str(cluster.cluster_id) == item:
if k == 0:
ids = self.cos_cluster_ids[cluster.cluster_id]
elif k == 1:
ids = self.cc_cluster_ids[cluster.cluster_id]

scaled = self.scale_cluster(
data_manager, ids, free_flags_in_full_set
)
self._record_individual_report(
data_manager,
scaled.report(),
cluster_dir.replace("_", " "),
)
os.chdir("..")
for item in list_of_clusters:
cluster_dir = f"{cty}_clusters/{item}"
logger.info(f"Scaling : {cluster_dir}")

for cluster in clusters:
if f"cluster_{cluster.cluster_id}" == item:
ids = (
self.cc_cluster_ids[cluster.cluster_id]
if k
else self.cos_cluster_ids[cluster.cluster_id]
)
self._scale_and_report_cluster(
self._data_manager, cluster_dir, ids
)
break
if (
self._params.clustering.output_clusters
and "coordinate" in self._params.clustering.method
):
clusters = self._mca.significant_coordinate_clusters
# if not pathlib.Path.exists(cwd / "coordinate_clusters"):
# pathlib.Path.mkdir(cwd / "coordinate_clusters")
for c in clusters:
cluster_dir = f"coordinate_cluster_{c.cluster_id}"
logger.info(f"Scaling: {cluster_dir}")
# if not pathlib.Path.exists(cwd / cluster_dir):
# pathlib.Path.mkdir(cwd / cluster_dir)
cluster_identifiers = [
self._data_manager.ids_to_identifiers_map[l] for l in c.labels
]
self._scale_and_report_cluster(
self._data_manager, cluster_dir, cluster_identifiers
)

if self._params.filtering.method:
# Final round of scaling, this time filtering out any bad datasets
Expand Down Expand Up @@ -789,6 +685,17 @@ def scale_cluster(self, data_manager_input, identifiers, free_flags_in_full_set)

return scaled

def _scale_and_report_cluster(self, data_manager, cluster_dir, cluster_identifiers):
cwd = pathlib.Path.cwd()
if not os.path.exists(cluster_dir):
os.mkdir(cluster_dir)
os.chdir(cluster_dir)
scaled = self.scale_cluster(data_manager, cluster_identifiers, True)
self._record_individual_report(
"", scaled.report(), cluster_dir.replace("_", " ")
)
os.chdir(cwd)

def _record_individual_report(self, data_manager, report, cluster_name):
d = self._report_as_dict(report)

Expand Down
1 change: 1 addition & 0 deletions src/xia2/Modules/MultiCrystalAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def cluster_analysis(self):
self._cc_cluster_table = matrices.cc_table
self._cos_angle_cluster_table = matrices.cos_table
self._cosym_graphs = matrices.rij_graphs
self.significant_coordinate_clusters = matrices.significant_clusters

# Need this here or else cos-angle dendrogram does not replicate original multiplex output
self._cluster_analysis = True
Expand Down
Loading

0 comments on commit 16ec971

Please sign in to comment.