Get rid of trying to sync preferred label algorithm.

TranslatorSRI · Nov 7, 2024 · 67a46c9 · 67a46c9
1 parent 8f64ec3
commit 67a46c9
Showing 1 changed file with 21 additions and 27 deletions.
diff --git a/node_normalizer/normalizer.py b/node_normalizer/normalizer.py
@@ -708,33 +708,27 @@ async def create_node(canonical_id, equivalent_ids, types, info_contents, includ
     # identifier _except_ where one of the types is in preferred_name_boost_prefixes, in which case
     # we prefer the prefixes listed there.
     #
-    # Note that types[canonical_id] goes from most specific to least specific, so we
-    # need to reverse it in order to apply preferred_name_boost_prefixes for the most
-    # specific type.
-    possible_labels = []
-    for typ in types[canonical_id][::-1]:
-        if typ in config['preferred_name_boost_prefixes']:
-            # This is the most specific matching type, so we use this and then break.
-            possible_labels = list(map(lambda identifier: identifier.get('l', ''),
-                                  sort_identifiers_with_boosted_prefixes(
-                                      eids,
-                                      config['preferred_name_boost_prefixes'][typ]
-                                  )))
-
-            # Add in all the other labels -- we'd still like to consider them, but at a lower priority.
-            for eid in eids:
-                label = eid.get('l', '')
-                if label not in possible_labels:
-                    possible_labels.append(label)
-
-            # Since this is the most specific matching type, we shouldn't do other (presumably higher-level)
-            # categories: so let's break here.
-            break
-
-    # Step 1.2. If we didn't have a preferred_name_boost_prefixes, just use the identifiers in their
-    # Biolink prefix order.
-    if not possible_labels:
-        possible_labels = map(lambda eid: eid.get('l', ''), eids)
+    # HOWEVER, there are three reasons not to do that here:
+    # 1. For NameRes, it makes sense that we're trying to come up with the best label for a clique
+    #    so we can autocomplete to it. But for NodeNorm, users would be expecting the label that
+    #    goes with the identifier we've normalized to, so we should probably go with that label
+    #    unless that would be annoying (e.g. if it's very long).
+    # 2. It will be impossible to keep this in sync with NameRes for conflated names, since NameRes
+    #    conflation in Babel doesn't pick the preferred label across all possible labels within the
+    #    conflated clique, but instead picks the preferred label for each subclique, and then chooses
+    #    the first preferred label in order of conflation. Which is what we should be doing, but by
+    #    this point we've lost track of each subclique that went into this conflated clique.
+    # 3. Even in a best case scenario, we'd just be trying to replicate some pretty complicated code
+    #    in Babel -- the ideal solution here would be to use the preferred_name being generated by
+    #    Babel, but that will require some large changes to NodeNorm.
+    #
+    # For these reasons, I'm going to try to replace this with a simplified algorithm:
+    # - Order labels in clique identifier order.
+    # - Filter out blank or suspicious identifiers (e.g. `CHEMBL...`) identifiers.
+    # - Filter out labels longer than demote_labels_longer_than unless there are no labels under that size.
+    #
+    # Step 1. Get all possible labels.
+    possible_labels = map(lambda eid: eid.get('l', ''), eids)
 
     # Step 2. Filter out any suspicious labels.
     filtered_possible_labels = [l for l in possible_labels if