From cd31f501e72e735ccc1b6bc9f34ccc2269513bd2 Mon Sep 17 00:00:00 2001 From: Amy Slagle Date: Tue, 30 Jan 2018 12:49:49 -0500 Subject: [PATCH 1/2] Guess sort name based on display name in the metadata layer instead of leaving out the author. --- metadata_layer.py | 7 ++++++- tests/test_metadata.py | 9 ++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/metadata_layer.py b/metadata_layer.py index df5ced1d6..a2a74141e 100644 --- a/metadata_layer.py +++ b/metadata_layer.py @@ -44,6 +44,7 @@ ) from classifier import NO_VALUE, NO_NUMBER from analytics import Analytics +from util.personal_names import display_name_to_sort_name class ReplacementPolicy(object): """How serious should we be about overwriting old metadata with @@ -290,6 +291,7 @@ def find_sort_name(self, _db, identifiers, metadata_client): # Is there a contributor already in the database with this # exact sort name? If so, use their display name. + # If not, take our best guess based on the display name. sort_name = self.display_name_to_sort_name(_db, self.display_name) if sort_name: self.sort_name = sort_name @@ -302,6 +304,7 @@ def find_sort_name(self, _db, identifiers, metadata_client): _db, identifiers, metadata_client ) self.sort_name = sort_name + return (self.sort_name is not None) @classmethod @@ -329,7 +332,9 @@ def display_name_to_sort_name(self, _db, display_name): contributors[0].sort_name ) return contributors[0].sort_name - return None + # There weren't any matching contributors. Guess the sort name + # based on the display name instead. + return display_name_to_sort_name(display_name) def _display_name_to_sort_name( self, _db, metadata_client, identifier_obj diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 3330a418e..d53a510c6 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -689,7 +689,14 @@ def test_apply(self): contributor_new, changed = contributor_data.apply(contributor_new) eq_(changed, False) - + def test_display_name_to_sort_name(self): + # If there's an existing contributor with a matching display name, + # we'll use their sort name. + existing_contributor, ignore = self._contributor(sort_name="Sort, Name", display_name="John Doe") + eq_("Sort, Name", ContributorData.display_name_to_sort_name(self._db, "John Doe")) + + # Otherwise, we'll guess based on the display name. + eq_("Doe, Jane", ContributorData.display_name_to_sort_name(self._db, "Jane Doe")) class TestLinkData(DatabaseTest): From 77297bccb808076924dfb2031c785de01749fb1f Mon Sep 17 00:00:00 2001 From: Amy Slagle Date: Tue, 30 Jan 2018 16:04:38 -0500 Subject: [PATCH 2/2] Changed find_sort_name to still check the metadata client before guessing, and added tests. --- metadata_layer.py | 17 +++++++++----- tests/test_metadata.py | 51 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/metadata_layer.py b/metadata_layer.py index a2a74141e..d186ec722 100644 --- a/metadata_layer.py +++ b/metadata_layer.py @@ -292,7 +292,8 @@ def find_sort_name(self, _db, identifiers, metadata_client): # Is there a contributor already in the database with this # exact sort name? If so, use their display name. # If not, take our best guess based on the display name. - sort_name = self.display_name_to_sort_name(_db, self.display_name) + sort_name = self.display_name_to_sort_name_from_existing_contributor( + _db, self.display_name) if sort_name: self.sort_name = sort_name return True @@ -303,12 +304,18 @@ def find_sort_name(self, _db, identifiers, metadata_client): sort_name = self.display_name_to_sort_name_through_canonicalizer( _db, identifiers, metadata_client ) - self.sort_name = sort_name + if sort_name: + self.sort_name = sort_name + return True + + # If there's still no sort name, take our best guess based + # on the display name. + self.sort_name = display_name_to_sort_name(self.display_name) return (self.sort_name is not None) @classmethod - def display_name_to_sort_name(self, _db, display_name): + def display_name_to_sort_name_from_existing_contributor(self, _db, display_name): """Find the sort name for this book's author, assuming it's easy. 'Easy' means we already have an established sort name for a @@ -332,9 +339,7 @@ def display_name_to_sort_name(self, _db, display_name): contributors[0].sort_name ) return contributors[0].sort_name - # There weren't any matching contributors. Guess the sort name - # based on the display name instead. - return display_name_to_sort_name(display_name) + return None def _display_name_to_sort_name( self, _db, metadata_client, identifier_obj diff --git a/tests/test_metadata.py b/tests/test_metadata.py index d53a510c6..cfe7f1c87 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -41,6 +41,7 @@ from . import ( DatabaseTest, DummyHTTPClient, + DummyMetadataClient, ) from s3 import MockS3Uploader @@ -689,14 +690,56 @@ def test_apply(self): contributor_new, changed = contributor_data.apply(contributor_new) eq_(changed, False) - def test_display_name_to_sort_name(self): + def test_display_name_to_sort_name_from_existing_contributor(self): # If there's an existing contributor with a matching display name, # we'll use their sort name. existing_contributor, ignore = self._contributor(sort_name="Sort, Name", display_name="John Doe") - eq_("Sort, Name", ContributorData.display_name_to_sort_name(self._db, "John Doe")) + eq_("Sort, Name", ContributorData.display_name_to_sort_name_from_existing_contributor(self._db, "John Doe")) - # Otherwise, we'll guess based on the display name. - eq_("Doe, Jane", ContributorData.display_name_to_sort_name(self._db, "Jane Doe")) + # Otherwise, we don't know. + eq_(None, ContributorData.display_name_to_sort_name_from_existing_contributor(self._db, "Jane Doe")) + + def test_find_sort_name(self): + metadata_client = DummyMetadataClient() + metadata_client.lookups["Metadata Client Author"] = "Author, M. C." + existing_contributor, ignore = self._contributor(sort_name="Author, E.", display_name="Existing Author") + contributor_data = ContributorData() + + # If there's already a sort name, keep it. + contributor_data.sort_name = "Sort Name" + eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client)) + eq_("Sort Name", contributor_data.sort_name) + + contributor_data.sort_name = "Sort Name" + contributor_data.display_name = "Existing Author" + eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client)) + eq_("Sort Name", contributor_data.sort_name) + + contributor_data.sort_name = "Sort Name" + contributor_data.display_name = "Metadata Client Author" + eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client)) + eq_("Sort Name", contributor_data.sort_name) + + # If there's no sort name but there's already an author with the same display name, + # use that author's sort name. + contributor_data.sort_name = None + contributor_data.display_name = "Existing Author" + eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client)) + eq_("Author, E.", contributor_data.sort_name) + + # If there's no sort name and no existing author, check the metadata wrangler + # for a sort name. + contributor_data.sort_name = None + contributor_data.display_name = "Metadata Client Author" + eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client)) + eq_("Author, M. C.", contributor_data.sort_name) + + # If there's no sort name, no existing author, and nothing from the metadata + # wrangler, guess the sort name based on the display name. + contributor_data.sort_name = None + contributor_data.display_name = "New Author" + eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client)) + eq_("Author, New", contributor_data.sort_name) class TestLinkData(DatabaseTest):