Skip to content
This repository has been archived by the owner on Apr 11, 2022. It is now read-only.

Commit

Permalink
Merge pull request #789 from NYPL-Simplified/missing-authors
Browse files Browse the repository at this point in the history
Guess sort name based on display name in the metadata layer instead of leaving out the author.
  • Loading branch information
aslagle authored Jan 31, 2018
2 parents f30f02f + 77297bc commit 2024781
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 4 deletions.
16 changes: 13 additions & 3 deletions metadata_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
)
from classifier import NO_VALUE, NO_NUMBER
from analytics import Analytics
from util.personal_names import display_name_to_sort_name

class ReplacementPolicy(object):
"""How serious should we be about overwriting old metadata with
Expand Down Expand Up @@ -290,7 +291,9 @@ def find_sort_name(self, _db, identifiers, metadata_client):

# Is there a contributor already in the database with this
# exact sort name? If so, use their display name.
sort_name = self.display_name_to_sort_name(_db, self.display_name)
# If not, take our best guess based on the display name.
sort_name = self.display_name_to_sort_name_from_existing_contributor(
_db, self.display_name)
if sort_name:
self.sort_name = sort_name
return True
Expand All @@ -301,11 +304,18 @@ def find_sort_name(self, _db, identifiers, metadata_client):
sort_name = self.display_name_to_sort_name_through_canonicalizer(
_db, identifiers, metadata_client
)
self.sort_name = sort_name
if sort_name:
self.sort_name = sort_name
return True

# If there's still no sort name, take our best guess based
# on the display name.
self.sort_name = display_name_to_sort_name(self.display_name)

return (self.sort_name is not None)

@classmethod
def display_name_to_sort_name(self, _db, display_name):
def display_name_to_sort_name_from_existing_contributor(self, _db, display_name):
"""Find the sort name for this book's author, assuming it's easy.
'Easy' means we already have an established sort name for a
Expand Down
52 changes: 51 additions & 1 deletion tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from . import (
DatabaseTest,
DummyHTTPClient,
DummyMetadataClient,
)

from s3 import MockS3Uploader
Expand Down Expand Up @@ -689,7 +690,56 @@ def test_apply(self):
contributor_new, changed = contributor_data.apply(contributor_new)
eq_(changed, False)


def test_display_name_to_sort_name_from_existing_contributor(self):
# If there's an existing contributor with a matching display name,
# we'll use their sort name.
existing_contributor, ignore = self._contributor(sort_name="Sort, Name", display_name="John Doe")
eq_("Sort, Name", ContributorData.display_name_to_sort_name_from_existing_contributor(self._db, "John Doe"))

# Otherwise, we don't know.
eq_(None, ContributorData.display_name_to_sort_name_from_existing_contributor(self._db, "Jane Doe"))

def test_find_sort_name(self):
metadata_client = DummyMetadataClient()
metadata_client.lookups["Metadata Client Author"] = "Author, M. C."
existing_contributor, ignore = self._contributor(sort_name="Author, E.", display_name="Existing Author")
contributor_data = ContributorData()

# If there's already a sort name, keep it.
contributor_data.sort_name = "Sort Name"
eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client))
eq_("Sort Name", contributor_data.sort_name)

contributor_data.sort_name = "Sort Name"
contributor_data.display_name = "Existing Author"
eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client))
eq_("Sort Name", contributor_data.sort_name)

contributor_data.sort_name = "Sort Name"
contributor_data.display_name = "Metadata Client Author"
eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client))
eq_("Sort Name", contributor_data.sort_name)

# If there's no sort name but there's already an author with the same display name,
# use that author's sort name.
contributor_data.sort_name = None
contributor_data.display_name = "Existing Author"
eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client))
eq_("Author, E.", contributor_data.sort_name)

# If there's no sort name and no existing author, check the metadata wrangler
# for a sort name.
contributor_data.sort_name = None
contributor_data.display_name = "Metadata Client Author"
eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client))
eq_("Author, M. C.", contributor_data.sort_name)

# If there's no sort name, no existing author, and nothing from the metadata
# wrangler, guess the sort name based on the display name.
contributor_data.sort_name = None
contributor_data.display_name = "New Author"
eq_(True, contributor_data.find_sort_name(self._db, [], metadata_client))
eq_("Author, New", contributor_data.sort_name)

class TestLinkData(DatabaseTest):

Expand Down

0 comments on commit 2024781

Please sign in to comment.