-
Notifications
You must be signed in to change notification settings - Fork 38
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
providers: provide DataCite-like DOI locally
- Generate a random, configurable length, base32, URI-friendly, hyphen-separated, optionally checksummed DOI
- Loading branch information
Showing
6 changed files
with
312 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# This file is part of Invenio. | ||
# Copyright (C) 2015-2018 CERN. | ||
# | ||
# Invenio is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Douglas Crockford Base32-URL encoder. | ||
This encoder/decoder: | ||
- uses Douglas Crockford Base32 encoding | ||
- allows for checksum | ||
- encodes the checksum using only characters in the base32 set | ||
(only digits in fact) | ||
- produces string that are URI-friendly (no '=' or '/' for instance) | ||
This is based on: | ||
- https://github.com/datacite/base32-url | ||
- https://github.com/jbittel/base32-crockford | ||
""" | ||
import string | ||
|
||
import six | ||
|
||
# NO i, l, o or u | ||
ENCODING_CHARS = '0123456789abcdefghjkmnpqrstvwxyz' | ||
DECODING_CHARS = {c: i for i, c in enumerate(ENCODING_CHARS)} | ||
|
||
|
||
def encode(number, split_every=0, checksum=False): | ||
"""Encodes `number` to URI-friendly Douglas Crockford base32 string. | ||
:param number: number to encode | ||
:param split_every: if provided, insert '-' every `split_every` characters | ||
going from left to right | ||
:param checksum: append modulo 97-10 (ISO 7064) checksum to string | ||
:returns: A random Douglas Crockford base32 encoded string composed only | ||
of valid URI characters. | ||
""" | ||
assert isinstance(number, six.integer_types) | ||
|
||
if number < 0: | ||
raise ValueError("Invalid 'number'. 'number' must > 0.") | ||
|
||
if split_every < 0: | ||
raise ValueError("Invalid 'split_every'. 'split_every' must > 0.") | ||
|
||
encoded = '' | ||
original_number = number | ||
while number > 0: | ||
remainder = number % 32 | ||
number //= 32 | ||
encoded = ENCODING_CHARS[remainder] + encoded | ||
|
||
if checksum: | ||
# NOTE: 100 * original_number is used because datacite also uses it | ||
computed_checksum = 97 - ((100 * original_number) % 97) + 1 | ||
encoded_checksum = "{:02d}".format(computed_checksum) | ||
encoded += encoded_checksum | ||
|
||
if split_every: | ||
splits = [ | ||
encoded[i:i+split_every] | ||
for i in range(0, len(encoded), split_every) | ||
] | ||
encoded = '-'.join(splits) | ||
|
||
return encoded | ||
|
||
|
||
def normalize(encoded): | ||
"""Returns normalized encoded string. | ||
- string is lowercased | ||
- '-' are removed | ||
- I,i,l,L decodes to the digit 1 | ||
- O,o decodes to the digit 0 | ||
:param encoded: string to decode | ||
:returns: normalized string. | ||
""" | ||
table = ( | ||
''.maketrans('IiLlOo', '111100') if six.PY3 else | ||
string.maketrans('IiLlOo', '111100') | ||
) | ||
encoded = encoded.replace('-', '').translate(table).lower() | ||
|
||
if not all([c in ENCODING_CHARS for c in encoded]): | ||
raise ValueError("'encoded' contains undecodable characters") | ||
|
||
return encoded | ||
|
||
|
||
def decode(encoded, checksum=False): | ||
"""Decodes `encoded` string (via above) to a number. | ||
The string is normalized before decoding. | ||
If `checksum` is enabled, raises a ValueError on checksum error. | ||
:param encoded: string to decode | ||
:param checksum: extract checksum and validate | ||
:returns: original number. | ||
""" | ||
if checksum: | ||
encoded_checksum = encoded[-2:] | ||
encoded = encoded[:-2] | ||
|
||
encoded = normalize(encoded) | ||
|
||
number = 0 | ||
for i, c in enumerate(reversed(encoded)): | ||
number += DECODING_CHARS[c] * (32**i) | ||
|
||
if checksum: | ||
verification_checksum = int(encoded_checksum, 10) | ||
# NOTE: 100 * number is used because datacite also uses it | ||
computed_checksum = 97 - ((100 * number) % 97) + 1 | ||
|
||
if verification_checksum != computed_checksum: | ||
raise ValueError("Invalid checksum.") | ||
|
||
return number |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
#!/usr/bin/env sh | ||
# -*- coding: utf-8 -*- | ||
# | ||
# This file is part of Invenio. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# -*- coding: utf-8 -*- | ||
# | ||
# This file is part of Invenio. | ||
# Copyright (C) 2015-2019 CERN. | ||
# Copyright (C) 2019 Northwestern University. | ||
# | ||
# Invenio is free software; you can redistribute it and/or modify it | ||
# under the terms of the MIT License; see LICENSE file for more details. | ||
|
||
"""Provider tests.""" | ||
|
||
import pytest | ||
|
||
from invenio_pidstore.providers import base32 | ||
|
||
|
||
def test_basic_encode(): | ||
assert base32.encode(32) == "10" | ||
assert base32.encode(1234) == "16j" | ||
|
||
|
||
def test_basic_decode(): | ||
assert base32.decode("16j") == 1234 | ||
|
||
|
||
def test_decode_normalizes_symbols(): | ||
assert ( | ||
base32.decode("abcdefghijklmnopqrstvwxyz") == | ||
base32.decode("ABCDEFGHIJKLMNOPQRSTVWXYZ") | ||
) | ||
assert base32.decode('IL1O0ilo') == base32.decode('11100110') | ||
assert base32.decode('1-6-j') == base32.decode('16j') | ||
|
||
|
||
def test_decode_raises_for_invalid_string(): | ||
with pytest.raises(ValueError): | ||
base32.decode("Ü'+?") | ||
|
||
|
||
def test_encode_hyphenates(): | ||
assert base32.encode(1234, split_every=1) == "1-6-j" | ||
|
||
with pytest.raises(ValueError): | ||
assert base32.encode(1234, split_every=-1) | ||
|
||
|
||
def test_encode_checksum(): | ||
assert base32.encode(1234, checksum=True) == "16j82" | ||
|
||
|
||
def test_decode_checksum(): | ||
assert base32.decode("16j82", checksum=True) == 1234 | ||
|
||
|
||
def test_decode_invalid_checksum(): | ||
with pytest.raises(ValueError): | ||
assert base32.decode("16j44", checksum=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters