Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix #88 and #93: multiple decimal places and multiple decimal numbers #91

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 89 additions & 33 deletions lingua_franca/lang/parse_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,17 @@
# limitations under the License.
#
from datetime import datetime, timedelta

from dateutil.relativedelta import relativedelta
from math import ceil, floor

import json
import re

from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \
invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer
from lingua_franca.lang.common_data_en import _ARTICLES_EN, _NUM_STRING_EN, \
_LONG_ORDINAL_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN, _SHORT_ORDINAL_EN

import re
import json
from lingua_franca import resolve_resource_file
from lingua_franca.time import now_local

Expand Down Expand Up @@ -77,14 +78,22 @@ def generate_plurals_en(originals):
_STRING_LONG_ORDINAL_EN = invert_dict(_LONG_ORDINAL_EN)


def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False,
decimal_places=None):
"""
Convert words in a string into their equivalent numbers.
Args:
text str:
short_scale boolean: True if short scale numbers should be used.
ordinals boolean: True if ordinals (e.g. first, second, third) should
text (str):
short_scale (bool): True if short scale numbers should be used.
ordinals (bool): True if ordinals (e.g. first, second, third) should
be parsed to their number values (1, 2, 3...)
decimal_places (int or None): Positive value will round to X places.
Val of 0 will round up to nearest int,
equivalent to `math.ceil(result)`
Val of -1 will round down to nearest int,
equivalent to `math.floor(result)`
Val of None will perform no rounding,
potentially returning a very long string.

Returns:
str
Expand All @@ -94,7 +103,8 @@ def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):
text = text.lower()
tokens = tokenize(text)
numbers_to_replace = \
_extract_numbers_with_text_en(tokens, short_scale, ordinals)
_extract_numbers_with_text_en(
tokens, short_scale, ordinals, places=decimal_places)
numbers_to_replace.sort(key=lambda number: number.start_index)

results = []
Expand All @@ -114,7 +124,8 @@ def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False):


def _extract_numbers_with_text_en(tokens, short_scale=True,
ordinals=False, fractional_numbers=True):
ordinals=False, fractional_numbers=True,
places=None):
"""
Extract all numbers from a list of Tokens, with the words that
represent them.
Expand All @@ -138,7 +149,8 @@ def _extract_numbers_with_text_en(tokens, short_scale=True,
while True:
to_replace = \
_extract_number_with_text_en(tokens, short_scale,
ordinals, fractional_numbers)
ordinals, fractional_numbers,
places=places)

if not to_replace:
break
Expand All @@ -156,7 +168,8 @@ def _extract_numbers_with_text_en(tokens, short_scale=True,


def _extract_number_with_text_en(tokens, short_scale=True,
ordinals=False, fractional_numbers=True):
ordinals=False, fractional_numbers=True,
places=None):
"""
This function extracts a number from a list of Tokens.

Expand All @@ -172,15 +185,17 @@ def _extract_number_with_text_en(tokens, short_scale=True,
"""
number, tokens = \
_extract_number_with_text_en_helper(tokens, short_scale,
ordinals, fractional_numbers)
ordinals, fractional_numbers,
places=places)
while tokens and tokens[0].word in _ARTICLES_EN:
tokens.pop(0)
return ReplaceableNumber(number, tokens)


def _extract_number_with_text_en_helper(tokens,
short_scale=True, ordinals=False,
fractional_numbers=True):
fractional_numbers=True,
places=None):
"""
Helper for _extract_number_with_text_en.

Expand All @@ -205,7 +220,8 @@ def _extract_number_with_text_en_helper(tokens,
return fraction, fraction_text

decimal, decimal_text = \
_extract_decimal_with_text_en(tokens, short_scale, ordinals)
_extract_decimal_with_text_en(
tokens, short_scale, ordinals, places=places)
if decimal:
return decimal, decimal_text

Expand Down Expand Up @@ -254,7 +270,7 @@ def _extract_fraction_with_text_en(tokens, short_scale, ordinals):
return None, None


def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
def _extract_decimal_with_text_en(tokens, short_scale, ordinals, places=None):
"""
Extract decimal numbers from a string.

Expand All @@ -264,13 +280,16 @@ def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
While this is a helper for extractnumber_en, it also depends on
extractnumber_en, to parse out the components of the decimal.

This does not currently handle things like:
number dot number number number

Args:
tokens [Token]: The text to parse.
short_scale boolean:
ordinals boolean:
places [int] or None: Number of decimal places to return
None performs no rounding
Positive int rounds to so many places
0 value rounds up to nearest int
-1 value rounds down to nearest int
other values throw error

Returns:
(float, [Token])
Expand All @@ -281,24 +300,58 @@ def _extract_decimal_with_text_en(tokens, short_scale, ordinals):
for c in _DECIMAL_MARKER:
partitions = partition_list(tokens, lambda t: t.word == c)

if len(partitions) == 3:
if len(partitions) >= 3:
ChanceNCounter marked this conversation as resolved.
Show resolved Hide resolved
numbers1 = \
_extract_numbers_with_text_en(partitions[0], short_scale,
ordinals, fractional_numbers=False)
ordinals, fractional_numbers=False,
places=places)
ChanceNCounter marked this conversation as resolved.
Show resolved Hide resolved
numbers2 = \
_extract_numbers_with_text_en(partitions[2], short_scale,
ordinals, fractional_numbers=False)

ordinals, fractional_numbers=False,
places=places)
if not numbers1 or not numbers2:
return None, None

# `numbers2` may have caught numbers which are part of the
# input string, but which are not part of *this* number.
# For example, for the input string:
# "a ratio of one point five to one"
# `numbers2` might read, `numbers2 == [5, 1]`
#
# truncate `numbers2` to contain only those tokens which were
# adjacent in the input string.
idx = 1
stop = False
while idx < len(numbers2) and not stop:
if numbers2[idx].tokens[0].index != numbers2[idx-1].tokens[0].index + 1 or \
numbers2[idx].value is None:
ChanceNCounter marked this conversation as resolved.
Show resolved Hide resolved
stop = True
else:
idx += 1
numbers2 = numbers2[:idx]

number = numbers1[-1]
decimal = numbers2[0]

# TODO handle number dot number number number
if "." not in str(decimal.text):
return number.value + float('0.' + str(decimal.value)), \
number.tokens + partitions[1] + decimal.tokens

if "." not in str(numbers2[0].text):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be updated to check for the new _DECIMAL_MARKER

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By this point, _DECIMAL_MARKER has done its job. This bit is easier to look at in a debugger than it is to explain, but numbers2 is a list of ReplaceableNumber objects corresponding to the digits of our decimal part. It's going to be joined, then appended to a decimal point, then cast to a float and summed with the whole number part of our result.

The if statement here goes back to Core, and it makes sure the first element of numbers2 doesn't already have a decimal point. It's hypothetically possible, but I can't figure out how, and removing the if clause doesn't break any test cases. Still, I left it there, just removed a superfluous variable is all.

I'll see if I can work backwards through git blame and ask whoever wrote the if statement. I was assuming it had something to do with the way the tokenizer handles certain input. If it's just a relic, I'm all for removing line 318 entirely.

Copy link
Collaborator

@JarbasAl JarbasAl Apr 28, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it doesnt break test cases, i say let's remove it or find a test case for it

return_value = float('0.' + "".join([str(
decimal.value) for decimal in numbers2]))
return_value = number.value + return_value
if places is not None:
if places == 0:
return_value = ceil(return_value)
elif places == -1:
return_value = floor(return_value)
if places < 1:
return_value = int(return_value)
return_tokens = number.tokens + partitions[1]
for n in numbers2:
return_tokens += n.tokens
if not places:
return return_value, return_tokens

return (round(return_value, places) if places > 0
else return_value), return_tokens
return None, None


Expand All @@ -319,8 +372,8 @@ def _extract_whole_number_with_text_en(tokens, short_scale, ordinals):
The value parsed, and tokens that it corresponds to.

"""
multiplies, string_num_ordinal, string_num_scale = \
_initialize_number_data(short_scale)
multiplies, string_num_ordinal, string_num_scale = _initialize_number_data(
short_scale)

number_words = [] # type: [Token]
val = False
Expand Down Expand Up @@ -560,7 +613,7 @@ def _initialize_number_data(short_scale):
return multiplies, string_num_ordinal_en, string_num_scale_en


def extractnumber_en(text, short_scale=True, ordinals=False):
def extractnumber_en(text, short_scale=True, ordinals=False, decimal_places=None):
"""
This function extracts a number from a text string,
handles pronunciations in long scale and short scale
Expand All @@ -571,13 +624,15 @@ def extractnumber_en(text, short_scale=True, ordinals=False):
text (str): the string to normalize
short_scale (bool): use short scale if True, long scale if False
ordinals (bool): consider ordinal numbers, third=3 instead of 1/3
decimal_places (int or None): rounds to # decimal places. uses builtin round()
Returns:
(int) or (float) or False: The extracted number or False if no number
was found

"""
return _extract_number_with_text_en(tokenize(text.lower()),
short_scale, ordinals).value
short_scale, ordinals,
places=decimal_places).value


def extract_duration_en(text):
Expand Down Expand Up @@ -1476,7 +1531,7 @@ def isFractional_en(input_str, short_scale=True):
return False


def extract_numbers_en(text, short_scale=True, ordinals=False):
def extract_numbers_en(text, short_scale=True, ordinals=False, decimal_places=None):
"""
Takes in a string and extracts a list of numbers.

Expand All @@ -1487,11 +1542,12 @@ def extract_numbers_en(text, short_scale=True, ordinals=False):
is now common in most English speaking countries.
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
decimal_places (int or False): rounds to # decimal places. uses builtin round()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more int or False

Returns:
list: list of extracted numbers as floats
"""
results = _extract_numbers_with_text_en(tokenize(text),
short_scale, ordinals)
short_scale, ordinals, places=decimal_places)
return [float(result.value) for result in results]


Expand Down
27 changes: 23 additions & 4 deletions lingua_franca/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,12 @@ def match_one(query, choices):
else:
return best

# TODO update these docstrings when decimal_places has been implemented
# in all parsers

def extract_numbers(text, short_scale=True, ordinals=False, lang=None):

def extract_numbers(text, short_scale=True, ordinals=False, lang=None,
decimal_places=None):
"""
Takes in a string and extracts a list of numbers.

Expand All @@ -90,12 +94,19 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
lang (str): the BCP-47 code for the language to use, None uses default
decimal_places (int or None): Positive value will round to X places.
Val of 0 will round up to nearest int,
equivalent to `math.ceil(result)`
Val of -1 will round down to nearest int,
equivalent to `math.floor(result)`
Val of None will perform no rounding,
potentially returning a very long string.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this a previous idea for implementation? Assuming this will be the new behaviour you described where 0 is a regular rounding and there is no handling of a -1 input?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point, I can't decide which way is better. There are three things at play:

  • If these functions don't round, their output could be a string of stupendous length
  • You don't always want to round the same way
  • Sometimes you feel like an int, sometimes you don't

Returns:
list: list of extracted numbers as floats, or empty list if none found
"""
lang_code = get_primary_lang_code(lang)
if lang_code == "en":
return extract_numbers_en(text, short_scale, ordinals)
return extract_numbers_en(text, short_scale, ordinals, decimal_places)
elif lang_code == "de":
return extract_numbers_de(text, short_scale, ordinals)
elif lang_code == "fr":
Expand All @@ -112,7 +123,8 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None):
return []


def extract_number(text, short_scale=True, ordinals=False, lang=None):
def extract_number(text, short_scale=True, ordinals=False, lang=None,
decimal_places=None):
"""Takes in a string and extracts a number.

Args:
Expand All @@ -123,14 +135,21 @@ def extract_number(text, short_scale=True, ordinals=False, lang=None):
See https://en.wikipedia.org/wiki/Names_of_large_numbers
ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3
lang (str): the BCP-47 code for the language to use, None uses default
decimal_places (int or None): Positive value will round to X places.
Val of 0 will round up to nearest int,
equivalent to `math.ceil(result)`
Val of -1 will round down to nearest int,
equivalent to `math.floor(result)`
Val of None will perform no rounding,
potentially returning a very long string.
Returns:
(int, float or False): The number extracted or False if the input
text contains no numbers
"""
lang_code = get_primary_lang_code(lang)
if lang_code == "en":
return extractnumber_en(text, short_scale=short_scale,
ordinals=ordinals)
ordinals=ordinals, decimal_places=decimal_places)
elif lang_code == "es":
return extractnumber_es(text)
elif lang_code == "pt":
Expand Down
28 changes: 24 additions & 4 deletions test/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,12 @@ def test_extract_number(self):
self.assertEqual(extract_number("eight hundred trillion two hundred \
fifty seven"), 800000000000257.0)

# TODO handle this case
# self.assertEqual(
# extract_number("6 dot six six six"),
# 6.666)
self.assertEqual(extract_number("6 dot six six six"), 6.666)
self.assertEqual(extract_number(
"6 dot six six six", decimal_places=2), round(6.666, 2))
self.assertEqual(extract_number(
"6 point seventy", decimal_places=2), 6.7)

self.assertTrue(extract_number("The tennis player is fast") is False)
self.assertTrue(extract_number("fraggle") is False)

Expand Down Expand Up @@ -735,6 +737,24 @@ def test_multiple_numbers(self):
self.assertEqual(extract_numbers("this is a seven eight nine and a"
" half test"),
[7.0, 8.0, 9.5])
self.assertEqual(extract_numbers("this is a six point five seven nine"
" bingo ten nancy forty six test"),
[6.579, 10.0, 46.0])
self.assertEqual(extract_numbers("this is a six point five seven nine"
" bingo ten nancy forty six test"
" with decimal rounding", decimal_places=2),
[round(6.579, 2), 10, 46])
# test integer rounding, multiple decimals in string
self.assertEqual(extract_numbers(
"five hundred seventy point seven two and thirty one point eight"),
[570.72, 31.8])
self.assertEqual(extract_numbers(
"five hundred seventy point seven two and thirty one point eight",
decimal_places=0), [571, 32])
self.assertEqual(extract_numbers(
"five hundred seventy point seven two and thirty one point eight",
decimal_places=-1), [570, 31])


def test_contractions(self):
self.assertEqual(normalize("ain't"), "is not")
Expand Down