From 00c88761090c3dcc0a121d1e27ea1d4eae4fb2d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=85ke=20Forslund?= Date: Tue, 15 Jun 2021 11:49:24 +0200 Subject: [PATCH 1/2] Add extract_duration for Swedish --- lingua_franca/lang/parse_sv.py | 141 ++++++++++++++++++++++++++++++++- test/test_parse_sv.py | 41 +++++++++- 2 files changed, 179 insertions(+), 3 deletions(-) diff --git a/lingua_franca/lang/parse_sv.py b/lingua_franca/lang/parse_sv.py index 9a5f0928..d8aea266 100644 --- a/lingua_franca/lang/parse_sv.py +++ b/lingua_franca/lang/parse_sv.py @@ -13,10 +13,147 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from datetime import datetime +from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta + from lingua_franca.time import now_local -from .parse_common import is_numeric, look_for_fractions, Normalizer + +from .parse_common import (is_numeric, look_for_fractions, Normalizer, + tokenize, Token) + + +def _find_numbers_in_text(tokens): + """Finds duration related numbers in texts and makes a list of mappings. + + The mapping will be for number to token that created it, if no number was + created from the token the mapping will be from None to the token. + + The function is optimized to generate data that can be parsed to a duration + so it returns the list in reverse order to make the "size" (minutes/hours/ + etc.) come first and the related numbers afterwards. + + Args: + tokens: Tokens to parse + + Returns: + list of (number, token) tuples + """ + parts = [] + for tok in tokens: + res = extract_number_sv(tok.word) + if res: + parts.insert(0, (res, tok)) + # Special case for quarter of an hour + if tok.word == 'kvart': + parts.insert(0, (None, Token('timmar', index=-1))) + elif tok.word in ['halvtimme', 'halvtimma']: + parts.insert(0, (30, tok)) + parts.insert(0, (None, Token('minuter', index=-1))) + else: + parts.insert(0, (None, tok)) + return parts + + +def _combine_adjacent_numbers(number_map): + """Combine adjacent numbers through multiplication. + + Walks through a number map and joins adjasent numbers to handle cases + such as "en halvtimme" (one half hour). + + Returns: + (list): simplified number_map + """ + simplified = [] + skip = False + for i in range(len(number_map) - 1): + if skip: + skip = False + continue + if number_map[i][0] and number_map[i + 1][0]: + combined_number = number_map[i][0] * number_map[i + 1][0] + combined_tokens = (number_map[i][1], number_map[i + 1][1]) + simplified.append((combined_number, combined_tokens)) + skip = True + else: + simplified.append((number_map[i][0], (number_map[i][1],))) + + if not skip: + simplified.append((number_map[-1][0], (number_map[-1][1],))) + return simplified + + +def extract_duration_sv(text): + """ + Convert an swedish phrase into a number of seconds. + + The function handles durations from seconds up to days. + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + tokens = tokenize(text) + number_tok_map = _find_numbers_in_text(tokens) + # Combine adjacent numbers + simplified = _combine_adjacent_numbers(number_tok_map) + + states = { + 'days': 0, + 'hours': 0, + 'minutes': 0, + 'seconds': 0 + } + + # Parser state, mapping words that should set the parser to collect + # numbers to a specific time "size" + state_words = { + 'days': ('dygn', 'dag', 'dagar', 'dags'), + 'hours': ('timmar', 'timme', 'timma', 'timmes', 'timmas'), + 'minutes': ('minuter', 'minuters', 'minut', 'minuts'), + 'seconds': ('sekunder', 'sekunders', 'sekund', 'sekunds') + } + binding_words = ('och') + + consumed = [] + state = None + valid = False + + for num, toks in simplified: + if state and num: + states[state] += num + consumed.extend(toks) + valid = True # If a state field got set this is valid duration + elif num is None: + for s in state_words: + if toks[0].word in state_words[s]: + state = s + consumed.extend(toks) + break + else: + if toks[0].word not in binding_words: + state = None + + td = timedelta(**states) + remainder = ' '.join([t.word for t in tokens if t not in consumed]) + return (td, remainder) if valid else None def extract_number_sv(text, short_scale=True, ordinals=False): diff --git a/test/test_parse_sv.py b/test/test_parse_sv.py index 20b87f58..ecba3b63 100644 --- a/test/test_parse_sv.py +++ b/test/test_parse_sv.py @@ -14,11 +14,12 @@ # limitations under the License. # import unittest -from datetime import datetime, time +from datetime import datetime, time, timedelta from lingua_franca import load_language, unload_language from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_number +from lingua_franca.parse import extract_duration from lingua_franca.parse import normalize @@ -127,6 +128,44 @@ def test_numbers(self): lang='sv-se'), "det är 18 19 20 test") +class TestExtractDuration(unittest.TestCase): + def test_valid_extract_duration(self): + """Duration in sentence.""" + td, remains = extract_duration("5 minuter", lang='sv-se') + self.assertEqual(td, timedelta(seconds=300)) + self.assertEqual(remains, '') + + td, remains = extract_duration("om 2 och en halv timme", lang='sv-se') + self.assertEqual(td, timedelta(hours=2, minutes=30)) + self.assertEqual(remains, "om och") + + td, remains = extract_duration("starta en 9 minuters timer", + lang='sv-se') + self.assertEqual(td, timedelta(minutes=9)) + self.assertEqual(remains, "starta timer") + + # Extraction of things like "kvart" and "halvtimme" + td, remains = extract_duration("i en kvart", lang='sv-se') + self.assertEqual(td, timedelta(minutes=15)) + self.assertEqual(remains, "i") + + td, remains = extract_duration("hämta mig om två timmar och en kvart", + lang='sv-se') + self.assertEqual(td, timedelta(hours=2, minutes=15)) + self.assertEqual(remains, "hämta mig om och") + + td, remains = extract_duration("om en halvtimme", lang='sv-se') + self.assertEqual(td, timedelta(minutes=30)) + self.assertEqual(remains, "om") + + def test_invalid_extract_duration(self): + """No duration in sentence.""" + res = extract_duration("vad är en myrslok", lang='sv-se') + self.assertEqual(res, None) + + res = extract_duration("svaret är 42", lang='sv-se') + self.assertEqual(res, None) + if __name__ == "__main__": unittest.main() From bb6239c4bf308ea941a992d2ffc915bccec06d7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=85ke=20Forslund?= Date: Tue, 7 Dec 2021 07:25:09 +0100 Subject: [PATCH 2/2] Fix flake-8 warning, line too long --- lingua_franca/lang/parse_sv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lingua_franca/lang/parse_sv.py b/lingua_franca/lang/parse_sv.py index d8aea266..02164111 100644 --- a/lingua_franca/lang/parse_sv.py +++ b/lingua_franca/lang/parse_sv.py @@ -166,8 +166,8 @@ def extract_number_sv(text, short_scale=True, ordinals=False): (int) or (float): The value of extracted number """ # TODO: short_scale and ordinals don't do anything here. - # The parameters are present in the function signature for API compatibility - # reasons. + # The parameters are present in the function signature for API + # compatibility reasons. text = text.lower() aWords = text.split() and_pass = False