From 0a6a9f3a59720b5c1254cadd65de7c15e65ea935 Mon Sep 17 00:00:00 2001 From: Arnab Chakraborty Date: Tue, 5 Nov 2024 14:22:08 +0530 Subject: [PATCH 1/5] Add custom function for title case to handle apostrophe --- picard/track.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/picard/track.py b/picard/track.py index fbaae45601..fccd4e5e70 100644 --- a/picard/track.py +++ b/picard/track.py @@ -321,6 +321,29 @@ def _customize_metadata(self): @staticmethod def _genres_to_metadata(genres, limit=None, minusage=0, filters='', join_with=None): + def titlize(text): + """Converts text to title case using custom rules. + + Capitalizes the first character of each word while converting remaining + characters to lowercase. Handles contractions properly by keeping the + apostrophe intact. + + Args: + text (str): The input string to be converted to title case. + + Returns: + str: The input text converted to title case format. + + Examples: + >>> titlize("children's music") + 'Children's Music' + >>> titlize("blues") + "Blues" + """ + return re.sub(r"[A-Za-z]+('[A-Za-z]+)?", + lambda m: m.group(0)[0].upper() + m.group(0)[1:].lower(), + text) + if limit is not None and limit < 1: return [] @@ -335,7 +358,7 @@ def _genres_to_metadata(genres, limit=None, minusage=0, filters='', join_with=No # Find most common genres most_common_genres = genres.most_common(limit) - genres_list = [name.title() for name, _count in most_common_genres] + genres_list = [titlize(name) for name, _count in most_common_genres] genres_list.sort() # And generate the genre metadata tag From b23e3d653cca6fc25aaa95a3896021ea937e0f18 Mon Sep 17 00:00:00 2001 From: Arnab Chakraborty Date: Tue, 5 Nov 2024 14:22:08 +0530 Subject: [PATCH 2/5] Use titlecase util function instead of string title method Use titlecase util function instead of string title method --- picard/track.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/picard/track.py b/picard/track.py index fccd4e5e70..7707e27dfb 100644 --- a/picard/track.py +++ b/picard/track.py @@ -75,7 +75,10 @@ ScriptParser, iter_active_tagging_scripts, ) -from picard.util import pattern_as_regex +from picard.util import ( + pattern_as_regex, + titlecase, +) from picard.util.imagelist import ImageList from picard.util.textencoding import asciipunct @@ -358,7 +361,7 @@ def titlize(text): # Find most common genres most_common_genres = genres.most_common(limit) - genres_list = [titlize(name) for name, _count in most_common_genres] + genres_list = [titlecase(name) for name, _count in most_common_genres] genres_list.sort() # And generate the genre metadata tag From d11d201f640ada1d6e5cdf147a15b0ca3e5cf368 Mon Sep 17 00:00:00 2001 From: Arnab Chakraborty Date: Tue, 5 Nov 2024 17:58:19 +0530 Subject: [PATCH 3/5] Create titlecase util function Create titlecase util function --- picard/script/functions.py | 30 ++---------------------- picard/util/__init__.py | 48 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 28 deletions(-) diff --git a/picard/script/functions.py b/picard/script/functions.py index 94bc58d665..967c7997a1 100644 --- a/picard/script/functions.py +++ b/picard/script/functions.py @@ -41,7 +41,6 @@ from functools import reduce import operator import re -import unicodedata from picard.const.countries import RELEASE_COUNTRIES from picard.extension_points.script_functions import script_function @@ -57,6 +56,7 @@ ) from picard.util import ( pattern_as_regex, + titlecase, uniqify, ) @@ -962,33 +962,7 @@ def func_ne_any(parser, x, *args): _Since Picard 2.1_""" )) def func_title(parser, text): - # GPL 2.0 licensed code by Javier Kohen, Sambhav Kothari - # from https://github.com/metabrainz/picard-plugins/blob/2.0/plugins/titlecase/titlecase.py - if not text: - return text - capitalized = text[0].capitalize() - capital = False - for i in range(1, len(text)): - t = text[i] - if t in "’'" and text[i-1].isalpha(): - capital = False - elif iswbound(t): - capital = True - elif capital and t.isalpha(): - capital = False - t = t.capitalize() - else: - capital = False - capitalized += t - return capitalized - - -def iswbound(char): - # GPL 2.0 licensed code by Javier Kohen, Sambhav Kothari - # from https://github.com/metabrainz/picard-plugins/blob/2.0/plugins/titlecase/titlecase.py - """ Checks whether the given character is a word boundary """ - category = unicodedata.category(char) - return 'Zs' == category or 'Sk' == category or 'P' == category[0] + return titlecase(text) @script_function(documentation=N_( diff --git a/picard/util/__init__.py b/picard/util/__init__.py index 89b3c8de90..59968c7beb 100644 --- a/picard/util/__init__.py +++ b/picard/util/__init__.py @@ -1183,3 +1183,51 @@ def detect_file_encoding(path, max_bytes_to_read=1024*256): encoding = result['encoding'].lower() return encoding + + +def iswbound(char): + # GPL 2.0 licensed code by Javier Kohen, Sambhav Kothari + # from https://github.com/metabrainz/picard-plugins/blob/2.0/plugins/titlecase/titlecase.py + """ Checks whether the given character is a word boundary """ + category = unicodedata.category(char) + return 'Zs' == category or 'Sk' == category or 'P' == category[0] + + +def titlecase(text): + # GPL 2.0 licensed code by Javier Kohen, Sambhav Kothari + # from https://github.com/metabrainz/picard-plugins/blob/2.0/plugins/titlecase/titlecase.py + """Converts text to title case following word boundary rules. + + Capitalizes the first character of each word in the input text, where words + are determined by Unicode word boundaries. Preserves existing capitalization + after the first character of each word. + + Args: + text (str): The input text to convert to title case. + + Returns: + str: The text converted to title case. Returns empty string if input is empty. + + Examples: + >>> titlecase("hello world") + 'Hello World' + >>> titlecase("children's music") + 'Children's Music' + """ + if not text: + return text + capitalized = text[0].capitalize() + capital = False + for i in range(1, len(text)): + t = text[i] + if t in "’'" and text[i-1].isalpha(): + capital = False + elif iswbound(t): + capital = True + elif capital and t.isalpha(): + capital = False + t = t.capitalize() + else: + capital = False + capitalized += t + return capitalized From 4a6b642523df267037cd6aea7406773dc5a921a9 Mon Sep 17 00:00:00 2001 From: Arnab Chakraborty Date: Tue, 5 Nov 2024 18:37:11 +0530 Subject: [PATCH 4/5] Add test for titlecase function --- test/test_utils.py | 49 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 5d72cecfd9..d131d9689b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -74,6 +74,7 @@ pattern_as_regex, sort_by_similarity, system_supports_long_paths, + titlecase, tracknum_and_title_from_filename, tracknum_from_filename, uniqify, @@ -1019,3 +1020,51 @@ def test_detect_file_encoding_eac_windows_1251(self): expected_encoding = 'windows-1251' file_path = get_test_data_path('eac-windows1251.log') self.assertEqual(expected_encoding, detect_file_encoding(file_path)) + + +class TitlecaseTest(PicardTestCase): + + def test_titlecase(self): + tests = ( + # empty string + ('', ''), + # simple cases + ('hello world', 'Hello World'), + ('Hello World', 'Hello World'), + ('HELLO WORLD', 'HELLO WORLD'), + # contractions and possessives + ("children's music", "Children's Music"), + ("CHILDREN'S MUSIC", "CHILDREN'S MUSIC"), + ("don't stop", "Don't Stop"), + # hyphenated words + ('first-class ticket', 'First-Class Ticket'), + ('FIRST-CLASS ticket', 'FIRST-CLASS Ticket'), + # multiple spaces + ('hello world', 'Hello World'), + # punctuation + ('hello, world!', 'Hello, World!'), + ('hello... world', 'Hello... World'), + # special characters + ('über café', 'Über Café'), + ('españa', 'España'), + ('ñandu', 'Ñandu'), + # single character words + ('a b c', 'A B C'), + # numbers + ('2001 a space odyssey', '2001 A Space Odyssey'), + # preserves existing capitalization after first letter + ('MacDonald had a farm', 'MacDonald Had A Farm'), + ('LaTeX document', 'LaTeX Document'), + # mixed case + ('mIxEd CaSe', 'MIxEd CaSe'), + # unicode boundaries + ('hello—world', 'Hello—World'), + ('hello\u2014world', 'Hello\u2014World'), + # preserves all caps + ('IBM PC', 'IBM PC'), + # single letter + ('a', 'A'), + ('A', 'A'), + ) + for input, expected in tests: + self.assertEqual(expected, titlecase(input)) From fe9302aeb14b235ea87e140d175a433068fc1167 Mon Sep 17 00:00:00 2001 From: Arnab Chakraborty Date: Tue, 5 Nov 2024 22:40:31 +0530 Subject: [PATCH 5/5] Remove unused titlize function --- picard/track.py | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/picard/track.py b/picard/track.py index 7707e27dfb..cda740f54f 100644 --- a/picard/track.py +++ b/picard/track.py @@ -324,29 +324,6 @@ def _customize_metadata(self): @staticmethod def _genres_to_metadata(genres, limit=None, minusage=0, filters='', join_with=None): - def titlize(text): - """Converts text to title case using custom rules. - - Capitalizes the first character of each word while converting remaining - characters to lowercase. Handles contractions properly by keeping the - apostrophe intact. - - Args: - text (str): The input string to be converted to title case. - - Returns: - str: The input text converted to title case format. - - Examples: - >>> titlize("children's music") - 'Children's Music' - >>> titlize("blues") - "Blues" - """ - return re.sub(r"[A-Za-z]+('[A-Za-z]+)?", - lambda m: m.group(0)[0].upper() + m.group(0)[1:].lower(), - text) - if limit is not None and limit < 1: return []