Skip to content

Commit

Permalink
[Directories.sanitizeFilname] various fixes (#4060)
Browse files Browse the repository at this point in the history
Most Unix file systems typically allow filenames of up to 255 bytes. However, the actual number of characters allowed can vary due to the representation of Unicode characters. Therefore length checks must be done in bytes, not unicode.

Remove re.split(r"/|\\", filename) because we obviously can't split on character that have already been removed by the blacklist. Remember this function is supposed to process a prospective filename, not a path.

Add a max length argument to command lengths other than 255, e.g. when we are going to append ascii to the filename.

Simplify a lot of overcomplicated code.

Add comments to make easy to understand.

[Directories.getRecordingFilename] also here filename length was being calulated in unicode, not bytes.

Use sanitizeFilename to clean up recording filename rather than having code duplication.

Get rid of this bug: path += "_%03d" % i

>>> path = "abc.def"
>>> for i in range(5):
...     path += "_%03d" % i
...     print(path)
...
abc.def_000
abc.def_000_001
abc.def_000_001_002
abc.def_000_001_002_003
abc.def_000_001_002_003_004
>>>
  • Loading branch information
Huevos authored and WanWizard committed Oct 24, 2024
1 parent bbb988f commit 37a9db7
Showing 1 changed file with 27 additions and 44 deletions.
71 changes: 27 additions & 44 deletions lib/python/Tools/Directories.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from enigma import eEnv
from datetime import datetime
from re import compile, split
from re import compile
from stat import S_IMODE
from sys import _getframe as getframe
from unicodedata import normalize
Expand Down Expand Up @@ -361,19 +361,10 @@ def fileReadXML(filename, default=None, *args, **kwargs):


def getRecordingFilename(basename, dirname=None):
# Filter out non-allowed characters.
non_allowed_characters = "/.\\:*?<>|\""
basename = basename.replace("\xc2\x86", "").replace("\xc2\x87", "")
filename = ""
for c in basename:
if c in non_allowed_characters or ord(c) < 32:
c = "_"
filename += c
# Max filename length for ext4 is 255 (minus 8 characters for .ts.meta)
# but must not truncate in the middle of a multi-byte utf8 character!
# So convert the truncation to unicode and back, ignoring errors, the
# result will be valid utf8 and so xml parsing will be OK.
filename = filename[:247]
# The "replaces" remove dvb emphasis chars.
# Also, "." is replaced with "_" which respects the original code.
# Max filename length for ext4 is 255 bytes (minus 8 bytes for ".ts.meta", minus 4 bytes for "_%03d")
filename = sanitizeFilename(basename.replace("\xc2\x86", "").replace("\xc2\x87", "").replace(".", "_"), maxlen=243)
if dirname is not None:
if not dirname.startswith("/"):
dirname = os.path.join(defaultRecordingLocation(), dirname)
Expand All @@ -385,7 +376,7 @@ def getRecordingFilename(basename, dirname=None):
while True:
if not os.path.isfile(path + ".ts"):
return path
path += "_%03d" % i
path = "%s_%03d" % (filename, i)
i += 1

# This is clearly a hack:
Expand Down Expand Up @@ -570,12 +561,12 @@ def shellquote(s):
return "'%s'" % s.replace("'", "'\\''")


def sanitizeFilename(filename):
def sanitizeFilename(filename, maxlen=255): # 255 is max length in bytes in ext4 (and most other file systems)
"""Return a fairly safe version of the filename.
We don't limit ourselves to ascii, because we want to keep municipality
names, etc, but we do want to get rid of anything potentially harmful,
and make sure we do not exceed Windows filename length limits.
and make sure we do not exceed filename length limits.
Hence a less safe blacklist, rather than a whitelist.
"""
blacklist = ["\\", "/", ":", "*", "?", "\"", "<", ">", "|", "\0"]
Expand All @@ -584,34 +575,26 @@ def sanitizeFilename(filename):
"COM6", "COM7", "COM8", "COM9", "LPT1", "LPT2", "LPT3", "LPT4", "LPT5",
"LPT6", "LPT7", "LPT8", "LPT9",
] # Reserved words on Windows
filename = "".join(c for c in filename if c not in blacklist)
# Remove all charcters below code point 32
filename = "".join(c for c in filename if 31 < ord(c))
filename = normalize("NFKD", filename)
filename = filename.rstrip(". ") # Windows does not allow these at end
filename = filename.strip()
if all([x == "." for x in filename]):
filename = "__" + filename
if filename in reserved:
# Remove any blacklisted chars. Remove all charcters below code point 32. Normalize. Strip.
filename = normalize("NFKD", "".join(c for c in filename if c not in blacklist and ord(c) > 31)).strip()
if all([x == "." for x in filename]) or filename in reserved: # if filename is a string of dots
filename = "__" + filename
# Most Unix file systems typically allow filenames of up to 255 bytes.
# However, the actual number of characters allowed can vary due to the
# representation of Unicode characters. Therefore length checks must
# be done in bytes, not unicode.
#
# Also we cannot leave the byte truncate in the middle of a multi-byte
# utf8 character! So, convert to bytes, truncate then get back to unicode,
# ignoring errors along the way, the result will be valid unicode.
# Prioritise maintaining the complete extension if possible.
# Any truncation of "root" or "ext" will be done at the end of the string
root, ext = os.path.splitext(filename.encode(encoding='utf-8', errors='ignore'))
if len(ext) > maxlen - (1 if root else 0): # leave at least one char for root if root
ext = ext[:maxlen - (1 if root else 0)]
# convert back to unicode, ignoring any incomplete utf8 multibyte chars
filename = root[:maxlen - len(ext)].decode(encoding='utf-8', errors='ignore') + ext.decode(encoding='utf-8', errors='ignore')
filename = filename.rstrip(". ") # Windows does not allow these at end
if len(filename) == 0:
filename = "__"
if len(filename) > 255:
parts = split(r"/|\\", filename)[-1].split(".")
if len(parts) > 1:
ext = "." + parts.pop()
filename = filename[:-len(ext)]
else:
ext = ""
if filename == "":
filename = "__"
if len(ext) > 254:
ext = ext[254:]
maxl = 255 - len(ext)
filename = filename[:maxl]
filename = filename + ext
# Re-check last character (if there was no extension)
filename = filename.rstrip(". ")
if len(filename) == 0:
filename = "__"
return filename

0 comments on commit 37a9db7

Please sign in to comment.