Skip to content

Commit

Permalink
feat: support raw pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
zhijianma committed Dec 19, 2023
1 parent 421c528 commit b3cc336
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 12 deletions.
8 changes: 4 additions & 4 deletions data_juicer/ops/mapper/clean_email_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
self.pattern = r'[A-Za-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+'
else:
self.pattern = pattern
if pattern is not None and len(pattern) > 2:
if (pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"')):
self.pattern = pattern[2:-1]
if (len(pattern) > 2
and pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"')):
self.pattern = pattern[2:-1]

self.repl = repl

Expand Down
8 changes: 4 additions & 4 deletions data_juicer/ops/mapper/clean_ip_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
self.pattern += r'([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4}' # ipv6
else:
self.pattern = pattern
if pattern is not None and len(pattern) > 2:
if (pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"')):
self.pattern = pattern[2:-1]
if (len(pattern) > 2
and pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"')):
self.pattern = pattern[2:-1]
self.repl = repl

def process(self, sample):
Expand Down
8 changes: 4 additions & 4 deletions data_juicer/ops/mapper/clean_links_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ def __init__(self, pattern: str = None, repl: str = '', *args, **kwargs):
self.pattern += r')'
else:
self.pattern = pattern
if pattern is not None and len(pattern) > 2:
if (pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"')):
self.pattern = pattern[2:-1]
if (len(pattern) > 2
and pattern.startswith("r'") and pattern.endswith("'")
or pattern.startswith('r"') and pattern.endswith('"')):
self.pattern = pattern[2:-1]
self.repl = repl

def process(self, sample):
Expand Down

0 comments on commit b3cc336

Please sign in to comment.