diff --git a/ansi/scanner.go b/ansi/scanner.go new file mode 100644 index 00000000..8f475b47 --- /dev/null +++ b/ansi/scanner.go @@ -0,0 +1,287 @@ +package ansi + +import ( + "unicode" + "unicode/utf8" + + "github.com/charmbracelet/x/ansi/parser" + "github.com/rivo/uniseg" +) + +// ScannerToken is the scanner type identifier +type ScannerToken int + +const ( + EmptyToken ScannerToken = -iota + EndToken + ErrorToken + ControlToken + RuneToken + TextToken + SpaceToken + LineToken +) + +// Scanner implements the reading of strings with ANSI escape and control codes and +// accounts for wide-characters (such as East Asians and emojis). Used to split the +// codes from the string without getting into the details of the codes. +// +// The default splitter ScanAll] will split the string into separate control codes +// and regular strings stripped of encoding. +type Scanner struct { + b []byte + width int + end int + start int + state byte + split ScannerSplit + token ScannerToken +} + +// ScannerSplit is the signature of the split function used to further tokenize +// the input. The arguments are the current substring of the remaining unprocessed +// data, the current width of the text and the current token. The return values +// are the number of bytes to advance the input and the next token to return to +// the user. +// +// The split function is called repeatedly as the current token is read. If the +// advance return value is 0 or less the next rune is read and the split is called +// again. +type ScannerSplit func(b []byte, width int, token ScannerToken) (advance int, newToken ScannerToken) + +// NewScanner creates a new Scanner for reading the string. +func NewScanner(s string, splitters ...ScannerSplit) *Scanner { + scanner := &Scanner{ + b: []byte(s), + state: parser.GroundState, + split: composeSplitters(splitters), + } + return scanner +} + +func composeSplitters(splitters []ScannerSplit) ScannerSplit { + switch len(splitters) { + case 0: + return ScanAll + case 1: + return splitters[0] + } + return func(b []byte, width int, token ScannerToken) (advance int, newToken ScannerToken) { + var w int + for _, split := range splitters { + w, token = split(b, width, token) + if w > 0 { + return w, token + } + } + return 0, token + } +} + +// Split sets the split function for the [Scanner]. +// The default split function is [ScanAll]. +func (s *Scanner) Split(f ScannerSplit) { + s.split = f +} + +// Text returns the string for current token. +func (s *Scanner) Text() string { + return string(s.data()) +} + +func (s *Scanner) data() []byte { + return s.b[s.start:s.end] +} + +// Len returns the length for current token. +func (s *Scanner) Len() int { + return s.end - s.start +} + +// Width returns the width for current token. +func (s *Scanner) Width() int { + return s.width +} + +// EOF returns true if at the end of the input string +func (s *Scanner) EOF() bool { + return s.end >= len(s.b) +} + +func (s *Scanner) advance(size, width int) bool { + s.end += size + s.width += width + n, tk := s.split(s.b[s.start:s.end], s.width, s.token) + s.token = tk + if n > s.Len() { + s.token = ErrorToken + return false + } + switch n { + case 0: + return false + case s.Len(): + return true + case s.Len() - size: + // can backup if completed without accepting the last rune + s.end -= size + s.width -= width + return true + default: + // not using the whole buffer, update the end + // and re-scan the string for the new width + s.end = s.start + n + s.width = stringWidth(s.Text()) + return true + } +} + +// Scan reads the next token from source and returns it. +func (s *Scanner) Scan() (ScannerToken, string) { + if s.token == ErrorToken { + return ErrorToken, "" + } + s.token = EmptyToken + s.start = s.end + s.width = 0 + if s.end >= len(s.b) { + return EndToken, "" + } + + // Here we iterate over the bytes of the string and collect characters + // and runes. + // On change of token we emit the current token. + for s.end < len(s.b) { + state, action := parser.Table.Transition(s.state, s.b[s.end]) + + if state == parser.Utf8State { + switch s.token { + case EmptyToken: + s.token = TextToken + case ControlToken: + // emit on a change from control type + if s.Len() > 0 { + return s.token, s.Text() + } + s.token = TextToken + } + // This action happens when we transition to the Utf8State. + cluster, _, width, _ := uniseg.FirstGraphemeCluster(s.b[s.end:], -1) + if s.advance(len(cluster), width) { + return s.token, s.Text() + } + // Done collecting, now we're back in the ground state. + s.state = parser.GroundState + continue + } + + switch action { + case parser.PrintAction, parser.ExecuteAction: + switch s.token { + case EmptyToken: + s.token = TextToken + case ControlToken: + // emit on a change from control type + if s.Len() > 0 { + return s.token, s.Text() + } + s.token = TextToken + } + if s.advance(1, 1) { + return s.token, s.Text() + } + + default: + if s.token != ControlToken && s.Len() > 0 { + return s.token, s.Text() + } + s.token = ControlToken + s.end++ + } + // Transition to the next state. + s.state = state + } + + return s.token, s.Text() +} + +// Splitter Functions + +// ScanAll is a split function for a [Scanner] that returns all data as Text. +func ScanAll(b []byte, width int, token ScannerToken) (int, ScannerToken) { + return 0, TextToken +} + +// ScanWords is a split function for a [Scanner] that returns each space +// separated word, and spaces as tokens. +func ScanWords(b []byte, width int, token ScannerToken) (int, ScannerToken) { + r0, _ := utf8.DecodeRune(b) + if len(b) == 1 { + if unicode.IsSpace(r0) { + return 0, SpaceToken + } + return 0, TextToken + } + r1, r1w := utf8.DecodeLastRune(b) + if unicode.IsSpace(r0) != unicode.IsSpace(r1) { + if unicode.IsSpace(r0) { + return len(b) - r1w, SpaceToken + } + return len(b) - r1w, TextToken + } + return 0, TextToken +} + +// ScanRunes is a split function for a [Scanner] that returns each rune. +func ScanRunes(b []byte, width int, token ScannerToken) (int, ScannerToken) { + return len(b), RuneToken +} + +// ScanWords is a split function for a [Scanner] that returns lines and +// and newlines as tokens. +func ScanLines(b []byte, width int, token ScannerToken) (int, ScannerToken) { + r0, r0w := utf8.DecodeRune(b) + if r0 == '\n' { + return r0w, LineToken + } + if len(b) == 1 { + if r0 == '\r' { + return 0, LineToken + } + return 0, token + } + r1, r1w := utf8.DecodeLastRune(b) + if r0 == '\r' { + switch r1 { + case '\r': + return 0, LineToken + case '\n': + return len(b), LineToken + default: + return len(b) - r1w, LineToken + } + } + switch r1 { + case '\r', '\n': + return len(b) - r1w, token + default: + return 0, token + } +} + +// utility functions + +// stringWidth returns the width of a string in cells. The argument is a string +// without ANSI escape sequences. The return value is the number of cells that +// the string will occupy when printed in a terminal. Wide characters (such as +// East Asians and emojis) are accounted for. +// +// ANSI escape not accounted for and not expected to be present in the input. +func stringWidth(s string) int { + width := 0 + g := uniseg.NewGraphemes(s) + for g.Next() { + width += g.Width() + } + return width +} diff --git a/ansi/scanner_test.go b/ansi/scanner_test.go new file mode 100644 index 00000000..1ea85152 --- /dev/null +++ b/ansi/scanner_test.go @@ -0,0 +1,344 @@ +package ansi + +import ( + "fmt" + "strings" + "testing" +) + +type scanResult struct { + kind ScannerToken + text string +} + +func (sr scanResult) String() string { + return fmt.Sprintf("{%d %q}", sr.kind, strings.ReplaceAll(sr.text, "\x1b", "\\x1b")) +} + +func TestScannerLinesWords(t *testing.T) { + + var testCases = []struct { + name string + input string + expected []scanResult + }{ + { + name: "simple", + input: "I really \x1B[38;2;249;38;114mlove\x1B[0m Go!", + expected: []scanResult{ + {TextToken, "I"}, + {SpaceToken, " "}, + {TextToken, "really"}, + {SpaceToken, " "}, + {ControlToken, "\x1B[38;2;249;38;114m"}, + {TextToken, "love"}, + {ControlToken, "\x1B[0m"}, + {SpaceToken, " "}, + {TextToken, "Go!"}, + }, + }, + { + name: "passthrough", + input: "hello world", + expected: []scanResult{ + {TextToken, "hello"}, + {SpaceToken, " "}, + {TextToken, "world"}, + }, + }, + { + name: "asian", + input: "こんにち", + expected: []scanResult{ + {TextToken, "こんにち"}}, + }, + { + name: "emoji", + input: "😃👰🏻‍♀️🫧", + expected: []scanResult{ + {TextToken, "😃👰🏻‍♀️🫧"}, + }, + }, + { + name: "long style", + input: "\x1B[38;2;249;38;114ma really long string\x1B[0m", + expected: []scanResult{ + {ControlToken, "\x1B[38;2;249;38;114m"}, + {TextToken, "a"}, + {SpaceToken, " "}, + {TextToken, "really"}, + {SpaceToken, " "}, + {TextToken, "long"}, + {SpaceToken, " "}, + {TextToken, "string"}, + {ControlToken, "\x1B[0m"}, + }, + }, + { + name: "long style nbsp", + input: "\x1B[38;2;249;38;114ma really\u00a0long string\x1B[0m", + expected: []scanResult{ + {ControlToken, "\x1b[38;2;249;38;114m"}, + {TextToken, "a"}, + {SpaceToken, " "}, + {TextToken, "really"}, + {SpaceToken, "\u00a0"}, + {TextToken, "long"}, + {SpaceToken, " "}, + {TextToken, "string"}, + {ControlToken, "\x1b[0m"}, + }, + }, + { + name: "exact", + input: "\x1b[91mfoo\x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo"}, + {ControlToken, "\x1b[0"}, + }, + }, + { + name: "newline", + input: "\x1b[91mfoo\nbar\x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo"}, + {LineToken, "\n"}, + {TextToken, "bar"}, + {ControlToken, "\x1b[0"}, + }, + }, + { + name: "carriage return", + input: "\x1b[91mfoo\rbar\x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo"}, + {LineToken, "\r"}, + {TextToken, "bar"}, + {ControlToken, "\x1b[0"}, + }, + }, + { + name: "return & newline", + input: "\x1b[91mfoo\r\nbar\x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo"}, + {LineToken, "\r\n"}, + {TextToken, "bar"}, + {ControlToken, "\x1b[0"}, + }, + }, + { + name: "extra extra return & newline", + input: "\x1b[91mfoo\r\r\r\nbar\x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo"}, + {LineToken, "\r\r\r\n"}, + {TextToken, "bar"}, + {ControlToken, "\x1b[0"}, + }, + }, + { + name: "spaces return & newline", + input: "\x1b[91mfoo \r\n bar \x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo"}, + {SpaceToken, " "}, + {LineToken, "\r\n"}, + {SpaceToken, " "}, + {TextToken, "bar"}, + {SpaceToken, " "}, + {ControlToken, "\x1b[0"}, + }, + }, + { + name: "multiple newlines emitted separately", + input: "\x1b[91mfoo\n\n\r\nbar\x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo"}, + {LineToken, "\n"}, + {LineToken, "\n"}, + {LineToken, "\r\n"}, + {TextToken, "bar"}, + {ControlToken, "\x1b[0"}, + }, + }, + } + + for i, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + s := NewScanner(tc.input, ScanLines, ScanWords) + for j, expected := range tc.expected { + k, s := s.Scan() + if k != expected.kind || s != expected.text { + t.Errorf("case %d, input %q, expected %d %s, got %s", i+1, tc.input, j, expected, scanResult{k, s}) + } + } + }) + } + +} + +func TestScannerLines(t *testing.T) { + + var testCases = []struct { + name string + input string + expected []scanResult + }{ + { + name: "simple", + input: "I really \x1B[38;2;249;38;114mlove\x1B[0m Go!", + expected: []scanResult{ + {TextToken, "I really "}, + {ControlToken, "\x1B[38;2;249;38;114m"}, + {TextToken, "love"}, + {ControlToken, "\x1B[0m"}, + {TextToken, " Go!"}, + }, + }, + { + name: "passthrough", + input: "hello world", + expected: []scanResult{ + {TextToken, "hello world"}, + }, + }, + { + name: "long style", + input: "\x1B[38;2;249;38;114ma really long string\x1B[0m", + expected: []scanResult{ + {ControlToken, "\x1B[38;2;249;38;114m"}, + {TextToken, "a really long string"}, + {ControlToken, "\x1B[0m"}, + }, + }, + { + name: "long style nbsp", + input: "\x1B[38;2;249;38;114ma really\u00a0long string\x1B[0m", + expected: []scanResult{ + {ControlToken, "\x1b[38;2;249;38;114m"}, + {TextToken, "a really\u00a0long string"}, + {ControlToken, "\x1b[0m"}, + }, + }, + { + name: "newline", + input: "\x1b[91mfoo\nbar\x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo"}, + {LineToken, "\n"}, + {TextToken, "bar"}, + {ControlToken, "\x1b[0"}, + }, + }, + { + name: "spaces return & newline", + input: "\x1b[91mfoo \r\n bar \x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo "}, + {LineToken, "\r\n"}, + {TextToken, " bar "}, + {ControlToken, "\x1b[0"}, + }, + }, + { + name: "spaces return", + input: "\x1b[91mfoo \r bar \x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo "}, + {LineToken, "\r"}, + {TextToken, " bar "}, + {ControlToken, "\x1b[0"}, + }, + }, + } + + for i, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + s := NewScanner(tc.input, ScanLines) + for j, expected := range tc.expected { + k, s := s.Scan() + if k != expected.kind || s != expected.text { + t.Errorf("case %d, input %q, expected %d %s, got %s", i+1, tc.input, j, expected, scanResult{k, s}) + } + } + }) + } +} + +func TestScanner(t *testing.T) { + + var testCases = []struct { + name string + input string + expected []scanResult + }{ + { + name: "simple", + input: "I really \x1B[38;2;249;38;114mlove\x1B[0m Go!", + expected: []scanResult{ + {TextToken, "I really "}, + {ControlToken, "\x1B[38;2;249;38;114m"}, + {TextToken, "love"}, + {ControlToken, "\x1B[0m"}, + {TextToken, " Go!"}, + }, + }, + { + name: "passthrough", + input: "hello world", + expected: []scanResult{ + {TextToken, "hello world"}, + }, + }, + { + name: "long style", + input: "\x1B[38;2;249;38;114ma really long string\x1B[0m", + expected: []scanResult{ + {ControlToken, "\x1B[38;2;249;38;114m"}, + {TextToken, "a really long string"}, + {ControlToken, "\x1B[0m"}, + }, + }, + { + name: "long style nbsp", + input: "\x1B[38;2;249;38;114ma really\u00a0long string\x1B[0m", + expected: []scanResult{ + {ControlToken, "\x1b[38;2;249;38;114m"}, + {TextToken, "a really\u00a0long string"}, + {ControlToken, "\x1b[0m"}, + }, + }, + { + name: "spaces return & newline", + input: "\x1b[91mfoo \r\n bar \x1b[0", + expected: []scanResult{ + {ControlToken, "\x1b[91m"}, + {TextToken, "foo \r\n bar "}, + {ControlToken, "\x1b[0"}, + }, + }, + } + + for i, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + s := NewScanner(tc.input) + for j, expected := range tc.expected { + k, s := s.Scan() + if k != expected.kind || s != expected.text { + t.Errorf("case %d, input %q, expected %d %s, got %s", i+1, tc.input, j, expected, scanResult{k, s}) + } + } + }) + } +}