Skip to content

Commit

Permalink
feat(ansi): add Scanner for splitting ANSI encoded strings.
Browse files Browse the repository at this point in the history
  • Loading branch information
pachecot committed Oct 14, 2024
1 parent 4604409 commit b7f1dd5
Show file tree
Hide file tree
Showing 2 changed files with 631 additions and 0 deletions.
287 changes: 287 additions & 0 deletions ansi/scanner.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
package ansi

import (
"unicode"
"unicode/utf8"

"github.com/charmbracelet/x/ansi/parser"
"github.com/rivo/uniseg"
)

// ScannerToken is the scanner type identifier
type ScannerToken int

const (
EmptyToken ScannerToken = -iota
EndToken
ErrorToken
ControlToken
RuneToken
TextToken
SpaceToken
LineToken
)

// Scanner implements the reading of strings with ANSI escape and control codes and
// accounts for wide-characters (such as East Asians and emojis). Used to split the
// codes from the string without getting into the details of the codes.
//
// The default splitter ScanAll] will split the string into separate control codes
// and regular strings stripped of encoding.
type Scanner struct {
b []byte
width int
end int
start int
state byte
split ScannerSplit
token ScannerToken
}

// ScannerSplit is the signature of the split function used to further tokenize
// the input. The arguments are the current substring of the remaining unprocessed
// data, the current width of the text and the current token. The return values
// are the number of bytes to advance the input and the next token to return to
// the user.
//
// The split function is called repeatedly as the current token is read. If the
// advance return value is 0 or less the next rune is read and the split is called
// again.
type ScannerSplit func(b []byte, width int, token ScannerToken) (advance int, newToken ScannerToken)

// NewScanner creates a new Scanner for reading the string.
func NewScanner(s string, splitters ...ScannerSplit) *Scanner {
scanner := &Scanner{
b: []byte(s),
state: parser.GroundState,
split: composeSplitters(splitters),
}
return scanner
}

func composeSplitters(splitters []ScannerSplit) ScannerSplit {
switch len(splitters) {
case 0:
return ScanAll
case 1:
return splitters[0]
}
return func(b []byte, width int, token ScannerToken) (advance int, newToken ScannerToken) {
var w int
for _, split := range splitters {
w, token = split(b, width, token)
if w > 0 {
return w, token
}
}
return 0, token
}
}

// Split sets the split function for the [Scanner].
// The default split function is [ScanAll].
func (s *Scanner) Split(f ScannerSplit) {
s.split = f
}

// Text returns the string for current token.
func (s *Scanner) Text() string {
return string(s.data())
}

func (s *Scanner) data() []byte {
return s.b[s.start:s.end]
}

// Len returns the length for current token.
func (s *Scanner) Len() int {
return s.end - s.start
}

// Width returns the width for current token.
func (s *Scanner) Width() int {
return s.width
}

// EOF returns true if at the end of the input string
func (s *Scanner) EOF() bool {
return s.end >= len(s.b)
}

func (s *Scanner) advance(size, width int) bool {
s.end += size
s.width += width
n, tk := s.split(s.b[s.start:s.end], s.width, s.token)
s.token = tk
if n > s.Len() {
s.token = ErrorToken
return false
}
switch n {
case 0:
return false
case s.Len():
return true
case s.Len() - size:
// can backup if completed without accepting the last rune
s.end -= size
s.width -= width
return true
default:
// not using the whole buffer, update the end
// and re-scan the string for the new width
s.end = s.start + n
s.width = stringWidth(s.Text())
return true
}
}

// Scan reads the next token from source and returns it.
func (s *Scanner) Scan() (ScannerToken, string) {
if s.token == ErrorToken {
return ErrorToken, ""
}
s.token = EmptyToken
s.start = s.end
s.width = 0
if s.end >= len(s.b) {
return EndToken, ""
}

// Here we iterate over the bytes of the string and collect characters
// and runes.
// On change of token we emit the current token.
for s.end < len(s.b) {
state, action := parser.Table.Transition(s.state, s.b[s.end])

if state == parser.Utf8State {
switch s.token {
case EmptyToken:
s.token = TextToken
case ControlToken:
// emit on a change from control type
if s.Len() > 0 {
return s.token, s.Text()
}
s.token = TextToken
}
// This action happens when we transition to the Utf8State.
cluster, _, width, _ := uniseg.FirstGraphemeCluster(s.b[s.end:], -1)
if s.advance(len(cluster), width) {
return s.token, s.Text()
}
// Done collecting, now we're back in the ground state.
s.state = parser.GroundState
continue
}

switch action {
case parser.PrintAction, parser.ExecuteAction:
switch s.token {
case EmptyToken:
s.token = TextToken
case ControlToken:
// emit on a change from control type
if s.Len() > 0 {
return s.token, s.Text()
}
s.token = TextToken
}
if s.advance(1, 1) {
return s.token, s.Text()
}

default:
if s.token != ControlToken && s.Len() > 0 {
return s.token, s.Text()
}
s.token = ControlToken
s.end++
}
// Transition to the next state.
s.state = state
}

return s.token, s.Text()
}

// Splitter Functions

// ScanAll is a split function for a [Scanner] that returns all data as Text.
func ScanAll(b []byte, width int, token ScannerToken) (int, ScannerToken) {
return 0, TextToken
}

// ScanWords is a split function for a [Scanner] that returns each space
// separated word, and spaces as tokens.
func ScanWords(b []byte, width int, token ScannerToken) (int, ScannerToken) {
r0, _ := utf8.DecodeRune(b)
if len(b) == 1 {
if unicode.IsSpace(r0) {
return 0, SpaceToken
}
return 0, TextToken
}
r1, r1w := utf8.DecodeLastRune(b)
if unicode.IsSpace(r0) != unicode.IsSpace(r1) {
if unicode.IsSpace(r0) {
return len(b) - r1w, SpaceToken
}
return len(b) - r1w, TextToken
}
return 0, TextToken
}

// ScanRunes is a split function for a [Scanner] that returns each rune.
func ScanRunes(b []byte, width int, token ScannerToken) (int, ScannerToken) {
return len(b), RuneToken
}

// ScanWords is a split function for a [Scanner] that returns lines and
// and newlines as tokens.
func ScanLines(b []byte, width int, token ScannerToken) (int, ScannerToken) {
r0, r0w := utf8.DecodeRune(b)
if r0 == '\n' {
return r0w, LineToken
}
if len(b) == 1 {
if r0 == '\r' {
return 0, LineToken
}
return 0, token
}
r1, r1w := utf8.DecodeLastRune(b)
if r0 == '\r' {
switch r1 {
case '\r':
return 0, LineToken
case '\n':
return len(b), LineToken
default:
return len(b) - r1w, LineToken
}
}
switch r1 {
case '\r', '\n':
return len(b) - r1w, token
default:
return 0, token
}
}

// utility functions

// stringWidth returns the width of a string in cells. The argument is a string
// without ANSI escape sequences. The return value is the number of cells that
// the string will occupy when printed in a terminal. Wide characters (such as
// East Asians and emojis) are accounted for.
//
// ANSI escape not accounted for and not expected to be present in the input.
func stringWidth(s string) int {
width := 0
g := uniseg.NewGraphemes(s)
for g.Next() {
width += g.Width()
}
return width
}
Loading

0 comments on commit b7f1dd5

Please sign in to comment.