Skip to content

Commit

Permalink
Merge pull request #1 from imbue11235/feature/rune-kind-exclusion
Browse files Browse the repository at this point in the history
[Feature] Added runekind exclusion
  • Loading branch information
imbue11235 authored Aug 30, 2023
2 parents be63287 + b1554d3 commit 89809d8
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 25 deletions.
2 changes: 2 additions & 0 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ type config struct {
includeSpaces bool
allowHyphenatedWords bool
ignoredRunes []rune
ignoredRunesKinds []RuneKind
}

// newDefaultConfig defines the standards
Expand All @@ -17,6 +18,7 @@ func newDefaultConfig() *config {
includeSpaces: false,
allowHyphenatedWords: false,
ignoredRunes: make([]rune, 0),
ignoredRunesKinds: make([]RuneKind, 0),
}
}

Expand Down
67 changes: 42 additions & 25 deletions extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,48 +17,50 @@ import (
// 5. If the current character is a lowercase, and the last character of the previous word was uppercase,
// the uppercase letter will be moved to the lowercase string. E.g. "YAMLParser" => []{"YAML", "Parser"}

type RuneKind int

const (
symbol = 1 + iota
uppercase
lowercase
space
digit
punctuation
unknown
Symbol RuneKind = 1 + iota
Uppercase
Lowercase
Space
Digit
Punctuation
Unknown
)

const hyphen = rune(45)

// getRuneKind takes the rune and returns
// the int representation of it's kind
func getRuneKind(r rune) int {
func getRuneKind(r rune) RuneKind {
switch {
case unicode.IsSymbol(r):
return symbol
return Symbol
case unicode.IsUpper(r):
return uppercase
return Uppercase
case unicode.IsLower(r):
return lowercase
return Lowercase
case unicode.IsSpace(r):
return space
return Space
case unicode.IsDigit(r):
return digit
return Digit
case unicode.IsPunct(r):
return punctuation
return Punctuation
}

return unknown
return Unknown
}

// shouldInclude checks if the kind of rune should be included
// in the word
func shouldInclude(runeKind int, config *config) bool {
func shouldInclude(runeKind RuneKind, config *config) bool {
switch runeKind {
case symbol:
case Symbol:
return config.includeSymbols
case punctuation:
case Punctuation:
return config.includePunctuation
case space:
case Space:
return config.includeSpaces
}

Expand All @@ -67,7 +69,7 @@ func shouldInclude(runeKind int, config *config) bool {

// isHyphenatedWord determines if the word is a hyphenated word
// by looking at adjacent rune kinds
func isHyphenatedWord(r rune, lastRuneKind, nextRuneKind int) bool {
func isHyphenatedWord(r rune, lastRuneKind, nextRuneKind RuneKind) bool {
if r != hyphen {
return false
}
Expand All @@ -79,7 +81,7 @@ func isHyphenatedWord(r rune, lastRuneKind, nextRuneKind int) bool {
return false
}

return slices.Contains([]int{lowercase, uppercase}, lastRuneKind) && slices.Contains([]int{lowercase, uppercase}, nextRuneKind)
return slices.Contains([]RuneKind{Lowercase, Uppercase}, lastRuneKind) && slices.Contains([]RuneKind{Lowercase, Uppercase}, nextRuneKind)
}

// extract with by the defined rules
Expand All @@ -90,14 +92,29 @@ func extract(input string, config *config) []string {
}

var runes [][]rune
runeKind, lastRuneKind, runesLen := 0, 0, -1
var runeKind, lastRuneKind RuneKind
runesLen := -1

for i, r := range input {
// If the rune should be ignored, we will simply add it to
// the current word, and treat it of same rune kind as the last
// added value
if slices.Contains(config.ignoredRunes, r) {
if slices.Contains(config.ignoredRunes, r) || slices.Contains(config.ignoredRunesKinds, getRuneKind(r)) {
// If the current rune is the first rune, we will append a new slice
if runesLen == -1 {
runes = append(runes, []rune{})
runesLen++
}

runes[runesLen] = append(runes[runesLen], r)

// If there is a next rune, we will set the last rune kind to the next rune kind
// to indicate that the next rune should be treated as the same kind as the last,
// even if it's not.
if len(input) > i+1 {
lastRuneKind = getRuneKind(rune(input[i+1]))
}

continue
}

Expand All @@ -106,7 +123,7 @@ func extract(input string, config *config) []string {
// if the adjacent runes of a hyphen is a letter of same kind (upper/lowercase),
// without keeping track of it's rune type (Rule 2).
if config.allowHyphenatedWords {
var nextRuneKind int
var nextRuneKind RuneKind

if len(input) > i+1 {
nextRuneKind = getRuneKind(rune(input[i+1]))
Expand Down Expand Up @@ -143,7 +160,7 @@ func extract(input string, config *config) []string {
runesLen++

// Move an uppercase rune from the end of previous word, to this word (Rule 5).
if lastRuneKind == uppercase && runeKind == lowercase {
if lastRuneKind == Uppercase && runeKind == Lowercase {
// Prepend the last character of previous rune-slice
runes[runesLen] = append([]rune{runes[runesLen-1][len(runes[runesLen-1])-1]}, runes[runesLen]...)
// Remove the last character from the previous rune-slice
Expand Down
13 changes: 13 additions & 0 deletions extract_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,20 @@ func TestExtractWithIgnoredRunes(t *testing.T) {
{"ignored.runes", []string{"ignored.runes"}},
{"etc. and so on", []string{"etc.", "and", "so", "on"}},
{"etc! and so on", []string{"etc", "and", "so", "on"}},
{".start_and_end_with_periods.", []string{".start", "and", "end", "with", "periods."}},
}

runExtractTest(t, tests, words.WithIgnoredRunes('.'))
}

func TestExtractWithIgnoreNumbers(t *testing.T) {
tests := []testSet{
{"100cm", []string{"100cm"}},
{"QC35", []string{"QC35"}},
{"Win2000", []string{"Win2000"}},
{"100cm QC35 Win2000", []string{"100cm", "QC35", "Win2000"}},
{"100cmQC35Win2000", []string{"100cm", "QC35", "Win2000"}},
}

runExtractTest(t, tests, words.WithIgnoredRuneKinds(words.Digit))
}
9 changes: 9 additions & 0 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,12 @@ func WithIgnoredRunes(runes ...rune) Option {
c.ignoredRunes = append(c.ignoredRunes, runes...)
}
}

// WithIgnoredRuneKinds tells the extractor to ignore these rune kinds
// when they are encountered, simply adding them to the output
// as the rune was of most recent rune kind.
func WithIgnoredRuneKinds(runeKinds ...RuneKind) Option {
return func(c *config) {
c.ignoredRunesKinds = append(c.ignoredRunesKinds, runeKinds...)
}
}

0 comments on commit 89809d8

Please sign in to comment.