Skip to content

Commit

Permalink
updated options + added a new option for ignoring runes
Browse files Browse the repository at this point in the history
  • Loading branch information
imbue11235 committed Jun 27, 2022
1 parent d3e67b1 commit 5389ccb
Show file tree
Hide file tree
Showing 6 changed files with 70 additions and 69 deletions.
18 changes: 10 additions & 8 deletions config.go
Original file line number Diff line number Diff line change
@@ -1,27 +1,29 @@
package words

type config struct {
includeSymbols bool
includePunctuation bool
includeSpaces bool
includeSymbols bool
includePunctuation bool
includeSpaces bool
allowHyphenatedWords bool
ignoredRunes []rune
}

// newDefaultConfig defines the standards
// of the word extractor
func newDefaultConfig() *config {
return &config{
includeSymbols: false,
includePunctuation: false,
includeSpaces: false,
includeSymbols: false,
includePunctuation: false,
includeSpaces: false,
allowHyphenatedWords: false,
ignoredRunes: make([]rune, 0),
}
}

// apply applies all the options to the
// current config
func (c *config) apply(options ...Option) {
for _, opt := range options {
opt.apply(c)
opt(c)
}
}
}
33 changes: 15 additions & 18 deletions extract.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
package words

import (
"golang.org/x/exp/slices"
"unicode"
"unicode/utf8"
)
Expand Down Expand Up @@ -64,17 +65,6 @@ func shouldInclude(runeKind int, config *config) bool {
return true
}

// in checks if the rune kind is in the given slice of rune kinds
func in(runeKind int, runeKinds []int) bool {
for _, kind := range runeKinds {
if runeKind == kind {
return true
}
}

return false
}

// isHyphenatedWord determines if the word is a hyphenated word
// by looking at adjacent rune kinds
func isHyphenatedWord(r rune, lastRuneKind, nextRuneKind int) bool {
Expand All @@ -89,7 +79,7 @@ func isHyphenatedWord(r rune, lastRuneKind, nextRuneKind int) bool {
return false
}

return in(lastRuneKind, []int{lowercase, uppercase}) && in(nextRuneKind, []int{lowercase, uppercase})
return slices.Contains([]int{lowercase, uppercase}, lastRuneKind) && slices.Contains([]int{lowercase, uppercase}, nextRuneKind)
}

// extract with by the defined rules
Expand All @@ -103,6 +93,14 @@ func extract(input string, config *config) []string {
runeKind, lastRuneKind, runesLen := 0, 0, -1

for i, r := range input {
// If the rune should be ignored, we will simply add it to
// the current word, and treat it of same rune kind as the last
// added value
if slices.Contains(config.ignoredRunes, r) {
runes[runesLen] = append(runes[runesLen], r)
continue
}

// If hyphenated words are allowed and current character is hyphenated,
// it'll get appended to the current rune slice,
// if the adjacent runes of a hyphen is a letter of same kind (upper/lowercase),
Expand Down Expand Up @@ -144,7 +142,7 @@ func extract(input string, config *config) []string {
// Keep track of the runes index, instead of using len(runes) to find current index
runesLen++

// Move a uppercase rune from the end of previous word, to this word (Rule 5).
// Move an uppercase rune from the end of previous word, to this word (Rule 5).
if lastRuneKind == uppercase && runeKind == lowercase {
// Prepend the last character of previous rune-slice
runes[runesLen] = append([]rune{runes[runesLen-1][len(runes[runesLen-1])-1]}, runes[runesLen]...)
Expand All @@ -153,7 +151,6 @@ func extract(input string, config *config) []string {
}

lastRuneKind = runeKind

}

// Convert the rune slices to strings
Expand All @@ -171,8 +168,8 @@ func extract(input string, config *config) []string {

// Extract extracts words from a given string with potential options.
func Extract(input string, options ...Option) []string {
config := newDefaultConfig()
config.apply(options...)
cfg := newDefaultConfig()
cfg.apply(options...)

return extract(input, config)
}
return extract(input, cfg)
}
16 changes: 13 additions & 3 deletions extract_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
)

type testSet struct {
input string
input string
expected []string
}

Expand All @@ -34,7 +34,7 @@ func TestExtract(t *testing.T) {
{"aeiouAreVowels", []string{"aeiou", "Are", "Vowels"}},
{"XmlHTTP", []string{"Xml", "HTTP"}},
{"isISO8601", []string{"is", "ISO", "8601"}},
{"Win2000", []string{"Win", "2000"} },
{"Win2000", []string{"Win", "2000"}},
{"Bose QC35", []string{"Bose", "QC", "35"}},
{"YAMLParser", []string{"YAML", "Parser"}},
{"SOME_CONSTANT_STRING_REPRESENTATION", []string{"SOME", "CONSTANT", "STRING", "REPRESENTATION"}},
Expand Down Expand Up @@ -93,4 +93,14 @@ func TestExtractWithOptionIncludePunctuation(t *testing.T) {
}

runExtractTest(t, tests, words.IncludePunctuation())
}
}

func TestExtractWithIgnoredRunes(t *testing.T) {
tests := []testSet{
{"ignored.runes", []string{"ignored.runes"}},
{"etc. and so on", []string{"etc.", "and", "so", "on"}},
{"etc! and so on", []string{"etc", "and", "so", "on"}},
}

runExtractTest(t, tests, words.WithIgnoredRunes('.'))
}
4 changes: 3 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
module github.com/imbue11235/words

go 1.16
go 1.18

require golang.org/x/exp v0.0.0-20220613132600-b0d781184e0d // indirect
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
golang.org/x/exp v0.0.0-20220613132600-b0d781184e0d h1:vtUKgx8dahOomfFzLREU8nSv25YHnTgLBn4rDnWZdU0=
golang.org/x/exp v0.0.0-20220613132600-b0d781184e0d/go.mod h1:Kr81I6Kryrl9sr8s2FK3vxD90NdsKWRuOIl2O4CvYbA=
66 changes: 27 additions & 39 deletions options.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,55 +2,43 @@ package words

// Option defines the interface for
// applying options to the extraction
type Option interface {
apply(c *config)
}

// Include symbols option
type includeSymbols bool
type Option func(c *config)

// IncludeSymbols includes symbols in the extraction. E.g. "beer>food" => []{"beer", ">", "food"}
func IncludeSymbols() includeSymbols {
return includeSymbols(true)
}

func (i includeSymbols) apply(c *config) {
c.includeSymbols = bool(i)
func IncludeSymbols() Option {
return func(c *config) {
c.includeSymbols = true
}
}

// Include punctuation option
type includePunctuation bool

// IncludePunctuation includes punctuation in extraction. E.g. "a.nested_path" => []{"a", ".", "nested", "-", "path"}
func IncludePunctuation() includePunctuation {
return includePunctuation(true)
}

func (i includePunctuation) apply(c *config) {
c.includePunctuation = bool(i)
func IncludePunctuation() Option {
return func(c *config) {
c.includePunctuation = true
}
}

// Include spaces option
type includeSpaces bool

// IncludeSpaces includes spaces in the extraction. E.g. "the moon" => []{"the", " ", "moon"}
func IncludeSpaces() includeSpaces {
return includeSpaces(true)
}

func (i includeSpaces) apply(c *config) {
c.includeSpaces = bool(i)
func IncludeSpaces() Option {
return func(c *config) {
c.includeSpaces = true
}
}

// Allow hyphenated words option
type allowHyphenatedWords bool

// Allow hyphenated words allows hyphenated words in the extraction.
// AllowHyphenatedWords allows hyphenated words in the extraction.
// E.g. "a family-sized pizza" => []{"a", "family-sized", "pizza"}
func AllowHyphenatedWords() allowHyphenatedWords {
return allowHyphenatedWords(true)
func AllowHyphenatedWords() Option {
return func(c *config) {
c.allowHyphenatedWords = true
}
}

func (a allowHyphenatedWords) apply(c *config) {
c.allowHyphenatedWords = true
}
// WithIgnoredRunes tells the extractor to ignore these runes
// when they are encountered, simply adding them to the output
// as the rune was of most recent rune kind.
// E.g. => WithIgnoredRunes('.') "Etc. and so on" becomes => []{"Etc.", "and", "so", "on"}
func WithIgnoredRunes(runes ...rune) Option {
return func(c *config) {
c.ignoredRunes = append(c.ignoredRunes, runes...)
}
}

0 comments on commit 5389ccb

Please sign in to comment.