From 690cfad098ba5154fc48f2635a3b29e7a6300c24 Mon Sep 17 00:00:00 2001 From: Siavash Askari Nasr Date: Thu, 13 May 2021 11:50:47 +0430 Subject: [PATCH] Applied requested changes - Document MaintainCaptureOrder option - Use return in `assignNameSlots` and remove else - Add test with MaintainCaptureOrder not provided - Change the MaintainCaptureOrder value to `0x0400` - Remove the `o` inline option - Add comment to explain why `autocap` is consumed --- README.md | 14 ++++ regexp.go | 2 +- regexp_MaintainCaptureOrder_test.go | 34 ++++++--- syntax/parser.go | 111 ++++++++++++++-------------- 4 files changed, 94 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index 9e448f4..c8166a4 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,20 @@ if isMatch, _ := re.MatchString(`Something to match`); isMatch { This feature is a work in progress and I'm open to ideas for more things to put here (maybe more relaxed character escaping rules?). +## MaintainCaptureOrder mode +The default behavior of `regexp2` is to match the .NET regexp engine, which unlike PCRE, doesn't maintain the order of the captures and appends the named capture groups to the end of captured groups. Using the `MaintainCaptureOrder` option when compiling a regexp will keep the order of named and unnamed capture groups. + +```go +re := regexp2.MustCompile(`(?This) (is) a (?test)`, regexp2.RE2) +if match, _ := re.FindStringMatch(`This is a test`); match != nil { + // match.Groups()[1].String() == "This" + // match.Groups()[1].Name == "first" + // match.Groups()[2].String() == "is" + // match.Groups()[2].Name == "2" + // match.Groups()[3].String() == "test" + // match.Groups()[3].Name == "last" +} +``` ## Library features that I'm still working on - Regex split diff --git a/regexp.go b/regexp.go index 60fce4a..5179468 100644 --- a/regexp.go +++ b/regexp.go @@ -121,7 +121,7 @@ const ( Debug = 0x0080 // "d" ECMAScript = 0x0100 // "e" RE2 = 0x0200 // RE2 (regexp package) compatibility mode - MaintainCaptureOrder = 0x1000 // Maintain named and unnamed capture order + MaintainCaptureOrder = 0x0400 // Maintain named and unnamed capture order ) func (re *Regexp) RightToLeft() bool { diff --git a/regexp_MaintainCaptureOrder_test.go b/regexp_MaintainCaptureOrder_test.go index 071850b..4b708fe 100644 --- a/regexp_MaintainCaptureOrder_test.go +++ b/regexp_MaintainCaptureOrder_test.go @@ -37,6 +37,9 @@ func TestMaintainCaptureOrder_Basic(t *testing.T) { if want, got := `this`, string(m.GroupByName(`first`).Runes()); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } + if want, got := `first`, m.regex.GroupNameFromNumber(1); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } if want, got := `testing`, groups[2].String(); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } @@ -54,13 +57,13 @@ func TestMaintainCaptureOrder_Basic(t *testing.T) { } } -func TestMaintainCaptureOrder_With_Other_Options(t *testing.T) { - r, err := Compile("(?si)(?this).+?\n(testing).+?(?stuff)", MaintainCaptureOrder) +func TestMaintainCaptureOrder_Mode_Not_Enabled(t *testing.T) { + r, err := Compile("(?this).+?(testing).+?(?stuff)", 0) // t.Logf("code dump: %v", r.code.Dump()) if err != nil { t.Errorf("unexpected compile err: %v", err) } - text := "This is a \ntesting stuff" + text := `this is a testing stuff` m, err := r.FindStringMatch(text) if err != nil { t.Errorf("unexpected match err: %v", err) @@ -78,16 +81,22 @@ func TestMaintainCaptureOrder_With_Other_Options(t *testing.T) { if want, got := text, groups[0].String(); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `This`, groups[1].String(); want != got { + if want, got := `testing`, groups[1].String(); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `first`, groups[1].Name; want != got { + if want, got := `1`, groups[1].Name; want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `testing`, groups[2].String(); want != got { + if want, got := `this`, string(m.GroupByName(`first`).Runes()); want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } - if want, got := `2`, groups[2].Name; want != got { + if want, got := `first`, m.regex.GroupNameFromNumber(2); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `this`, groups[2].String(); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } + if want, got := `first`, groups[2].Name; want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } if want, got := `stuff`, groups[3].String(); want != got { @@ -96,10 +105,13 @@ func TestMaintainCaptureOrder_With_Other_Options(t *testing.T) { if want, got := `last`, groups[3].Name; want != got { t.Fatalf("Wanted '%v'\nGot '%v'", want, got) } + if want, got := `stuff`, string(m.GroupByNumber(3).Runes()); want != got { + t.Fatalf("Wanted '%v'\nGot '%v'", want, got) + } } -func TestMaintainCaptureOrder_Enable_Inline(t *testing.T) { - r, err := Compile("(?sio)(?this).+?\n(testing).+?(?stuff)", 0) +func TestMaintainCaptureOrder_With_Other_Options(t *testing.T) { + r, err := Compile("(?si)(?this).+?\n(testing).+?(?stuff)", MaintainCaptureOrder) // t.Logf("code dump: %v", r.code.Dump()) if err != nil { t.Errorf("unexpected compile err: %v", err) @@ -142,8 +154,8 @@ func TestMaintainCaptureOrder_Enable_Inline(t *testing.T) { } } -func TestMaintainCaptureOrder_Inline_No_Capture_Groups(t *testing.T) { - r, err := Compile("(?o)this.+?testing.+?stuff", 0) +func TestMaintainCaptureOrder_No_Capture_Groups(t *testing.T) { + r, err := Compile("this.+?testing.+?stuff", MaintainCaptureOrder) // t.Logf("code dump: %v", r.code.Dump()) if err != nil { t.Errorf("unexpected compile err: %v", err) diff --git a/syntax/parser.go b/syntax/parser.go index cf07420..d75ee90 100644 --- a/syntax/parser.go +++ b/syntax/parser.go @@ -22,7 +22,7 @@ const ( Debug = 0x0080 // "d" ECMAScript = 0x0100 // "e" RE2 = 0x0200 // RE2 compat mode - MaintainCaptureOrder = 0x1000 // "o" Maintain named and unnamed capture order + MaintainCaptureOrder = 0x0400 // Maintain named and unnamed capture order ) func optionFromCode(ch rune) RegexOptions { @@ -44,8 +44,6 @@ func optionFromCode(ch rune) RegexOptions { return Debug case 'e', 'E': return ECMAScript - case 'o', 'O': - return MaintainCaptureOrder default: return 0 } @@ -242,75 +240,76 @@ func (p *parser) assignNameSlots() { if len(p.capnamelist) == 0 || p.capnamelist[0] != `0` { p.capnamelist = append([]string{fmt.Sprint(0)}, p.capnamelist...) } - } else { - if p.capnames != nil { - for _, name := range p.capnamelist { - for p.isCaptureSlot(p.autocap) { - p.autocap++ - } - pos := p.capnames[name] - p.capnames[name] = p.autocap - p.noteCaptureSlot(p.autocap, pos) + return + } + if p.capnames != nil { + for _, name := range p.capnamelist { + for p.isCaptureSlot(p.autocap) { p.autocap++ } + pos := p.capnames[name] + p.capnames[name] = p.autocap + p.noteCaptureSlot(p.autocap, pos) + + p.autocap++ } - - // if the caps array has at least one gap, construct the list of used slots - if p.capcount < p.captop { - p.capnumlist = make([]int, p.capcount) - i := 0 + } - for k := range p.caps { - p.capnumlist[i] = k - i++ - } + // if the caps array has at least one gap, construct the list of used slots + if p.capcount < p.captop { + p.capnumlist = make([]int, p.capcount) + i := 0 - sort.Ints(p.capnumlist) + for k := range p.caps { + p.capnumlist[i] = k + i++ } - // merge capsnumlist into capnamelist - if p.capnames != nil || p.capnumlist != nil { - var oldcapnamelist []string - var next int - var k int + sort.Ints(p.capnumlist) + } - if p.capnames == nil { - oldcapnamelist = nil - p.capnames = make(map[string]int) - p.capnamelist = []string{} - next = -1 - } else { - oldcapnamelist = p.capnamelist - p.capnamelist = []string{} - next = p.capnames[oldcapnamelist[0]] - } + // merge capsnumlist into capnamelist + if p.capnames != nil || p.capnumlist != nil { + var oldcapnamelist []string + var next int + var k int - for i := 0; i < p.capcount; i++ { - j := i - if p.capnumlist != nil { - j = p.capnumlist[i] - } + if p.capnames == nil { + oldcapnamelist = nil + p.capnames = make(map[string]int) + p.capnamelist = []string{} + next = -1 + } else { + oldcapnamelist = p.capnamelist + p.capnamelist = []string{} + next = p.capnames[oldcapnamelist[0]] + } - if next == j { - p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) - k++ + for i := 0; i < p.capcount; i++ { + j := i + if p.capnumlist != nil { + j = p.capnumlist[i] + } - if k == len(oldcapnamelist) { - next = -1 - } else { - next = p.capnames[oldcapnamelist[k]] - } + if next == j { + p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) + k++ + if k == len(oldcapnamelist) { + next = -1 } else { - //feature: culture? - str := strconv.Itoa(j) - p.capnamelist = append(p.capnamelist, str) -p.capnames[str] = j - } + next = p.capnames[oldcapnamelist[k]] } + + } else { + //feature: culture? + str := strconv.Itoa(j) + p.capnamelist = append(p.capnamelist, str) + p.capnames[str] = j } } + } } func (p *parser) consumeAutocap() int { @@ -958,6 +957,8 @@ func (p *parser) scanGroupOpen() (*regexNode, error) { } if capnum != -1 && p.useMaintainCaptureOrder() { + // Successfully scanned a named capture group so we need to increment + // our cap number to maintain the order p.consumeAutocap() } } else if ch == '-' {