Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pull dedupe out of queue and into personality mixin #333

Merged
merged 2 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cmd/conformance/aws/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ func main() {
if err != nil {
klog.Exitf("Failed to create new AWS storage: %v", err)
}
dedupe := tessera.NewInMemoryDedupe(storage.Add, 256)

// Expose a HTTP handler for the conformance test writes.
// This should accept arbitrary bytes POSTed to /add, and return an ascii
Expand All @@ -81,7 +82,7 @@ func main() {
return
}

idx, err := storage.Add(r.Context(), tessera.NewEntry(b))()
idx, err := dedupe.Add(r.Context(), tessera.NewEntry(b))()
mhutchinson marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
if errors.Is(err, tessera.ErrPushback) {
w.Header().Add("Retry-After", "1")
Expand Down
3 changes: 2 additions & 1 deletion cmd/conformance/gcp/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ func main() {
if err != nil {
klog.Exitf("Failed to create new GCP storage: %v", err)
}
dedupe := tessera.NewInMemoryDedupe(storage.Add, 256)

// Expose a HTTP handler for the conformance test writes.
// This should accept arbitrary bytes POSTed to /add, and return an ascii
Expand All @@ -76,7 +77,7 @@ func main() {
return
}

idx, err := storage.Add(r.Context(), tessera.NewEntry(b))()
idx, err := dedupe.Add(r.Context(), tessera.NewEntry(b))()
if err != nil {
if errors.Is(err, tessera.ErrPushback) {
w.Header().Add("Retry-After", "1")
Expand Down
3 changes: 2 additions & 1 deletion cmd/conformance/mysql/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ func main() {
if err != nil {
klog.Exitf("Failed to create new MySQL storage: %v", err)
}
dedupe := tessera.NewInMemoryDedupe(storage.Add, 256)

// Set up the handlers for the tlog-tiles GET methods, and a custom handler for HTTP POSTs to /add
configureTilesReadAPI(http.DefaultServeMux, storage)
Expand All @@ -72,7 +73,7 @@ func main() {
w.WriteHeader(http.StatusInternalServerError)
return
}
idx, err := storage.Add(r.Context(), tessera.NewEntry(b))()
idx, err := dedupe.Add(r.Context(), tessera.NewEntry(b))()
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
_, _ = w.Write([]byte(err.Error()))
Expand Down
3 changes: 2 additions & 1 deletion cmd/conformance/posix/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ func main() {
if err != nil {
klog.Exitf("Failed to construct storage: %v", err)
}
dedupe := tessera.NewInMemoryDedupe(storage.Add, 256)

// Define a handler for /add that accepts POST requests and adds the POST body to the log
http.HandleFunc("POST /add", func(w http.ResponseWriter, r *http.Request) {
Expand All @@ -70,7 +71,7 @@ func main() {
w.WriteHeader(http.StatusInternalServerError)
return
}
idx, err := storage.Add(r.Context(), tessera.NewEntry(b))()
idx, err := dedupe.Add(r.Context(), tessera.NewEntry(b))()
if err != nil {
w.WriteHeader(http.StatusInternalServerError)
_, _ = w.Write([]byte(err.Error()))
Expand Down
61 changes: 61 additions & 0 deletions dedupe.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Copyright 2024 The Tessera authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tessera

import (
"context"
"sync"

"github.com/hashicorp/golang-lru/v2/expirable"
)

// NewInMemoryDedupe returns an InMemoryDedupe that stores at most size entries in memory.
// If the entry being `Add`ed is not found in the cache, then it calls the delegate.
func NewInMemoryDedupe(delegate func(ctx context.Context, e *Entry) IndexFuture, size uint) *InMemoryDedupe {
return &InMemoryDedupe{
delegate: delegate,
cache: expirable.NewLRU[string, IndexFuture](int(size), nil, 0),
}
}

// InMemoryDedupe prevents duplicate entries being written to an underlying storage by keeping
// an in-memory cache of recently seen entries. Where an existing entry has already been `Add`ed,
// the previous `IndexFuture` will be returned. When no entry is found in the cache, the delegate
// method will be called to store the entry, and the result will be registered in the cache.
//
// This object can be used in isolation, or in conjunction with a persistent dedupe implementation.
// When using this with a persistent dedupe, the persistent layer should be the delegate of this
// InMemoryDedupe. This allows recent duplicates to be deduplicated in memory, reducing the need to
// make calls to a persistent storage.
type InMemoryDedupe struct {
delegate func(ctx context.Context, e *Entry) IndexFuture
mu sync.Mutex // cache is thread safe, but this mutex allows us to do conditional writes
cache *expirable.LRU[string, IndexFuture]
}

// Add adds the entry to the underlying delegate only if e hasn't been recently seen. In either case,
// an IndexFuture will be returned that the client can use to get the sequence number of this entry.
func (d *InMemoryDedupe) Add(ctx context.Context, e *Entry) IndexFuture {
id := string(e.Identity())
d.mu.Lock()
defer d.mu.Unlock()

f, ok := d.cache.Get(id)
if !ok {
f = d.delegate(ctx, e)
d.cache.Add(id, f)
}
return f
}
104 changes: 104 additions & 0 deletions dedupe_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
// Copyright 2024 The Tessera authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tessera_test

import (
"context"
"fmt"
"sync"
"testing"

tessera "github.com/transparency-dev/trillian-tessera"
)

func TestDedupe(t *testing.T) {
ctx := context.Background()
testCases := []struct {
desc string
newValue string
wantIdx uint64
}{
{
desc: "first element",
newValue: "foo",
wantIdx: 1,
},
{
desc: "third element",
newValue: "baz",
wantIdx: 3,
},
{
desc: "new element",
newValue: "omega",
wantIdx: 4,
},
}
for _, tC := range testCases {
t.Run(tC.desc, func(t *testing.T) {
idx := uint64(1)
delegate := func(ctx context.Context, e *tessera.Entry) tessera.IndexFuture {
thisIdx := idx
idx++
return func() (uint64, error) {
return thisIdx, nil
}
}
d := tessera.NewInMemoryDedupe(delegate, 256)

// Add foo, bar, baz to prime the cache to make things interesting
d.Add(ctx, tessera.NewEntry([]byte("foo")))
d.Add(ctx, tessera.NewEntry([]byte("bar")))
d.Add(ctx, tessera.NewEntry([]byte("baz")))

idx, err := d.Add(ctx, tessera.NewEntry([]byte(tC.newValue)))()
if err != nil {
t.Fatal(err)
}
if idx != tC.wantIdx {
t.Errorf("got != want (%d != %d)", idx, tC.wantIdx)
}
})
}
}

func BenchmarkDedupe(b *testing.B) {
ctx := context.Background()
// Outer loop is for benchmark calibration, inside here is each individual run of the benchmark
for i := 0; i < b.N; i++ {
idx := uint64(1)
delegate := func(ctx context.Context, e *tessera.Entry) tessera.IndexFuture {
thisIdx := idx
idx++
return func() (uint64, error) {
return thisIdx, nil
}
}
d := tessera.NewInMemoryDedupe(delegate, 256)
wg := &sync.WaitGroup{}
// Loop to create a bunch of leaves in parallel to test lock contention
for leafIndex := range 1024 {
wg.Add(1)
go func(index int) {
_, err := d.Add(ctx, tessera.NewEntry([]byte(fmt.Sprintf("leaf with value %d", index%32))))()
if err != nil {
b.Error(err)
}
wg.Done()
}(leafIndex)
}
wg.Wait()
}
}
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ require (
github.com/aws/aws-sdk-go-v2/service/sso v1.24.6 // indirect
github.com/aws/aws-sdk-go-v2/service/ssooidc v1.28.5 // indirect
github.com/aws/aws-sdk-go-v2/service/sts v1.33.1 // indirect
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect
go.opentelemetry.io/contrib/detectors/gcp v1.29.0 // indirect
go.opentelemetry.io/otel/sdk v1.29.0 // indirect
go.opentelemetry.io/otel/sdk/metric v1.29.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -892,6 +892,8 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.7.0/go.mod h1:hgWBS7lorOAVIJEQMi4Zs
github.com/grpc-ecosystem/grpc-gateway/v2 v2.11.3/go.mod h1:o//XUCC/F+yRGJoPO/VU0GSB0f8Nhgmxx0VIRUvaC0w=
github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8=
github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=
github.com/iancoleman/strcase v0.2.0/go.mod h1:iwCmte+B7n89clKwxIoIXy/HfoL7AsD47ZCWhYzw7ho=
github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
Expand Down
41 changes: 7 additions & 34 deletions storage/internal/queue.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,6 @@ import (
type Queue struct {
buf *buffer.Buffer
flush FlushFunc

inFlightMu sync.Mutex
inFlight map[string]*queueItem
}

// FlushFunc is the signature of a function which will receive the slice of queued entries.
Expand All @@ -58,8 +55,7 @@ type FlushFunc func(ctx context.Context, entries []*tessera.Entry) error
// for maxAge, or the size of the queue reaches maxSize.
func NewQueue(ctx context.Context, maxAge time.Duration, maxSize uint, f FlushFunc) *Queue {
q := &Queue{
flush: f,
inFlight: make(map[string]*queueItem, maxSize),
flush: f,
}

// The underlying queue implementation blocks additions during a flush.
Expand Down Expand Up @@ -97,32 +93,14 @@ func NewQueue(ctx context.Context, maxAge time.Duration, maxSize uint, f FlushFu
return q
}

// squashDupes keeps track of all in-flight requests, enabling dupe squashing for entries currently in the queue.
// Returns an entry struct, and a bool which is true if the provided entry is a dupe and should NOT be added to the queue.
func (q *Queue) squashDupes(e *tessera.Entry) (*queueItem, bool) {
q.inFlightMu.Lock()
defer q.inFlightMu.Unlock()

k := string(e.Identity())
entry, isKnown := q.inFlight[k]
if !isKnown {
entry = newEntry(e)
q.inFlight[k] = entry
}
return entry, isKnown
}

// Add places e into the queue, and returns a func which may be called to retrieve the assigned index.
func (q *Queue) Add(ctx context.Context, e *tessera.Entry) tessera.IndexFuture {
entry, isDupe := q.squashDupes(e)
if isDupe {
// This entry is already in the queue, so no need to add it again.
return entry.f
}
if err := q.buf.Push(entry); err != nil {
entry.notify(err)
qi := newEntry(e)

if err := q.buf.Push(qi); err != nil {
qi.notify(err)
}
return entry.f
return qi.f
}

// doFlush handles the queue flush, and sending notifications of assigned log indices.
Expand All @@ -134,14 +112,9 @@ func (q *Queue) doFlush(ctx context.Context, entries []*queueItem) {

err := q.flush(ctx, entriesData)

// Send assigned indices to all the waiting Add() requests, including dupes.
q.inFlightMu.Lock()
defer q.inFlightMu.Unlock()

// Send assigned indices to all the waiting Add() requests
for _, e := range entries {
e.notify(err)
k := string(e.entry.Identity())
delete(q.inFlight, k)
}
}

Expand Down
33 changes: 0 additions & 33 deletions storage/internal/queue_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,36 +95,3 @@ func TestQueue(t *testing.T) {
})
}
}

func TestDedup(t *testing.T) {
ctx := context.Background()
idx := uint64(0)

q := storage.NewQueue(ctx, time.Second, 10 /*maxSize*/, func(ctx context.Context, entries []*tessera.Entry) error {
for _, e := range entries {
_ = e.MarshalBundleData(idx)
idx++
}
return nil
})

numEntries := 10
adds := []tessera.IndexFuture{}
for i := 0; i < numEntries; i++ {
adds = append(adds, q.Add(ctx, tessera.NewEntry([]byte("Have I seen this before?"))))
}

firstN, err := adds[0]()
if err != nil {
t.Fatalf("Add: %v", err)
}
for i := 1; i < len(adds); i++ {
N, err := adds[i]()
if err != nil {
t.Errorf("[%d] got %v", i, err)
}
if N != firstN {
t.Errorf("[%d] got seq %d, want %d", i, N, firstN)
}
}
}
Loading