From 18609a29c99da2af93a050ed1de5b278463f185c Mon Sep 17 00:00:00 2001 From: Kolbe Kegel Date: Tue, 3 Dec 2024 11:01:52 -0800 Subject: [PATCH 1/3] use a hash for metadata lock name --- go.mod | 2 ++ pkg/dbconn/metadatalock.go | 31 ++++++++++++++++++------------- pkg/dbconn/metadatalock_test.go | 33 ++++++++++++++++++--------------- pkg/migration/runner.go | 2 +- pkg/migration/runner_test.go | 6 ++++-- 5 files changed, 43 insertions(+), 31 deletions(-) diff --git a/go.mod b/go.mod index 8590c2d..6365e6f 100644 --- a/go.mod +++ b/go.mod @@ -2,6 +2,8 @@ module github.com/cashapp/spirit go 1.22 +toolchain go1.22.9 + require ( github.com/alecthomas/kong v0.7.1 github.com/go-mysql-org/go-mysql v1.8.1-0.20240805131754-ccf204bf2b2a diff --git a/pkg/dbconn/metadatalock.go b/pkg/dbconn/metadatalock.go index 231bb9f..aaaba39 100644 --- a/pkg/dbconn/metadatalock.go +++ b/pkg/dbconn/metadatalock.go @@ -6,6 +6,8 @@ import ( "fmt" "time" + "github.com/cashapp/spirit/pkg/table" + "github.com/cashapp/spirit/pkg/dbconn/sqlescape" "github.com/siddontang/loggers" ) @@ -23,12 +25,9 @@ type MetadataLock struct { refreshInterval time.Duration } -func NewMetadataLock(ctx context.Context, dsn string, lockName string, logger loggers.Advanced, optionFns ...func(*MetadataLock)) (*MetadataLock, error) { - if len(lockName) == 0 { - return nil, errors.New("metadata lock name is empty") - } - if len(lockName) > 64 { - return nil, fmt.Errorf("metadata lock name is too long: %d, max length is 64", len(lockName)) +func NewMetadataLock(ctx context.Context, dsn string, table *table.TableInfo, logger loggers.Advanced, optionFns ...func(*MetadataLock)) (*MetadataLock, error) { + if table == nil { + return nil, errors.New("metadata lock table info is nil") } mdl := &MetadataLock{ @@ -52,28 +51,34 @@ func NewMetadataLock(ctx context.Context, dsn string, lockName string, logger lo getLock := func() error { // https://dev.mysql.com/doc/refman/8.0/en/locking-functions.html#function_get-lock var answer int - stmt := sqlescape.MustEscapeSQL("SELECT GET_LOCK(%?, %?)", lockName, getLockTimeout.Seconds()) + // Using the table name alone entails a maximum lock length and leads to conflicts + // between different tables with the same name in different schemas. + // We use the schema name and table name to create a unique lock name with a hash. + // The hash is truncated to 8 characters to avoid the maximum lock length. + // bizarrely_long_schema_name.thisisareallylongtablenamethisisareallylongtablename60charac ==> + // bizarrely_long_schem.thisisareallylongtablenamethisis-66fec116 + stmt := sqlescape.MustEscapeSQL("SELECT GET_LOCK( concat(left(%?,20),'.',left(%?,32),'-',left(sha1(concat(%?,%?)),8)), %?)", table.SchemaName, table.TableName, table.SchemaName, table.TableName, getLockTimeout.Seconds()) if err := dbConn.QueryRowContext(ctx, stmt).Scan(&answer); err != nil { return fmt.Errorf("could not acquire metadata lock: %s", err) } if answer == 0 { // 0 means the lock is held by another connection // TODO: we could lookup the connection that holds the lock and report details about it - return fmt.Errorf("could not acquire metadata lock: %s, lock is held by another connection", lockName) + return fmt.Errorf("could not acquire metadata lock for %s.%s, lock is held by another connection", table.SchemaName, table.TableName) } else if answer != 1 { // probably we never get here, but just in case - return fmt.Errorf("could not acquire metadata lock: %s, GET_LOCK returned: %d", lockName, answer) + return fmt.Errorf("could not acquire metadata lock for %s.%s, GET_LOCK returned: %d", table.SchemaName, table.TableName, answer) } return nil } // Acquire the lock or return an error immediately // We only Infof the initial acquisition. - logger.Infof("attempting to acquire metadata lock: %s", lockName) + logger.Infof("attempting to acquire metadata lock for %s.%s", table.SchemaName, table.TableName) if err = getLock(); err != nil { return nil, err } - logger.Infof("acquired metadata lock: %s", lockName) + logger.Infof("acquired metadata lock: %s.%s", table.SchemaName, table.TableName) // Setup background refresh runner ctx, mdl.cancel = context.WithCancel(ctx) @@ -85,7 +90,7 @@ func NewMetadataLock(ctx context.Context, dsn string, lockName string, logger lo select { case <-ctx.Done(): // Close the dedicated connection to release the lock - logger.Warnf("releasing metadata lock: %s", lockName) + logger.Warnf("releasing metadata lock for %s.%s", table.SchemaName, table.TableName) mdl.closeCh <- dbConn.Close() return case <-ticker.C: @@ -97,7 +102,7 @@ func NewMetadataLock(ctx context.Context, dsn string, lockName string, logger lo // and we can try again on the next tick interval. logger.Warnf("could not refresh metadata lock: %s", err) } else { - logger.Debugf("refreshed metadata lock: %s", lockName) + logger.Debugf("refreshed metadata lock for %s.%s", table.SchemaName, table.TableName) } } } diff --git a/pkg/dbconn/metadatalock_test.go b/pkg/dbconn/metadatalock_test.go index f3b6cb6..f9514e4 100644 --- a/pkg/dbconn/metadatalock_test.go +++ b/pkg/dbconn/metadatalock_test.go @@ -5,37 +5,39 @@ import ( "testing" "time" + "github.com/cashapp/spirit/pkg/table" + "github.com/cashapp/spirit/pkg/testutils" "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" ) func TestMetadataLock(t *testing.T) { - lockName := "test" + lockTableInfo := table.TableInfo{SchemaName: "test", TableName: "test"} logger := logrus.New() - mdl, err := NewMetadataLock(context.Background(), testutils.DSN(), lockName, logger) + mdl, err := NewMetadataLock(context.Background(), testutils.DSN(), &lockTableInfo, logger) assert.NoError(t, err) assert.NotNil(t, mdl) // Confirm a second lock cannot be acquired - _, err = NewMetadataLock(context.Background(), testutils.DSN(), lockName, logger) + _, err = NewMetadataLock(context.Background(), testutils.DSN(), &lockTableInfo, logger) assert.ErrorContains(t, err, "lock is held by another connection") // Close the original mdl assert.NoError(t, mdl.Close()) // Confirm a new lock can be acquired - mdl3, err := NewMetadataLock(context.Background(), testutils.DSN(), lockName, logger) + mdl3, err := NewMetadataLock(context.Background(), testutils.DSN(), &lockTableInfo, logger) assert.NoError(t, err) assert.NoError(t, mdl3.Close()) } func TestMetadataLockContextCancel(t *testing.T) { - lockName := "test-cancel" + lockTableInfo := table.TableInfo{SchemaName: "test", TableName: "test-cancel"} logger := logrus.New() ctx, cancel := context.WithCancel(context.Background()) - mdl, err := NewMetadataLock(ctx, testutils.DSN(), lockName, logger) + mdl, err := NewMetadataLock(ctx, testutils.DSN(), &lockTableInfo, logger) assert.NoError(t, err) assert.NotNil(t, mdl) @@ -46,16 +48,16 @@ func TestMetadataLockContextCancel(t *testing.T) { <-mdl.closeCh // Confirm the lock is released by acquiring a new one - mdl2, err := NewMetadataLock(context.Background(), testutils.DSN(), lockName, logger) + mdl2, err := NewMetadataLock(context.Background(), testutils.DSN(), &lockTableInfo, logger) assert.NoError(t, err) assert.NotNil(t, mdl2) assert.NoError(t, mdl2.Close()) } func TestMetadataLockRefresh(t *testing.T) { - lockName := "test-refresh" + lockTableInfo := table.TableInfo{SchemaName: "test", TableName: "test-refresh"} logger := logrus.New() - mdl, err := NewMetadataLock(context.Background(), testutils.DSN(), lockName, logger, func(mdl *MetadataLock) { + mdl, err := NewMetadataLock(context.Background(), testutils.DSN(), &lockTableInfo, logger, func(mdl *MetadataLock) { // override the refresh interval for faster testing mdl.refreshInterval = 2 * time.Second }) @@ -66,7 +68,7 @@ func TestMetadataLockRefresh(t *testing.T) { time.Sleep(5 * time.Second) // Confirm the lock is still held - _, err = NewMetadataLock(context.Background(), testutils.DSN(), lockName, logger) + _, err = NewMetadataLock(context.Background(), testutils.DSN(), &lockTableInfo, logger) assert.ErrorContains(t, err, "lock is held by another connection") // Close the lock @@ -74,14 +76,15 @@ func TestMetadataLockRefresh(t *testing.T) { } func TestMetadataLockLength(t *testing.T) { - long := "thisisareallylongtablenamethisisareallylongtablenamethisisareallylongtablename" - empty := "" + lockTableInfo := table.TableInfo{SchemaName: "test", TableName: "thisisareallylongtablenamethisisareallylongtablenamethisisareallylongtablename"} + var empty *table.TableInfo logger := logrus.New() - _, err := NewMetadataLock(context.Background(), testutils.DSN(), long, logger) - assert.ErrorContains(t, err, "metadata lock name is too long") + _, err := NewMetadataLock(context.Background(), testutils.DSN(), &lockTableInfo, logger) + // No error anymore after using a hash of the table name + assert.NoError(t, err) _, err = NewMetadataLock(context.Background(), testutils.DSN(), empty, logger) - assert.ErrorContains(t, err, "metadata lock name is empty") + assert.ErrorContains(t, err, "metadata lock table info is nil") } diff --git a/pkg/migration/runner.go b/pkg/migration/runner.go index 32d3b4c..df1080c 100644 --- a/pkg/migration/runner.go +++ b/pkg/migration/runner.go @@ -202,7 +202,7 @@ func (r *Runner) Run(originalCtx context.Context) error { } // Take a metadata lock to prevent other migrations from running concurrently. - r.metadataLock, err = dbconn.NewMetadataLock(ctx, r.dsn(), r.table.TableName, r.logger) + r.metadataLock, err = dbconn.NewMetadataLock(ctx, r.dsn(), r.table, r.logger) if err != nil { return err } diff --git a/pkg/migration/runner_test.go b/pkg/migration/runner_test.go index 2383158..b49244a 100644 --- a/pkg/migration/runner_test.go +++ b/pkg/migration/runner_test.go @@ -2711,6 +2711,8 @@ func TestResumeFromCheckpointE2EWithManualSentinel(t *testing.T) { statusInterval = 500 * time.Millisecond tableName := `resume_checkpoint_e2e_w_sentinel` + tableInfo := table.TableInfo{SchemaName: "test", TableName: tableName} + testutils.RunSQL(t, fmt.Sprintf(`DROP TABLE IF EXISTS %s, _%s_old, _%s_chkpnt, _%s_sentinel`, tableName, tableName, tableName, tableName)) table := fmt.Sprintf(`CREATE TABLE %s ( id int(11) NOT NULL AUTO_INCREMENT, @@ -2766,8 +2768,8 @@ func TestResumeFromCheckpointE2EWithManualSentinel(t *testing.T) { // Test that it's not possible to acquire metadata lock with name // as tablename while the migration is running. lock, err := dbconn.NewMetadataLock(ctx, testutils.DSN(), - tableName, &testLogger{}) - assert.ErrorContains(t, err, "could not acquire metadata lock: resume_checkpoint_e2e_w_sentinel, lock is held by another connection") + &tableInfo, &testLogger{}) + assert.ErrorContains(t, err, "could not acquire metadata lock for test.resume_checkpoint_e2e_w_sentinel, lock is held by another connection") assert.Nil(t, lock) break } From be3d5276bd01edb180c5f11bd3fa16c7840e7d00 Mon Sep 17 00:00:00 2001 From: Kolbe Kegel Date: Tue, 3 Dec 2024 11:07:34 -0800 Subject: [PATCH 2/3] revert change to go.mod --- go.mod | 2 -- 1 file changed, 2 deletions(-) diff --git a/go.mod b/go.mod index 6365e6f..8590c2d 100644 --- a/go.mod +++ b/go.mod @@ -2,8 +2,6 @@ module github.com/cashapp/spirit go 1.22 -toolchain go1.22.9 - require ( github.com/alecthomas/kong v0.7.1 github.com/go-mysql-org/go-mysql v1.8.1-0.20240805131754-ccf204bf2b2a From 3a1f4b93f4c8bf51a9b001f82da7497b1ea998df Mon Sep 17 00:00:00 2001 From: Kolbe Kegel Date: Tue, 3 Dec 2024 11:53:04 -0800 Subject: [PATCH 3/3] add comment about not logging lock name --- pkg/dbconn/metadatalock.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/dbconn/metadatalock.go b/pkg/dbconn/metadatalock.go index aaaba39..cd4c358 100644 --- a/pkg/dbconn/metadatalock.go +++ b/pkg/dbconn/metadatalock.go @@ -57,6 +57,11 @@ func NewMetadataLock(ctx context.Context, dsn string, table *table.TableInfo, lo // The hash is truncated to 8 characters to avoid the maximum lock length. // bizarrely_long_schema_name.thisisareallylongtablenamethisisareallylongtablename60charac ==> // bizarrely_long_schem.thisisareallylongtablenamethisis-66fec116 + // + // The computation of the hash is done server-side to simplify the whole process, + // but that means we can't easily log the actual lock name used. If you want to do that + // in the future, just add another MySQL round-trip to compute the lock name server-side + // and then use the returned string in the GET_LOCK call. stmt := sqlescape.MustEscapeSQL("SELECT GET_LOCK( concat(left(%?,20),'.',left(%?,32),'-',left(sha1(concat(%?,%?)),8)), %?)", table.SchemaName, table.TableName, table.SchemaName, table.TableName, getLockTimeout.Seconds()) if err := dbConn.QueryRowContext(ctx, stmt).Scan(&answer); err != nil { return fmt.Errorf("could not acquire metadata lock: %s", err)