Skip to content

Commit

Permalink
Handling the boltDB panic in case of database found to be corrupt. (#521
Browse files Browse the repository at this point in the history
)
  • Loading branch information
ishan16696 authored Sep 5, 2022
1 parent 4343a98 commit b68a54e
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
37 changes: 34 additions & 3 deletions pkg/initializer/validator/datavalidator.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ var (

// ErrCorrupt is returned when a checking a data file finds errors.
ErrCorrupt = errors.New("invalid value")

// isBoltDBPanic is a flag which indicates the whether the panic occurs while opening the Bolt database.
isBoltDBPanic = false
)

func (d *DataValidator) memberDir() string { return filepath.Join(d.Config.DataDir, "member") }
Expand Down Expand Up @@ -141,6 +144,7 @@ func (d *DataValidator) sanityCheck(failBelowRevision int64) (DataDirStatus, err
return DataDirectoryValid, nil
}

d.Logger.Info("Checking for Etcd revision...")
etcdRevision, err := getLatestEtcdRevision(d.backendPath())
if err != nil && errors.Is(err, bolt.ErrTimeout) {
d.Logger.Errorf("another etcd process is using %v and holds the file lock", d.backendPath())
Expand All @@ -150,6 +154,13 @@ func (d *DataValidator) sanityCheck(failBelowRevision int64) (DataDirStatus, err
return DataDirectoryCorrupt, nil
}

if isBoltDBPanic {
d.Logger.Info("Bolt database panic: database file found to be invalid.")
// reset the isBoltDBPanic
isBoltDBPanic = false
return BoltDBCorrupt, nil
}

if d.OriginalClusterSize > 1 {
d.Logger.Info("Skipping check for revision consistency of etcd member as it will get in sync with etcd leader.")
return DataDirectoryValid, nil
Expand Down Expand Up @@ -206,6 +217,12 @@ func (d *DataValidator) checkForDataCorruption() error {
}
return fmt.Errorf("invalid db files: %v", err)
}
if isBoltDBPanic {
d.Logger.Info("Bolt database panic: database file found to be invalid.")
// reset the isBoltDBPanic
isBoltDBPanic = false
return fmt.Errorf("invalid db files")
}
return nil
}

Expand Down Expand Up @@ -273,6 +290,13 @@ func verifyDB(path string) error {
return ErrFileNotFound
}

defer func() {
if err := recover(); err != nil {
// set the flag: isBoltDBPanic
isBoltDBPanic = true
}
}()

// Open database.
db, err := bolt.Open(path, 0666, &bolt.Options{Timeout: timeoutToOpenBoltDB})
if err != nil {
Expand Down Expand Up @@ -376,7 +400,7 @@ waitLoop:
case <-timer.C:
break waitLoop
default:
latestSyncedEtcdRevision, _ = getLatestSyncedRevision(clientKV)
latestSyncedEtcdRevision, _ = getLatestSyncedRevision(clientKV, d.Logger)
if latestSyncedEtcdRevision >= latestSnapshotRevision {
d.Logger.Infof("After starting embeddedEtcd backend DB file revision (%d) is greater than or equal to latest snapshot revision (%d): no data loss", latestSyncedEtcdRevision, latestSnapshotRevision)
break waitLoop
Expand All @@ -400,6 +424,13 @@ func getLatestEtcdRevision(path string) (int64, error) {
return -1, fmt.Errorf("unable to stat backend db file: %v", err)
}

defer func() {
if err := recover(); err != nil {
// set the flag: isBoltDBPanic
isBoltDBPanic = true
}
}()

db, err := bolt.Open(path, 0400, &bolt.Options{Timeout: timeoutToOpenBoltDB, ReadOnly: true})
if err != nil {
return -1, err
Expand Down Expand Up @@ -433,14 +464,14 @@ func getLatestEtcdRevision(path string) (int64, error) {
}

// getLatestSyncedRevision finds out the latest revision on etcd db file when embedded etcd is started to double check the latest revision of etcd db file.
func getLatestSyncedRevision(client client.KVCloser) (int64, error) {
func getLatestSyncedRevision(client client.KVCloser, logger *logrus.Logger) (int64, error) {
var latestSyncedRevision int64

ctx, cancel := context.WithTimeout(context.TODO(), connectionTimeout)
defer cancel()
resp, err := client.Get(ctx, "", clientv3.WithLastRev()...)
if err != nil {
fmt.Printf("Failed to get the latest etcd revision: %v\n", err)
logger.Errorf("failed to get the latest etcd revision: %v\n", err)
return latestSyncedRevision, err
}
latestSyncedRevision = resp.Header.Revision
Expand Down
2 changes: 2 additions & 0 deletions pkg/initializer/validator/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ const (
DataDirectoryInvStruct
// DataDirectoryCorrupt indicates data directory is corrupt.
DataDirectoryCorrupt
// BoltDBCorrupt indicates Bolt database is corrupt.
BoltDBCorrupt
// DataDirectoryStatusUnknown indicates validator failed to check the data directory status.
DataDirectoryStatusUnknown
// DataDirStatusInvalidInMultiNode indicates validator failed to check the data directory status in multi-node etcd cluster.
Expand Down
6 changes: 0 additions & 6 deletions pkg/initializer/validator/validator_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,6 @@ var (
etcdRevision int64
)

// fileInfo holds file information such as file name and file path
type fileInfo struct {
name string
path string
}

func TestValidator(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Validator Suite")
Expand Down

0 comments on commit b68a54e

Please sign in to comment.