Unverified Commit fadc5a80 authored by Hussein Galal's avatar Hussein Galal Committed by GitHub
Browse files

Add tombstone file to etcd and catch errc etcd channel (#2592)



* Add tombstone file to embedded etcd
Signed-off-by: default avatargalal-hussein <hussein.galal.ahmed.11@gmail.com>

* go mod update
Signed-off-by: default avatargalal-hussein <hussein.galal.ahmed.11@gmail.com>

* fixes
Signed-off-by: default avatargalal-hussein <hussein.galal.ahmed.11@gmail.com>

* more fixes
Signed-off-by: default avatargalal-hussein <hussein.galal.ahmed.11@gmail.com>

* more changes
Signed-off-by: default avatargalal-hussein <hussein.galal.ahmed.11@gmail.com>

* gofmt and goimports
Signed-off-by: default avatargalal-hussein <hussein.galal.ahmed.11@gmail.com>

* go mod update
Signed-off-by: default avatargalal-hussein <hussein.galal.ahmed.11@gmail.com>

* go lint
Signed-off-by: default avatargalal-hussein <hussein.galal.ahmed.11@gmail.com>

* go lint
Signed-off-by: default avatargalal-hussein <hussein.galal.ahmed.11@gmail.com>

* go mod tidy
Signed-off-by: default avatargalal-hussein <hussein.galal.ahmed.11@gmail.com>
parent 10b43c8f
......@@ -25,7 +25,7 @@ replace (
github.com/matryer/moq => github.com/rancher/moq v0.0.0-20190404221404-ee5226d43009
github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.0-rc92
github.com/opencontainers/runtime-spec => github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6
go.etcd.io/etcd => github.com/k3s-io/etcd v0.0.0-20200911210206-f8fde3601008 // v3.4.13-k3s1
go.etcd.io/etcd => github.com/k3s-io/etcd v0.5.0-alpha.5.0.20201204203317-251ee41536d8
golang.org/x/crypto => golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2
golang.org/x/net => golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7
golang.org/x/sys => golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456
......
......@@ -3,8 +3,14 @@
package executor
import (
"io/ioutil"
"path/filepath"
"strings"
"github.com/rancher/k3s/pkg/version"
"github.com/sirupsen/logrus"
"go.etcd.io/etcd/embed"
"go.etcd.io/etcd/etcdserver"
)
func (e Embedded) CurrentETCDOptions() (InitialOptions, error) {
......@@ -27,8 +33,18 @@ func (e Embedded) ETCD(args ETCDConfig) error {
go func() {
select {
case err := <-etcd.Server.ErrNotify():
if strings.Contains(err.Error(), etcdserver.ErrMemberRemoved.Error()) {
tombstoneFile := filepath.Join(args.DataDir, "tombstone")
if err := ioutil.WriteFile(tombstoneFile, []byte{}, 0600); err != nil {
logrus.Fatalf("failed to write tombstone file to %s", tombstoneFile)
}
logrus.Infof("this node has been removed from the cluster please restart %s to rejoin the cluster", version.Program)
return
}
case <-etcd.Server.StopNotify():
logrus.Fatalf("etcd stopped - if this node was removed from the cluster, you must backup and delete %s before rejoining", args.DataDir)
logrus.Fatalf("etcd stopped")
case err := <-etcd.Err():
logrus.Fatalf("etcd exited: %v", err)
}
......
......@@ -72,6 +72,8 @@ const (
// other defaults from k8s.io/apiserver/pkg/storage/storagebackend/factory/etcd3.go
defaultKeepAliveTime = 30 * time.Second
defaultKeepAliveTimeout = 10 * time.Second
maxBackupRetention = 5
)
// Members contains a slice that holds all
......@@ -323,6 +325,13 @@ func (e *ETCD) Register(ctx context.Context, config *config.Control, handler htt
return nil, err
}
tombstoneFile := filepath.Join(etcdDBDir(e.config), "tombstone")
if _, err := os.Stat(tombstoneFile); err == nil {
logrus.Infof("tombstone file has been detected, removing data dir to rejoin the cluster")
if _, err := backupDirWithRetention(etcdDBDir(e.config), maxBackupRetention); err != nil {
return nil, err
}
}
return e.handler(handler), err
}
......@@ -512,7 +521,7 @@ func (e *ETCD) removePeer(ctx context.Context, id, address string) error {
}
if u.Hostname() == address {
if e.address == address {
logrus.Fatalf("node has been delete from the cluster. Backup and delete ${datadir}/server/db if you like to rejoin the node")
return errors.New("node has been deleted from the cluster")
}
logrus.Infof("Removing name=%s id=%d address=%s from etcd", member.Name, member.ID, address)
_, err := e.client.MemberRemove(ctx, member.ID)
......@@ -802,3 +811,35 @@ func snapshotRetention(retention int, snapshotDir string) error {
})
return os.Remove(filepath.Join(snapshotDir, snapshotFiles[0].Name()))
}
// backupDirWithRetention will move the dir to a backup dir
// and will keep only maxBackupRetention of dirs.
func backupDirWithRetention(dir string, maxBackupRetention int) (string, error) {
backupDir := dir + "-backup-" + strconv.Itoa(int(time.Now().Unix()))
if _, err := os.Stat(dir); err != nil {
return "", nil
}
files, err := ioutil.ReadDir(filepath.Dir(dir))
if err != nil {
return "", err
}
sort.Slice(files, func(i, j int) bool {
return files[i].ModTime().After(files[j].ModTime())
})
count := 0
for _, f := range files {
if strings.HasPrefix(f.Name(), filepath.Base(dir)+"-backup") && f.IsDir() {
count++
if count > maxBackupRetention {
if err := os.RemoveAll(filepath.Join(filepath.Dir(dir), f.Name())); err != nil {
return "", err
}
}
}
}
// move the directory to a temp path
if err := os.Rename(dir, backupDir); err != nil {
return "", err
}
return backupDir, nil
}
......@@ -39,6 +39,7 @@ var (
ErrKeyNotFound = errors.New("etcdserver: key not found")
ErrCorrupt = errors.New("etcdserver: corrupt cluster")
ErrBadLeaderTransferee = errors.New("etcdserver: bad leader transferee")
ErrMemberRemoved = errors.New("etcdserver: the member has been permanently removed from the cluster")
)
type DiscoveryError struct {
......
......@@ -1388,7 +1388,7 @@ func (s *EtcdServer) applyEntries(ep *etcdProgress, apply *apply) {
}
var shouldstop bool
if ep.appliedt, ep.appliedi, shouldstop = s.apply(ents, &ep.confState); shouldstop {
go s.stopWithDelay(10*100*time.Millisecond, fmt.Errorf("the member has been permanently removed from the cluster"))
go s.stopWithDelay(10*100*time.Millisecond, ErrMemberRemoved)
}
}
......@@ -1551,6 +1551,8 @@ func (s *EtcdServer) stopWithDelay(d time.Duration, err error) {
// when the server is stopped.
func (s *EtcdServer) StopNotify() <-chan struct{} { return s.done }
func (s *EtcdServer) ErrNotify() <-chan error { return s.errorc }
func (s *EtcdServer) SelfStats() []byte { return s.stats.JSON() }
func (s *EtcdServer) LeaderStats() []byte {
......
......@@ -1007,7 +1007,7 @@ github.com/willf/bitset
github.com/xiang90/probing
# go.etcd.io/bbolt v1.3.5
go.etcd.io/bbolt
# go.etcd.io/etcd v0.5.0-alpha.5.0.20200819165624-17cef6e3e9d5 => github.com/k3s-io/etcd v0.0.0-20200911210206-f8fde3601008
# go.etcd.io/etcd v0.5.0-alpha.5.0.20200819165624-17cef6e3e9d5 => github.com/k3s-io/etcd v0.5.0-alpha.5.0.20201204203317-251ee41536d8
## explicit
go.etcd.io/etcd/auth
go.etcd.io/etcd/auth/authpb
......@@ -2949,7 +2949,7 @@ vbom.ml/util/sortorder
# github.com/matryer/moq => github.com/rancher/moq v0.0.0-20190404221404-ee5226d43009
# github.com/opencontainers/runc => github.com/opencontainers/runc v1.0.0-rc92
# github.com/opencontainers/runtime-spec => github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6
# go.etcd.io/etcd => github.com/k3s-io/etcd v0.0.0-20200911210206-f8fde3601008
# go.etcd.io/etcd => github.com/k3s-io/etcd v0.5.0-alpha.5.0.20201204203317-251ee41536d8
# golang.org/x/crypto => golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2
# golang.org/x/net => golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7
# golang.org/x/sys => golang.org/x/sys v0.0.0-20190826190057-c7b8b68b1456
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment