Unexpected result when DeleteRange at range [nil, nil)
It does not delete all data in tikv when i called DeleteRange(ctx, nil, nil, 10)
Reproduce code:
package main
import (
"context"
"fmt"
"github.com/alecthomas/kong"
"github.com/sirupsen/logrus"
"github.com/tikv/client-go/v2/tikv"
)
// clean Tikv command clean all data in tikv, be careful to use.
type CleanTikvCmd struct {
Concurrency int32 `short:"c" default:"10" help:"clean concurrency"`
PDEndpoints []string `required:"" help:"endpoints of tikv's pd"`
UseMaxEndkey bool `default:"false" help:"use []byte{255...} as end key when DeleteRange"`
}
func CountKey(tikvClient *tikv.KVStore, begin, end []byte) error {
numberOfKeys := 0
txn, err := tikvClient.Begin()
if err != nil {
return err
}
txn.GetSnapshot().SetKeyOnly(true)
// Here use [nil, nil] can count all of keys.
iter, err := txn.Iter(begin, end)
if err != nil {
return err
}
for iter.Valid() {
numberOfKeys += 1
if err := iter.Next(); err != nil {
return err
}
}
logrus.Infof("tikv: number of keys in range [%v, %v] is %v", begin, end, numberOfKeys)
return nil
}
func PutKeys(ctx context.Context, tikvClient *tikv.KVStore, count int) error {
txn, err := tikvClient.Begin()
if err != nil {
return err
}
for i := 0; i < 10; i++ {
err := txn.Set([]byte(fmt.Sprint(i)), []byte("balala"))
if err != nil {
return err
}
}
err = txn.Commit(ctx)
return err
}
func (cmd *CleanTikvCmd) Run() error {
ctx := context.Background()
tikvClient, err := tikv.NewTxnClient(cmd.PDEndpoints)
if err != nil {
return err
}
if err := PutKeys(ctx, tikvClient, 10); err != nil {
return err
}
var (
begin []byte = nil
end []byte = nil
)
if cmd.UseMaxEndkey {
for i := 0; i < 32; i++ {
end = append(end, 255)
}
}
logrus.Infof("count key before DeleteRange")
if err := CountKey(tikvClient, begin, end); err != nil {
return err
}
// FIXME: figured out why use [nil, nil] can not delete all range.
count, err := tikvClient.DeleteRange(ctx, begin, end, int(cmd.Concurrency))
if err != nil {
return err
}
logrus.Infof("clean tikv done, region number is %v", count)
logrus.Infof("count key after DeleteRange")
if err := CountKey(tikvClient, begin, end); err != nil {
return err
}
return nil
}
func main() {
test := &CleanTikvCmd{}
ctx := kong.Parse(test)
err := ctx.Run()
ctx.FatalIfErrorf(err)
}
Result in my tikv(v5.3.0):
- Without flag use-max-endkey
[2022/02/24 10:52:20.847 +08:00] [INFO] [client.go:355] ["[pd] create pd client with endpoints"] [pd-address="[http://security-167:17520]"]
[2022/02/24 10:52:20.856 +08:00] [INFO] [base_client.go:331] ["[pd] update member urls"] [old-urls="[http://security-167:17520]"] [new-urls="[http://10.172.198.14:17520]"]
[2022/02/24 10:52:20.856 +08:00] [INFO] [base_client.go:349] ["[pd] switch leader"] [new-leader=http://10.172.198.14:17520] [old-leader=]
[2022/02/24 10:52:20.856 +08:00] [INFO] [base_client.go:104] ["[pd] init cluster id"] [cluster-id=7065581661513030660]
[2022/02/24 10:52:20.856 +08:00] [INFO] [client.go:651] ["[pd] tso dispatcher created"] [dc-location=global]
INFO[0000] count key before DeleteRange
INFO[0000] tikv: number of keys in range [[], []] is 10
[2022/02/24 10:52:20.879 +08:00] [INFO] [range_task.go:137] ["range task started"] [name=delete-range] [startKey=] [endKey=] [concurrency=10]
[2022/02/24 10:52:20.881 +08:00] [INFO] [range_task.go:243] ["range task finished"] [name=delete-range] [startKey=] [endKey=] ["cost time"=975.753µs] ["completed regions"=0]
INFO[0000] clean tikv done, region number is 0
INFO[0000] count key after DeleteRange
INFO[0000] tikv: number of keys in range [[], []] is 10
- With flag use-max-endkey
[2022/02/24 10:52:49.252 +08:00] [INFO] [client.go:355] ["[pd] create pd client with endpoints"] [pd-address="[http://security-167:17520]"]
[2022/02/24 10:52:49.257 +08:00] [INFO] [base_client.go:331] ["[pd] update member urls"] [old-urls="[http://security-167:17520]"] [new-urls="[http://10.172.198.14:17520]"]
[2022/02/24 10:52:49.257 +08:00] [INFO] [base_client.go:349] ["[pd] switch leader"] [new-leader=http://10.172.198.14:17520] [old-leader=]
[2022/02/24 10:52:49.257 +08:00] [INFO] [base_client.go:104] ["[pd] init cluster id"] [cluster-id=7065581661513030660]
[2022/02/24 10:52:49.257 +08:00] [INFO] [client.go:651] ["[pd] tso dispatcher created"] [dc-location=global]
INFO[0000] count key before DeleteRange
INFO[0000] tikv: number of keys in range [[], [255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255]] is 10
[2022/02/24 10:52:49.290 +08:00] [INFO] [range_task.go:137] ["range task started"] [name=delete-range] [startKey=] [endKey=ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff] [concurrency=10]
[2022/02/24 10:52:49.308 +08:00] [INFO] [range_task.go:243] ["range task finished"] [name=delete-range] [startKey=] [endKey=ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff] ["cost time"=18.117433ms] ["completed regions"=1]
INFO[0000] clean tikv done, region number is 1
INFO[0000] count key after DeleteRange
INFO[0000] tikv: number of keys in range [[], [255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255]] is 0
It can see that DeleteRange(ctx, nil, nil,10) can not delete data in all range.
I'll take a look.
When i debug source code in tikv/client, i found code here may not correct when rangeEndKey = nil.
I try to hard code in sendReqOnRange to fix like this:
// Execute performs the delete range operation.
func (t *DeleteRangeTask) sendReqOnRange(ctx context.Context, r kv.KeyRange) (TaskStat, error) {
startKey, rangeEndKey := r.StartKey, r.EndKey
var stat TaskStat
for {
select {
case <-ctx.Done():
return stat, errors.WithStack(ctx.Err())
default:
}
// if bytes.Compare(startKey, rangeEndKey) >= 0 {
// break
// }
// hard code part begin
// normal case: startKey >= rangeEndKey
if len(rangeEndKey) > 0 && bytes.Compare(startKey, rangeEndKey) >= 0 {
break
}
// hard code part end
bo := retry.NewBackofferWithVars(ctx, deleteRangeOneRegionMaxBackoff, nil)
loc, err := t.store.GetRegionCache().LocateKey(bo, startKey)
if err != nil {
return stat, err
}
// Delete to the end of the region, except if it's the last region overlapping the range
endKey := loc.EndKey
// If it is the last region
if loc.Contains(rangeEndKey) {
endKey = rangeEndKey
}
// log begin
logutil.Logger(ctx).Info("send on range info",
zap.String("startKey", kv.StrKey(startKey)),
zap.String("range endKey", kv.StrKey(rangeEndKey)),
zap.String("endKey", kv.StrKey(endKey)))
// log end
req := tikvrpc.NewRequest(tikvrpc.CmdDeleteRange, &kvrpcpb.DeleteRangeRequest{
StartKey: startKey,
EndKey: endKey,
NotifyOnly: t.notifyOnly,
})
resp, err := t.store.SendReq(bo, req, loc.Region, client.ReadTimeoutMedium)
if err != nil {
return stat, err
}
regionErr, err := resp.GetRegionError()
if err != nil {
return stat, err
}
if regionErr != nil {
err = bo.Backoff(retry.BoRegionMiss, errors.New(regionErr.String()))
if err != nil {
return stat, err
}
// log begin
logutil.Logger(ctx).Info("send on range error",
zap.String("err", regionErr.String()))
// log end
continue
}
if resp.Resp == nil {
return stat, errors.WithStack(tikverr.ErrBodyMissing)
}
deleteRangeResp := resp.Resp.(*kvrpcpb.DeleteRangeResponse)
if err := deleteRangeResp.GetError(); err != "" {
return stat, errors.Errorf("unexpected delete range err: %v", err)
}
stat.CompletedRegions++
startKey = endKey
// hard code begin
// case rangeEndKey = nil: will find the end of tikv data.
if len(endKey) == 0 {
break
}
// hard code end
}
return stat, nil
}
Result:
[2022/02/24 11:21:45.701 +08:00] [INFO] [client.go:355] ["[pd] create pd client with endpoints"] [pd-address="[http://security-167:17520]"]
[2022/02/24 11:21:45.750 +08:00] [INFO] [base_client.go:331] ["[pd] update member urls"] [old-urls="[http://security-167:17520]"] [new-urls="[http://10.172.198.14:17520]"]
[2022/02/24 11:21:45.751 +08:00] [INFO] [base_client.go:349] ["[pd] switch leader"] [new-leader=http://10.172.198.14:17520] [old-leader=]
[2022/02/24 11:21:45.751 +08:00] [INFO] [base_client.go:104] ["[pd] init cluster id"] [cluster-id=7065581661513030660]
[2022/02/24 11:21:45.751 +08:00] [INFO] [client.go:651] ["[pd] tso dispatcher created"] [dc-location=global]
INFO[0000] count key before DeleteRange
INFO[0000] tikv: number of keys in range [[], []] is 10
[2022/02/24 11:21:45.781 +08:00] [INFO] [range_task.go:137] ["range task started"] [name=delete-range] [startKey=] [endKey=] [concurrency=10]
[2022/02/24 11:21:45.782 +08:00] [INFO] [delete_range.go:153] ["send on range info"] [startKey=] ["range endKey"=] [endKey=]
[2022/02/24 11:21:45.815 +08:00] [INFO] [delete_range.go:178] ["send on range error"] [err="epoch_not_match:<> "]
[2022/02/24 11:21:45.816 +08:00] [INFO] [delete_range.go:153] ["send on range info"] [startKey=] ["range endKey"=] [endKey=]
[2022/02/24 11:21:45.836 +08:00] [INFO] [delete_range.go:178] ["send on range error"] [err="epoch_not_match:<> "]
[2022/02/24 11:21:45.836 +08:00] [INFO] [delete_range.go:153] ["send on range info"] [startKey=] ["range endKey"=] [endKey=]
[2022/02/24 11:21:45.856 +08:00] [INFO] [delete_range.go:178] ["send on range error"] [err="epoch_not_match:<> "]
[2022/02/24 11:21:45.856 +08:00] [INFO] [delete_range.go:153] ["send on range info"] [startKey=] ["range endKey"=] [endKey=]
[2022/02/24 11:21:45.894 +08:00] [INFO] [delete_range.go:178] ["send on range error"] [err="epoch_not_match:<> "]
[2022/02/24 11:21:45.894 +08:00] [INFO] [delete_range.go:153] ["send on range info"] [startKey=] ["range endKey"=] [endKey=]
[2022/02/24 11:21:45.928 +08:00] [INFO] [delete_range.go:178] ["send on range error"] [err="epoch_not_match:<> "]
[2022/02/24 11:21:45.929 +08:00] [INFO] [delete_range.go:153] ["send on range info"] [startKey=] ["range endKey"=] [endKey=]
[2022/02/24 11:21:45.945 +08:00] [INFO] [delete_range.go:178] ["send on range error"] [err="epoch_not_match:<> "]
[2022/02/24 11:21:45.946 +08:00] [INFO] [delete_range.go:153] ["send on range info"] [startKey=] ["range endKey"=] [endKey=]
^C
tikv log error when test:
[2022/02/24 11:26:29.533 +08:00] [ERROR] [apply.rs:1248] ["execute raft command"] [err_code=KV:Raftstore:Unknown] [err="Other(\"[components/raftstore/src/store/fsm/apply.rs:1602]: invalid delete range command, start_key: [0, 0, 0, 0, 0, 0, 0, 0, 247], end_key: [0, 0, 0, 0, 0, 0, 0, 0, 247]\")"] [peer_id=3] [region_id=2]
[2022/02/24 11:26:29.535 +08:00] [INFO] [kv.rs:1023] ["kv rpc failed"] [err=RemoteStopped] [request=batch_commands]
It keeps reporting error when send request to tikv and i don't know what's going on.
@Jy-poi I found this issue too. I think there are bug in tikv side too and it prevents using nil as end key of delete range.
I have send a PR. You can help to review if it is convenient. Later TiKV shoule apply this change to make finally work.
Ok, I will review this after a while.