blog icon indicating copy to clipboard operation
blog copied to clipboard

Redis 中的字符串对象

Open junnplus opened this issue 6 years ago • 1 comments

字符串

Redis 中的字符串用内置的简单动态字符串(Simple Dynamic String,SDS)来表示。 在 3.2 之前的版本用结构体 sdshdr 表示 SDS。

// 3.2.0/src/sds.h

typedef char *sds;

struct sdshdr {
    unsigned int len;
    unsigned int free;
    char buf[];
};

3.2 之后的版本优化了 SDS 的内存空间占用,

// 5.0.3/src/sds.h

typedef char *sds;

/* Note: sdshdr5 is never used, we just access the flags byte directly.
 * However is here to document the layout of type 5 SDS strings. */
struct __attribute__ ((__packed__)) sdshdr5 {
    unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
    uint8_t len; /* used */
    uint8_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
    uint16_t len; /* used */
    uint16_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
    uint32_t len; /* used */
    uint32_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
    uint64_t len; /* used */
    uint64_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

上面的 sdshdr* 可以抽象成下面的伪代码:

struct sdshdr<T> {
  T len;
  T alloc;
  unsigned char flags;
  char buf[];
}

范型 T 来表示 len 和 alloc 的类型。 使用范型 T 可以根据字符串的长度使用不同类型来表示 len 和 alloc 成员,优化内存空间。

len、alloc 和 flags 成员是 SDS 的元数据,分别表示字符串长度、分配的字符串长度、标记位。 buf 成员是实际的字符串内容,buf[] 是一个柔性数组,不占内存。

sdshdr 结构体指定了 __attribute__ ((__packed__)) ,GCC 编译器将会按照紧凑排列的方式使用内存,也就是说用 packed 修饰之后,结构体将按 1 字节对齐,作为一个优化内存占用。

创建 sds

// 5.0.3/src/sds.c

/* Create a new sds string with the content specified by the 'init' pointer
 * and 'initlen'.
 * If NULL is used for 'init' the string is initialized with zero bytes.
 * If SDS_NOINIT is used, the buffer is left uninitialized;
 *
 * The string is always null-termined (all the sds strings are, always) so
 * even if you create an sds string with:
 *
 * mystring = sdsnewlen("abc",3);
 *
 * You can print the string with printf() as there is an implicit \0 at the
 * end of the string. However the string is binary safe and can contain
 * \0 characters in the middle, as the length is stored in the sds header. */
sds sdsnewlen(const void *init, size_t initlen) {
    void *sh;
    sds s;
    char type = sdsReqType(initlen);
    /* Empty strings are usually created in order to append. Use type 8
     * since type 5 is not good at this. */
    if (type == SDS_TYPE_5 && initlen == 0) type = SDS_TYPE_8;
    int hdrlen = sdsHdrSize(type);
    unsigned char *fp; /* flags pointer. */

    sh = s_malloc(hdrlen+initlen+1);
    if (init==SDS_NOINIT)
        init = NULL;
    else if (!init)
        memset(sh, 0, hdrlen+initlen+1);
    if (sh == NULL) return NULL;
    s = (char*)sh+hdrlen;
    fp = ((unsigned char*)s)-1;
    switch(type) {
        case SDS_TYPE_5: {
            *fp = type | (initlen << SDS_TYPE_BITS);
            break;
        }
        case SDS_TYPE_8: {
            SDS_HDR_VAR(8,s);
            sh->len = initlen;
            sh->alloc = initlen;
            *fp = type;
            break;
        }
        case SDS_TYPE_16: {
            SDS_HDR_VAR(16,s);
            sh->len = initlen;
            sh->alloc = initlen;
            *fp = type;
            break;
        }
        case SDS_TYPE_32: {
            SDS_HDR_VAR(32,s);
            sh->len = initlen;
            sh->alloc = initlen;
            *fp = type;
            break;
        }
        case SDS_TYPE_64: {
            SDS_HDR_VAR(64,s);
            sh->len = initlen;
            sh->alloc = initlen;
            *fp = type;
            break;
        }
    }
    if (initlen && init)
        memcpy(s, init, initlen);
    s[initlen] = '\0';
    return s;
}

// 5.0.3/src/sds.h
#define SDS_HDR_VAR(T,s) struct sdshdr##T *sh = (void*)((s)-(sizeof(struct sdshdr##T)));
  • sdsReqType 根据字符串长度 initlen 选择 sds 类型
  • sdsHdrSize 计算类型所需要的结构体头部长度 hdrlen

sdsnewlen 函数创建一个长度为 hdrlen + initlen + 1 的 SDS,返回的字符串指针 sds 实际上指向 SDS 的 buf 成员,其中 sds 字符串长度为 initlen + 1,最后一个字节保存了空字符 '\0'

image

SDS 遵循 C 字符串以空字符结尾的惯例,保存空字符的 1 字节空间不计算在 SDS 的 len 属性里面,并且为空字符分配额外的 1 字节空间,以及添加空字符到字符串末尾等操作,都是由 SDS 函数自动完成的,所以这个空字符对于 SDS 的使用者来说是完全透明的。遵循空字符结尾这一惯例的好处是,SDS 可以直接重用一部分 C 字符串函数库里面的函数。 摘自:《Redis设计与实现》 — 黄健宏

根据字符串的长度计算需要占用的 SDS 元数据内存大小 hdrlen。

// 5.0.3/src/sds.c

static inline char sdsReqType(size_t string_size) {
    if (string_size < 1<<5)
        return SDS_TYPE_5;
    if (string_size < 1<<8)
        return SDS_TYPE_8;
    if (string_size < 1<<16)
        return SDS_TYPE_16;
#if (LONG_MAX == LLONG_MAX)
    if (string_size < 1ll<<32)
        return SDS_TYPE_32;
    return SDS_TYPE_64;
#else
    return SDS_TYPE_32;
#endif
}

static inline int sdsHdrSize(char type) {
    switch(type&SDS_TYPE_MASK) {
        case SDS_TYPE_5:
            return sizeof(struct sdshdr5);
        case SDS_TYPE_8:
            return sizeof(struct sdshdr8);
        case SDS_TYPE_16:
            return sizeof(struct sdshdr16);
        case SDS_TYPE_32:
            return sizeof(struct sdshdr32);
        case SDS_TYPE_64:
            return sizeof(struct sdshdr64);
    }
    return 0;
}

由于 buf[] 不占内存,所以 sdshdr* 结构体大小就是 SDS 元数据的内存大小。

拼接 sds

// 5.0.3/src/sds.c
/* Append the specified sds 't' to the existing sds 's'.
 *
 * After the call, the modified sds string is no longer valid and all the
 * references must be substituted with the new pointer returned by the call. */
sds sdscatsds(sds s, const sds t) {
    return sdscatlen(s, t, sdslen(t));
}

/* Destructively modify the sds string 's' to hold the specified binary
 * safe string pointed by 't' of length 'len' bytes. */
sds sdscpylen(sds s, const char *t, size_t len) {
    if (sdsalloc(s) < len) {
        s = sdsMakeRoomFor(s,len-sdslen(s));
        if (s == NULL) return NULL;
    }
    memcpy(s, t, len);
    s[len] = '\0';
    sdssetlen(s, len);
    return s;
}

sdscpylen 调用 sdsMakeRoomFor 选择性对 sds 扩容,然后通过 memcpy 拼接字符串。

// 5.0.3/src/sds.c
/* Enlarge the free space at the end of the sds string so that the caller
 * is sure that after calling this function can overwrite up to addlen
 * bytes after the end of the string, plus one more byte for nul term.
 *
 * Note: this does not change the *length* of the sds string as returned
 * by sdslen(), but only the free buffer space we have. */
sds sdsMakeRoomFor(sds s, size_t addlen) {
    void *sh, *newsh;
    size_t avail = sdsavail(s);
    size_t len, newlen;
    char type, oldtype = s[-1] & SDS_TYPE_MASK;
    int hdrlen;

    /* Return ASAP if there is enough space left. */
    if (avail >= addlen) return s;

    len = sdslen(s);
    sh = (char*)s-sdsHdrSize(oldtype);
    newlen = (len+addlen);
    if (newlen < SDS_MAX_PREALLOC)
        newlen *= 2;
    else
        newlen += SDS_MAX_PREALLOC;

    type = sdsReqType(newlen);

    /* Don't use type 5: the user is appending to the string and type 5 is
     * not able to remember empty space, so sdsMakeRoomFor() must be called
     * at every appending operation. */
    if (type == SDS_TYPE_5) type = SDS_TYPE_8;

    hdrlen = sdsHdrSize(type);
    if (oldtype==type) {
        newsh = s_realloc(sh, hdrlen+newlen+1);
        if (newsh == NULL) return NULL;
        s = (char*)newsh+hdrlen;
    } else {
        /* Since the header size changes, need to move the string forward,
         * and can't use realloc */
        newsh = s_malloc(hdrlen+newlen+1);
        if (newsh == NULL) return NULL;
        memcpy((char*)newsh+hdrlen, s, len+1);
        s_free(sh);
        s = (char*)newsh+hdrlen;
        s[-1] = type;
        sdssetlen(s, len);
    }
    sdssetalloc(s, newlen);
    return s;
}
  • sdsavail 计算 sds 空闲长度,如果空闲长度大于新增长度 addlen,就不需要扩容;
  • 计算新 sds 长度 newlen 以及 sds 类型;
  • 如果 sds 类型没有改变,通过 s_realloc 来扩容 buf,否则重新 s_malloc 申请内存并移动原字符串内容,这是因为头部的大小被改变了。

junnplus avatar Feb 16 '19 10:02 junnplus

字符串对象

前一篇介绍过,Redis 的字符串对象有三种编码使用不同底层数据结构来实现: OBJ_ENCODING_INTOBJ_ENCODING_EMBSTROBJ_ENCODING_RAW

int 编码

如果存储的是可以用 long 类型来表示整数值,在 Redis 中会用 int 编码来表示字符串对象。

127.0.0.1:6379> SET a 1
OK
127.0.0.1:6379> TYPE a
string
127.0.0.1:6379> OBJECT ENCODING a
"int"
// 5.0.3/src/object.c

/* Create a string object from a long long value. When possible returns a
 * shared integer object, or at least an integer encoded one.
 *
 * If valueobj is non zero, the function avoids returning a a shared
 * integer, because the object is going to be used as value in the Redis key
 * space (for instance when the INCR command is used), so we want LFU/LRU
 * values specific for each key. */
robj *createStringObjectFromLongLongWithOptions(long long value, int valueobj) {
    robj *o;
    ...

    if (value >= 0 && value < OBJ_SHARED_INTEGERS && valueobj == 0) {
        incrRefCount(shared.integers[value]);
        o = shared.integers[value];
    } else {
        if (value >= LONG_MIN && value <= LONG_MAX) {
            o = createObject(OBJ_STRING, NULL);
            o->encoding = OBJ_ENCODING_INT;
            o->ptr = (void*)((long)value);
        } else {
            o = createObject(OBJ_STRING,sdsfromlonglong(value));
        }
    }
    return o;
}

// 5.0.3/src/server.h
#define OBJ_SHARED_INTEGERS 10000

Redis 中共享 [0, OBJ_SHARED_INTEGERS) 整数之间的 int 编码字符串对象。 数值大小范围不在共享的对象内,如果这个整数值在 long 大小以内,robj.ptr 直接指向这个整数且设置为 int 编码,

image

大于 long 大小的数值是用 sds 表示的 raw 编码字符串对象。

// 5.0.3/src/sds.c

#define SDS_LLSTR_SIZE 21

/* Create an sds string from a long long value. It is much faster than:
 *
 * sdscatprintf(sdsempty(),"%lld\n", value);
 */
sds sdsfromlonglong(long long value) {
    char buf[SDS_LLSTR_SIZE];
    int len = sdsll2str(buf,value);

    return sdsnewlen(buf,len);
}

raw 编码

// 5.0.3/src/object.c

/* Create a string object with encoding OBJ_ENCODING_RAW, that is a plain
 * string object where o->ptr points to a proper sds string. */
robj *createRawStringObject(const char *ptr, size_t len) {
    return createObject(OBJ_STRING, sdsnewlen(ptr,len));
}

raw 编码使用 SDS 来保存字符串,robj.ptr 指向 sds 字符串。 image

embstr 编码

// 5.0.3/src/object.c

/* Create a string object with encoding OBJ_ENCODING_EMBSTR, that is
 * an object where the sds string is actually an unmodifiable string
 * allocated in the same chunk as the object itself. */
robj *createEmbeddedStringObject(const char *ptr, size_t len) {
    robj *o = zmalloc(sizeof(robj)+sizeof(struct sdshdr8)+len+1);
    struct sdshdr8 *sh = (void*)(o+1);

    o->type = OBJ_STRING;
    o->encoding = OBJ_ENCODING_EMBSTR;
    o->ptr = sh+1;
    o->refcount = 1;
    ...

    sh->len = len;
    sh->alloc = len;
    sh->flags = SDS_TYPE_8;
    if (ptr == SDS_NOINIT)
        sh->buf[len] = '\0';
    else if (ptr) {
        memcpy(sh->buf,ptr,len);
        sh->buf[len] = '\0';
    } else {
        memset(sh->buf,0,len+1);
    }
    return o;
}

embstr 编码会一次内存分配一块连续的空间用来表示 redisObject 和 SDS 结构。

image

那什么时候使用 embstr 编码呢?(´・_・`)

// 5.0.3/src/object.c

/* Create a string object with EMBSTR encoding if it is smaller than
 * OBJ_ENCODING_EMBSTR_SIZE_LIMIT, otherwise the RAW encoding is
 * used.
 *
 * The current limit of 44 is chosen so that the biggest string object
 * we allocate as EMBSTR will still fit into the 64 byte arena of jemalloc. */
#define OBJ_ENCODING_EMBSTR_SIZE_LIMIT 44
robj *createStringObject(const char *ptr, size_t len) {
    if (len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT)
        return createEmbeddedStringObject(ptr,len);
    else
        return createRawStringObject(ptr,len);
}

字符串长度小于 44 字节就会用 embstr 编码来存储字符串对象,这是因为内存分配器分配内存的大小单位都是 2 的指数字节,embstr 编码的字符串对象最小的空间占用为 19 字节,所以最少分配的内存是 32 字节的空间,为了存储稍大一点的 embstr 编码字符串对象,分配器分配了 64 字节空间,那么要求字符串长度不超过 44 字节(字符串以 NULL 结尾占用一个字节)。

junnplus avatar Apr 30 '20 14:04 junnplus