blog
blog copied to clipboard
Redis 中的字符串对象
字符串
Redis 中的字符串用内置的简单动态字符串(Simple Dynamic String,SDS)来表示。 在 3.2 之前的版本用结构体 sdshdr 表示 SDS。
// 3.2.0/src/sds.h
typedef char *sds;
struct sdshdr {
unsigned int len;
unsigned int free;
char buf[];
};
3.2 之后的版本优化了 SDS 的内存空间占用,
// 5.0.3/src/sds.h
typedef char *sds;
/* Note: sdshdr5 is never used, we just access the flags byte directly.
* However is here to document the layout of type 5 SDS strings. */
struct __attribute__ ((__packed__)) sdshdr5 {
unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
uint8_t len; /* used */
uint8_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
uint16_t len; /* used */
uint16_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
uint32_t len; /* used */
uint32_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
uint64_t len; /* used */
uint64_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
上面的 sdshdr* 可以抽象成下面的伪代码:
struct sdshdr<T> {
T len;
T alloc;
unsigned char flags;
char buf[];
}
范型 T 来表示 len 和 alloc 的类型。 使用范型 T 可以根据字符串的长度使用不同类型来表示 len 和 alloc 成员,优化内存空间。
len、alloc 和 flags 成员是 SDS 的元数据,分别表示字符串长度、分配的字符串长度、标记位。
buf 成员是实际的字符串内容,buf[] 是一个柔性数组,不占内存。
sdshdr 结构体指定了 __attribute__ ((__packed__)) ,GCC 编译器将会按照紧凑排列的方式使用内存,也就是说用 packed 修饰之后,结构体将按 1 字节对齐,作为一个优化内存占用。
创建 sds
// 5.0.3/src/sds.c
/* Create a new sds string with the content specified by the 'init' pointer
* and 'initlen'.
* If NULL is used for 'init' the string is initialized with zero bytes.
* If SDS_NOINIT is used, the buffer is left uninitialized;
*
* The string is always null-termined (all the sds strings are, always) so
* even if you create an sds string with:
*
* mystring = sdsnewlen("abc",3);
*
* You can print the string with printf() as there is an implicit \0 at the
* end of the string. However the string is binary safe and can contain
* \0 characters in the middle, as the length is stored in the sds header. */
sds sdsnewlen(const void *init, size_t initlen) {
void *sh;
sds s;
char type = sdsReqType(initlen);
/* Empty strings are usually created in order to append. Use type 8
* since type 5 is not good at this. */
if (type == SDS_TYPE_5 && initlen == 0) type = SDS_TYPE_8;
int hdrlen = sdsHdrSize(type);
unsigned char *fp; /* flags pointer. */
sh = s_malloc(hdrlen+initlen+1);
if (init==SDS_NOINIT)
init = NULL;
else if (!init)
memset(sh, 0, hdrlen+initlen+1);
if (sh == NULL) return NULL;
s = (char*)sh+hdrlen;
fp = ((unsigned char*)s)-1;
switch(type) {
case SDS_TYPE_5: {
*fp = type | (initlen << SDS_TYPE_BITS);
break;
}
case SDS_TYPE_8: {
SDS_HDR_VAR(8,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_16: {
SDS_HDR_VAR(16,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_32: {
SDS_HDR_VAR(32,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
case SDS_TYPE_64: {
SDS_HDR_VAR(64,s);
sh->len = initlen;
sh->alloc = initlen;
*fp = type;
break;
}
}
if (initlen && init)
memcpy(s, init, initlen);
s[initlen] = '\0';
return s;
}
// 5.0.3/src/sds.h
#define SDS_HDR_VAR(T,s) struct sdshdr##T *sh = (void*)((s)-(sizeof(struct sdshdr##T)));
- sdsReqType 根据字符串长度 initlen 选择 sds 类型
- sdsHdrSize 计算类型所需要的结构体头部长度 hdrlen
sdsnewlen 函数创建一个长度为 hdrlen + initlen + 1 的 SDS,返回的字符串指针 sds 实际上指向 SDS 的 buf 成员,其中 sds 字符串长度为 initlen + 1,最后一个字节保存了空字符 '\0'。

SDS 遵循 C 字符串以空字符结尾的惯例,保存空字符的 1 字节空间不计算在 SDS 的 len 属性里面,并且为空字符分配额外的 1 字节空间,以及添加空字符到字符串末尾等操作,都是由 SDS 函数自动完成的,所以这个空字符对于 SDS 的使用者来说是完全透明的。遵循空字符结尾这一惯例的好处是,SDS 可以直接重用一部分 C 字符串函数库里面的函数。 摘自:《Redis设计与实现》 — 黄健宏
根据字符串的长度计算需要占用的 SDS 元数据内存大小 hdrlen。
// 5.0.3/src/sds.c
static inline char sdsReqType(size_t string_size) {
if (string_size < 1<<5)
return SDS_TYPE_5;
if (string_size < 1<<8)
return SDS_TYPE_8;
if (string_size < 1<<16)
return SDS_TYPE_16;
#if (LONG_MAX == LLONG_MAX)
if (string_size < 1ll<<32)
return SDS_TYPE_32;
return SDS_TYPE_64;
#else
return SDS_TYPE_32;
#endif
}
static inline int sdsHdrSize(char type) {
switch(type&SDS_TYPE_MASK) {
case SDS_TYPE_5:
return sizeof(struct sdshdr5);
case SDS_TYPE_8:
return sizeof(struct sdshdr8);
case SDS_TYPE_16:
return sizeof(struct sdshdr16);
case SDS_TYPE_32:
return sizeof(struct sdshdr32);
case SDS_TYPE_64:
return sizeof(struct sdshdr64);
}
return 0;
}
由于 buf[] 不占内存,所以 sdshdr* 结构体大小就是 SDS 元数据的内存大小。
拼接 sds
// 5.0.3/src/sds.c
/* Append the specified sds 't' to the existing sds 's'.
*
* After the call, the modified sds string is no longer valid and all the
* references must be substituted with the new pointer returned by the call. */
sds sdscatsds(sds s, const sds t) {
return sdscatlen(s, t, sdslen(t));
}
/* Destructively modify the sds string 's' to hold the specified binary
* safe string pointed by 't' of length 'len' bytes. */
sds sdscpylen(sds s, const char *t, size_t len) {
if (sdsalloc(s) < len) {
s = sdsMakeRoomFor(s,len-sdslen(s));
if (s == NULL) return NULL;
}
memcpy(s, t, len);
s[len] = '\0';
sdssetlen(s, len);
return s;
}
sdscpylen 调用 sdsMakeRoomFor 选择性对 sds 扩容,然后通过 memcpy 拼接字符串。
// 5.0.3/src/sds.c
/* Enlarge the free space at the end of the sds string so that the caller
* is sure that after calling this function can overwrite up to addlen
* bytes after the end of the string, plus one more byte for nul term.
*
* Note: this does not change the *length* of the sds string as returned
* by sdslen(), but only the free buffer space we have. */
sds sdsMakeRoomFor(sds s, size_t addlen) {
void *sh, *newsh;
size_t avail = sdsavail(s);
size_t len, newlen;
char type, oldtype = s[-1] & SDS_TYPE_MASK;
int hdrlen;
/* Return ASAP if there is enough space left. */
if (avail >= addlen) return s;
len = sdslen(s);
sh = (char*)s-sdsHdrSize(oldtype);
newlen = (len+addlen);
if (newlen < SDS_MAX_PREALLOC)
newlen *= 2;
else
newlen += SDS_MAX_PREALLOC;
type = sdsReqType(newlen);
/* Don't use type 5: the user is appending to the string and type 5 is
* not able to remember empty space, so sdsMakeRoomFor() must be called
* at every appending operation. */
if (type == SDS_TYPE_5) type = SDS_TYPE_8;
hdrlen = sdsHdrSize(type);
if (oldtype==type) {
newsh = s_realloc(sh, hdrlen+newlen+1);
if (newsh == NULL) return NULL;
s = (char*)newsh+hdrlen;
} else {
/* Since the header size changes, need to move the string forward,
* and can't use realloc */
newsh = s_malloc(hdrlen+newlen+1);
if (newsh == NULL) return NULL;
memcpy((char*)newsh+hdrlen, s, len+1);
s_free(sh);
s = (char*)newsh+hdrlen;
s[-1] = type;
sdssetlen(s, len);
}
sdssetalloc(s, newlen);
return s;
}
- sdsavail 计算 sds 空闲长度,如果空闲长度大于新增长度 addlen,就不需要扩容;
- 计算新 sds 长度 newlen 以及 sds 类型;
- 如果 sds 类型没有改变,通过 s_realloc 来扩容 buf,否则重新 s_malloc 申请内存并移动原字符串内容,这是因为头部的大小被改变了。
字符串对象
前一篇介绍过,Redis 的字符串对象有三种编码使用不同底层数据结构来实现:
OBJ_ENCODING_INT、OBJ_ENCODING_EMBSTR和OBJ_ENCODING_RAW。
int 编码
如果存储的是可以用 long 类型来表示整数值,在 Redis 中会用 int 编码来表示字符串对象。
127.0.0.1:6379> SET a 1
OK
127.0.0.1:6379> TYPE a
string
127.0.0.1:6379> OBJECT ENCODING a
"int"
// 5.0.3/src/object.c
/* Create a string object from a long long value. When possible returns a
* shared integer object, or at least an integer encoded one.
*
* If valueobj is non zero, the function avoids returning a a shared
* integer, because the object is going to be used as value in the Redis key
* space (for instance when the INCR command is used), so we want LFU/LRU
* values specific for each key. */
robj *createStringObjectFromLongLongWithOptions(long long value, int valueobj) {
robj *o;
...
if (value >= 0 && value < OBJ_SHARED_INTEGERS && valueobj == 0) {
incrRefCount(shared.integers[value]);
o = shared.integers[value];
} else {
if (value >= LONG_MIN && value <= LONG_MAX) {
o = createObject(OBJ_STRING, NULL);
o->encoding = OBJ_ENCODING_INT;
o->ptr = (void*)((long)value);
} else {
o = createObject(OBJ_STRING,sdsfromlonglong(value));
}
}
return o;
}
// 5.0.3/src/server.h
#define OBJ_SHARED_INTEGERS 10000
Redis 中共享 [0, OBJ_SHARED_INTEGERS) 整数之间的 int 编码字符串对象。 数值大小范围不在共享的对象内,如果这个整数值在 long 大小以内,robj.ptr 直接指向这个整数且设置为 int 编码,

大于 long 大小的数值是用 sds 表示的 raw 编码字符串对象。
// 5.0.3/src/sds.c
#define SDS_LLSTR_SIZE 21
/* Create an sds string from a long long value. It is much faster than:
*
* sdscatprintf(sdsempty(),"%lld\n", value);
*/
sds sdsfromlonglong(long long value) {
char buf[SDS_LLSTR_SIZE];
int len = sdsll2str(buf,value);
return sdsnewlen(buf,len);
}
raw 编码
// 5.0.3/src/object.c
/* Create a string object with encoding OBJ_ENCODING_RAW, that is a plain
* string object where o->ptr points to a proper sds string. */
robj *createRawStringObject(const char *ptr, size_t len) {
return createObject(OBJ_STRING, sdsnewlen(ptr,len));
}
raw 编码使用 SDS 来保存字符串,robj.ptr 指向 sds 字符串。

embstr 编码
// 5.0.3/src/object.c
/* Create a string object with encoding OBJ_ENCODING_EMBSTR, that is
* an object where the sds string is actually an unmodifiable string
* allocated in the same chunk as the object itself. */
robj *createEmbeddedStringObject(const char *ptr, size_t len) {
robj *o = zmalloc(sizeof(robj)+sizeof(struct sdshdr8)+len+1);
struct sdshdr8 *sh = (void*)(o+1);
o->type = OBJ_STRING;
o->encoding = OBJ_ENCODING_EMBSTR;
o->ptr = sh+1;
o->refcount = 1;
...
sh->len = len;
sh->alloc = len;
sh->flags = SDS_TYPE_8;
if (ptr == SDS_NOINIT)
sh->buf[len] = '\0';
else if (ptr) {
memcpy(sh->buf,ptr,len);
sh->buf[len] = '\0';
} else {
memset(sh->buf,0,len+1);
}
return o;
}
embstr 编码会一次内存分配一块连续的空间用来表示 redisObject 和 SDS 结构。

那什么时候使用 embstr 编码呢?(´・_・`)
// 5.0.3/src/object.c
/* Create a string object with EMBSTR encoding if it is smaller than
* OBJ_ENCODING_EMBSTR_SIZE_LIMIT, otherwise the RAW encoding is
* used.
*
* The current limit of 44 is chosen so that the biggest string object
* we allocate as EMBSTR will still fit into the 64 byte arena of jemalloc. */
#define OBJ_ENCODING_EMBSTR_SIZE_LIMIT 44
robj *createStringObject(const char *ptr, size_t len) {
if (len <= OBJ_ENCODING_EMBSTR_SIZE_LIMIT)
return createEmbeddedStringObject(ptr,len);
else
return createRawStringObject(ptr,len);
}
字符串长度小于 44 字节就会用 embstr 编码来存储字符串对象,这是因为内存分配器分配内存的大小单位都是 2 的指数字节,embstr 编码的字符串对象最小的空间占用为 19 字节,所以最少分配的内存是 32 字节的空间,为了存储稍大一点的 embstr 编码字符串对象,分配器分配了 64 字节空间,那么要求字符串长度不超过 44 字节(字符串以 NULL 结尾占用一个字节)。