memset
memset copied to clipboard
Some example implementations of memset
/* This file contains a variety of implementations of memset. If you don't know
- what memset is
man memset
may enlighten you. Its definition is - void* memset(void* b, int c, size_t len);
- I wrote this code partly as a gentle introduction to memset and partly as a
- reminder to myself of how memset works. This code is in the public domain and
- you are free to use it for any purpose (commercial or non-commercial) with or
- without attribution to me. I don't guarantee the correctness of any of the
- code herein and if you spot a mistake please let me know and I'll correct it.
- I wrote this code during my free time, but at a period during which I was
- employed by National ICT Australia and it was heavily related to my work. If
- push comes to shove it may be considered their intellectual property, but as
- it is not functional freestanding code and (better) implementations of memset
- are widely available, I hope they won't mind :)
- Matthew Fernandez, 2011 */
#include <string.h> #include <stdio.h> #include <stdint.h> #include <assert.h>
/* Just for convenience let's setup a type for bytes. */ typedef unsigned char byte;
/* Let's start off with a fairly naive implementation of memset. This sets
-
memory byte-by-byte. While not being particularly efficient and being
-
slightly braindead, it does have the advantage of being readily
-
understandable and reasonably straightforward to implement without making
-
mistakes. / void bytewise_memset(void* s, int c, size_t sz) { byte* p = (byte*)s;
/* c should only be a byte's worth of information anyway, but let's mask out
- everything else just in case. */ byte x = c & 0xff;
while (sz--) *p++ = x; return s; }
/* A smarter way of doing memset is word-by-word. Your architecture will have a
-
native word size (32 bits on x86) and reads/writes at this granularity will
-
be more efficient than those at finer granularities. Let's take a look at
-
memset for a 32-bit architecture. / void wordwise_32_memset(void* s, int c, size_t sz) { uint32_t* p = (uint32_t*)s;
/* In this case the masking is actually important. */ uint32_t x = c & 0xff;
/* Construct a word's worth of the value we're supposed to be setting. */ x |= x << 8; x |= x << 16;
/* This technique (without a prologue and epilogue) will only cope with
- sizes that are word-aligned. For example, you cannot use this function to
- set a region 7 bytes long. Let's do some checks to make sure the size
- passed is actually something we can cope with. It is worth noting that in
- practice you would actually want to check the alignment of the start
- pointer as well. Doing a word-wise memset on an unaligned pointer gains
- you nothing and may even hurt performance. / assert(!(sz & 3)); / Check sz is 4-byte-aligned. / sz >>= 2; / Divide by number of bytes in a word. */
while (sz--) *p++ = x; return s; }
/* Now let's do an architecture-independent word-by-word memset. You would never
-
want to implement this in practice, as you are relying on the compiler to
-
optimise the statically determinable loops that could be manually written
-
much more efficiently if you know the target architecture, but this is a
-
useful thought exercise. Note that GCC defines the handy constant __WORDSIZE
-
that tells us the size (in bits) of words on this architecture. / void wordwise_memset(void* s, int c, size_t sz) { uintptr_t* p = (uintptr_t*)s; uintptr_t x = c & 0xff; int i; int bytes_per_word;
/* Construct a word's worth of the byte value we need to set. / for (i = 3; (1<<i) < __WORDSIZE; ++i) x |= x << (1<<i); / At this point i is the power of 2 which is equal to the word size. */
/* 1<<i would be the number of bits per word, therefore (1<<i)/8 == 1<<(i-3)
- is the number of bytes per word. */ bytes_per_word = 1<<(i-3);
/* Check sz is bytes_per_word-byte-aligned. */ assert(!(sz & (bytes_per_word-1))); sz >>= i-3;
while (sz--) *p++ = x; return s; }
/* Let's return to the 32-bit word-wise version briefly to implement a version
- that handles unaligned (with respect to word size) pointers and size. To
- understand what's going on here it's best to look at an example:
- Calling wordwise_32_unaligned_memset(2, 0, 7)...
- Initial: +-+-+-+-+-+-+-+
-
|2|3|4|5|6|7|8| pp = 2, p = ?
-
+-+-+-+-+-+-+-+ sz = 7, tail = ?
-
|?|?|?|?|?|?|?|
-
+-+-+-+-+-+-+-+
- After the prologue: +-+-+-+-+-+-+-+ Now the pointer (pp/p) is aligned.
-
|2|3|4|5|6|7|8| pp = 4, p = 4
-
+-+-+-+-+-+-+-+ sz = 5, tail = 1
-
|0|0|?|?|?|?|?| (then we adjust sz to 5>>2 == 1)
-
+-+-+-+-+-+-+-+
- After the main loop: +-+-+-+-+-+-+-+ We can't set any more word length
-
|2|3|4|5|6|7|8| regions, but there's still one byte
-
+-+-+-+-+-+-+-+ remaining.
-
|0|0|0|0|0|0|?| pp = 8, p = 8
-
+-+-+-+-+-+-+-+ sz = 0, tail = 1
- After the epilogue: +-+-+-+-+-+-+-+ Now we're done.
-
|2|3|4|5|6|7|8| pp = 9, p = 8
-
+-+-+-+-+-+-+-+ sz = 0, tail = 0
-
|0|0|0|0|0|0|0|
-
+-+-+-+-+-+-+-+
/ void wordwise_32_unaligned_memset(void* s, int c, size_t sz) { uint32_t* p; uint32_t x = c & 0xff; byte xx = c & 0xff; byte* pp = (byte*)s; size_t tail;
/* Let's introduce a prologue to bump the starting location forward to the
* next alignment boundary.
*/
while (((unsigned int)pp & 3) && sz--)
*pp++ = xx;
p = (uint32_t*)pp;
/* Let's figure out the number of bytes that will be trailing when the
* word-wise loop taps out.
*/
tail = sz & 3;
/* The middle of this function is identical to the wordwise_32_memset minus
* the alignment checks.
*/
x |= x << 8;
x |= x << 16;
sz >>= 2;
while (sz--)
*p++ = x;
/* Now we introduce an epilogue to account for the trailing bytes. */
pp = (byte*)p;
while (tail--)
*pp++ = xx;
return s;
}
/* Let's take the final logical step and implement an architecture-independent
-
memset that can cope with unaligned pointers and sizes. Note, the same
-
caveat as for wordwise_memset applies; you wouldn't write code like this in
-
real life. / void wordwise_unaligned_memset(void* s, int c, size_t sz) { uintptr_t* p; uintptr_t x = c & 0xff; byte* pp = (byte*)s; byte xx = c & 0xff; size_t tail; int i; int bytes_per_word;
for (i = 3; (1<<i) < __WORDSIZE; ++i) x |= x << (1<<i); bytes_per_word = 1<<(i-3);
/* Prologue. */ while (((unsigned int)pp & (bytes_per_word-1)) && sz--) pp++ = xx; tail = sz & (bytes_per_word-1); p = (uintptr_t)pp;
/* Main loop. */ sz >>= i-3; while (sz--) *p++ = x;
/* Epilogue. / pp = (byte)p; while (tail--) *pp++ = xx;
return s; }
/* Finally, just for fun, let's look at memset using Duff's Device
-
(http://www.lysator.liu.se/c/duffs-device.html). The original Duff's Device
-
was not intended to target memset, but was aimed at solving the problem of
-
copying data from an array into a memory mapped hardware register. As a
-
result, it's not particularly suited to memset. Of course there's a big
-
"but" with this. As with all the algorithms in this file, the relative
-
performance is highly architecture- and compiler-dependent. If you want to
-
know the fastest algorithm for a given scenario you really have no choice
-
but to look at the generated assembly code. / void duffs_device_memset(void* s, int c, size_t sz) { byte* p = (byte*)s; byte x = c & 0xff; unsigned int leftover = sz & 0x7;
/* Catch the pathological case of 0. */ if (!sz) return s;
/* To understand what's going on here, take a look at the original
- bytewise_memset and consider unrolling the loop. For this situation
- we'll unroll the loop 8 times (assuming a 32-bit architecture). Choosing
- the level to which to unroll the loop can be a fine art... */ sz = (sz + 7) >> 3; switch (leftover) { case 0: do { *p++ = x; case 7: *p++ = x; case 6: *p++ = x; case 5: *p++ = x; case 4: *p++ = x; case 3: *p++ = x; case 2: *p++ = x; case 1: *p++ = x; } while (--sz > 0); } return s; }
/* Lines below here are instrumentation for testing your implementation. */
#define CHECK(f, unaligned)
do {
int fail_byte = check_memset((f), (unaligned));
if (fail_byte)
printf("%s %s check failed on byte %d.\n",
(unaligned) ? "Unaligned" : "Aligned", #f, fail_byte - 1);
} while(0)
#define BUFFER_LEN 4096
/* This function does some very basic checking of your memset function. If you
-
really want to comprehensively test your function you should check the
-
regions either side of the memory you are setting to make sure your function
-
is actually obeying the limits passed to it.
-
Note, the argument unaligned determines whether we pass an unaligned pointer
-
and size or not. Only certain of the above implementations will pass this
-
test. / int check_memset(void f(), int unaligned) { char buffer[BUFFER_LEN]; int i; char set;
for (set = 0; (unsigned int)set <= 0xff; ++set) { f(buffer + unaligned, set, BUFFER_LEN - 2unaligned); for (i = unaligned; i < BUFFER_LEN - 2unaligned; ++i) if (buffer[i] != set) return i + 1; }
return 0; }
/* When executed, this program will just validate the implementations in this
-
file. Note that the unaligned tests are only run on the functions that can
-
cope with unaligned values. / int main(int argc, char* argv) {
/* Use GCC's built-in memset to validate our checking function. */ CHECK(memset, 0); CHECK(memset, 1);
/* Check our implementations. */ CHECK(bytewise_memset, 0); CHECK(bytewise_memset, 1); CHECK(wordwise_32_memset, 0); CHECK(wordwise_memset, 0); CHECK(wordwise_32_unaligned_memset, 0); CHECK(wordwise_32_unaligned_memset, 1); CHECK(wordwise_unaligned_memset, 0); CHECK(wordwise_unaligned_memset, 1); CHECK(duffs_device_memset, 0); CHECK(duffs_device_memset, 1);
return 0; }