memset icon indicating copy to clipboard operation
memset copied to clipboard

Some example implementations of memset

/* This file contains a variety of implementations of memset. If you don't know

  • what memset is man memset may enlighten you. Its definition is
  • void* memset(void* b, int c, size_t len);
  • I wrote this code partly as a gentle introduction to memset and partly as a
  • reminder to myself of how memset works. This code is in the public domain and
  • you are free to use it for any purpose (commercial or non-commercial) with or
  • without attribution to me. I don't guarantee the correctness of any of the
  • code herein and if you spot a mistake please let me know and I'll correct it.
  • I wrote this code during my free time, but at a period during which I was
  • employed by National ICT Australia and it was heavily related to my work. If
  • push comes to shove it may be considered their intellectual property, but as
  • it is not functional freestanding code and (better) implementations of memset
  • are widely available, I hope they won't mind :)
  • Matthew Fernandez, 2011 */

#include <string.h> #include <stdio.h> #include <stdint.h> #include <assert.h>

/* Just for convenience let's setup a type for bytes. */ typedef unsigned char byte;

/* Let's start off with a fairly naive implementation of memset. This sets

  • memory byte-by-byte. While not being particularly efficient and being

  • slightly braindead, it does have the advantage of being readily

  • understandable and reasonably straightforward to implement without making

  • mistakes. / void bytewise_memset(void* s, int c, size_t sz) { byte* p = (byte*)s;

    /* c should only be a byte's worth of information anyway, but let's mask out

    • everything else just in case. */ byte x = c & 0xff;

    while (sz--) *p++ = x; return s; }

/* A smarter way of doing memset is word-by-word. Your architecture will have a

  • native word size (32 bits on x86) and reads/writes at this granularity will

  • be more efficient than those at finer granularities. Let's take a look at

  • memset for a 32-bit architecture. / void wordwise_32_memset(void* s, int c, size_t sz) { uint32_t* p = (uint32_t*)s;

    /* In this case the masking is actually important. */ uint32_t x = c & 0xff;

    /* Construct a word's worth of the value we're supposed to be setting. */ x |= x << 8; x |= x << 16;

    /* This technique (without a prologue and epilogue) will only cope with

    • sizes that are word-aligned. For example, you cannot use this function to
    • set a region 7 bytes long. Let's do some checks to make sure the size
    • passed is actually something we can cope with. It is worth noting that in
    • practice you would actually want to check the alignment of the start
    • pointer as well. Doing a word-wise memset on an unaligned pointer gains
    • you nothing and may even hurt performance. / assert(!(sz & 3)); / Check sz is 4-byte-aligned. / sz >>= 2; / Divide by number of bytes in a word. */

    while (sz--) *p++ = x; return s; }

/* Now let's do an architecture-independent word-by-word memset. You would never

  • want to implement this in practice, as you are relying on the compiler to

  • optimise the statically determinable loops that could be manually written

  • much more efficiently if you know the target architecture, but this is a

  • useful thought exercise. Note that GCC defines the handy constant __WORDSIZE

  • that tells us the size (in bits) of words on this architecture. / void wordwise_memset(void* s, int c, size_t sz) { uintptr_t* p = (uintptr_t*)s; uintptr_t x = c & 0xff; int i; int bytes_per_word;

    /* Construct a word's worth of the byte value we need to set. / for (i = 3; (1<<i) < __WORDSIZE; ++i) x |= x << (1<<i); / At this point i is the power of 2 which is equal to the word size. */

    /* 1<<i would be the number of bits per word, therefore (1<<i)/8 == 1<<(i-3)

    • is the number of bytes per word. */ bytes_per_word = 1<<(i-3);

    /* Check sz is bytes_per_word-byte-aligned. */ assert(!(sz & (bytes_per_word-1))); sz >>= i-3;

    while (sz--) *p++ = x; return s; }

/* Let's return to the 32-bit word-wise version briefly to implement a version

  • that handles unaligned (with respect to word size) pointers and size. To
  • understand what's going on here it's best to look at an example:
  • Calling wordwise_32_unaligned_memset(2, 0, 7)...
  • Initial: +-+-+-+-+-+-+-+
  •                  |2|3|4|5|6|7|8|  pp = 2, p = ?
    
  •                  +-+-+-+-+-+-+-+  sz = 7, tail = ?
    
  •                  |?|?|?|?|?|?|?|
    
  •                  +-+-+-+-+-+-+-+
    
  • After the prologue: +-+-+-+-+-+-+-+ Now the pointer (pp/p) is aligned.
  •                  |2|3|4|5|6|7|8|  pp = 4, p = 4
    
  •                  +-+-+-+-+-+-+-+  sz = 5, tail = 1
    
  •                  |0|0|?|?|?|?|?| (then we adjust sz to 5>>2 == 1)
    
  •                  +-+-+-+-+-+-+-+
    
  • After the main loop: +-+-+-+-+-+-+-+ We can't set any more word length
  •                  |2|3|4|5|6|7|8| regions, but there's still one byte
    
  •                  +-+-+-+-+-+-+-+ remaining.
    
  •                  |0|0|0|0|0|0|?|  pp = 8, p = 8
    
  •                  +-+-+-+-+-+-+-+  sz = 0, tail = 1
    
  • After the epilogue: +-+-+-+-+-+-+-+ Now we're done.
  •                  |2|3|4|5|6|7|8|  pp = 9, p = 8
    
  •                  +-+-+-+-+-+-+-+  sz = 0, tail = 0
    
  •                  |0|0|0|0|0|0|0|
    
  •                  +-+-+-+-+-+-+-+
    

/ void wordwise_32_unaligned_memset(void* s, int c, size_t sz) { uint32_t* p; uint32_t x = c & 0xff; byte xx = c & 0xff; byte* pp = (byte*)s; size_t tail;

/* Let's introduce a prologue to bump the starting location forward to the
 * next alignment boundary.
 */
while (((unsigned int)pp & 3) && sz--)
    *pp++ = xx;
p = (uint32_t*)pp;

/* Let's figure out the number of bytes that will be trailing when the
 * word-wise loop taps out.
 */
tail = sz & 3;

/* The middle of this function is identical to the wordwise_32_memset minus
 * the alignment checks.
 */
x |= x << 8;
x |= x << 16;

sz >>= 2;

while (sz--)
    *p++ = x;

/* Now we introduce an epilogue to account for the trailing bytes. */
pp = (byte*)p;
while (tail--)
    *pp++ = xx;

return s;

}

/* Let's take the final logical step and implement an architecture-independent

  • memset that can cope with unaligned pointers and sizes. Note, the same

  • caveat as for wordwise_memset applies; you wouldn't write code like this in

  • real life. / void wordwise_unaligned_memset(void* s, int c, size_t sz) { uintptr_t* p; uintptr_t x = c & 0xff; byte* pp = (byte*)s; byte xx = c & 0xff; size_t tail; int i; int bytes_per_word;

    for (i = 3; (1<<i) < __WORDSIZE; ++i) x |= x << (1<<i); bytes_per_word = 1<<(i-3);

    /* Prologue. */ while (((unsigned int)pp & (bytes_per_word-1)) && sz--) pp++ = xx; tail = sz & (bytes_per_word-1); p = (uintptr_t)pp;

    /* Main loop. */ sz >>= i-3; while (sz--) *p++ = x;

    /* Epilogue. / pp = (byte)p; while (tail--) *pp++ = xx;

    return s; }

/* Finally, just for fun, let's look at memset using Duff's Device

  • (http://www.lysator.liu.se/c/duffs-device.html). The original Duff's Device

  • was not intended to target memset, but was aimed at solving the problem of

  • copying data from an array into a memory mapped hardware register. As a

  • result, it's not particularly suited to memset. Of course there's a big

  • "but" with this. As with all the algorithms in this file, the relative

  • performance is highly architecture- and compiler-dependent. If you want to

  • know the fastest algorithm for a given scenario you really have no choice

  • but to look at the generated assembly code. / void duffs_device_memset(void* s, int c, size_t sz) { byte* p = (byte*)s; byte x = c & 0xff; unsigned int leftover = sz & 0x7;

    /* Catch the pathological case of 0. */ if (!sz) return s;

    /* To understand what's going on here, take a look at the original

    • bytewise_memset and consider unrolling the loop. For this situation
    • we'll unroll the loop 8 times (assuming a 32-bit architecture). Choosing
    • the level to which to unroll the loop can be a fine art... */ sz = (sz + 7) >> 3; switch (leftover) { case 0: do { *p++ = x; case 7: *p++ = x; case 6: *p++ = x; case 5: *p++ = x; case 4: *p++ = x; case 3: *p++ = x; case 2: *p++ = x; case 1: *p++ = x; } while (--sz > 0); } return s; }

/* Lines below here are instrumentation for testing your implementation. */

#define CHECK(f, unaligned)
do {
int fail_byte = check_memset((f), (unaligned));
if (fail_byte)
printf("%s %s check failed on byte %d.\n",
(unaligned) ? "Unaligned" : "Aligned", #f, fail_byte - 1);
} while(0)

#define BUFFER_LEN 4096

/* This function does some very basic checking of your memset function. If you

  • really want to comprehensively test your function you should check the

  • regions either side of the memory you are setting to make sure your function

  • is actually obeying the limits passed to it.

  • Note, the argument unaligned determines whether we pass an unaligned pointer

  • and size or not. Only certain of the above implementations will pass this

  • test. / int check_memset(void f(), int unaligned) { char buffer[BUFFER_LEN]; int i; char set;

    for (set = 0; (unsigned int)set <= 0xff; ++set) { f(buffer + unaligned, set, BUFFER_LEN - 2unaligned); for (i = unaligned; i < BUFFER_LEN - 2unaligned; ++i) if (buffer[i] != set) return i + 1; }

    return 0; }

/* When executed, this program will just validate the implementations in this

  • file. Note that the unaligned tests are only run on the functions that can

  • cope with unaligned values. / int main(int argc, char* argv) {

    /* Use GCC's built-in memset to validate our checking function. */ CHECK(memset, 0); CHECK(memset, 1);

    /* Check our implementations. */ CHECK(bytewise_memset, 0); CHECK(bytewise_memset, 1); CHECK(wordwise_32_memset, 0); CHECK(wordwise_memset, 0); CHECK(wordwise_32_unaligned_memset, 0); CHECK(wordwise_32_unaligned_memset, 1); CHECK(wordwise_unaligned_memset, 0); CHECK(wordwise_unaligned_memset, 1); CHECK(duffs_device_memset, 0); CHECK(duffs_device_memset, 1);

    return 0; }