[haiku-commits] BRANCH pdziepak-github.memcpy-v2 [bb159c7] src/system/libroot/posix/string/arch/x86_64

  • From: pdziepak-github.memcpy-v2 <community@xxxxxxxxxxxx>
  • To: haiku-commits@xxxxxxxxxxxxx
  • Date: Sun, 7 Sep 2014 22:31:33 +0200 (CEST)

added 1 changeset to branch 'refs/remotes/pdziepak-github/memcpy-v2'
old head: 4f79f4ff256e5e4222647dd6cafaff7bc95b4e2e
new head: bb159c73fb99410fb591ee25b1fe0b92cd5870a5
overview: https://github.com/pdziepak/Haiku/compare/4f79f4f...bb159c7

----------------------------------------------------------------------------

bb159c7: libroot/x86_64: new memset implementation
  
  This patch introduces new memset() implementation that improves the
  performance when the buffer is small. It was written for processors that
  support ERMSB, but performs reasonably well on older CPUs as well.
  
  The following benchmarks were done on Haswell i7 running Debian Jessie
  with Linux 3.16.1. In each iteration 64MB buffer was memset()ed, the
  parameter "size" is the size of the buffer passed in a single call (i.e.
  for "size: 2" memset() was called ~32 million times to memset the whole
  64MB).
  
  f - original implementation, g - new implementation, all buffers 16 byte
  aligned
  
  set, size:        8, f:    66885 µs, g:    17768 µs, ∆:   73.44%
  set, size:       32, f:    17123 µs, g:     9163 µs, ∆:   46.49%
  set, size:      128, f:     6677 µs, g:     6919 µs, ∆:   -3.62%
  set, size:      512, f:    11656 µs, g:     7715 µs, ∆:   33.81%
  set, size:     1024, f:     9156 µs, g:     7359 µs, ∆:   19.63%
  set, size:     4096, f:     4936 µs, g:     5159 µs, ∆:   -4.52%
  
  f - glibc 2.19 implementation, g - new implementation, all buffers 16 byte
  aligned
  
  set, size:        8, f:    19631 µs, g:    17828 µs, ∆:    9.18%
  set, size:       32, f:     8545 µs, g:     9047 µs, ∆:   -5.87%
  set, size:      128, f:     8304 µs, g:     6874 µs, ∆:   17.22%
  set, size:      512, f:     7373 µs, g:     7486 µs, ∆:   -1.53%
  set, size:     1024, f:     9007 µs, g:     7344 µs, ∆:   18.46%
  set, size:     4096, f:     8169 µs, g:     5146 µs, ∆:   37.01%
  
  Apparently, glibc uses SSE even for large buffers and therefore does not
  takes advantage of ERMSB:
  
  set, size:    16384, f:     7007 µs, g:     3223 µs, ∆:   54.00%
  set, size:    32768, f:     6979 µs, g:     2930 µs, ∆:   58.02%
  set, size:    65536, f:     6907 µs, g:     2826 µs, ∆:   59.08%
  set, size:   131072, f:     6919 µs, g:     2752 µs, ∆:   60.23%
  
  The new implementation handles unaligned buffers quite well:
  
  f - glibc 2.19 implementation, g - new implementation, all buffers unaligned
  
  set, size:       16, f:    10045 µs, g:    10498 µs, ∆:   -4.51%
  set, size:       32, f:     8590 µs, g:     9358 µs, ∆:   -8.94%
  set, size:       64, f:     8618 µs, g:     8585 µs, ∆:    0.38%
  set, size:      128, f:     8393 µs, g:     6893 µs, ∆:   17.87%
  set, size:      256, f:     8042 µs, g:     7621 µs, ∆:    5.24%
  set, size:      512, f:     9661 µs, g:     7738 µs, ∆:   19.90%
  
  Signed-off-by: Paweł Dziepak <pdziepak@xxxxxxxxxxx>

                                    [ Paweł Dziepak <pdziepak@xxxxxxxxxxx> ]

----------------------------------------------------------------------------

Commit:      bb159c73fb99410fb591ee25b1fe0b92cd5870a5
Author:      Paweł Dziepak <pdziepak@xxxxxxxxxxx>
Date:        Sun Sep  7 19:43:28 2014 UTC

----------------------------------------------------------------------------

1 file changed, 70 insertions(+), 4 deletions(-)
.../posix/string/arch/x86_64/arch_string.cpp     | 74 ++++++++++++++++++--

----------------------------------------------------------------------------

diff --git a/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp 
b/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp
index b83376c..33fca22 100644
--- a/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp
+++ b/src/system/libroot/posix/string/arch/x86_64/arch_string.cpp
@@ -5,6 +5,9 @@
 
 
 #include <cstddef>
+#include <cstdint>
+
+#include <x86intrin.h>
 
 
 extern "C" void*
@@ -18,14 +21,77 @@ memcpy(void* destination, const void* source, size_t length)
 }
 
 
-extern "C" void*
-memset(void* destination, int value, size_t length)
+static inline void
+memset_repstos(uint8_t* destination, uint8_t value, size_t length)
 {
-       auto returnValue = destination;
        __asm__ __volatile__("rep stosb"
                : "+D" (destination), "+c" (length)
                : "a" (value)
                : "memory");
-       return returnValue;
+}
+
+
+static inline void
+memset_sse(uint8_t* destination, uint8_t value, size_t length)
+{
+       __m128i packed = _mm_set1_epi8(value);
+       auto end = reinterpret_cast<__m128i*>(destination + length - 16);
+       auto diff = reinterpret_cast<uintptr_t>(destination) % 16;
+       if (diff) {
+               diff = 16 - diff;
+               length -= diff;
+               _mm_storeu_si128(reinterpret_cast<__m128i*>(destination), 
packed);
+       }
+       auto ptr = reinterpret_cast<__m128i*>(destination + diff);
+       while (length >= 64) {
+               _mm_store_si128(ptr++, packed);
+               _mm_store_si128(ptr++, packed);
+               _mm_store_si128(ptr++, packed);
+               _mm_store_si128(ptr++, packed);
+               length -= 64;
+       }
+       while (length >= 16) {
+               _mm_store_si128(ptr++, packed);
+               length -= 16;
+       }
+       _mm_storeu_si128(end, packed);
+}
+
+
+static inline void
+memset_small(uint8_t* destination, uint8_t value, size_t length)
+{
+       if (length >= 8) {
+               auto packed = value * 0x101010101010101ul;
+               auto ptr = reinterpret_cast<uint64_t*>(destination);
+               auto end = reinterpret_cast<uint64_t*>(destination + length - 
8);
+               while (length >= 8) {
+                       *ptr++ = packed;
+                       length -= 8;
+               }
+               *end = packed;
+       } else {
+               while (length--) {
+                       *destination++ = value;
+               }
+       }
+}
+
+
+extern "C" void*
+memset(void* ptr, int chr, size_t length)
+{
+       auto value = static_cast<unsigned char>(chr);
+       auto destination = static_cast<uint8_t*>(ptr);
+       if (length < 32) {
+               memset_small(destination, value, length);
+               return ptr;
+       }
+       if (length < 2048) {
+               memset_sse(destination, value, length);
+               return ptr;
+       }
+       memset_repstos(destination, value, length);
+       return ptr;
 }
 


Other related posts:

  • » [haiku-commits] BRANCH pdziepak-github.memcpy-v2 [bb159c7] src/system/libroot/posix/string/arch/x86_64 - pdziepak-github . memcpy-v2