skip to Main Content

uint8_t data[] = "mykeyxyz:1234nky:123n...";.
My lines of string has format key:value, where each line has len(key) <= 16 guaranteed. I want to load mykeyxyz into a __m128i, but fill out the higher position with 0.

The easiest way is to have an array of 255 or 0 masks, but that requires another memory load.
Is there anyway to do this faster?

Edit: preferably without AVX512

Edit 2: I need the variable len so I can start parsing the value part.

Edit 3: the function will be used in a loop (for example to parse 1 million lines of text). But strcmp_mask will basically always be inside L1 cache

Edit 4: I benchmark the functions by parsing 1 billion lines of (key,value) and process them. You can download the code/data and replicate the results in my repo: https://github.com/lehuyduc/1brc-simd . Also the discussion post will contain more info

The accepted answer gives ~2% faster total program time. To compare, test 1brc_valid13.cpp against 1brc_valid14.cpp (which uses the accepted answer). Hardware: AMD 2950X, Ubuntu 18.04, g++ 11.4, compile command: g++ -o main 1brc_final_valid.cpp -O3 -std=c++17 -march=native -m64 -lpthread

#include <iostream>
#include <immintrin.h>
#include <string>
#include <cstring>
using namespace std;

alignas(4096) const uint8_t strcmp_mask[32] = {
  255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};

int main()
{
  uint8_t data[] = "mykeyxyz:1234naaaaaaaaaaa";
  __m128i chars = _mm_loadu_si128((__m128i*)data);
  __m128i separators = _mm_set1_epi8(':');
  __m128i compared = _mm_cmpeq_epi8(chars, separators);
  uint32_t separator_mask = _mm_movemask_epi8(compared);
  uint32_t len = __builtin_ctz(separator_mask);
  
  cout << "len = " << len << "n";
  __m128i mask = _mm_loadu_si128((__m128i*)(strcmp_mask + 16 - len));
  __m128i key_chars = _mm_and_si128(chars, mask);
  
  uint8_t res[16];
  memcpy(res, (char*)&key_chars, 16);
  for (int i = 0; i < 16; i++) cout << int(res[i]) << " ";
  cout << "n";
}
// len = 8
// 109 121 107 101 121 120 121 122 0 0 0 0 0 0 0 0

3

Answers


  1. I’m not aware of an efficient, load-free way of doing it without AVX-512.
    See is there an inverse instruction to the movemask instruction in intel avx2? for various approaches.

    Using AVX-512, you can generate a mask using _mm_mask_broadcastb_epi8 for later use with _mm_and_si128.

    Even more simply, you can mask the input characters with _mm_maskz_mov_epi8 (1):

    __m128i mask_first_n(__m128i string, int n) {
        __mmask16 mask = (1u << n) - 1;
        return _mm_maskz_mov_epi8(mask, string);
    }
    
    mask_first_n(long long __vector(2), int):
            mov     eax, 1
            shlx    eax, eax, edi
            sub     eax, 1
            kmovw   k1, eax
            vmovdqu8        xmm0{k1}{z}, xmm0
            ret
    

    Using this, you can mask your string like this:

    int find(__m128i string, char c) {
        __m128i compareMask = _mm_cmpeq_epi8(string, _mm_set1_epi8(c));
        int mask = _mm_movemask_epi8(compareMask);
        // TODO: error handling if mask == 0, i.e. character not found
        return __builtin_ctz(mask);
    }
    
    int main() {
        unsigned char data[] = "mykeyxyz:1234naaaaaaaaaaa";
        __m128i chars = _mm_loadu_si128((__m128i*)data);
    
        int pos = find(chars, ':');
        __m128i result = mask_first_n(chars, pos);
    
        puts((char*) &result);
    }
    

    This code prints mykeyxyz.
    See live code at Compiler Explorer.


    (1) thanks to @PeterCordes for the suggestion

    Login or Signup to reply.
  2. The following code (requiring only SSE 4.1), masks out every byte following the first occurrence of char c in string:

    __m128i maskString(__m128i string, char c) {
        // find every occurrence of `c`:
        __m128i compareMask = _mm_cmpeq_epi8(string, _mm_set1_epi8(c));
        // always generate a `-1` for the lower half, 
        // and a `-1` for the upper half if the lower half is zero (i.e. if it would carry):
        __m128i minus_one = _mm_cmpeq_epi64(_mm_bslli_si128(compareMask, 8), _mm_setzero_si128());
        // `~compareMask & (compareMask-1)` as if computed as 128bit subtraction.
        // Subtraction makes `0xff` of every byte below the first occurrence of `c` in `string`, anding with `~compareMask` clears every remaining `0xff` and `0xfe`
        __m128i andMask = _mm_andnot_si128(compareMask, _mm_add_epi64(compareMask, minus_one));
    
        // apply mask to string and return:
        return _mm_and_si128(andMask, string);
    }
    

    Godbolt demo: https://godbolt.org/z/Tnsj1sf46

    N.B.: If you need the length of the key anyways, OP’s original code is probably fine (loading data from cache is usually cheap).

    Login or Signup to reply.
  3. I often find it interesting to see how others approach a problem, so here’s my version. It only requires SSE2, but benefits from BMI1 for the trailing zeros calculation.

    #include <stdio.h>
    #include <string.h>
    #include <stdint.h>
    #include <immintrin.h>
    
    // gcc maskafterc.c -o maskafterc.bin -O3 -Wall
    
    __m128i maskafterc(__m128i input, uint8_t c, uint8_t *pos) {
      // Finds first occurance of c in input and takes its position pos. 
      // Returns mask of 255s before pos, 0s on and after.
      // e.g. maskafterc([5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20], 9, uint8_t *pos)
      // sets pos = 4 and returns [255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0].
      __m128i index = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
      __m128i cmp = _mm_cmpeq_epi8(input, _mm_set1_epi8(c));
      uint16_t mmask = _mm_movemask_epi8(cmp);
      *pos = (mmask ? __builtin_ctz(mmask) : 16); // Return all -1s if c not found
      return _mm_cmplt_epi8(index, _mm_set1_epi8(*pos));
    }
    
    int main(int argc, char **argv)
    {
      unsigned char data[] = "mykeyxyz:98765:211234naaaaaaaaaaa";
      uint8_t pos;
      __m128i chars = _mm_loadu_si128((__m128i*)data);
      __m128i res =_mm_and_si128(maskafterc(chars, ':', &pos), chars);
      if (pos < 16) puts((char*) &res);
      printf("keylen = %in", pos);
    }
    //mykeyxyz
    //keylen = 8
    
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search