这个文件主要讲述一些宏和哈希函数。
关于宏,其实不用细看。
关于哈希函数,看懂用途即可。其转换的细节不用看,不然肯定受不了。
- /* The cached cardinality MSB is used to signal validity of the cached value. */
- #define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[7] |= (1<<7)
- #define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0)
-
- #define HLL_P 14 /* The greater is P, the smaller the error. */
- #define HLL_Q (64-HLL_P) /* The number of bits of the hash value used for
- determining the number of leading zeros. */
- #define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
- #define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */
- #define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */
- #define HLL_REGISTER_MAX ((1<<HLL_BITS)-1)
- #define HLL_HDR_SIZE sizeof(struct hllhdr)
- #define HLL_DENSE_SIZE (HLL_HDR_SIZE+((HLL_REGISTERS*HLL_BITS+7)/8))
- #define HLL_DENSE 0 /* Dense encoding. */
- #define HLL_SPARSE 1 /* Sparse encoding. */
- #define HLL_RAW 255 /* Only used internally, never exposed. */
- #define HLL_MAX_ENCODING 1
-
- static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected";
-
- /* =========================== Low level bit macros ========================= */
-
- /* Macros to access the dense representation.
- *
- * We need to get and set 6 bit counters in an array of 8 bit bytes.
- * We use macros to make sure the code is inlined since speed is critical
- * especially in order to compute the approximated cardinality in
- * HLLCOUNT where we need to access all the registers at once.
- * For the same reason we also want to avoid conditionals in this code path.
- *
- * +--------+--------+--------+------//
- * |11000000|22221111|33333322|55444444
- * +--------+--------+--------+------//
- *
- * Note: in the above representation the most significant bit (MSB)
- * of every byte is on the left. We start using bits from the LSB to MSB,
- * and so forth passing to the next byte.
- *
- * Example, we want to access to counter at pos = 1 ("111111" in the
- * illustration above).
- *
- * The index of the first byte b0 containing our data is:
- *
- * b0 = 6 * pos / 8 = 0
- *
- * +--------+
- * |11000000| <- Our byte at b0
- * +--------+
- *
- * The position of the first bit (counting from the LSB = 0) in the byte
- * is given by:
- *
- * fb = 6 * pos % 8 -> 6
- *
- * Right shift b0 of 'fb' bits.
- *
- * +--------+
- * |11000000| <- Initial value of b0
- * |00000011| <- After right shift of 6 pos.
- * +--------+
- *
- * Left shift b1 of bits 8-fb bits (2 bits)
- *
- * +--------+
- * |22221111| <- Initial value of b1
- * |22111100| <- After left shift of 2 bits.
- * +--------+
- *
- * OR the two bits, and finally AND with 111111 (63 in decimal) to
- * clean the higher order bits we are not interested in:
- *
- * +--------+
- * |00000011| <- b0 right shifted
- * |22111100| <- b1 left shifted
- * |22111111| <- b0 OR b1
- * | 111111| <- (b0 OR b1) AND 63, our value.
- * +--------+
- *
- * We can try with a different example, like pos = 0. In this case
- * the 6-bit counter is actually contained in a single byte.
- *
- * b0 = 6 * pos / 8 = 0
- *
- * +--------+
- * |11000000| <- Our byte at b0
- * +--------+
- *
- * fb = 6 * pos % 8 = 0
- *
- * So we right shift of 0 bits (no shift in practice) and
- * left shift the next byte of 8 bits, even if we don't use it,
- * but this has the effect of clearing the bits so the result
- * will not be affected after the OR.
- *
- * -------------------------------------------------------------------------
- *
- * Setting the register is a bit more complex, let's assume that 'val'
- * is the value we want to set, already in the right range.
- *
- * We need two steps, in one we need to clear the bits, and in the other
- * we need to bitwise-OR the new bits.
- *
- * Let's try with 'pos' = 1, so our first byte at 'b' is 0,
- *
- * "fb" is 6 in this case.
- *
- * +--------+
- * |11000000| <- Our byte at b0
- * +--------+
- *
- * To create an AND-mask to clear the bits about this position, we just
- * initialize the mask with the value 63, left shift it of "fs" bits,
- * and finally invert the result.
- *
- * +--------+
- * |00111111| <- "mask" starts at 63
- * |11000000| <- "mask" after left shift of "ls" bits.
- * |00111111| <- "mask" after invert.
- * +--------+
- *
- * Now we can bitwise-AND the byte at "b" with the mask, and bitwise-OR
- * it with "val" left-shifted of "ls" bits to set the new bits.
- *
- * Now let's focus on the next byte b1:
- *
- * +--------+
- * |22221111| <- Initial value of b1
- * +--------+
- *
- * To build the AND mask we start again with the 63 value, right shift
- * it by 8-fb bits, and invert it.
- *
- * +--------+
- * |00111111| <- "mask" set at 2&6-1
- * |00001111| <- "mask" after the right shift by 8-fb = 2 bits
- * |11110000| <- "mask" after bitwise not.
- * +--------+
- *
- * Now we can mask it with b+1 to clear the old bits, and bitwise-OR
- * with "val" left-shifted by "rs" bits to set the new value.
- */
-
- /* Note: if we access the last counter, we will also access the b+1 byte
- * that is out of the array, but sds strings always have an implicit null
- * term, so the byte exists, and we can skip the conditional (or the need
- * to allocate 1 byte more explicitly). */
-
- /* Store the value of the register at position 'regnum' into variable 'target'.
- * 'p' is an array of unsigned bytes. */
- #define HLL_DENSE_GET_REGISTER(target,p,regnum) do { \
- uint8_t *_p = (uint8_t*) p; \
- unsigned long _byte = regnum*HLL_BITS/8; \
- unsigned long _fb = regnum*HLL_BITS&7; \
- unsigned long _fb8 = 8 - _fb; \
- unsigned long b0 = _p[_byte]; \
- unsigned long b1 = _p[_byte+1]; \
- target = ((b0 >> _fb) | (b1 << _fb8)) & HLL_REGISTER_MAX; \
- } while(0)
-
- /* Set the value of the register at position 'regnum' to 'val'.
- * 'p' is an array of unsigned bytes. */
- #define HLL_DENSE_SET_REGISTER(p,regnum,val) do { \
- uint8_t *_p = (uint8_t*) p; \
- unsigned long _byte = regnum*HLL_BITS/8; \
- unsigned long _fb = regnum*HLL_BITS&7; \
- unsigned long _fb8 = 8 - _fb; \
- unsigned long _v = val; \
- _p[_byte] &= ~(HLL_REGISTER_MAX << _fb); \
- _p[_byte] |= _v << _fb; \
- _p[_byte+1] &= ~(HLL_REGISTER_MAX >> _fb8); \
- _p[_byte+1] |= _v >> _fb8; \
- } while(0)
-
- /* Macros to access the sparse representation.
- * The macros parameter is expected to be an uint8_t pointer. */
- #define HLL_SPARSE_XZERO_BIT 0x40 /* 01xxxxxx */
- #define HLL_SPARSE_VAL_BIT 0x80 /* 1vvvvvxx */
- #define HLL_SPARSE_IS_ZERO(p) (((*(p)) & 0xc0) == 0) /* 00xxxxxx */
- #define HLL_SPARSE_IS_XZERO(p) (((*(p)) & 0xc0) == HLL_SPARSE_XZERO_BIT)
- #define HLL_SPARSE_IS_VAL(p) ((*(p)) & HLL_SPARSE_VAL_BIT)
- #define HLL_SPARSE_ZERO_LEN(p) (((*(p)) & 0x3f)+1)
- #define HLL_SPARSE_XZERO_LEN(p) (((((*(p)) & 0x3f) << 8) | (*((p)+1)))+1)
- #define HLL_SPARSE_VAL_VALUE(p) ((((*(p)) >> 2) & 0x1f)+1)
- #define HLL_SPARSE_VAL_LEN(p) (((*(p)) & 0x3)+1)
- #define HLL_SPARSE_VAL_MAX_VALUE 32
- #define HLL_SPARSE_VAL_MAX_LEN 4
- #define HLL_SPARSE_ZERO_MAX_LEN 64
- #define HLL_SPARSE_XZERO_MAX_LEN 16384
- #define HLL_SPARSE_VAL_SET(p,val,len) do { \
- *(p) = (((val)-1)<<2|((len)-1))|HLL_SPARSE_VAL_BIT; \
- } while(0)
- #define HLL_SPARSE_ZERO_SET(p,len) do { \
- *(p) = (len)-1; \
- } while(0)
- #define HLL_SPARSE_XZERO_SET(p,len) do { \
- int _l = (len)-1; \
- *(p) = (_l>>8) | HLL_SPARSE_XZERO_BIT; \
- *((p)+1) = (_l&0xff); \
- } while(0)
- #define HLL_ALPHA_INF 0.721347520444481703680 /* constant for 0.5/ln(2) */
-
- /* ========================= HyperLogLog algorithm ========================= */
-
- /* Our hash function is MurmurHash2, 64 bit version.
- * It was modified for Redis in order to provide the same result in
- * big and little endian archs (endian neutral). */
- uint64_t MurmurHash64A (const void * key, int len, unsigned int seed) {
- const uint64_t m = 0xc6a4a7935bd1e995;
- const int r = 47;
- uint64_t h = seed ^ (len * m);
- const uint8_t *data = (const uint8_t *)key;
- const uint8_t *end = data + (len-(len&7));
-
- while(data != end) {
- uint64_t k;
-
- #if (BYTE_ORDER == LITTLE_ENDIAN)
- #ifdef USE_ALIGNED_ACCESS
- memcpy(&k,data,sizeof(uint64_t));
- #else
- k = *((uint64_t*)data);
- #endif
- #else
- k = (uint64_t) data[0];
- k |= (uint64_t) data[1] << 8;
- k |= (uint64_t) data[2] << 16;
- k |= (uint64_t) data[3] << 24;
- k |= (uint64_t) data[4] << 32;
- k |= (uint64_t) data[5] << 40;
- k |= (uint64_t) data[6] << 48;
- k |= (uint64_t) data[7] << 56;
- #endif
-
- k *= m;
- k ^= k >> r;
- k *= m;
- h ^= k;
- h *= m;
- data += 8;
- }
-
- switch(len & 7) {
- case 7: h ^= (uint64_t)data[6] << 48; /* fall-thru */
- case 6: h ^= (uint64_t)data[5] << 40; /* fall-thru */
- case 5: h ^= (uint64_t)data[4] << 32; /* fall-thru */
- case 4: h ^= (uint64_t)data[3] << 24; /* fall-thru */
- case 3: h ^= (uint64_t)data[2] << 16; /* fall-thru */
- case 2: h ^= (uint64_t)data[1] << 8; /* fall-thru */
- case 1: h ^= (uint64_t)data[0];
- h *= m; /* fall-thru */
- };
-
- h ^= h >> r;
- h *= m;
- h ^= h >> r;
- return h;
- }
-
- /* Given a string element to add to the HyperLogLog, returns the length
- * of the pattern 000..1 of the element hash. As a side effect 'regp' is
- * set to the register index this element hashes to. */
- int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
- uint64_t hash, bit, index;
- int count;
-
- /* Count the number of zeroes starting from bit HLL_REGISTERS
- * (that is a power of two corresponding to the first bit we don't use
- * as index). The max run can be 64-P+1 = Q+1 bits.
- *
- * Note that the final "1" ending the sequence of zeroes must be
- * included in the count, so if we find "001" the count is 3, and
- * the smallest count possible is no zeroes at all, just a 1 bit
- * at the first position, that is a count of 1.
- *
- * This may sound like inefficient, but actually in the average case
- * there are high probabilities to find a 1 after a few iterations. */
- hash = MurmurHash64A(ele,elesize,0xadc83b19ULL);
- index = hash & HLL_P_MASK; /* Register index. */
- hash >>= HLL_P; /* Remove bits used to address the register. */
- hash |= ((uint64_t)1<<HLL_Q); /* Make sure the loop terminates
- and count will be <= Q+1. */
- bit = 1;
- count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
- while((hash & bit) == 0) {
- count++;
- bit <<= 1;
- }
- *regp = (int) index;
- return count;
- }