diff options
author | bfredl <bjorn.linse@gmail.com> | 2023-05-17 16:08:06 +0200 |
---|---|---|
committer | bfredl <bjorn.linse@gmail.com> | 2023-09-08 12:48:46 +0200 |
commit | 5970157e1d22fd5e05ae5d3bd949f807fb7a744c (patch) | |
tree | 59bd8e97d87fa5da7cc95f290f47b93eaf8dd953 /src/nvim/map_key_impl.c.h | |
parent | 6a8b48e24cbe070846dd1d234553b3fdeb19460e (diff) | |
download | rneovim-5970157e1d22fd5e05ae5d3bd949f807fb7a744c.tar.gz rneovim-5970157e1d22fd5e05ae5d3bd949f807fb7a744c.tar.bz2 rneovim-5970157e1d22fd5e05ae5d3bd949f807fb7a744c.zip |
refactor(map): enhanced implementation, Clean Codeā¢, etc etc
This involves two redesigns of the map.c implementations:
1. Change of macro style and code organization
The old khash.h and map.c implementation used huge #define blocks with a
lot of backslash line continuations.
This instead uses the "implementation file" .c.h pattern. Such a file is
meant to be included multiple times, with different macros set prior to
inclusion as parameters. we already use this pattern e.g. for
eval/typval_encode.c.h to implement different typval encoders reusing a
similar structure.
We can structure this code into two parts. one that only depends on key
type and is enough to implement sets, and one which depends on both key
and value to implement maps (as a wrapper around sets, with an added
value[] array)
2. Separate the main hash buckets from the key / value arrays
Change the hack buckets to only contain an index into separate key /
value arrays
This is a common pattern in modern, state of the art hashmap
implementations. Even though this leads to one more allocated array, it
is this often is a net reduction of memory consumption. Consider
key+value consuming at least 12 bytes per pair. On average, we will have
twice as many buckets per item.
Thus old implementation:
2*12 = 24 bytes per item
New implementation
1*12 + 2*4 = 20 bytes per item
And the difference gets bigger with larger items.
One might think we have pulled a fast one here, as wouldn't the average size of
the new key/value arrays be 1.5 slots per items due to amortized grows?
But remember, these arrays are fully dense, and thus the accessed memory,
measured in _cache lines_, the unit which actually matters, will be the
fully used memory but just rounded up to the nearest cache line
boundary.
This has some other interesting properties, such as an insert-only
set/map will be fully ordered by insert only. Preserving this ordering
in face of deletions is more tricky tho. As we currently don't use
ordered maps, the "delete" operation maintains compactness of the item
arrays in the simplest way by breaking the ordering. It would be
possible to implement an order-preserving delete although at some cost,
like allowing the items array to become non-dense until the next rehash.
Finally, in face of these two major changes, all code used in khash.h
has been integrated into map.c and friends. Given the heavy edits it
makes no sense to "layer" the code into a vendored and a wrapper part.
Rather, the layered cake follows the specialization depth: code shared
for all maps, code specialized to a key type (and its equivalence
relation), and finally code specialized to value+key type.
Diffstat (limited to 'src/nvim/map_key_impl.c.h')
-rw-r--r-- | src/nvim/map_key_impl.c.h | 159 |
1 files changed, 159 insertions, 0 deletions
diff --git a/src/nvim/map_key_impl.c.h b/src/nvim/map_key_impl.c.h new file mode 100644 index 0000000000..7e7b2f74fe --- /dev/null +++ b/src/nvim/map_key_impl.c.h @@ -0,0 +1,159 @@ +#include "nvim/map.h" +#include "nvim/memory.h" + +#ifndef KEY_NAME +// Don't error out. it is nice to type-check the file in isolation, in clangd or otherwise +# define KEY_NAME(x) x##int +# define hash_int(x) ((uint32_t)x) +# define equal_int(x, y) ((x) == (y)) +#endif + +#define SET_TYPE KEY_NAME(Set_) +#define KEY_TYPE KEY_NAME() + +/// find bucket to get or put "key" +/// +/// set->h.hash assumed already allocated! +/// +/// @return bucket index, or MH_TOMBSTONE if not found and `put` was false +/// mh_is_either(hash[rv]) : not found, but this is the place to put +/// otherwise: hash[rv]-1 is index into key/value arrays +uint32_t KEY_NAME(mh_find_bucket_)(SET_TYPE *set, KEY_TYPE key, bool put) +{ + MapHash *h = &set->h; + uint32_t step = 0; + uint32_t mask = h->n_buckets - 1; + uint32_t k = KEY_NAME(hash_)(key); + uint32_t i = k & mask; + uint32_t last = i; + uint32_t site = put ? last : MH_TOMBSTONE; + while (!mh_is_empty(h, i)) { + if (mh_is_del(h, i)) { + if (site == last) { + site = i; + } + } else if (KEY_NAME(equal_)(set->keys[h->hash[i] - 1], key)) { + return i; + } + i = (i + (++step)) & mask; + if (i == last) { + abort(); + } + } + if (site == last) { + site = i; + } + return site; +} + +/// @return index into set->keys if found, MH_TOMBSTONE otherwise +uint32_t KEY_NAME(mh_get_)(SET_TYPE *set, KEY_TYPE key) +{ + if (set->h.n_buckets == 0) { + return MH_TOMBSTONE; + } + uint32_t idx = KEY_NAME(mh_find_bucket_)(set, key, false); + return (idx != MH_TOMBSTONE) ? set->h.hash[idx] - 1 : MH_TOMBSTONE; +} + +/// Rebuild hash from keys[] array +/// +/// set->h.hash must be allocated and empty before&alling! +void KEY_NAME(mh_rehash_)(SET_TYPE *set) +{ + for (uint32_t k = 0; k < set->h.n_keys; k++) { + uint32_t idx = KEY_NAME(mh_find_bucket_)(set, set->keys[k], true); + // there must be tombstones when we do a rehash + if (!mh_is_empty((&set->h), idx)) { + abort(); + } + set->h.hash[idx] = k + 1; + } + set->h.n_occupied = set->h.size = set->h.n_keys; +} + +/// Put a key. Return the existing item if found +/// +/// Allocates/resizes the hash table and/or keys[] table if needed. +/// +/// @param[out] new mandatory. Reveals if an existing key was found. In addition, +/// if new item, indicates if keys[] was resized. +/// +/// @return keys index +uint32_t KEY_NAME(mh_put_)(SET_TYPE *set, KEY_TYPE key, MhPutStatus *new) +{ + MapHash *h = &set->h; + // Might rehash ahead of time if "key" already existed. But it was + // going to happen soon anyway. + if (h->n_occupied >= h->upper_bound) { + // If we likely were to resize soon, do it now to avoid extra rehash + // TODO(bfredl): we never shrink. but maybe that's fine + if (h->size >= h->upper_bound * 0.9) { + mh_realloc(h, h->n_buckets + 1); + } else { + // Just a lot of tombstones from deleted items, start all over again + memset(h->hash, 0, h->n_buckets * sizeof(*h->hash)); + h->size = h->n_occupied = 0; + } + KEY_NAME(mh_rehash_)(set); + } + + uint32_t idx = KEY_NAME(mh_find_bucket_)(set, key, true); + + if (mh_is_either(h, idx)) { + h->size++; + if (mh_is_empty(h, idx)) { + h->n_occupied++; + } + + uint32_t pos = h->n_keys++; + if (pos >= h->keys_capacity) { + h->keys_capacity = MAX(h->keys_capacity * 2, 8); + set->keys = xrealloc(set->keys, h->keys_capacity * sizeof(KEY_TYPE)); + *new = kMHNewKeyRealloc; + } else { + *new = kMHNewKeyDidFit; + } + set->keys[pos] = key; + h->hash[idx] = pos + 1; + return pos; + } else { + *new = kMHExisting; + uint32_t pos = h->hash[idx] - 1; + if (!KEY_NAME(equal_)(set->keys[pos], key)) { + abort(); + } + return pos; + } +} + +/// Deletes `*key` if found, do nothing otherwise +/// +/// @param[in, out] key modified to the value contained in the set +/// @return the index the item used to have in keys[] +/// MH_TOMBSTONE if key was not found +uint32_t KEY_NAME(mh_delete_)(SET_TYPE *set, KEY_TYPE *key) +{ + if (set->h.size == 0) { + return MH_TOMBSTONE; + } + uint32_t idx = KEY_NAME(mh_find_bucket_)(set, *key, false); + if (idx != MH_TOMBSTONE) { + uint32_t k = set->h.hash[idx] - 1; + set->h.hash[idx] = MH_TOMBSTONE; + + uint32_t last = --set->h.n_keys; + *key = set->keys[k]; + set->h.size--; + if (last != k) { + uint32_t idx2 = KEY_NAME(mh_find_bucket_)(set, set->keys[last], false); + if (set->h.hash[idx2] != last + 1) { + abort(); + } + set->h.hash[idx2] = k + 1; + set->keys[k] = set->keys[last]; + } + return k; + } + return MH_TOMBSTONE; +} |