#! /usr/bin/env python3
# -*- coding: utf-8 -*-
from collections import Counter
from math import ceil
import random
NUM_ELEMENTS = 10 ** 6
LOAD_FACTOR = 1.0
def bucket_histogram(num_buckets, num_insertions):
entries_per_bucket = [0] * num_buckets
for _ in range(num_insertions):
bucket = random.randrange(num_buckets)
entries_per_bucket[bucket] += 1
return Counter(entries_per_bucket)
def get_space_per_bucket(elems_in_bucket):
"""Return how many bytes beyond the 4 bytes for the basic entry it takes
to represent a bucket with `elems_in_bucket` entries."""
if elems_in_bucket == 0:
# Just the space for the bucket.
return 4
elif elems_in_bucket == 1:
# Single elements are represented directly, so just the space
# for the bucket.
return 4
elif elems_in_bucket == 2:
# Buckets with two elements receive special IDs that index
# into a deque (or vector) of pairs.
return 4 + 8
else:
# Space for the bucket plus a compact linked list
# representation for the elements it contains.
#
# We might also consider special-casing buckets with 3
# elements, but these are already somewhat rare. So we assume
# here that everything larger is stored as a singly linked
# list or similar data structure, for which we assume a cost
# of 8 bytes per entry. This assumes using ints for pointers
# and managing memory ourselves with near-zero overhead, which
# is hopefully feasible if we never delete. (However, we do
# have to worry about what happens when a 2-element bucket
# grows into a larger one and hence the 2-element bucket
# is released. One idea is to store a linked list of unused
# 8-byte chunks within these 8-byte chunks.)
return 4 + 8 * elems_in_bucket
def print_space_per_entry(num_buckets, num_elements, histogram):
total_space = 0
for elems_in_bucket in sorted(histogram):
num_buckets = histogram[elems_in_bucket]
space_per_bucket = get_space_per_bucket(elems_in_bucket) * num_buckets
print("space for buckets with {} entries: {}".format(
elems_in_bucket, space_per_bucket))
total_space += space_per_bucket
print("space per entry: {}".format(total_space / num_elements))
def main():
num_buckets = ceil(NUM_ELEMENTS / LOAD_FACTOR)
for _ in range(10):
histogram = bucket_histogram(num_buckets, NUM_ELEMENTS)
print_space_per_entry(num_buckets, NUM_ELEMENTS, histogram)
print()
if __name__ == "__main__":
main()