diff --git a/Include/dictobject.h b/Include/dictobject.h index 5a1e9feea1..e6676c2d82 100644 --- a/Include/dictobject.h +++ b/Include/dictobject.h @@ -78,6 +78,13 @@ struct _dictobject { */ Py_ssize_t ma_mask; + Py_ssize_t ma_collisions; /* Collisions heuristic */ + + /* Indicates whether the dictionary was created under hash + randomization state or if it was randomized as a result of collision + threshold breach. */ + Py_ssize_t ma_randomized; + /* ma_table points to ma_smalltable for small tables, else to * additional malloc'ed memory. ma_table is never NULL! This rule * saves repeated runtime null-tests in the workhorse getitem and diff --git a/Objects/dictobject.c b/Objects/dictobject.c index a792b2dfa2..f38277c2b1 100644 --- a/Objects/dictobject.c +++ b/Objects/dictobject.c @@ -148,6 +148,8 @@ _PyDict_Dummy(void) /* forward declarations */ static PyDictEntry * lookdict_string(PyDictObject *mp, PyObject *key, long hash); +static int +dictresize(PyDictObject *mp, Py_ssize_t minused); #ifdef SHOW_CONVERSION_COUNTS static long created = 0L; @@ -210,11 +212,12 @@ show_track(void) #define INIT_NONZERO_DICT_SLOTS(mp) do { \ (mp)->ma_table = (mp)->ma_smalltable; \ (mp)->ma_mask = PyDict_MINSIZE - 1; \ + (mp)->ma_randomized = Py_HashRandomizationFlag; \ } while(0) #define EMPTY_TO_MINSIZE(mp) do { \ memset((mp)->ma_smalltable, 0, sizeof((mp)->ma_smalltable)); \ - (mp)->ma_used = (mp)->ma_fill = 0; \ + (mp)->ma_used = (mp)->ma_fill = (mp)->ma_collisions = 0; \ INIT_NONZERO_DICT_SLOTS(mp); \ } while(0) @@ -292,6 +295,54 @@ PyDict_New(void) return (PyObject *)mp; } +#define COLLISION_THRESHOLD 50 +/* + * Update the collision heuristic on the dict. If there was a collision, + * increment the collisions counter. Otherwise, decrement it. If the + * collisions counter is already at 0, do not decrement it further. + */ +static void +update_collision_heuristic(PyDictObject *mp, int collision) +{ + if (collision) { + mp->ma_collisions++; + } + else if (mp->ma_collisions) { + mp->ma_collisions--; + } +} + +static void +rehash_dict(PyDictObject *mp) +{ + /* Use dictresize with the same current size so that the table + gets rebuilt with randomized hashes. */ + dictresize(mp, mp->ma_used); + mp->ma_randomized = 1; +} + +static void +check_collisions(PyDictObject *mp) +{ + if (Py_HashRandomizationFlag) { + /* Collision in another dict was detected which enabled + randomization, but this dict not yet randomized */ + if (!mp->ma_randomized) + rehash_dict(mp); + } + else { + /* Prevent the detector from going off on the small table, which will + quickly collide too frequently on its way to resizing up. */ + if (mp->ma_mask > PyDict_MINSIZE) { + if (mp->ma_collisions > COLLISION_THRESHOLD) { + Py_HashRandomizationFlag++; + _PyRandom_Init(); + rehash_dict(mp); + } + } + } +} + /* The basic lookup function used by all operations. This is based on Algorithm D from Knuth Vol. 3, Sec. 6.4. @@ -330,8 +381,10 @@ lookdict(PyDictObject *mp, PyObject *key, register long hash) i = (size_t)hash & mask; ep = &ep0[i]; - if (ep->me_key == NULL || ep->me_key == key) + if (ep->me_key == NULL || ep->me_key == key) { + update_collision_heuristic(mp, 0); return ep; + } if (ep->me_key == dummy) freeslot = ep; @@ -344,8 +397,10 @@ lookdict(PyDictObject *mp, PyObject *key, register long hash) if (cmp < 0) return NULL; if (ep0 == mp->ma_table && ep->me_key == startkey) { - if (cmp > 0) + if (cmp > 0) { + update_collision_heuristic(mp, 0); return ep; + } } else { /* The compare did major nasty stuff to the @@ -359,6 +414,8 @@ lookdict(PyDictObject *mp, PyObject *key, register long hash) freeslot = NULL; } + update_collision_heuristic(mp, (freeslot == NULL ? 1 : 0)); + /* In the loop, me_key == dummy is by far (factor of 100s) the least likely outcome, so test for that last. */ for (perturb = hash; ; perturb >>= PERTURB_SHIFT) { @@ -427,16 +484,22 @@ lookdict_string(PyDictObject *mp, PyObject *key, register long hash) } i = hash & mask; ep = &ep0[i]; - if (ep->me_key == NULL || ep->me_key == key) + if (ep->me_key == NULL || ep->me_key == key) { + update_collision_heuristic(mp, 0); return ep; + } if (ep->me_key == dummy) freeslot = ep; else { - if (ep->me_hash == hash && _PyString_Eq(ep->me_key, key)) + if (ep->me_hash == hash && _PyString_Eq(ep->me_key, key)) { + update_collision_heuristic(mp, 0); return ep; + } freeslot = NULL; } + update_collision_heuristic(mp, (freeslot == NULL ? 1 : 0)); + /* In the loop, me_key == dummy is by far (factor of 100s) the least likely outcome, so test for that last. */ for (perturb = hash; ; perturb >>= PERTURB_SHIFT) { @@ -655,13 +718,19 @@ dictresize(PyDictObject *mp, Py_ssize_t minused) mp->ma_used = 0; i = mp->ma_fill; mp->ma_fill = 0; + mp->ma_collisions = 0; /* Copy the data over; this is refcount-neutral for active entries; dummy entries aren't copied over, of course */ for (ep = oldtable; i > 0; ep++) { if (ep->me_value != NULL) { /* active entry */ --i; - insertdict_clean(mp, ep->me_key, (long)ep->me_hash, + insertdict_clean(mp, + ep->me_key, + ((Py_HashRandomizationFlag && + !mp->ma_randomized) ? + PyObject_Hash(ep->me_key) : + (long)ep->me_hash), ep->me_value); } else if (ep->me_key != NULL) { /* dummy entry */ @@ -799,6 +868,8 @@ dict_set_item_by_hash_or_entry(register PyObject *op, PyObject *key, if (insertdict_by_entry(mp, key, hash, ep, value) != 0) return -1; } + check_collisions(mp); + /* If we added a key, we can safely resize. Otherwise just return! * If fill >= 2/3 size, adjust size. Normally, this doubles or * quaduples the size, but it's also possible for the dict to shrink