STXXL  1.4-dev
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
hash_map.h
Go to the documentation of this file.
1 /***************************************************************************
2  * include/stxxl/bits/containers/hash_map/hash_map.h
3  *
4  * Part of the STXXL. See http://stxxl.sourceforge.net
5  *
6  * Copyright (C) 2007 Markus Westphal <[email protected]>
7  * Copyright (C) 2014 Timo Bingmann <[email protected]>
8  *
9  * Distributed under the Boost Software License, Version 1.0.
10  * (See accompanying file LICENSE_1_0.txt or copy at
11  * http://www.boost.org/LICENSE_1_0.txt)
12  **************************************************************************/
13 
14 #ifndef STXXL_CONTAINERS_HASH_MAP_HASH_MAP_HEADER
15 #define STXXL_CONTAINERS_HASH_MAP_HASH_MAP_HEADER
16 
17 #include <functional>
18 
19 #include <stxxl/bits/noncopyable.h>
20 #include <stxxl/bits/namespace.h>
25 
30 
32 
33 #define STXXL_VERBOSE_HASH_MAP(m) \
34  STXXL_VERBOSE1("hash_map[" << static_cast<const void*>(this) << "]::" << m)
35 
36 //! External memory hash-map
37 namespace hash_map {
38 
39 /*!
40  * Main implementation of external memory hash map.
41  *
42  * \tparam KeyType the key type
43  * \tparam MappedType the mapped type associated with a key
44  * \tparam HashType a hash functional
45  * \tparam CompareType a less comparison relation for KeyType
46  * \tparam SubBlockSize the raw size of a subblock (caching granularity)
47  * (default: 8192)
48  * \tparam SubBlocksPerBlock the number of subblocks per external block
49  * (default: 256 -> 2MB blocks)
50  * \tparam AllocType allocator for internal-memory buffer
51  */
52 template <class KeyType,
53  class MappedType,
54  class HashType,
55  class KeyCompareType,
56  unsigned SubBlockSize = 4*1024,
57  unsigned SubBlocksPerBlock = 256,
58  class AllocatorType = std::allocator<std::pair<const KeyType, MappedType> >
59  >
60 class hash_map : private noncopyable
61 {
62 protected:
63  typedef hash_map<KeyType, MappedType, HashType, KeyCompareType,
64  SubBlockSize, SubBlocksPerBlock> self_type;
65 
66 public:
67  //! type of the keys being used
68  typedef KeyType key_type;
69  //! type of the data to be stored
70  typedef MappedType mapped_type;
71  //! actually store (key-data)-pairs
72  typedef std::pair<KeyType, MappedType> value_type;
73  //! type for value-references
75  //! type for constant value-references
76  typedef const value_type& const_reference;
77  //! pointer to type of keys
78  typedef value_type* pointer;
79  //! const pointer to type of keys
80  typedef value_type const* const_pointer;
81 
85 
86  //! type of (mother) hash-function
87  typedef HashType hasher;
88  //! functor that imposes a ordering on keys (but see _lt())
89  typedef KeyCompareType key_compare;
90  //! allocator template type
91  typedef AllocatorType allocator_type;
92 
95 
96  //! subblock- and block-size in bytes
97  enum {
98  block_raw_size = SubBlocksPerBlock * SubBlockSize,
99  subblock_raw_size = SubBlockSize
100  };
101 
102  //! Subblock-size as number of elements, block-size as number of subblocks
103  enum {
104  subblocks_per_block = SubBlocksPerBlock,
105  subblock_size = SubBlockSize / sizeof(value_type)
106  };
107 
108  //! a subblock consists of subblock_size values
110  //! a block consists of block_size subblocks
112 
113  //! block-identifier for subblocks
115  //! block-identifier for blocks
116  typedef typename block_type::bid_type bid_type;
117  //! container for block-bids
118  typedef std::vector<bid_type> bid_container_type;
119  //! iterator for block-bids
120  typedef typename bid_container_type::iterator bid_iterator_type;
121 
122  enum source_type { src_internal, src_external, src_unknown };
123 
124  //! nodes for internal-memory buffer
126  //! buckets
128 
129  typedef std::vector<bucket_type> buckets_container_type;
130 
131  //! for tracking active iterators
133 
135 
137 
138  typedef typename allocator_type::template rebind<node_type>::other node_allocator_type;
139 
140 protected:
141  //! user supplied mother hash-function
143  //! user supplied strict-weak-ordering for keys
145  //! array of bucket
146  buckets_container_type buckets_;
147  //! blocks-ids of allocated blocks
148  bid_container_type bids_;
149  //! size of internal-memory buffer in number of entries
151  //! maximum size for internal-memory buffer
153  //! keeps track of all active iterators
154  iterator_map_type iterator_map_;
155 
156  mutable block_cache_type block_cache_;
157  //! used to allocate new nodes for internal buffer
158  node_allocator_type node_allocator_;
159  //! false if the total-number of values is correct (false) or true if
160  //! estimated (true); see *oblivious_-methods
161  mutable bool oblivious_;
162  //! (estimated) number of values
164  //! desired load factor after rehashing
166 
167 public:
168  /*!
169  * Construct a new hash-map
170  * \param n initial number of buckets
171  * \param hf hash-function
172  * \param cmp comparator-object
173  * \param buffer_size size of internal-memory buffer in bytes
174  * \param a allocation-strategory for internal-memory buffer
175  */
177  const hasher& hf = hasher(),
178  const key_compare& cmp = key_compare(),
179  internal_size_type buffer_size = 128*1024*1024,
180  const allocator_type& a = allocator_type())
181  : hash_(hf),
182  cmp_(cmp),
183  buckets_(n),
184  bids_(0),
185  buffer_size_(0),
186  iterator_map_(this),
187  block_cache_(tuning::get_instance()->blockcache_size),
188  node_allocator_(a),
189  oblivious_(false),
190  num_total_(0),
191  opt_load_factor_(0.875)
192  {
193  max_buffer_size_ = buffer_size / sizeof(node_type);
194  }
195 
196  /*!
197  * Construct a new hash-map and insert all values in the range [f,l)
198  *
199  * \param begin beginning of the range
200  * \param end end of the range
201  * \param mem_to_sort internal memory that may be used for bulk-construction (not
202  * to be confused with the buffer-memory)
203  * \param n initial number of buckets
204  * \param hf hash-function
205  * \param cmp comparator-object
206  * \param buffer_size size of internal-memory buffer in bytes
207  * \param a allocation-strategory for internal-memory buffer
208  */
209  template <class InputIterator>
210  hash_map(InputIterator begin, InputIterator end,
211  internal_size_type mem_to_sort = 256*1024*1024,
212  internal_size_type n = 0,
213  const hasher& hf = hasher(),
214  const key_compare& cmp = key_compare(),
215  internal_size_type buffer_size = 128*1024*1024,
216  const allocator_type& a = allocator_type())
217  : hash_(hf),
218  cmp_(cmp),
219  buckets_(n), // insert will determine a good size
220  bids_(0),
221  buffer_size_(0),
222  iterator_map_(this),
223  block_cache_(tuning::get_instance()->blockcache_size),
224  node_allocator_(a),
225  oblivious_(false),
226  num_total_(0),
227  opt_load_factor_(0.875)
228  {
229  max_buffer_size_ = buffer_size / sizeof(node_type);
230  insert(begin, end, mem_to_sort);
231  }
232 
234  {
235  clear();
236  }
237 
238 public:
239  //! Hash-function used by this hash-map
241  { return hash_; }
242 
243  //! Strict-weak-ordering used by this hash-map
245  { return cmp_; }
246 
247  //! Get node memory allocator
249  { return node_allocator_; }
250 
251 protected:
252  /*!
253  * After using *oblivious_-methods only an estimate for the total number of
254  * elements can be given. This method accesses external memory to
255  * calculate the exact number.
256  */
258  { /* const */ //! TODO: make const again
259  if (!oblivious_)
260  return;
261 
262  typedef HashedValuesStream<self_type, reader_type> values_stream_type;
263 
264  // this will start prefetching automatically
265  reader_type reader(bids_.begin(), bids_.end(), block_cache_);
266  values_stream_type values(buckets_.begin(), buckets_.end(),
267  reader, bids_.begin(), *this);
268 
269  num_total_ = 0;
270  while (!values.empty())
271  {
272  ++num_total_;
273  ++values;
274  }
275  oblivious_ = false;
276  }
277 
278 public:
279  //! Number of values currently stored. Note: If the correct number is
280  //! currently unknown (because *_oblivous-methods were used), external
281  //! memory will be scanned.
283  {
284  if (oblivious_)
285  ((self_type*)this)->_make_conscious();
286  return num_total_;
287  }
288 
289  //! The hash-map may store up to this number of values
291  {
293  }
294 
295  //! Check if container is empty.
296  bool empty() const
297  {
298  return size() == 0;
299  }
300 
301  /*!
302  * Insert a new value if no value with the same key is already present;
303  * external memory must therefore be accessed
304  *
305  * \param value what to insert
306  * \return a tuple whose second part is true iff the value was actually
307  * added (no value with the same key present); the first part is an
308  * iterator pointing to the newly inserted or already stored value
309  */
310  std::pair<iterator, bool> insert(const value_type& value)
311  {
312  if (buckets_.size() == 0)
313  _rebuild_buckets(128);
314 
315  internal_size_type i_bucket = _bkt_num(value.first);
316  bucket_type& bucket = buckets_[i_bucket];
317  node_type* node = _find_key_internal(bucket, value.first);
318 
319  // found value in internal memory
320  if (node && _eq(node->value_.first, value.first))
321  {
322  bool old_deleted = node->deleted();
323  if (old_deleted)
324  {
325  node->set_deleted(false);
326  node->value_ = value;
327  ++num_total_;
328  }
329  return std::pair<iterator, bool>(
330  iterator(this, i_bucket, node,
331  0, src_internal, false, value.first), old_deleted);
332  }
333  // search external memory ...
334  else
335  {
337  = _find_key_external(bucket, value.first);
338 
339  external_size_type i_external = result.first;
340  value_type ext_value = result.second;
341 
342  // ... if found, return iterator pointing to external position ...
343  if (i_external < bucket.n_external_ && _eq(ext_value.first, value.first))
344  {
345  return std::pair<iterator, bool>(
346  iterator(this, i_bucket, node,
347  i_external, src_external, true, value.first), false);
348  }
349  // ... otherwise create a new buffer-node to add the value
350  else
351  {
352  ++num_total_;
353  node_type* new_node =
354  node
355  ? node->set_next(_new_node(value, node->next(), false))
356  : (bucket.list_ = _new_node(value, bucket.list_, false));
357 
358  iterator it(this, i_bucket, new_node,
359  0, src_internal, false, value.first);
360 
361  ++buffer_size_;
362  if (buffer_size_ >= max_buffer_size_)
363  _rebuild_buckets(); // will fix it as well
364 
365  return std::pair<iterator, bool>(it, true);
366  }
367  }
368  }
369 
370  //! Insert a value; external memory is not accessed so that another value
371  //! with the same key may be overwritten
372  //! \param value what to insert
373  //! \return iterator pointing to the inserted value
375  {
376  internal_size_type i_bucket = _bkt_num(value.first);
377  bucket_type& bucket = buckets_[i_bucket];
378  node_type* node = _find_key_internal(bucket, value.first);
379 
380  // found value in internal memory
381  if (node && _eq(node->value_.first, value.first))
382  {
383  if (node->deleted())
384  ++num_total_;
385 
386  node->set_deleted(false);
387  node->value_ = value;
388  return iterator(this, i_bucket, node,
389  0, src_internal, false, value.first);
390  }
391  // not found; ignore external memory and add a new node to the
392  // internal-memory buffer
393  else
394  {
395  oblivious_ = true;
396  ++num_total_;
397  node_type* new_node =
398  node
399  ? node->set_next(_new_node(value, node->next(), false))
400  : (bucket.list_ = _new_node(value, bucket.list_, false));
401 
402  // there may be some iterators that reference the newly inserted
403  // value in external memory these need to be fixed (make them point
404  // to new_node)
405  iterator_map_.fix_iterators_2int(i_bucket, value.first, new_node);
406 
407  iterator it(this, i_bucket, new_node,
408  0, src_internal, false, value.first);
409 
410  ++buffer_size_;
411  if (buffer_size_ >= max_buffer_size_)
412  _rebuild_buckets();
413 
414  return it;
415  }
416  }
417 
418  //! Erase value by iterator
419  //! \param it iterator pointing to the value to erase
421  {
422  --num_total_;
423  bucket_type& bucket = buckets_[it.i_bucket_];
424 
425  if (it.source_ == src_internal)
426  {
427  it.node_->set_deleted(true);
428  iterator_map_.fix_iterators_2end(it.i_bucket_, it.key_);
429  }
430  else {
431  // find biggest value < iterator's value
432  node_type* node = _find_key_internal(bucket, it.key_);
433  assert(!node || !_eq(node->value_.first, it.key_));
434 
435  // add delete-node to buffer
436  if (node)
437  node->set_next(_new_node(value_type(it.key_, mapped_type()), node->next(), true));
438  else
439  bucket.list_ = _new_node(value_type(it.key_, mapped_type()), bucket.list_, true);
440 
441  iterator_map_.fix_iterators_2end(it.i_bucket_, it.key_);
442 
443  ++buffer_size_;
444  if (buffer_size_ >= max_buffer_size_)
445  _rebuild_buckets();
446  }
447  }
448 
449  //! Erase value by key; check external memory
450  //! \param key key of value to erase
451  //! \return number of values actually erased (0 or 1)
453  {
454  internal_size_type i_bucket = _bkt_num(key);
455  bucket_type& bucket = buckets_[i_bucket];
456  node_type* node = _find_key_internal(bucket, key);
457 
458  // found in internal memory
459  if (node && _eq(node->value_.first, key))
460  {
461  if (!node->deleted())
462  {
463  node->set_deleted(true);
464  --num_total_;
465  iterator_map_.fix_iterators_2end(i_bucket, key);
466  return 1;
467  }
468  else
469  return 0; // already deleted
470  }
471  // check external memory
472  else
473  {
475  = _find_key_external(bucket, key);
476 
477  external_size_type i_external = result.first;
478  value_type ext_value = result.second;
479 
480  // found in external memory; add delete-node
481  if (i_external < bucket.n_external_ && _eq(ext_value.first, key))
482  {
483  --num_total_;
484 
485  if (node)
486  node->set_next(_new_node(value_type(key, mapped_type()), node->next(), true));
487  else
488  bucket.list_ = _new_node(value_type(key, mapped_type()), bucket.list_, true);
489 
490  iterator_map_.fix_iterators_2end(i_bucket, key);
491 
492  ++buffer_size_;
493  if (buffer_size_ >= max_buffer_size_)
494  _rebuild_buckets();
495 
496  return 1;
497  }
498  // no value with given key
499  else
500  return 0;
501  }
502  }
503 
504  //! Erase value by key but without looking at external memory
505  //! \param key key for value to release
506  void erase_oblivious(const key_type& key)
507  {
508  internal_size_type i_bucket = _bkt_num(key);
509  bucket_type& bucket = buckets_[i_bucket];
510  node_type* node = _find_key_internal(bucket, key);
511 
512  // found value in internal-memory
513  if (node && _eq(node->value_.first, key))
514  {
515  if (!node->deleted())
516  {
517  --num_total_;
518  node->set_deleted(true);
519  iterator_map_.fix_iterators_2end(i_bucket, key);
520  }
521  }
522  // not found; ignore external memory and add delete-node
523  else
524  {
525  oblivious_ = true;
526  --num_total_;
527 
528  if (node)
529  node->set_next(_new_node(value_type(key, mapped_type()), node->next(), true));
530  else
531  bucket.list_ = _new_node(value_type(key, mapped_type()), bucket.list_, true);
532 
533  iterator_map_.fix_iterators_2end(i_bucket, key);
534 
535  ++buffer_size_;
536  if (buffer_size_ >= max_buffer_size_)
537  _rebuild_buckets();
538  }
539  }
540 
541  //! Reset hash-map: erase all values, invalidate all iterators
542  void clear()
543  {
544  STXXL_VERBOSE_HASH_MAP("clear()");
545 
546  iterator_map_.fix_iterators_all2end();
547  block_cache_.flush();
548  block_cache_.clear();
549 
550  // reset buckets and release buffer-memory
551  for (internal_size_type i_bucket = 0;
552  i_bucket < buckets_.size(); i_bucket++)
553  {
554  _erase_nodes(buckets_[i_bucket].list_, NULL);
555  buckets_[i_bucket] = bucket_type();
556  }
557  oblivious_ = false;
558  num_total_ = 0;
559  buffer_size_ = 0;
560 
561  // free external memory
562  block_manager* bm = block_manager::get_instance();
563  bm->delete_blocks(bids_.begin(), bids_.end());
564  bids_.clear();
565  }
566 
567  //! Exchange stored values with another hash-map
568  //! \param obj hash-map to swap values with
569  void swap(self_type& obj)
570  {
571  std::swap(buckets_, obj.buckets_);
572  std::swap(bids_, obj.bids_);
573 
574  std::swap(oblivious_, obj.oblivious_);
575  std::swap(num_total_, obj.num_total_);
576 
577  std::swap(node_allocator_, obj.node_allocator_);
578 
579  std::swap(hash_, obj.hash_);
580  std::swap(cmp_, obj.cmp_);
581 
582  std::swap(buffer_size_, obj.buffer_size_);
583  std::swap(max_buffer_size_, obj.max_buffer_size_);
584 
585  std::swap(opt_load_factor_, obj.opt_load_factor_);
586 
587  std::swap(iterator_map_, obj.iterator_map_);
588 
589  std::swap(block_cache_, obj.block_cache_);
590  }
591 
592 protected:
593  // find statistics
598 
599 public:
600  //! Reset hash-map statistics
602  {
603  block_cache_.reset_statistics();
604  n_subblocks_loaded = n_found_external = n_found_internal = n_not_found = 0;
605  }
606 
607  //! Print short general statistics to output stream
608  void print_statistics(std::ostream& o = std::cout) const
609  {
610  o << "Find-statistics:" << std::endl;
611  o << " Found internal : " << n_found_internal << std::endl;
612  o << " Found external : " << n_found_external << std::endl;
613  o << " Not found : " << n_not_found << std::endl;
614  o << " Subblocks searched : " << n_subblocks_loaded << std::endl;
615 
616  iterator_map_.print_statistics(o);
617  block_cache_.print_statistics(o);
618  }
619 
620  //! Look up value by key. Non-const access.
621  //! \param key key for value to look up
622  iterator find(const key_type& key)
623  {
624  if (buffer_size_ + 1 >= max_buffer_size_) // (*)
625  _rebuild_buckets();
626 
627  internal_size_type i_bucket = _bkt_num(key);
628  bucket_type& bucket = buckets_[i_bucket];
629  node_type* node = _find_key_internal(bucket, key);
630 
631  // found in internal-memory buffer
632  if (node && _eq(node->value_.first, key)) {
633  n_found_internal++;
634  if (node->deleted())
635  return this->_end<iterator>();
636  else
637  return iterator(this, i_bucket, node, 0, src_internal, false, key);
638  }
639  // search external elements
640  else {
642  = _find_key_external(bucket, key);
643 
644  external_size_type i_external = result.first;
645  value_type value = result.second;
646 
647  // found in external memory
648  if (i_external < bucket.n_external_ && _eq(value.first, key)) {
649  n_found_external++;
650 
651  // we ultimately expect the user to de-reference the returned
652  // iterator to change its value (non-const!). to prevent an
653  // additional disk-access, we create a new node in the
654  // internal-memory buffer overwriting the external value.
655  // note: by checking and rebuilding (if neccessary) in (*) we
656  // made sure that the new node will fit into the buffer and no
657  // rebuild is neccessary here.
658  node_type* new_node =
659  node
660  ? node->set_next(_new_node(value, node->next(), false))
661  : (bucket.list_ = _new_node(value, bucket.list_, false));
662 
663  ++buffer_size_;
664 
665  iterator_map_.fix_iterators_2int(i_bucket, value.first, new_node);
666 
667  return iterator(this, i_bucket, new_node, i_external + 1, src_internal, true, key);
668  }
669  // not found in external memory
670  else {
671  n_not_found++;
672  return this->_end<iterator>();
673  }
674  }
675  }
676 
677  //! Look up value by key. Const access.
678  //! \param key key for value to look up
679  const_iterator find(const key_type& key) const
680  {
681  internal_size_type i_bucket = _bkt_num(key);
682  const bucket_type& bucket = buckets_[i_bucket];
683  node_type* node = _find_key_internal(bucket, key);
684 
685  // found in internal-memory buffer
686  if (node && _eq(node->value_.first, key)) {
687  n_found_internal++;
688  if (node->deleted())
689  return this->_end<const_iterator>();
690  else
691  return const_iterator((self_type*)this, i_bucket, node, 0, src_internal, false, key);
692  }
693  // search external elements
694  else {
696  = _find_key_external(bucket, key);
697 
698  external_size_type i_external = result.first;
699  value_type value = result.second;
700 
701  // found in external memory
702  if (i_external < bucket.n_external_ && _eq(value.first, key)) {
703  n_found_external++;
704  return const_iterator((self_type*)this, i_bucket, node, i_external, src_external, true, key);
705  }
706  // not found in external memory
707  else {
708  n_not_found++;
709  return this->_end<const_iterator>();
710  }
711  }
712  }
713 
714  //! Number of values with given key
715  //! \param k key for value to look up
716  //! \return 0 or 1 depending on the presence of a value with the given key
718  {
719  const_iterator cit = find(k);
720  return (cit == end()) ? 0 : 1;
721  }
722 
723  //! Finds a range containing all values with given key. Non-const access
724  //! \param key key to look for#
725  //! \return range may be empty or contains exactly one value
726  std::pair<iterator, iterator> equal_range(const key_type& key)
727  {
728  iterator it = find(key);
729  return std::pair<iterator, iterator>(it, it);
730  }
731 
732  //! Finds a range containing all values with given key. Const access
733  //! \param key key to look for#
734  //! \return range may be empty or contains exactly one value
735  std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const
736  {
737  const_iterator cit = find(key);
738  return std::pair<const_iterator, const_iterator>(cit, cit);
739  }
740 
741  //! Convenience operator to quickly insert or find values. Use with caution
742  //! since using this operator will check external-memory.
743  mapped_type& operator [] (const key_type& key)
744  {
745  if (buffer_size_ + 1 >= max_buffer_size_) // (*)
746  _rebuild_buckets();
747 
748  internal_size_type i_bucket = _bkt_num(key);
749  bucket_type& bucket = buckets_[i_bucket];
750  node_type* node = _find_key_internal(bucket, key);
751 
752  // found in internal-memory buffer
753  if (node && _eq(node->value_.first, key)) {
754  if (node->deleted()) {
755  node->set_deleted(false);
756  node->value_.second = mapped_type();
757  ++num_total_;
758  }
759  return node->value_.second;
760  }
761  // search external elements
762  else {
764  = _find_key_external(bucket, key);
765 
766  external_size_type i_external = result.first;
767  value_type found_value = result.second;
768 
769  value_type buffer_value =
770  (i_external < bucket.n_external_ && _eq(found_value.first, key))
771  ? found_value
772  : value_type(key, mapped_type());
773 
774  // add a new node to the buffer. this new node's value overwrites
775  // the external value if it was found and otherwise is set to (key,
776  // mapped_type())
777  node_type* new_node =
778  node
779  ? node->set_next(_new_node(buffer_value, node->next(), false))
780  : (bucket.list_ = _new_node(buffer_value, bucket.list_, false));
781 
782  ++buffer_size_;
783  // note that we already checked the buffer-size in (*)
784 
785  return new_node->value_.second;
786  }
787  }
788 
789  //! Number of buckets
791  { return buckets_.size(); }
792 
793  //! Maximum number of buckets
795  { return (internal_size_type)(max_size() / subblock_size); }
796 
797  //! Bucket-index for values with given key.
799  { return _bkt_num(k); }
800 
801 public:
802  //! Average number of (sub)blocks occupied by a bucket.
803  float load_factor() const
804  { return (float)num_total_ / ((float)subblock_size * (float)buckets_.size()); }
805 
806  //! Get desired load-factor
807  float opt_load_factor() const { return opt_load_factor_; }
808 
809  //! Set desired load-factor
810  void opt_load_factor(float z)
811  {
812  opt_load_factor_ = z;
813  if (load_factor() > opt_load_factor_)
814  _rebuild_buckets();
815  }
816 
817  //! Rehash with (at least) n buckets
819  {
820  _rebuild_buckets(n);
821  }
822 
823  //! Number of bytes occupied by buffer
825  {
826  // buffer-size internally stored as number of nodes
827  return buffer_size_ * sizeof(node_type);
828  }
829 
830  //! Maximum buffer size in byte
832  {
833  return max_buffer_size_ * sizeof(node_type);
834  }
835 
836  //! Set maximum buffer size
837  //! \param buffer_size new size in byte
839  {
840  max_buffer_size_ = buffer_size / sizeof(node_type);
841  if (buffer_size_ >= max_buffer_size_)
842  _rebuild_buckets();
843  }
844 
845 protected:
846  //! iterator pointing to the beginnning of the hash-map
847  template <class Iterator>
848  Iterator _begin() const
849  {
850  self_type* non_const_this = (self_type*)this;
851 
852  if (buckets_.size() == 0)
853  return _end<Iterator>();
854 
855  // correct key will be set by find_next()
856  Iterator it(non_const_this, 0, buckets_[0].list_,
857  0, src_unknown, true, key_type());
858  it.find_next();
859 
860  return it;
861  }
862 
863  //! iterator pointing to the end of the hash-map (iterator-type as
864  //! template-parameter)
865  template <class Iterator>
866  Iterator _end() const
867  {
868  self_type* non_const_this = (self_type*)this;
869  return Iterator(non_const_this);
870  }
871 
872 public:
873  //! Returns an iterator pointing to the beginning of the hash-map
874  iterator begin() { return _begin<iterator>(); }
875 
876  //! Returns a const_interator pointing to the beginning of the hash-map
877  const_iterator begin() const { return _begin<const_iterator>(); }
878 
879  //! Returns an iterator pointing to the end of the hash-map
880  iterator end() { return _end<iterator>(); }
881 
882  //! Returns a const_iterator pointing to the end of the hash-map
883  const_iterator end() const { return _end<const_iterator>(); }
884 
885 protected:
886  //! Allocate a new buffer-node
887  node_type * _get_node()
888  {
889  return node_allocator_.allocate(1);
890  }
891 
892  //! Free given node
893  void _put_node(node_type* node)
894  {
895  node_allocator_.deallocate(node, 1);
896  }
897 
898  //! Allocate a new buffer-node and initialize with given value, node and
899  //! deleted-flag
900  node_type * _new_node(const value_type& value, node_type* nxt, bool del)
901  {
902  node_type* node = _get_node();
903  node->value_ = value;
904  node->set_next(nxt);
905  node->set_deleted(del);
906  return node;
907  }
908 
909  //! Free nodes in range [first, last). If last is NULL all nodes will be
910  //! freed.
911  void _erase_nodes(node_type* first, node_type* last)
912  {
913  node_type* curr = first;
914  while (curr != last)
915  {
916  node_type* next = curr->next();
917  _put_node(curr);
918  curr = next;
919  }
920  }
921 
922  //! Bucket-index for values with given key
924  {
925  return _bkt_num(key, buckets_.size());
926  }
927 
928  /*!
929  * Bucket-index for values with given key. The total number of buckets has
930  * to be specified as well. The bucket number is determined by \f$
931  * bucket_num = (hash/max_hash)*n_buckets \f$ max_hash is in fact 2^63-1
932  * (internal_size_type=uint64 (or uint32)) but we rather divide by 2^64, so
933  * we can use plain integer arithmetic easily (there should be only a small
934  * difference): this way we must only calculate the upper 64 bits of the
935  * product hash*n_buckets and we're done. See
936  * http://www.cs.uaf.edu/~cs301/notes/Chapter5/node5.html
937  */
939  {
940  //! TODO maybe specialize double arithmetic to integer. the old code
941  //! was faulty -tb.
942  return (internal_size_type)(
943  (double)n * ((double)hash_(key) / (double)std::numeric_limits<internal_size_type>::max())
944  );
945  }
946 
947  /*!
948  * Locate the given key in the internal-memory chained list. If the key is
949  * not present, the node with the biggest key smaller than the given key is
950  * returned. Note that the returned value may be zero: either because the
951  * chained list is empty or because the given key is smaller than all other
952  * keys in the chained list.
953  */
954  node_type*
955  _find_key_internal(const bucket_type& bucket, const key_type& key) const
956  {
957  node_type* old = NULL;
958  for (node_type* curr = bucket.list_;
959  curr && _leq(curr->value_.first, key);
960  curr = curr->next())
961  {
962  old = curr;
963  }
964  return old;
965  }
966 
967  /*!
968  * Search for key in external part of bucket. Return value is (i_external,
969  * value), where i_ext = bucket._num_external if key could not be found.
970  */
972  _find_key_external(const bucket_type& bucket, const key_type& key) const
973  {
974  subblock_type* subblock;
975 
976  // number of subblocks occupied by bucket
977  internal_size_type n_subblocks = (internal_size_type)(
978  bucket.n_external_ / subblock_size
979  );
980  if (bucket.n_external_ % subblock_size != 0)
981  n_subblocks++;
982 
983  for (internal_size_type i_subblock = 0;
984  i_subblock < n_subblocks; i_subblock++)
985  {
986  subblock = _load_subblock(bucket, i_subblock);
987  // number of values in i-th subblock
988  internal_size_type n_values =
989  (i_subblock + 1 < n_subblocks)
990  ? (internal_size_type)subblock_size
991  : (internal_size_type)(
992  bucket.n_external_ - i_subblock * subblock_size
993  );
994 
995  //! TODO: replace with bucket.n_external_ % subblock_size
996 
997  // biggest key in current subblock still too small => next subblock
998  if (_lt((*subblock)[n_values - 1].first, key))
999  continue;
1000 
1001  // binary search in current subblock
1002  internal_size_type i_lower = 0, i_upper = n_values;
1003  while (i_lower + 1 != i_upper)
1004  {
1005  internal_size_type i_middle = (i_lower + i_upper) / 2;
1006  if (_leq((*subblock)[i_middle].first, key))
1007  i_lower = i_middle;
1008  else
1009  i_upper = i_middle;
1010  }
1011 
1012  value_type value = (*subblock)[i_lower];
1013 
1014  if (_eq(value.first, key))
1016  (i_subblock * subblock_size + i_lower, value);
1017  else
1019  (bucket.n_external_, value_type());
1020  }
1021 
1023  (bucket.n_external_, value_type());
1024  }
1025 
1026  /*!
1027  * Load the given bucket's i-th subblock.
1028  * Since a bucket may be spread over several blocks, we must
1029  * 1. determine in which block the requested subblock is located
1030  * 2. at which position within the obove-mentioned block the questioned subblock is located
1031  */
1032  subblock_type*
1033  _load_subblock(const bucket_type& bucket, internal_size_type which_subblock) const
1034  {
1035  n_subblocks_loaded++;
1036 
1037  // index of the requested subblock counted from the very beginning of
1038  // the bucket's first block
1039  external_size_type i_abs_subblock = bucket.i_subblock_ + which_subblock;
1040 
1041  /* 1. */
1042  bid_type bid = bids_[bucket.i_block_ + (internal_size_type)(i_abs_subblock / subblocks_per_block)];
1043  /* 2. */
1044  internal_size_type i_subblock_within = (internal_size_type)(i_abs_subblock % subblocks_per_block);
1045 
1046  return block_cache_.get_subblock(bid, i_subblock_within);
1047  }
1048 
1050 
1051  //! Functor to extracts the actual value from a HashedValue-struct
1053  {
1054  value_type& operator () (hashed_value_type& hvalue)
1055  { return hvalue.value_; }
1056  };
1057 
1058  /*!
1059  * Will return from its input-stream all values that are to be stored in
1060  * the given bucket. Those values must appear in consecutive order
1061  * beginning with the input-stream's current value.
1062  */
1063  template <class InputStream, class ValueExtractor>
1065  {
1066  typedef typename InputStream::value_type value_type;
1067 
1069  InputStream& input_;
1073  bool empty_;
1074  ValueExtractor vextract_;
1075 
1076  HashingStream(InputStream& input, internal_size_type i_bucket,
1077  ValueExtractor vextract, self_type* map)
1078  : map_(map),
1079  input_(input),
1080  i_bucket_(i_bucket),
1081  bucket_size_(0),
1082  vextract_(vextract)
1083  {
1084  empty_ = find_next();
1085  }
1086 
1087  const value_type& operator * () { return value_; }
1088 
1089  bool empty() const { return empty_; }
1090 
1092  {
1093  ++input_;
1094  empty_ = find_next();
1095  }
1096 
1097  bool find_next()
1098  {
1099  if (input_.empty())
1100  return true;
1101  value_ = *input_;
1102  if (map_->_bkt_num(vextract_(value_).first) != i_bucket_)
1103  return true;
1104 
1105  ++bucket_size_;
1106  return false;
1107  }
1108  };
1109 
1110  /* Rebuild hash-map. The desired number of buckets may be supplied. */
1112  {
1113  STXXL_VERBOSE_HASH_MAP("_rebuild_buckets()");
1114 
1116  typedef HashedValuesStream<self_type, reader_type> values_stream_type;
1117  typedef HashingStream<values_stream_type, HashedValueExtractor> hashing_stream_type;
1118 
1119  const int_type write_buffer_size = config::get_instance()->disks_number() * 4;
1120 
1121  // determine new number of buckets from desired load_factor ...
1122  internal_size_type n_new;
1123  n_new = (internal_size_type)ceil((double)num_total_ / ((double)subblock_size * (double)opt_load_factor()));
1124 
1125  // ... but give the user the chance to request even more buckets
1126  if (n_desired > n_new)
1127  n_new = std::min<internal_size_type>(n_desired, max_bucket_count());
1128 
1129  // allocate new buckets and bids
1130  buckets_container_type old_buckets(n_new);
1131  std::swap(buckets_, old_buckets);
1132 
1133  bid_container_type old_bids;
1134  std::swap(bids_, old_bids);
1135 
1136  // read stored values in consecutive order
1137 
1138  // use new to control point of destruction (see below)
1139  reader_type* reader
1140  = new reader_type(old_bids.begin(), old_bids.end(), block_cache_);
1141 
1142  values_stream_type values_stream(old_buckets.begin(), old_buckets.end(),
1143  *reader, old_bids.begin(), *this);
1144 
1145  writer_type writer(&bids_, write_buffer_size, write_buffer_size / 2);
1146 
1147  // re-distribute values among new buckets.
1148 
1149  // this makes use of the fact that if value1 preceeds value2 before
1150  // resizing, value1 will preceed value2 after resizing as well (uniform
1151  // rehashing)
1152  num_total_ = 0;
1153  for (internal_size_type i_bucket = 0;
1154  i_bucket < buckets_.size(); i_bucket++)
1155  {
1156  buckets_[i_bucket] = bucket_type();
1157  buckets_[i_bucket].i_block_ = writer.i_block();
1158  buckets_[i_bucket].i_subblock_ = writer.i_subblock();
1159 
1160  hashing_stream_type hasher(values_stream, i_bucket, HashedValueExtractor(), this);
1161  external_size_type i_ext = 0;
1162  while (!hasher.empty())
1163  {
1164  const hashed_value_type& hvalue = *hasher;
1165  iterator_map_.fix_iterators_2ext(hvalue.i_bucket_, hvalue.value_.first, i_bucket, i_ext);
1166 
1167  writer.append(hvalue.value_);
1168  ++hasher;
1169  ++i_ext;
1170  }
1171 
1172  writer.finish_subblock();
1173  buckets_[i_bucket].n_external_ = hasher.bucket_size_;
1174  num_total_ += hasher.bucket_size_;
1175  }
1176  writer.flush();
1177  // reader must be deleted before deleting old_bids because its
1178  // destructor will dereference the bid-iterator
1179  delete reader;
1180  block_cache_.clear();
1181 
1182  // get rid of old blocks and buckets
1184  bm->delete_blocks(old_bids.begin(), old_bids.end());
1185 
1186  for (internal_size_type i_bucket = 0;
1187  i_bucket < old_buckets.size(); i_bucket++)
1188  {
1189  _erase_nodes(old_buckets[i_bucket].list_, NULL);
1190  old_buckets[i_bucket] = bucket_type();
1191  }
1192 
1193  buffer_size_ = 0;
1194  oblivious_ = false;
1195  }
1196 
1197  /*!
1198  * Stream for filtering duplicates. Used to eliminate duplicated values
1199  * when bulk-inserting Note: input has to be sorted, so that duplicates
1200  * will occure in row
1201  */
1202  template <class InputStream>
1204  {
1205  typedef typename InputStream::value_type value_type;
1207  InputStream& in_;
1208 
1209  UniqueValueStream(InputStream& input, self_type& map)
1210  : map_(map), in_(input)
1211  { }
1212 
1213  bool empty() const { return in_.empty(); }
1214 
1215  const value_type& operator * () { return *in_; }
1216 
1218  {
1219  value_type v_old = *in_;
1220  ++in_;
1221  while (!in_.empty() && v_old.first == (*in_).first)
1222  ++in_;
1223  }
1224  };
1225 
1226  template <class InputStream>
1228  {
1229  //! (hash,value)
1230  typedef std::pair<internal_size_type, typename InputStream::value_type> value_type;
1232  InputStream& in_;
1233 
1234  AddHashStream(InputStream& input, self_type& map)
1235  : map_(map), in_(input)
1236  { }
1237 
1238  bool empty() const { return in_.empty(); }
1239 
1240  value_type operator * ()
1241  { return value_type(map_.hash_((*in_).first), *in_); }
1242 
1243  void operator ++ () { ++in_; }
1244  };
1245 
1246  /*!
1247  * Extracts the value-part (ignoring the hashvalue); required by
1248  * HashingStream (see above)
1249  */
1251  {
1252  const value_type& operator () (std::pair<internal_size_type, value_type>& v)
1253  { return v.second; }
1254  };
1255 
1256  /*!
1257  * Comparator object for values as required by stxxl::sort. Sorting is done
1258  * lexicographically by <hash-value, key> Note: the hash-value has already
1259  * been computed.
1260  */
1261  struct Cmp : public std::binary_function<
1262  std::pair<internal_size_type, value_type>,
1263  std::pair<internal_size_type, value_type>, bool
1264  >
1265  {
1267  Cmp(self_type& map) : map_(map) { }
1268 
1269  bool operator () (const std::pair<internal_size_type, value_type>& a,
1270  const std::pair<internal_size_type, value_type>& b) const
1271  {
1272  return (a.first < b.first) ||
1273  ((a.first == b.first) && map_.cmp_(a.second.first, b.second.first));
1274  }
1275  std::pair<internal_size_type, value_type> min_value() const
1276  {
1277  return std::pair<internal_size_type, value_type>(
1279  value_type(map_.cmp_.min_value(), mapped_type())
1280  );
1281  }
1282  std::pair<internal_size_type, value_type> max_value() const
1283  {
1284  return std::pair<internal_size_type, value_type>(
1286  value_type(map_.cmp_.max_value(), mapped_type())
1287  );
1288  }
1289  };
1290 
1291 public:
1292  //! Bulk-insert of values in the range [f, l)
1293  //! \param f beginning of the range
1294  //! \param l end of the range
1295  //! \param mem internal memory that may be used (note: this memory will be used additionally to the buffer). The more the better
1296  template <class InputIterator>
1297  void insert(InputIterator f, InputIterator l, internal_size_type mem)
1298  {
1299  //! values already stored in the hashtable ("old values")
1300  typedef HashedValuesStream<self_type, reader_type> old_values_stream;
1301  //! old values, that are to be stored in a certain (new) bucket
1302  typedef HashingStream<old_values_stream, HashedValueExtractor> old_hashing_stream;
1303 
1304  //! values to insert ("new values")
1306 
1307  //! new values with added hash: (hash, (key, mapped))
1308  typedef AddHashStream<input_stream> new_values_stream;
1309  //! new values sorted by <hash-value, key>
1310  typedef stxxl::stream::sort<new_values_stream, Cmp> new_sorted_values_stream;
1311  //! new values sorted by <hash-value, key> with duplicates eliminated
1312  typedef UniqueValueStream<new_sorted_values_stream> new_unique_values_stream;
1313  //! new values, that are to be stored in a certain bucket
1314  typedef HashingStream<new_unique_values_stream, StripHashFunctor> new_hashing_stream;
1315 
1317 
1318  int_type write_buffer_size = config::get_instance()->disks_number() * 2;
1319 
1320  // calculate new number of buckets
1321  external_size_type num_total_new = num_total_ + (l - f); // estimated number of elements
1322  external_size_type n_buckets_new = (external_size_type)ceil((double)num_total_new / ((double)subblock_size * (double)opt_load_factor()));
1323  if (n_buckets_new > max_bucket_count())
1324  n_buckets_new = max_bucket_count();
1325 
1326  STXXL_VERBOSE_HASH_MAP("insert() items=" << (l - f) << " buckets_new=" << n_buckets_new);
1327 
1328  // prepare new buckets and bids
1329  buckets_container_type old_buckets((internal_size_type)n_buckets_new);
1330  std::swap(buckets_, old_buckets);
1331  // writer will allocate new blocks as necessary
1332  bid_container_type old_bids;
1333  std::swap(bids_, old_bids);
1334 
1335  // already stored values ("old values")
1336  reader_type* reader = new reader_type(old_bids.begin(), old_bids.end(),
1337  block_cache_);
1338  old_values_stream old_values(old_buckets.begin(), old_buckets.end(),
1339  *reader, old_bids.begin(), *this);
1340 
1341  // values to insert ("new values")
1342  input_stream input = stxxl::stream::streamify(f, l);
1343  new_values_stream new_values(input, *this);
1344  new_sorted_values_stream new_sorted_values(new_values, Cmp(*this), mem);
1345  new_unique_values_stream new_unique_values(new_sorted_values, *this);
1346 
1347  writer_type writer(&bids_, write_buffer_size, write_buffer_size / 2);
1348 
1349  num_total_ = 0;
1350  for (internal_size_type i_bucket = 0; i_bucket < buckets_.size(); i_bucket++)
1351  {
1352  buckets_[i_bucket] = bucket_type();
1353  buckets_[i_bucket].i_block_ = writer.i_block();
1354  buckets_[i_bucket].i_subblock_ = writer.i_subblock();
1355 
1356  old_hashing_stream old_hasher(old_values, i_bucket, HashedValueExtractor(), this);
1357  new_hashing_stream new_hasher(new_unique_values, i_bucket, StripHashFunctor(), this);
1358  internal_size_type bucket_size = 0;
1359 
1360  // more old and new values for the current bucket => choose smallest
1361  while (!old_hasher.empty() && !new_hasher.empty())
1362  {
1363  internal_size_type old_hash = hash_((*old_hasher).value_.first);
1364  internal_size_type new_hash = (*new_hasher).first;
1365  key_type old_key = (*old_hasher).value_.first;
1366  key_type new_key = (*new_hasher).second.first;
1367 
1368  // old value wins
1369  if ((old_hash < new_hash) || (old_hash == new_hash && cmp_(old_key, new_key))) // (_lt((*old_hasher)._value.first, (*new_hasher).second.first))
1370  {
1371  const hashed_value_type& hvalue = *old_hasher;
1372  iterator_map_.fix_iterators_2ext(hvalue.i_bucket_, hvalue.value_.first, i_bucket, bucket_size);
1373  writer.append(hvalue.value_);
1374  ++old_hasher;
1375  }
1376  // new value smaller or equal => new value wins
1377  else
1378  {
1379  if (_eq(old_key, new_key))
1380  {
1381  const hashed_value_type& hvalue = *old_hasher;
1382  iterator_map_.fix_iterators_2ext(hvalue.i_bucket_, hvalue.value_.first, i_bucket, bucket_size);
1383  ++old_hasher;
1384  }
1385  writer.append((*new_hasher).second);
1386  ++new_hasher;
1387  }
1388  ++bucket_size;
1389  }
1390  // no more new values for the current bucket
1391  while (!old_hasher.empty())
1392  {
1393  const hashed_value_type& hvalue = *old_hasher;
1394  iterator_map_.fix_iterators_2ext(hvalue.i_bucket_, hvalue.value_.first, i_bucket, bucket_size);
1395  writer.append(hvalue.value_);
1396  ++old_hasher;
1397  ++bucket_size;
1398  }
1399  // no more old values for the current bucket
1400  while (!new_hasher.empty())
1401  {
1402  writer.append((*new_hasher).second);
1403  ++new_hasher;
1404  ++bucket_size;
1405  }
1406 
1407  writer.finish_subblock();
1408  buckets_[i_bucket].n_external_ = bucket_size;
1409  num_total_ += bucket_size;
1410  }
1411  writer.flush();
1412  delete reader;
1413  block_cache_.clear();
1414 
1415  // release old blocks
1417  bm->delete_blocks(old_bids.begin(), old_bids.end());
1418 
1419  // free nodes in old bucket lists
1420  for (internal_size_type i_bucket = 0;
1421  i_bucket < old_buckets.size(); i_bucket++)
1422  {
1423  _erase_nodes(old_buckets[i_bucket].list_, NULL);
1424  old_buckets[i_bucket] = bucket_type();
1425  }
1426 
1427  buffer_size_ = 0;
1428  oblivious_ = false;
1429  }
1430 
1431 protected:
1432  /* 1 iff a < b
1433  The comparison is done lexicographically by (hash-value, key)
1434  */
1435  bool _lt(const key_type& a, const key_type& b) const
1436  {
1437  internal_size_type hash_a = hash_(a);
1438  internal_size_type hash_b = hash_(b);
1439 
1440  return (hash_a < hash_b) ||
1441  ((hash_a == hash_b) && cmp_(a, b));
1442  }
1443 
1444  //! true iff a > b
1445  bool _gt(const key_type& a, const key_type& b) const { return _lt(b, a); }
1446  //! true iff a <= b
1447  bool _leq(const key_type& a, const key_type& b) const { return !_gt(a, b); }
1448  //! true iff a >= b
1449  bool _geq(const key_type& a, const key_type& b) const { return !_lt(a, b); }
1450 
1451  //! true iff a == b. note: it is mandatory that equal keys yield equal
1452  //! hash-values => hashing not neccessary for equality-testing.
1453  bool _eq(const key_type& a, const key_type& b) const
1454  { return !cmp_(a, b) && !cmp_(b, a); }
1455 
1459  friend class iterator_map<self_type>;
1460  friend class block_cache<block_type>;
1461  friend struct HashedValuesStream<self_type, reader_type>;
1462 
1463 #if 1
1465  {
1466  reader_type reader(bids_.begin(), bids_.end(), &block_cache_);
1467 
1468  for (internal_size_type i_block = 0; i_block < bids_.size(); i_block++) {
1469  std::cout << "block " << i_block << ":\n";
1470 
1471  for (internal_size_type i_subblock = 0; i_subblock < subblocks_per_block; i_subblock++) {
1472  std::cout << " subblock " << i_subblock << ":\n ";
1473 
1474  for (external_size_type i_element = 0; i_element < subblocks_per_block; i_element++) {
1475  std::cout << reader.const_value().first << ", ";
1476  ++reader;
1477  }
1478  std::cout << std::endl;
1479  }
1480  }
1481  }
1482 
1484  {
1485  reader_type reader(bids_.begin(), bids_.end(), &block_cache_);
1486 
1487  std::cout << "number of buckets: " << buckets_.size() << std::endl;
1488  for (internal_size_type i_bucket = 0; i_bucket < buckets_.size(); i_bucket++) {
1489  const bucket_type& bucket = buckets_[i_bucket];
1490  reader.skip_to(bids_.begin() + bucket.i_block_, bucket.i_subblock_);
1491 
1492  std::cout << " bucket " << i_bucket << ": block=" << bucket.i_block_ << ", subblock=" << bucket.i_subblock_ << ", external=" << bucket.n_external_ << std::endl;
1493 
1494  node_type* node = bucket.list_;
1495  std::cout << " internal_list=";
1496  while (node) {
1497  std::cout << node->value_.first << " (del=" << node->deleted() << "), ";
1498  node = node->next();
1499  }
1500  std::cout << std::endl;
1501 
1502  std::cout << " external=";
1503  for (external_size_type i_element = 0; i_element < bucket.n_external_; i_element++) {
1504  std::cout << reader.const_value().first << ", ";
1505  ++reader;
1506  }
1507  std::cout << std::endl;
1508  }
1509  }
1510 
1512  {
1513  std::cout << "number of buckets: " << buckets_.size() << std::endl;
1514  for (internal_size_type i_bucket = 0; i_bucket < buckets_.size(); i_bucket++) {
1515  const bucket_type& bucket = buckets_[i_bucket];
1516  std::cout << " bucket " << i_bucket << ": block=" << bucket.i_block_ << ", subblock=" << bucket.i_subblock_ << ", external=" << bucket.n_external_ << ", list=" << bucket.list_ << std::endl;
1517  }
1518  }
1519 #endif
1520 
1521 public:
1522  //! Construct an equality predicate from the comparison operator
1523  struct equal_to : public std::binary_function<key_type, key_type, bool>
1524  {
1525  //! reference to hash_map
1527 
1528  //! constructor requires reference to hash_map
1529  equal_to(const self_type& map) : m_map(map) { }
1530 
1531  //! return whether the arguments compare equal (x==y).
1532  bool operator () (const key_type& x, const key_type& y) const
1533  {
1534  return m_map._eq(x, y);
1535  }
1536 
1537  //! C++11 required type
1539  //! C++11 required type
1541  //! C++11 required type
1542  typedef bool result_type;
1543  };
1544 
1545  //! Type of constructed equality predicate
1546  typedef equal_to key_equal;
1547 
1548  //! Constructed equality predicate used by this hash-map
1549  key_equal key_eq() const
1550  {
1551  return equal_to(*this);
1552  }
1553 
1554 public:
1555  //! Even more statistics: Number of buckets, number of values, buffer-size,
1556  //! values per bucket
1557  void print_load_statistics(std::ostream& o = std::cout) const
1558  {
1559  external_size_type sum_external = 0;
1560  external_size_type square_sum_external = 0;
1561  external_size_type max_external = 0;
1562 
1563  for (internal_size_type i_bucket = 0; i_bucket < buckets_.size(); i_bucket++)
1564  {
1565  const bucket_type& b = buckets_[i_bucket];
1566 
1567  sum_external += b.n_external_;
1568  square_sum_external += b.n_external_ * b.n_external_;
1569  if (b.n_external_ > max_external)
1570  max_external = b.n_external_;
1571  }
1572 
1573  double avg_external = (double)sum_external / (double)buckets_.size();
1574  double std_external = sqrt(((double)square_sum_external / (double)buckets_.size()) - (avg_external * avg_external));
1575 
1576  o << "Bucket count : " << buckets_.size() << std::endl;
1577  o << "Values total : " << num_total_ << std::endl;
1578  o << "Values buffered : " << buffer_size_ << std::endl;
1579  o << "Max Buffer-Size : " << max_buffer_size_ << std::endl;
1580  o << "Max external/bucket : " << max_external << std::endl;
1581  o << "Avg external/bucket : " << avg_external << std::endl;
1582  o << "Std external/bucket : " << std_external << std::endl;
1583  o << "Load-factor : " << load_factor() << std::endl;
1584  o << "Blocks allocated : " << bids_.size() << " => " << (bids_.size() * block_type::raw_size) << " bytes" << std::endl;
1585  o << "Bytes per value : " << ((double)(bids_.size() * block_type::raw_size) / (double)num_total_) << std::endl;
1586  }
1587 }; /* end of class hash_map */
1588 
1589 } // namespace hash_map
1590 
1592 
1593 namespace std {
1594 
1595 template <class KeyType, class MappedType, class HashType, class KeyCompareType,
1596  unsigned SubBlockSize, unsigned SubBlocksPerBlock, class AllocType>
1597 void swap(stxxl::hash_map::hash_map<KeyType, MappedType, HashType, KeyCompareType,
1598  SubBlockSize, SubBlocksPerBlock, AllocType>& a,
1599  stxxl::hash_map::hash_map<KeyType, MappedType, HashType, KeyCompareType,
1600  SubBlockSize, SubBlocksPerBlock, AllocType>& b)
1601 {
1602  if (&a != &b)
1603  a.swap(b);
1604 }
1605 
1606 } // namespace std
1607 
1608 #endif // !STXXL_CONTAINERS_HASH_MAP_HASH_MAP_HEADER
UniqueValueStream(InputStream &input, self_type &map)
Definition: hash_map.h:1209
iterator2stream< InputIterator > streamify(InputIterator begin, InputIterator end)
Input iterator range to stream converter.
Definition: stream.h:93
void print_statistics(std::ostream &o=std::cout) const
Print short general statistics to output stream.
Definition: hash_map.h:608
external_size_type n_not_found
Definition: hash_map.h:597
Additional information about a stored value:
Definition: util.h:432
internal_size_type i_bucket_
index of current bucket
Definition: iterator.h:65
KeyType key_type
type of the keys being used
Definition: hash_map.h:68
AddHashStream(InputStream &input, self_type &map)
Definition: hash_map.h:1234
void _put_node(node_type *node)
Free given node.
Definition: hash_map.h:893
void rehash(internal_size_type n=0)
Rehash with (at least) n buckets.
Definition: hash_map.h:818
void erase(const_iterator it)
Erase value by iterator.
Definition: hash_map.h:420
second_type second
Second tuple component.
Definition: tuple.h:98
void fix_iterators_all2end()
Update all iterators and make them point to the end of the hash-map (used by clear()) ...
Definition: iterator_map.h:206
ValueType value_
Definition: util.h:34
void reset_statistics()
Reset all counters to zero.
Definition: block_cache.h:518
iterator_map< self_type > iterator_map_type
for tracking active iterators
Definition: hash_map.h:132
Block manager class.
Definition: block_manager.h:61
void insert(InputIterator f, InputIterator l, internal_size_type mem)
Bulk-insert of values in the range [f, l)
Definition: hash_map.h:1297
internal_size_type buffer_size_
size of internal-memory buffer in number of entries
Definition: hash_map.h:150
void print_statistics(std::ostream &o=std::cout) const
Definition: iterator_map.h:246
long long int int64
Definition: types.h:38
key_type first_argument_type
C++11 required type.
Definition: hash_map.h:1538
std::vector< bid_type > bid_container_type
container for block-bids
Definition: hash_map.h:118
A model of stream that retrieves the data from an input iterator. For convenience use streamify funct...
Definition: stream.h:46
internal_size_type buffer_size() const
Number of bytes occupied by buffer.
Definition: hash_map.h:824
float opt_load_factor() const
Get desired load-factor.
Definition: hash_map.h:807
iterator begin()
Returns an iterator pointing to the beginning of the hash-map.
Definition: hash_map.h:874
Will return from its input-stream all values that are to be stored in the given bucket.
Definition: hash_map.h:1064
Iterator _begin() const
iterator pointing to the beginnning of the hash-map
Definition: hash_map.h:848
void print_load_statistics(std::ostream &o=std::cout) const
Even more statistics: Number of buckets, number of values, buffer-size, values per bucket...
Definition: hash_map.h:1557
bool deleted()
check if the next node is deleted.
Definition: util.h:37
node< value_type > node_type
nodes for internal-memory buffer
Definition: hash_map.h:125
key_type second_argument_type
C++11 required type.
Definition: hash_map.h:1540
equal_to(const self_type &map)
constructor requires reference to hash_map
Definition: hash_map.h:1529
hash_map_const_iterator< self_type > const_iterator
Definition: hash_map.h:94
external_size_type max_size() const
The hash-map may store up to this number of values.
Definition: hash_map.h:290
node_type * _find_key_internal(const bucket_type &bucket, const key_type &key) const
Locate the given key in the internal-memory chained list.
Definition: hash_map.h:955
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.h:234
subblock_type * get_subblock(const bid_type &bid, unsigned_type i_subblock)
Retrieve a subblock from the cache. If not yet cached, only the subblock will be loaded.
Definition: block_cache.h:361
const self_type & m_map
reference to hash_map
Definition: hash_map.h:1526
void max_buffer_size(internal_size_type buffer_size)
Set maximum buffer size.
Definition: hash_map.h:838
key_equal key_eq() const
Constructed equality predicate used by this hash-map.
Definition: hash_map.h:1549
hash_map(internal_size_type n=0, const hasher &hf=hasher(), const key_compare &cmp=key_compare(), internal_size_type buffer_size=128 *1024 *1024, const allocator_type &a=allocator_type())
Construct a new hash-map.
Definition: hash_map.h:176
first_type first
First tuple component.
Definition: tuple.h:96
void fix_iterators_2ext(internal_size_type i_bucket_old, const key_type &key, internal_size_type i_bucket_new, external_size_type i_ext)
Update iterators with given key and bucket and make them point to the specified location in external ...
Definition: iterator_map.h:127
bool _gt(const key_type &a, const key_type &b) const
true iff a &gt; b
Definition: hash_map.h:1445
hasher hash_function() const
Hash-function used by this hash-map.
Definition: hash_map.h:240
uint_pair & operator++()
prefix increment operator (directly manipulates the integer parts)
Definition: uint_types.h:163
node_allocator_type node_allocator_
used to allocate new nodes for internal buffer
Definition: hash_map.h:158
block_type::bid_type bid_type
block-identifier for blocks
Definition: hash_map.h:116
tuple< external_size_type, value_type > _find_key_external(const bucket_type &bucket, const key_type &key) const
Search for key in external part of bucket.
Definition: hash_map.h:972
external_size_type num_total_
(estimated) number of values
Definition: hash_map.h:163
stxxl::internal_size_type internal_size_type
Definition: hash_map.h:83
iterator end()
Returns an iterator pointing to the end of the hash-map.
Definition: hash_map.h:880
void opt_load_factor(float z)
Set desired load-factor.
Definition: hash_map.h:810
HashedValue< self_type > hashed_value_type
Definition: hash_map.h:1049
void fix_iterators_2int(internal_size_type i_bucket, const key_type &key, node_type *node)
Update iterators with given key and bucket and make them point to the specified node in internal memo...
Definition: iterator_map.h:160
buffered_reader< block_cache_type, bid_iterator_type > reader_type
Definition: hash_map.h:136
Comparator object for values as required by stxxl::sort.
Definition: hash_map.h:1261
static long long curr
block_cache_type block_cache_
Definition: hash_map.h:156
internal_size_type i_subblock_
index of first subblock
Definition: util.h:75
bool result_type
C++11 required type.
Definition: hash_map.h:1542
const_iterator begin() const
Returns a const_interator pointing to the beginning of the hash-map.
Definition: hash_map.h:877
void delete_blocks(const BIDIteratorClass &bidbegin, const BIDIteratorClass &bidend)
Deallocates blocks.
Tuning parameters for external memory hash map.
Definition: tuning.h:25
std::pair< internal_size_type, typename InputStream::value_type > value_type
(hash,value)
Definition: hash_map.h:1230
Extracts the value-part (ignoring the hashvalue); required by HashingStream (see above) ...
Definition: hash_map.h:1250
block_cache< block_type > block_cache_type
Definition: hash_map.h:134
Functor to extracts the actual value from a HashedValue-struct.
Definition: hash_map.h:1052
bool _leq(const key_type &a, const key_type &b) const
true iff a &lt;= b
Definition: hash_map.h:1447
Block containing elements of fixed length.
Definition: typed_block.h:237
key_compare key_cmp() const
Strict-weak-ordering used by this hash-map.
Definition: hash_map.h:244
Stream for filtering duplicates.
Definition: hash_map.h:1203
internal_size_type _bkt_num(const key_type &key, internal_size_type n) const
Bucket-index for values with given key.
Definition: hash_map.h:938
const_iterator end() const
Returns a const_iterator pointing to the end of the hash-map.
Definition: hash_map.h:883
bool oblivious_
false if the total-number of values is correct (false) or true if estimated (true); see *oblivious_-m...
Definition: hash_map.h:161
external_size_type erase(const key_type &key)
Erase value by key; check external memory.
Definition: hash_map.h:452
void swap(self_type &obj)
Exchange stored values with another hash-map.
Definition: hash_map.h:569
equal_to key_equal
Type of constructed equality predicate.
Definition: hash_map.h:1546
void _make_conscious()
After using *oblivious_-methods only an estimate for the total number of elements can be given...
Definition: hash_map.h:257
HashType hasher
type of (mother) hash-function
Definition: hash_map.h:87
void _rebuild_buckets(internal_size_type n_desired=0)
Definition: hash_map.h:1111
choose_int_types< my_pointer_size >::int_type int_type
Definition: types.h:63
AllocatorType allocator_type
allocator template type
Definition: hash_map.h:91
internal_size_type max_buffer_size() const
Maximum buffer size in byte.
Definition: hash_map.h:831
bid_container_type bids_
blocks-ids of allocated blocks
Definition: hash_map.h:148
typed_block< subblock_raw_size, value_type > subblock_type
a subblock consists of subblock_size values
Definition: hash_map.h:109
bucket< node_type > bucket_type
buckets
Definition: hash_map.h:127
static instance_pointer get_instance()
Definition: singleton.h:38
unsigned_type internal_size_type
Definition: types.h:66
void _erase_nodes(node_type *first, node_type *last)
Free nodes in range [first, last). If last is NULL all nodes will be freed.
Definition: hash_map.h:911
std::pair< iterator, iterator > equal_range(const key_type &key)
Finds a range containing all values with given key. Non-const access.
Definition: hash_map.h:726
const value_type & const_reference
type for constant value-references
Definition: hash_map.h:76
std::pair< const_iterator, const_iterator > equal_range(const key_type &key) const
Finds a range containing all values with given key. Const access.
Definition: hash_map.h:735
external_size_type n_found_external
Definition: hash_map.h:596
internal_size_type max_buffer_size_
maximum size for internal-memory buffer
Definition: hash_map.h:152
std::vector< bucket_type > buckets_container_type
Definition: hash_map.h:129
internal_size_type i_block_
index of first block&#39;s bid (to be used as index for hash_map&#39;s bids_-array
Definition: util.h:72
bool _geq(const key_type &a, const key_type &b) const
true iff a &gt;= b
Definition: hash_map.h:1449
std::pair< internal_size_type, value_type > max_value() const
Definition: hash_map.h:1282
#define STXXL_BEGIN_NAMESPACE
Definition: namespace.h:16
internal_size_type _bkt_num(const key_type &key) const
Bucket-index for values with given key.
Definition: hash_map.h:923
external_size_type n_external_
number of elements in external memory
Definition: util.h:68
key_compare cmp_
user supplied strict-weak-ordering for keys
Definition: hash_map.h:144
value_type & reference
type for value-references
Definition: hash_map.h:74
void clear()
Empty cache; don&#39;t write back dirty blocks.
Definition: block_cache.h:490
std::pair< KeyType, MappedType > value_type
actually store (key-data)-pairs
Definition: hash_map.h:72
hash_map_iterator< self_type > iterator
Definition: hash_map.h:93
Produces sorted stream from input stream.
Definition: sort_stream.h:1537
float load_factor() const
Average number of (sub)blocks occupied by a bucket.
Definition: hash_map.h:803
internal_size_type bucket_index(const key_type &k) const
Bucket-index for values with given key.
Definition: hash_map.h:798
node_type * _get_node()
Allocate a new buffer-node.
Definition: hash_map.h:887
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.h:241
internal_size_type i_bucket_
Definition: util.h:443
Stream interface for all value-pairs currently stored in the map.
Definition: util.h:470
bool set_deleted(bool d)
change deleted flag on the next node
Definition: util.h:42
Construct an equality predicate from the comparison operator.
Definition: hash_map.h:1523
NodeType * list_
entry point to the chain in internal memory
Definition: util.h:65
Iterator _end() const
iterator pointing to the end of the hash-map (iterator-type as template-parameter) ...
Definition: hash_map.h:866
#define STXXL_VERBOSE_HASH_MAP(m)
Definition: hash_map.h:33
hash_map(InputIterator begin, InputIterator end, internal_size_type mem_to_sort=256 *1024 *1024, internal_size_type n=0, const hasher &hf=hasher(), const key_compare &cmp=key_compare(), internal_size_type buffer_size=128 *1024 *1024, const allocator_type &a=allocator_type())
Construct a new hash-map and insert all values in the range [f,l)
Definition: hash_map.h:210
KeyCompareType key_compare
functor that imposes a ordering on keys (but see _lt())
Definition: hash_map.h:89
float opt_load_factor_
desired load factor after rehashing
Definition: hash_map.h:165
allocator_type::template rebind< node_type >::other node_allocator_type
Definition: hash_map.h:138
node_type * _new_node(const value_type &value, node_type *nxt, bool del)
Allocate a new buffer-node and initialize with given value, node and deleted-flag.
Definition: hash_map.h:900
external_size_type size() const
Number of values currently stored. Note: If the correct number is currently unknown (because *_oblivo...
Definition: hash_map.h:282
internal_size_type bucket_count() const
Number of buckets.
Definition: hash_map.h:790
void flush()
Write all dirty blocks back to disk.
Definition: block_cache.h:472
buckets_container_type buckets_
array of bucket
Definition: hash_map.h:146
bool _lt(const key_type &a, const key_type &b) const
Definition: hash_map.h:1435
iterator find(const key_type &key)
Look up value by key. Non-const access.
Definition: hash_map.h:622
uint64 external_size_type
Definition: types.h:67
Used to scan external memory with prefetching.
Definition: util.h:95
Main implementation of external memory hash map.
Definition: hash_map.h:60
subblock_type * _load_subblock(const bucket_type &bucket, internal_size_type which_subblock) const
Load the given bucket&#39;s i-th subblock.
Definition: hash_map.h:1033
allocator_type get_allocator() const
Get node memory allocator.
Definition: hash_map.h:248
source_type source_
source of current value: external or internal
Definition: iterator.h:67
typed_block< block_raw_size, subblock_type > block_type
a block consists of block_size subblocks
Definition: hash_map.h:111
stxxl::external_size_type external_size_type
Definition: hash_map.h:82
void reset_statistics()
Reset hash-map statistics.
Definition: hash_map.h:601
bid_container_type::iterator bid_iterator_type
iterator for block-bids
Definition: hash_map.h:120
InputStream::value_type value_type
Definition: hash_map.h:1066
HashingStream(InputStream &input, internal_size_type i_bucket, ValueExtractor vextract, self_type *map)
Definition: hash_map.h:1076
std::pair< internal_size_type, value_type > min_value() const
Definition: hash_map.h:1275
External associative container (map). Introduction to map container: see STXXL Map (B+-tree) tutor...
Definition: map.h:82
void clear()
Reset hash-map: erase all values, invalidate all iterators.
Definition: hash_map.h:542
bool _eq(const key_type &a, const key_type &b) const
true iff a == b. note: it is mandatory that equal keys yield equal hash-values =&gt; hashing not neccess...
Definition: hash_map.h:1453
node_type * node_
current (source=internal) or old (src=external) internal node
Definition: iterator.h:69
subblock_type::bid_type subblock_bid_type
block-identifier for subblocks
Definition: hash_map.h:114
value_type const * const_pointer
const pointer to type of keys
Definition: hash_map.h:80
external_size_type count(const key_type &k) const
Number of values with given key.
Definition: hash_map.h:717
MappedType mapped_type
type of the data to be stored
Definition: hash_map.h:70
external_size_type n_found_internal
Definition: hash_map.h:595
void print_statistics(std::ostream &o=std::cout) const
Print statistics: Number of hits/misses, blocks forced from cache or written back.
Definition: block_cache.h:507
node< ValueType > * set_next(node< ValueType > *n)
change the &quot;next&quot; value of next node pointer
Definition: util.h:54
const_iterator find(const key_type &key) const
Look up value by key. Const access.
Definition: hash_map.h:679
internal_size_type max_bucket_count() const
Maximum number of buckets.
Definition: hash_map.h:794
Buffered writing of values. New Blocks are allocated as needed.
Definition: util.h:302
value_type * pointer
pointer to type of keys
Definition: hash_map.h:78
external_size_type n_subblocks_loaded
Definition: hash_map.h:594
ExtIterator find(ExtIterator begin, ExtIterator end, const EqualityComparable &value, int_type nbuffers=0)
External equivalent of std::find, see stxxl::find.
Definition: scan.h:289
std::pair< iterator, bool > insert(const value_type &value)
Insert a new value if no value with the same key is already present; external memory must therefore b...
Definition: hash_map.h:310
key_type key_
key of current value
Definition: iterator.h:74
iterator_map_type iterator_map_
keeps track of all active iterators
Definition: hash_map.h:154
k-Tuple data type
Definition: tuple.h:67
iterator insert_oblivious(const value_type &value)
Insert a value; external memory is not accessed so that another value with the same key may be overwr...
Definition: hash_map.h:374
hash_map< KeyType, MappedType, HashType, KeyCompareType, SubBlockSize, SubBlocksPerBlock > self_type
Definition: hash_map.h:64
hasher hash_
user supplied mother hash-function
Definition: hash_map.h:142
#define STXXL_END_NAMESPACE
Definition: namespace.h:17
stxxl::int64 difference_type
Definition: hash_map.h:84
node< ValueType > * next()
return the next node, without the &quot;next&quot; flag.
Definition: util.h:49
void fix_iterators_2end(internal_size_type i_bucket, const key_type &key)
Update iterators with given key and bucket and make them point to the end of the hash-map (called by ...
Definition: iterator_map.h:185
void erase_oblivious(const key_type &key)
Erase value by key but without looking at external memory.
Definition: hash_map.h:506
bool empty() const
Check if container is empty.
Definition: hash_map.h:296