STXXL  1.4.1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
hash_map.h
Go to the documentation of this file.
1 /***************************************************************************
2  * include/stxxl/bits/containers/hash_map/hash_map.h
3  *
4  * Part of the STXXL. See http://stxxl.sourceforge.net
5  *
6  * Copyright (C) 2007 Markus Westphal <[email protected]>
7  * Copyright (C) 2014 Timo Bingmann <[email protected]>
8  *
9  * Distributed under the Boost Software License, Version 1.0.
10  * (See accompanying file LICENSE_1_0.txt or copy at
11  * http://www.boost.org/LICENSE_1_0.txt)
12  **************************************************************************/
13 
14 #ifndef STXXL_CONTAINERS_HASH_MAP_HASH_MAP_HEADER
15 #define STXXL_CONTAINERS_HASH_MAP_HASH_MAP_HEADER
16 
17 #include <functional>
18 
19 #include <stxxl/bits/noncopyable.h>
20 #include <stxxl/bits/namespace.h>
25 
30 
32 
33 #define STXXL_VERBOSE_HASH_MAP(m) \
34  STXXL_VERBOSE1("hash_map[" << static_cast<const void*>(this) << "]::" << m)
35 
36 //! External memory hash-map
37 namespace hash_map {
38 
39 /*!
40  * Main implementation of external memory hash map.
41  *
42  * \tparam KeyType the key type
43  * \tparam MappedType the mapped type associated with a key
44  * \tparam HashType a hash functional
45  * \tparam CompareType a less comparison relation for KeyType
46  * \tparam SubBlockSize the raw size of a subblock (caching granularity)
47  * (default: 8192)
48  * \tparam SubBlocksPerBlock the number of subblocks per external block
49  * (default: 256 -> 2MB blocks)
50  * \tparam AllocType allocator for internal-memory buffer
51  */
52 template <class KeyType,
53  class MappedType,
54  class HashType,
55  class KeyCompareType,
56  unsigned SubBlockSize = 4*1024,
57  unsigned SubBlocksPerBlock = 256,
58  class AllocatorType = std::allocator<std::pair<const KeyType, MappedType> >
59  >
60 class hash_map : private noncopyable
61 {
62 protected:
63  typedef hash_map<KeyType, MappedType, HashType, KeyCompareType,
64  SubBlockSize, SubBlocksPerBlock> self_type;
65 
66 public:
67  //! type of the keys being used
68  typedef KeyType key_type;
69  //! type of the data to be stored
70  typedef MappedType mapped_type;
71  //! actually store (key-data)-pairs
72  typedef std::pair<KeyType, MappedType> value_type;
73  //! type for value-references
75  //! type for constant value-references
76  typedef const value_type& const_reference;
77  //! pointer to type of keys
78  typedef value_type* pointer;
79  //! const pointer to type of keys
80  typedef value_type const* const_pointer;
81 
85 
86  //! type of (mother) hash-function
87  typedef HashType hasher;
88  //! functor that imposes a ordering on keys (but see _lt())
89  typedef KeyCompareType key_compare;
90  //! allocator template type
91  typedef AllocatorType allocator_type;
92 
95 
96  //! subblock- and block-size in bytes
97  enum {
98  block_raw_size = SubBlocksPerBlock * SubBlockSize,
99  subblock_raw_size = SubBlockSize
100  };
101 
102  //! Subblock-size as number of elements, block-size as number of subblocks
103  enum {
104  subblocks_per_block = SubBlocksPerBlock,
105  subblock_size = SubBlockSize / sizeof(value_type)
106  };
107 
108  //! a subblock consists of subblock_size values
110  //! a block consists of block_size subblocks
112 
113  //! block-identifier for subblocks
115  //! block-identifier for blocks
116  typedef typename block_type::bid_type bid_type;
117  //! container for block-bids
118  typedef std::vector<bid_type> bid_container_type;
119  //! iterator for block-bids
120  typedef typename bid_container_type::iterator bid_iterator_type;
121 
122  enum source_type { src_internal, src_external, src_unknown };
123 
124  //! nodes for internal-memory buffer
126  //! buckets
128 
129  typedef std::vector<bucket_type> buckets_container_type;
130 
131  //! for tracking active iterators
133 
135 
137 
138  typedef typename allocator_type::template rebind<node_type>::other node_allocator_type;
139 
140 protected:
141  //! user supplied mother hash-function
143  //! user supplied strict-weak-ordering for keys
145  //! array of bucket
146  buckets_container_type buckets_;
147  //! blocks-ids of allocated blocks
148  bid_container_type bids_;
149  //! size of internal-memory buffer in number of entries
151  //! maximum size for internal-memory buffer
153  //! keeps track of all active iterators
154  iterator_map_type iterator_map_;
155 
156  mutable block_cache_type block_cache_;
157  //! used to allocate new nodes for internal buffer
158  node_allocator_type node_allocator_;
159  //! false if the total-number of values is correct (false) or true if
160  //! estimated (true); see *oblivious_-methods
161  mutable bool oblivious_;
162  //! (estimated) number of values
164  //! desired load factor after rehashing
166 
167 public:
168  /*!
169  * Construct a new hash-map
170  * \param n initial number of buckets
171  * \param hf hash-function
172  * \param cmp comparator-object
173  * \param buffer_size size of internal-memory buffer in bytes
174  * \param a allocation-strategory for internal-memory buffer
175  */
177  const hasher& hf = hasher(),
178  const key_compare& cmp = key_compare(),
179  internal_size_type buffer_size = 128*1024*1024,
180  const allocator_type& a = allocator_type())
181  : hash_(hf),
182  cmp_(cmp),
183  buckets_(n),
184  bids_(0),
185  buffer_size_(0),
186  iterator_map_(this),
187  block_cache_(tuning::get_instance()->blockcache_size),
188  node_allocator_(a),
189  oblivious_(false),
190  num_total_(0),
191  opt_load_factor_(0.875)
192  {
193  max_buffer_size_ = buffer_size / sizeof(node_type);
194  }
195 
196  /*!
197  * Construct a new hash-map and insert all values in the range [f,l)
198  *
199  * \param begin beginning of the range
200  * \param end end of the range
201  * \param mem_to_sort internal memory that may be used for bulk-construction (not
202  * to be confused with the buffer-memory)
203  * \param n initial number of buckets
204  * \param hf hash-function
205  * \param cmp comparator-object
206  * \param buffer_size size of internal-memory buffer in bytes
207  * \param a allocation-strategory for internal-memory buffer
208  */
209  template <class InputIterator>
210  hash_map(InputIterator begin, InputIterator end,
211  internal_size_type mem_to_sort = 256*1024*1024,
212  internal_size_type n = 0,
213  const hasher& hf = hasher(),
214  const key_compare& cmp = key_compare(),
215  internal_size_type buffer_size = 128*1024*1024,
216  const allocator_type& a = allocator_type())
217  : hash_(hf),
218  cmp_(cmp),
219  buckets_(n), // insert will determine a good size
220  bids_(0),
221  buffer_size_(0),
222  iterator_map_(this),
223  block_cache_(tuning::get_instance()->blockcache_size),
224  node_allocator_(a),
225  oblivious_(false),
226  num_total_(0),
227  opt_load_factor_(0.875)
228  {
229  max_buffer_size_ = buffer_size / sizeof(node_type);
230  insert(begin, end, mem_to_sort);
231  }
232 
234  {
235  clear();
236  }
237 
238 public:
239  //! Hash-function used by this hash-map
241  { return hash_; }
242 
243  //! Strict-weak-ordering used by this hash-map
245  { return cmp_; }
246 
247  //! Get node memory allocator
249  { return node_allocator_; }
250 
251 protected:
252  /*!
253  * After using *oblivious_-methods only an estimate for the total number of
254  * elements can be given. This method accesses external memory to
255  * calculate the exact number.
256  */
258  { /* const */ //! TODO: make const again
259  if (!oblivious_)
260  return;
261 
262  typedef HashedValuesStream<self_type, reader_type> values_stream_type;
263 
264  // this will start prefetching automatically
265  reader_type reader(bids_.begin(), bids_.end(), block_cache_);
266  values_stream_type values(buckets_.begin(), buckets_.end(),
267  reader, bids_.begin(), *this);
268 
269  num_total_ = 0;
270  while (!values.empty())
271  {
272  ++num_total_;
273  ++values;
274  }
275  oblivious_ = false;
276  }
277 
278 public:
279  //! Number of values currently stored. Note: If the correct number is
280  //! currently unknown (because *_oblivous-methods were used), external
281  //! memory will be scanned.
283  {
284  if (oblivious_)
285  ((self_type*)this)->_make_conscious();
286  return num_total_;
287  }
288 
289  //! The hash-map may store up to this number of values
291  {
293  }
294 
295  //! Check if container is empty.
296  bool empty() const
297  {
298  return size() != 0;
299  }
300 
301  /*!
302  * Insert a new value if no value with the same key is already present;
303  * external memory must therefore be accessed
304  *
305  * \param value what to insert
306  * \return a tuple whose second part is true iff the value was actually
307  * added (no value with the same key present); the first part is an
308  * iterator pointing to the newly inserted or already stored value
309  */
310  std::pair<iterator, bool> insert(const value_type& value)
311  {
312  if (buckets_.size() == 0)
313  _rebuild_buckets(128);
314 
315  internal_size_type i_bucket = _bkt_num(value.first);
316  bucket_type& bucket = buckets_[i_bucket];
317  node_type* node = _find_key_internal(bucket, value.first);
318 
319  // found value in internal memory
320  if (node && _eq(node->value_.first, value.first))
321  {
322  bool old_deleted = node->deleted();
323  if (old_deleted)
324  {
325  node->set_deleted(false);
326  node->value_ = value;
327  ++num_total_;
328  }
329  return std::pair<iterator, bool>(
330  iterator(this, i_bucket, node,
331  0, src_internal, false, value.first), old_deleted);
332  }
333 
334  // search external memory ...
335  else
336  {
338  = _find_key_external(bucket, value.first);
339 
340  external_size_type i_external = result.first;
341  value_type ext_value = result.second;
342 
343  // ... if found, return iterator pointing to external position ...
344  if (i_external < bucket.n_external_ && _eq(ext_value.first, value.first))
345  {
346  return std::pair<iterator, bool>(
347  iterator(this, i_bucket, node,
348  i_external, src_external, true, value.first), false);
349  }
350  // ... otherwise create a new buffer-node to add the value
351  else
352  {
353  ++num_total_;
354  node_type* new_node =
355  node
356  ? node->set_next(_new_node(value, node->next(), false))
357  : (bucket.list_ = _new_node(value, bucket.list_, false));
358 
359  iterator it(this, i_bucket, new_node,
360  0, src_internal, false, value.first);
361 
362  ++buffer_size_;
363  if (buffer_size_ >= max_buffer_size_)
364  _rebuild_buckets(); // will fix it as well
365 
366  return std::pair<iterator, bool>(it, true);
367  }
368  }
369  }
370 
371  //! Insert a value; external memory is not accessed so that another value
372  //! with the same key may be overwritten
373  //! \param value what to insert
374  //! \return iterator pointing to the inserted value
376  {
377  internal_size_type i_bucket = _bkt_num(value.first);
378  bucket_type& bucket = buckets_[i_bucket];
379  node_type* node = _find_key_internal(bucket, value.first);
380 
381  // found value in internal memory
382  if (node && _eq(node->value_.first, value.first))
383  {
384  if (node->deleted())
385  ++num_total_;
386 
387  node->set_deleted(false);
388  node->value_ = value;
389  return iterator(this, i_bucket, node,
390  0, src_internal, false, value.first);
391  }
392  // not found; ignore external memory and add a new node to the
393  // internal-memory buffer
394  else
395  {
396  oblivious_ = true;
397  ++num_total_;
398  node_type* new_node =
399  node
400  ? node->set_next(_new_node(value, node->next(), false))
401  : (bucket.list_ = _new_node(value, bucket.list_, false));
402 
403  // there may be some iterators that reference the newly inserted
404  // value in external memory these need to be fixed (make them point
405  // to new_node)
406  iterator_map_.fix_iterators_2int(i_bucket, value.first, new_node);
407 
408  iterator it(this, i_bucket, new_node,
409  0, src_internal, false, value.first);
410 
411  ++buffer_size_;
412  if (buffer_size_ >= max_buffer_size_)
413  _rebuild_buckets();
414 
415  return it;
416  }
417  }
418 
419  //! Erase value by iterator
420  //! \param it iterator pointing to the value to erase
422  {
423  --num_total_;
424  bucket_type& bucket = buckets_[it.i_bucket_];
425 
426  if (it.source_ == src_internal)
427  {
428  it.node_->set_deleted(true);
429  iterator_map_.fix_iterators_2end(it.i_bucket_, it.key_);
430  }
431  else {
432  // find biggest value < iterator's value
433  node_type* node = _find_key_internal(bucket, it.key_);
434  assert(!node || !_eq(node->value_.first, it.key_));
435 
436  // add delete-node to buffer
437  if (node)
438  node->set_next(_new_node(value_type(it.key_, mapped_type()), node->next(), true));
439  else
440  bucket.list_ = _new_node(value_type(it.key_, mapped_type()), bucket.list_, true);
441 
442  iterator_map_.fix_iterators_2end(it.i_bucket_, it.key_);
443 
444  ++buffer_size_;
445  if (buffer_size_ >= max_buffer_size_)
446  _rebuild_buckets();
447  }
448  }
449 
450  //! Erase value by key; check external memory
451  //! \param key key of value to erase
452  //! \return number of values actually erased (0 or 1)
454  {
455  internal_size_type i_bucket = _bkt_num(key);
456  bucket_type& bucket = buckets_[i_bucket];
457  node_type* node = _find_key_internal(bucket, key);
458 
459  // found in internal memory
460  if (node && _eq(node->value_.first, key))
461  {
462  if (!node->deleted())
463  {
464  node->set_deleted(true);
465  --num_total_;
466  iterator_map_.fix_iterators_2end(i_bucket, key);
467  return 1;
468  }
469  else
470  return 0; // already deleted
471  }
472  // check external memory
473  else
474  {
476  = _find_key_external(bucket, key);
477 
478  external_size_type i_external = result.first;
479  value_type ext_value = result.second;
480 
481  // found in external memory; add delete-node
482  if (i_external < bucket.n_external_ && _eq(ext_value.first, key))
483  {
484  --num_total_;
485 
486  if (node)
487  node->set_next(_new_node(value_type(key, mapped_type()), node->next(), true));
488  else
489  bucket.list_ = _new_node(value_type(key, mapped_type()), bucket.list_, true);
490 
491  iterator_map_.fix_iterators_2end(i_bucket, key);
492 
493  ++buffer_size_;
494  if (buffer_size_ >= max_buffer_size_)
495  _rebuild_buckets();
496 
497  return 1;
498  }
499  // no value with given key
500  else
501  return 0;
502  }
503  }
504 
505  //! Erase value by key but without looking at external memory
506  //! \param key key for value to release
507  void erase_oblivious(const key_type& key)
508  {
509  internal_size_type i_bucket = _bkt_num(key);
510  bucket_type& bucket = buckets_[i_bucket];
511  node_type* node = _find_key_internal(bucket, key);
512 
513  // found value in internal-memory
514  if (node && _eq(node->value_.first, key))
515  {
516  if (!node->deleted())
517  {
518  --num_total_;
519  node->set_deleted(true);
520  iterator_map_.fix_iterators_2end(i_bucket, key);
521  }
522  }
523  // not found; ignore external memory and add delete-node
524  else
525  {
526  oblivious_ = true;
527  --num_total_;
528 
529  if (node)
530  node->set_next(_new_node(value_type(key, mapped_type()), node->next(), true));
531  else
532  bucket.list_ = _new_node(value_type(key, mapped_type()), bucket.list_, true);
533 
534  iterator_map_.fix_iterators_2end(i_bucket, key);
535 
536  ++buffer_size_;
537  if (buffer_size_ >= max_buffer_size_)
538  _rebuild_buckets();
539  }
540  }
541 
542  //! Reset hash-map: erase all values, invalidate all iterators
543  void clear()
544  {
545  STXXL_VERBOSE_HASH_MAP("clear()");
546 
547  iterator_map_.fix_iterators_all2end();
548  block_cache_.flush();
549  block_cache_.clear();
550 
551  // reset buckets and release buffer-memory
552  for (internal_size_type i_bucket = 0;
553  i_bucket < buckets_.size(); i_bucket++)
554  {
555  _erase_nodes(buckets_[i_bucket].list_, NULL);
556  buckets_[i_bucket] = bucket_type();
557  }
558  oblivious_ = false;
559  num_total_ = 0;
560  buffer_size_ = 0;
561 
562  // free external memory
563  block_manager* bm = block_manager::get_instance();
564  bm->delete_blocks(bids_.begin(), bids_.end());
565  bids_.clear();
566  }
567 
568  //! Exchange stored values with another hash-map
569  //! \param obj hash-map to swap values with
570  void swap(self_type& obj)
571  {
572  std::swap(buckets_, obj.buckets_);
573  std::swap(bids_, obj.bids_);
574 
575  std::swap(oblivious_, obj.oblivious_);
576  std::swap(num_total_, obj.num_total_);
577 
578  std::swap(node_allocator_, obj.node_allocator_);
579 
580  std::swap(hash_, obj.hash_);
581  std::swap(cmp_, obj.cmp_);
582 
583  std::swap(buffer_size_, obj.buffer_size_);
584  std::swap(max_buffer_size_, obj.max_buffer_size_);
585 
586  std::swap(opt_load_factor_, obj.opt_load_factor_);
587 
588  std::swap(iterator_map_, obj.iterator_map_);
589 
590  std::swap(block_cache_, obj.block_cache_);
591  }
592 
593 protected:
594  // find statistics
599 
600 public:
601  //! Reset hash-map statistics
603  {
604  block_cache_.reset_statistics();
605  n_subblocks_loaded = n_found_external = n_found_internal = n_not_found = 0;
606  }
607 
608  //! Print short general statistics to output stream
609  void print_statistics(std::ostream& o = std::cout) const
610  {
611  o << "Find-statistics:" << std::endl;
612  o << " Found internal : " << n_found_internal << std::endl;
613  o << " Found external : " << n_found_external << std::endl;
614  o << " Not found : " << n_not_found << std::endl;
615  o << " Subblocks searched : " << n_subblocks_loaded << std::endl;
616 
617  iterator_map_.print_statistics(o);
618  block_cache_.print_statistics(o);
619  }
620 
621  //! Look up value by key. Non-const access.
622  //! \param key key for value to look up
623  iterator find(const key_type& key)
624  {
625  if (buffer_size_ + 1 >= max_buffer_size_) // (*)
626  _rebuild_buckets();
627 
628  internal_size_type i_bucket = _bkt_num(key);
629  bucket_type& bucket = buckets_[i_bucket];
630  node_type* node = _find_key_internal(bucket, key);
631 
632  // found in internal-memory buffer
633  if (node && _eq(node->value_.first, key)) {
634  n_found_internal++;
635  if (node->deleted())
636  return this->_end<iterator>();
637  else
638  return iterator(this, i_bucket, node, 0, src_internal, false, key);
639  }
640  // search external elements
641  else {
643  = _find_key_external(bucket, key);
644 
645  external_size_type i_external = result.first;
646  value_type value = result.second;
647 
648  // found in external memory
649  if (i_external < bucket.n_external_ && _eq(value.first, key)) {
650  n_found_external++;
651 
652  // we ultimately expect the user to de-reference the returned
653  // iterator to change its value (non-const!). to prevent an
654  // additional disk-access, we create a new node in the
655  // internal-memory buffer overwriting the external value.
656  // note: by checking and rebuilding (if neccessary) in (*) we
657  // made sure that the new node will fit into the buffer and no
658  // rebuild is neccessary here.
659  node_type* new_node =
660  node
661  ? node->set_next(_new_node(value, node->next(), false))
662  : (bucket.list_ = _new_node(value, bucket.list_, false));
663 
664  ++buffer_size_;
665 
666  iterator_map_.fix_iterators_2int(i_bucket, value.first, new_node);
667 
668  return iterator(this, i_bucket, new_node, i_external + 1, src_internal, true, key);
669  }
670  // not found in external memory
671  else {
672  n_not_found++;
673  return this->_end<iterator>();
674  }
675  }
676  }
677 
678  //! Look up value by key. Const access.
679  //! \param key key for value to look up
680  const_iterator find(const key_type& key) const
681  {
682  internal_size_type i_bucket = _bkt_num(key);
683  const bucket_type& bucket = buckets_[i_bucket];
684  node_type* node = _find_key_internal(bucket, key);
685 
686  // found in internal-memory buffer
687  if (node && _eq(node->value_.first, key)) {
688  n_found_internal++;
689  if (node->deleted())
690  return this->_end<const_iterator>();
691  else
692  return const_iterator((self_type*)this, i_bucket, node, 0, src_internal, false, key);
693  }
694  // search external elements
695  else {
697  = _find_key_external(bucket, key);
698 
699  external_size_type i_external = result.first;
700  value_type value = result.second;
701 
702  // found in external memory
703  if (i_external < bucket.n_external_ && _eq(value.first, key)) {
704  n_found_external++;
705  return const_iterator((self_type*)this, i_bucket, node, i_external, src_external, true, key);
706  }
707  // not found in external memory
708  else {
709  n_not_found++;
710  return this->_end<const_iterator>();
711  }
712  }
713  }
714 
715  //! Number of values with given key
716  //! \param k key for value to look up
717  //! \return 0 or 1 depending on the presence of a value with the given key
719  {
720  const_iterator cit = find(k);
721  return (cit == end()) ? 0 : 1;
722  }
723 
724  //! Finds a range containing all values with given key. Non-const access
725  //! \param key key to look for#
726  //! \return range may be empty or contains exactly one value
727  std::pair<iterator, iterator> equal_range(const key_type& key)
728  {
729  iterator it = find(key);
730  return std::pair<iterator, iterator>(it, it);
731  }
732 
733  //! Finds a range containing all values with given key. Const access
734  //! \param key key to look for#
735  //! \return range may be empty or contains exactly one value
736  std::pair<const_iterator, const_iterator> equal_range(const key_type& key) const
737  {
738  const_iterator cit = find(key);
739  return std::pair<const_iterator, const_iterator>(cit, cit);
740  }
741 
742  //! Convenience operator to quickly insert or find values. Use with caution
743  //! since using this operator will check external-memory.
744  mapped_type& operator [] (const key_type& key)
745  {
746  if (buffer_size_ + 1 >= max_buffer_size_) // (*)
747  _rebuild_buckets();
748 
749  internal_size_type i_bucket = _bkt_num(key);
750  bucket_type& bucket = buckets_[i_bucket];
751  node_type* node = _find_key_internal(bucket, key);
752 
753  // found in internal-memory buffer
754  if (node && _eq(node->value_.first, key)) {
755  if (node->deleted()) {
756  node->set_deleted(false);
757  node->value_.second = mapped_type();
758  ++num_total_;
759  }
760  return node->value_.second;
761  }
762  // search external elements
763  else {
765  = _find_key_external(bucket, key);
766 
767  external_size_type i_external = result.first;
768  value_type found_value = result.second;
769 
770  value_type buffer_value =
771  (i_external < bucket.n_external_ && _eq(found_value.first, key))
772  ? found_value
773  : value_type(key, mapped_type());
774 
775  // add a new node to the buffer. this new node's value overwrites
776  // the external value if it was found and otherwise is set to (key,
777  // mapped_type())
778  node_type* new_node =
779  node
780  ? node->set_next(_new_node(buffer_value, node->next(), false))
781  : (bucket.list_ = _new_node(buffer_value, bucket.list_, false));
782 
783  ++buffer_size_;
784  // note that we already checked the buffer-size in (*)
785 
786  return new_node->value_.second;
787  }
788  }
789 
790  //! Number of buckets
792  { return buckets_.size(); }
793 
794  //! Maximum number of buckets
796  { return (internal_size_type)(max_size() / subblock_size); }
797 
798  //! Bucket-index for values with given key.
800  { return _bkt_num(k); }
801 
802 public:
803  //! Average number of (sub)blocks occupied by a bucket.
804  float load_factor() const
805  { return (float)num_total_ / ((float)subblock_size * (float)buckets_.size()); }
806 
807  //! Get desired load-factor
808  float opt_load_factor() const { return opt_load_factor_; }
809 
810  //! Set desired load-factor
811  void opt_load_factor(float z)
812  {
813  opt_load_factor_ = z;
814  if (load_factor() > opt_load_factor_)
815  _rebuild_buckets();
816  }
817 
818  //! Rehash with (at least) n buckets
820  {
821  _rebuild_buckets(n);
822  }
823 
824  //! Number of bytes occupied by buffer
826  {
827  // buffer-size internally stored as number of nodes
828  return buffer_size_ * sizeof(node_type);
829  }
830 
831  //! Maximum buffer size in byte
833  {
834  return max_buffer_size_ * sizeof(node_type);
835  }
836 
837  //! Set maximum buffer size
838  //! \param buffer_size new size in byte
840  {
841  max_buffer_size_ = buffer_size / sizeof(node_type);
842  if (buffer_size_ >= max_buffer_size_)
843  _rebuild_buckets();
844  }
845 
846 protected:
847  //! iterator pointing to the beginnning of the hash-map
848  template <class Iterator>
849  Iterator _begin() const
850  {
851  self_type* non_const_this = (self_type*)this;
852 
853  if (buckets_.size() == 0)
854  return _end<Iterator>();
855 
856  // correct key will be set by find_next()
857  Iterator it(non_const_this, 0, buckets_[0].list_,
858  0, src_unknown, true, key_type());
859  it.find_next();
860 
861  return it;
862  }
863 
864  //! iterator pointing to the end of the hash-map (iterator-type as
865  //! template-parameter)
866  template <class Iterator>
867  Iterator _end() const
868  {
869  self_type* non_const_this = (self_type*)this;
870  return Iterator(non_const_this);
871  }
872 
873 public:
874  //! Returns an iterator pointing to the beginning of the hash-map
875  iterator begin() { return _begin<iterator>(); }
876 
877  //! Returns a const_interator pointing to the beginning of the hash-map
878  const_iterator begin() const { return _begin<const_iterator>(); }
879 
880  //! Returns an iterator pointing to the end of the hash-map
881  iterator end() { return _end<iterator>(); }
882 
883  //! Returns a const_iterator pointing to the end of the hash-map
884  const_iterator end() const { return _end<const_iterator>(); }
885 
886 protected:
887  //! Allocate a new buffer-node
888  node_type * _get_node()
889  {
890  return node_allocator_.allocate(1);
891  }
892 
893  //! Free given node
894  void _put_node(node_type* node)
895  {
896  node_allocator_.deallocate(node, 1);
897  }
898 
899  //! Allocate a new buffer-node and initialize with given value, node and
900  //! deleted-flag
901  node_type * _new_node(const value_type& value, node_type* nxt, bool del)
902  {
903  node_type* node = _get_node();
904  node->value_ = value;
905  node->set_next(nxt);
906  node->set_deleted(del);
907  return node;
908  }
909 
910  //! Free nodes in range [first, last). If last is NULL all nodes will be
911  //! freed.
912  void _erase_nodes(node_type* first, node_type* last)
913  {
914  node_type* curr = first;
915  while (curr != last)
916  {
917  node_type* next = curr->next();
918  _put_node(curr);
919  curr = next;
920  }
921  }
922 
923  //! Bucket-index for values with given key
925  {
926  return _bkt_num(key, buckets_.size());
927  }
928 
929  /*!
930  * Bucket-index for values with given key. The total number of buckets has
931  * to be specified as well. The bucket number is determined by \f$
932  * bucket_num = (hash/max_hash)*n_buckets \f$ max_hash is in fact 2^63-1
933  * (internal_size_type=uint64 (or uint32)) but we rather divide by 2^64, so
934  * we can use plain integer arithmetic easily (there should be only a small
935  * difference): this way we must only calculate the upper 64 bits of the
936  * product hash*n_buckets and we're done. See
937  * http://www.cs.uaf.edu/~cs301/notes/Chapter5/node5.html
938  */
940  {
941  //! TODO maybe specialize double arithmetic to integer. the old code
942  //! was faulty -tb.
943  return (internal_size_type)(
944  (double)n * ((double)hash_(key) / (double)std::numeric_limits<internal_size_type>::max())
945  );
946  }
947 
948  /*!
949  * Locate the given key in the internal-memory chained list. If the key is
950  * not present, the node with the biggest key smaller than the given key is
951  * returned. Note that the returned value may be zero: either because the
952  * chained list is empty or because the given key is smaller than all other
953  * keys in the chained list.
954  */
955  node_type*
956  _find_key_internal(const bucket_type& bucket, const key_type& key) const
957  {
958  node_type* old = NULL;
959  for (node_type* curr = bucket.list_;
960  curr && _leq(curr->value_.first, key);
961  curr = curr->next())
962  {
963  old = curr;
964  }
965  return old;
966  }
967 
968  /*!
969  * Search for key in external part of bucket. Return value is (i_external,
970  * value), where i_ext = bucket._num_external if key could not be found.
971  */
973  _find_key_external(const bucket_type& bucket, const key_type& key) const
974  {
975  subblock_type* subblock;
976 
977  // number of subblocks occupied by bucket
978  internal_size_type n_subblocks = (internal_size_type)(
979  bucket.n_external_ / subblock_size
980  );
981  if (bucket.n_external_ % subblock_size != 0)
982  n_subblocks++;
983 
984  for (internal_size_type i_subblock = 0;
985  i_subblock < n_subblocks; i_subblock++)
986  {
987  subblock = _load_subblock(bucket, i_subblock);
988  // number of values in i-th subblock
989  internal_size_type n_values =
990  (i_subblock + 1 < n_subblocks)
991  ? (internal_size_type)subblock_size
992  : (internal_size_type)(
993  bucket.n_external_ - i_subblock * subblock_size
994  );
995 
996  //! TODO: replace with bucket.n_external_ % subblock_size
997 
998  // biggest key in current subblock still too small => next subblock
999  if (_lt((*subblock)[n_values - 1].first, key))
1000  continue;
1001 
1002  // binary search in current subblock
1003  internal_size_type i_lower = 0, i_upper = n_values;
1004  while (i_lower + 1 != i_upper)
1005  {
1006  internal_size_type i_middle = (i_lower + i_upper) / 2;
1007  if (_leq((*subblock)[i_middle].first, key))
1008  i_lower = i_middle;
1009  else
1010  i_upper = i_middle;
1011  }
1012 
1013  value_type value = (*subblock)[i_lower];
1014 
1015  if (_eq(value.first, key))
1017  (i_subblock * subblock_size + i_lower, value);
1018  else
1020  (bucket.n_external_, value_type());
1021  }
1022 
1024  (bucket.n_external_, value_type());
1025  }
1026 
1027  /*!
1028  * Load the given bucket's i-th subblock.
1029  * Since a bucket may be spread over several blocks, we must
1030  * 1. determine in which block the requested subblock is located
1031  * 2. at which position within the obove-mentioned block the questioned subblock is located
1032  */
1033  subblock_type*
1034  _load_subblock(const bucket_type& bucket, internal_size_type which_subblock) const
1035  {
1036  n_subblocks_loaded++;
1037 
1038  // index of the requested subblock counted from the very beginning of
1039  // the bucket's first block
1040  external_size_type i_abs_subblock = bucket.i_subblock_ + which_subblock;
1041 
1042  /* 1. */
1043  bid_type bid = bids_[bucket.i_block_ + (internal_size_type)(i_abs_subblock / subblocks_per_block)];
1044  /* 2. */
1045  internal_size_type i_subblock_within = (internal_size_type)(i_abs_subblock % subblocks_per_block);
1046 
1047  return block_cache_.get_subblock(bid, i_subblock_within);
1048  }
1049 
1051 
1052  //! Functor to extracts the actual value from a HashedValue-struct
1054  {
1055  value_type& operator () (hashed_value_type& hvalue)
1056  { return hvalue.value_; }
1057  };
1058 
1059  /*!
1060  * Will return from its input-stream all values that are to be stored in
1061  * the given bucket. Those values must appear in consecutive order
1062  * beginning with the input-stream's current value.
1063  */
1064  template <class InputStream, class ValueExtractor>
1066  {
1067  typedef typename InputStream::value_type value_type;
1068 
1070  InputStream& input_;
1074  bool empty_;
1075  ValueExtractor vextract_;
1076 
1077  HashingStream(InputStream& input, internal_size_type i_bucket,
1078  ValueExtractor vextract, self_type* map)
1079  : map_(map),
1080  input_(input),
1081  i_bucket_(i_bucket),
1082  bucket_size_(0),
1083  vextract_(vextract)
1084  {
1085  empty_ = find_next();
1086  }
1087 
1088  const value_type& operator * () { return value_; }
1089 
1090  bool empty() const { return empty_; }
1091 
1093  {
1094  ++input_;
1095  empty_ = find_next();
1096  }
1097 
1098  bool find_next()
1099  {
1100  if (input_.empty())
1101  return true;
1102  value_ = *input_;
1103  if (map_->_bkt_num(vextract_(value_).first) != i_bucket_)
1104  return true;
1105 
1106  ++bucket_size_;
1107  return false;
1108  }
1109  };
1110 
1111  /* Rebuild hash-map. The desired number of buckets may be supplied. */
1113  {
1114  STXXL_VERBOSE_HASH_MAP("_rebuild_buckets()");
1115 
1117  typedef HashedValuesStream<self_type, reader_type> values_stream_type;
1118  typedef HashingStream<values_stream_type, HashedValueExtractor> hashing_stream_type;
1119 
1120  const int_type write_buffer_size = config::get_instance()->disks_number() * 4;
1121 
1122  // determine new number of buckets from desired load_factor ...
1123  internal_size_type n_new;
1124  n_new = (internal_size_type)ceil((double)num_total_ / ((double)subblock_size * (double)opt_load_factor()));
1125 
1126  // ... but give the user the chance to request even more buckets
1127  if (n_desired > n_new)
1128  n_new = std::min<internal_size_type>(n_desired, max_bucket_count());
1129 
1130  // allocate new buckets and bids
1131  buckets_container_type old_buckets(n_new);
1132  std::swap(buckets_, old_buckets);
1133 
1134  bid_container_type old_bids;
1135  std::swap(bids_, old_bids);
1136 
1137  // read stored values in consecutive order
1138 
1139  // use new to control point of destruction (see below)
1140  reader_type* reader
1141  = new reader_type(old_bids.begin(), old_bids.end(), block_cache_);
1142 
1143  values_stream_type values_stream(old_buckets.begin(), old_buckets.end(),
1144  *reader, old_bids.begin(), *this);
1145 
1146  writer_type writer(&bids_, write_buffer_size, write_buffer_size / 2);
1147 
1148  // re-distribute values among new buckets.
1149 
1150  // this makes use of the fact that if value1 preceeds value2 before
1151  // resizing, value1 will preceed value2 after resizing as well (uniform
1152  // rehashing)
1153  num_total_ = 0;
1154  for (internal_size_type i_bucket = 0;
1155  i_bucket < buckets_.size(); i_bucket++)
1156  {
1157  buckets_[i_bucket] = bucket_type();
1158  buckets_[i_bucket].i_block_ = writer.i_block();
1159  buckets_[i_bucket].i_subblock_ = writer.i_subblock();
1160 
1161  hashing_stream_type hasher(values_stream, i_bucket, HashedValueExtractor(), this);
1162  external_size_type i_ext = 0;
1163  while (!hasher.empty())
1164  {
1165  const hashed_value_type& hvalue = *hasher;
1166  iterator_map_.fix_iterators_2ext(hvalue.i_bucket_, hvalue.value_.first, i_bucket, i_ext);
1167 
1168  writer.append(hvalue.value_);
1169  ++hasher;
1170  ++i_ext;
1171  }
1172 
1173  writer.finish_subblock();
1174  buckets_[i_bucket].n_external_ = hasher.bucket_size_;
1175  num_total_ += hasher.bucket_size_;
1176  }
1177  writer.flush();
1178  // reader must be deleted before deleting old_bids because its
1179  // destructor will dereference the bid-iterator
1180  delete reader;
1181  block_cache_.clear();
1182 
1183  // get rid of old blocks and buckets
1185  bm->delete_blocks(old_bids.begin(), old_bids.end());
1186 
1187  for (internal_size_type i_bucket = 0;
1188  i_bucket < old_buckets.size(); i_bucket++)
1189  {
1190  _erase_nodes(old_buckets[i_bucket].list_, NULL);
1191  old_buckets[i_bucket] = bucket_type();
1192  }
1193 
1194  buffer_size_ = 0;
1195  oblivious_ = false;
1196  }
1197 
1198  /*!
1199  * Stream for filtering duplicates. Used to eliminate duplicated values
1200  * when bulk-inserting Note: input has to be sorted, so that duplicates
1201  * will occure in row
1202  */
1203  template <class InputStream>
1205  {
1206  typedef typename InputStream::value_type value_type;
1208  InputStream& in_;
1209 
1210  UniqueValueStream(InputStream& input, self_type& map)
1211  : map_(map), in_(input)
1212  { }
1213 
1214  bool empty() const { return in_.empty(); }
1215 
1216  const value_type& operator * () { return *in_; }
1217 
1219  {
1220  value_type v_old = *in_;
1221  ++in_;
1222  while (!in_.empty() && v_old.first == (*in_).first)
1223  ++in_;
1224  }
1225  };
1226 
1227  template <class InputStream>
1229  {
1230  //! (hash,value)
1231  typedef std::pair<internal_size_type, typename InputStream::value_type> value_type;
1233  InputStream& in_;
1234 
1235  AddHashStream(InputStream& input, self_type& map)
1236  : map_(map), in_(input)
1237  { }
1238 
1239  bool empty() const { return in_.empty(); }
1240 
1241  value_type operator * ()
1242  { return value_type(map_.hash_((*in_).first), *in_); }
1243 
1244  void operator ++ () { ++in_; }
1245  };
1246 
1247  /*!
1248  * Extracts the value-part (ignoring the hashvalue); required by
1249  * HashingStream (see above)
1250  */
1252  {
1253  const value_type& operator () (std::pair<internal_size_type, value_type>& v)
1254  { return v.second; }
1255  };
1256 
1257  /*!
1258  * Comparator object for values as required by stxxl::sort. Sorting is done
1259  * lexicographically by <hash-value, key> Note: the hash-value has already
1260  * been computed.
1261  */
1262  struct Cmp : public std::binary_function<
1263  std::pair<internal_size_type, value_type>,
1264  std::pair<internal_size_type, value_type>, bool
1265  >
1266  {
1268  Cmp(self_type& map) : map_(map) { }
1269 
1270  bool operator () (const std::pair<internal_size_type, value_type>& a,
1271  const std::pair<internal_size_type, value_type>& b) const
1272  {
1273  return (a.first < b.first) ||
1274  ((a.first == b.first) && map_.cmp_(a.second.first, b.second.first));
1275  }
1276  std::pair<internal_size_type, value_type> min_value() const
1277  {
1278  return std::pair<internal_size_type, value_type>(
1280  value_type(map_.cmp_.min_value(), mapped_type())
1281  );
1282  }
1283  std::pair<internal_size_type, value_type> max_value() const
1284  {
1285  return std::pair<internal_size_type, value_type>(
1287  value_type(map_.cmp_.max_value(), mapped_type())
1288  );
1289  }
1290  };
1291 
1292 public:
1293  //! Bulk-insert of values in the range [f, l)
1294  //! \param f beginning of the range
1295  //! \param l end of the range
1296  //! \param mem internal memory that may be used (note: this memory will be used additionally to the buffer). The more the better
1297  template <class InputIterator>
1298  void insert(InputIterator f, InputIterator l, internal_size_type mem)
1299  {
1300  //! values already stored in the hashtable ("old values")
1301  typedef HashedValuesStream<self_type, reader_type> old_values_stream;
1302  //! old values, that are to be stored in a certain (new) bucket
1303  typedef HashingStream<old_values_stream, HashedValueExtractor> old_hashing_stream;
1304 
1305  //! values to insert ("new values")
1307 
1308  //! new values with added hash: (hash, (key, mapped))
1309  typedef AddHashStream<input_stream> new_values_stream;
1310  //! new values sorted by <hash-value, key>
1311  typedef stxxl::stream::sort<new_values_stream, Cmp> new_sorted_values_stream;
1312  //! new values sorted by <hash-value, key> with duplicates eliminated
1313  typedef UniqueValueStream<new_sorted_values_stream> new_unique_values_stream;
1314  //! new values, that are to be stored in a certain bucket
1315  typedef HashingStream<new_unique_values_stream, StripHashFunctor> new_hashing_stream;
1316 
1318 
1319  int_type write_buffer_size = config::get_instance()->disks_number() * 2;
1320 
1321  // calculate new number of buckets
1322  external_size_type num_total_new = num_total_ + (l - f); // estimated number of elements
1323  external_size_type n_buckets_new = (external_size_type)ceil((double)num_total_new / ((double)subblock_size * (double)opt_load_factor()));
1324  if (n_buckets_new > max_bucket_count())
1325  n_buckets_new = max_bucket_count();
1326 
1327  STXXL_VERBOSE_HASH_MAP("insert() items=" << (l - f) << " buckets_new=" << n_buckets_new);
1328 
1329  // prepare new buckets and bids
1330  buckets_container_type old_buckets((internal_size_type)n_buckets_new);
1331  std::swap(buckets_, old_buckets);
1332  // writer will allocate new blocks as necessary
1333  bid_container_type old_bids;
1334  std::swap(bids_, old_bids);
1335 
1336  // already stored values ("old values")
1337  reader_type* reader = new reader_type(old_bids.begin(), old_bids.end(),
1338  block_cache_);
1339  old_values_stream old_values(old_buckets.begin(), old_buckets.end(),
1340  *reader, old_bids.begin(), *this);
1341 
1342  // values to insert ("new values")
1343  input_stream input = stxxl::stream::streamify(f, l);
1344  new_values_stream new_values(input, *this);
1345  new_sorted_values_stream new_sorted_values(new_values, Cmp(*this), mem);
1346  new_unique_values_stream new_unique_values(new_sorted_values, *this);
1347 
1348  writer_type writer(&bids_, write_buffer_size, write_buffer_size / 2);
1349 
1350  num_total_ = 0;
1351  for (internal_size_type i_bucket = 0; i_bucket < buckets_.size(); i_bucket++)
1352  {
1353  buckets_[i_bucket] = bucket_type();
1354  buckets_[i_bucket].i_block_ = writer.i_block();
1355  buckets_[i_bucket].i_subblock_ = writer.i_subblock();
1356 
1357  old_hashing_stream old_hasher(old_values, i_bucket, HashedValueExtractor(), this);
1358  new_hashing_stream new_hasher(new_unique_values, i_bucket, StripHashFunctor(), this);
1359  internal_size_type bucket_size = 0;
1360 
1361  // more old and new values for the current bucket => choose smallest
1362  while (!old_hasher.empty() && !new_hasher.empty())
1363  {
1364  internal_size_type old_hash = hash_((*old_hasher).value_.first);
1365  internal_size_type new_hash = (*new_hasher).first;
1366  key_type old_key = (*old_hasher).value_.first;
1367  key_type new_key = (*new_hasher).second.first;
1368 
1369  // old value wins
1370  if ((old_hash < new_hash) || (old_hash == new_hash && cmp_(old_key, new_key))) // (_lt((*old_hasher)._value.first, (*new_hasher).second.first))
1371  {
1372  const hashed_value_type& hvalue = *old_hasher;
1373  iterator_map_.fix_iterators_2ext(hvalue.i_bucket_, hvalue.value_.first, i_bucket, bucket_size);
1374  writer.append(hvalue.value_);
1375  ++old_hasher;
1376  }
1377  // new value smaller or equal => new value wins
1378  else
1379  {
1380  if (_eq(old_key, new_key))
1381  {
1382  const hashed_value_type& hvalue = *old_hasher;
1383  iterator_map_.fix_iterators_2ext(hvalue.i_bucket_, hvalue.value_.first, i_bucket, bucket_size);
1384  ++old_hasher;
1385  }
1386  writer.append((*new_hasher).second);
1387  ++new_hasher;
1388  }
1389  ++bucket_size;
1390  }
1391  // no more new values for the current bucket
1392  while (!old_hasher.empty())
1393  {
1394  const hashed_value_type& hvalue = *old_hasher;
1395  iterator_map_.fix_iterators_2ext(hvalue.i_bucket_, hvalue.value_.first, i_bucket, bucket_size);
1396  writer.append(hvalue.value_);
1397  ++old_hasher;
1398  ++bucket_size;
1399  }
1400  // no more old values for the current bucket
1401  while (!new_hasher.empty())
1402  {
1403  writer.append((*new_hasher).second);
1404  ++new_hasher;
1405  ++bucket_size;
1406  }
1407 
1408  writer.finish_subblock();
1409  buckets_[i_bucket].n_external_ = bucket_size;
1410  num_total_ += bucket_size;
1411  }
1412  writer.flush();
1413  delete reader;
1414  block_cache_.clear();
1415 
1416  // release old blocks
1418  bm->delete_blocks(old_bids.begin(), old_bids.end());
1419 
1420  // free nodes in old bucket lists
1421  for (internal_size_type i_bucket = 0;
1422  i_bucket < old_buckets.size(); i_bucket++)
1423  {
1424  _erase_nodes(old_buckets[i_bucket].list_, NULL);
1425  old_buckets[i_bucket] = bucket_type();
1426  }
1427 
1428  buffer_size_ = 0;
1429  oblivious_ = false;
1430  }
1431 
1432 protected:
1433  /* 1 iff a < b
1434  The comparison is done lexicographically by (hash-value, key)
1435  */
1436  bool _lt(const key_type& a, const key_type& b) const
1437  {
1438  internal_size_type hash_a = hash_(a);
1439  internal_size_type hash_b = hash_(b);
1440 
1441  return (hash_a < hash_b) ||
1442  ((hash_a == hash_b) && cmp_(a, b));
1443  }
1444 
1445  //! true iff a > b
1446  bool _gt(const key_type& a, const key_type& b) const { return _lt(b, a); }
1447  //! true iff a <= b
1448  bool _leq(const key_type& a, const key_type& b) const { return !_gt(a, b); }
1449  //! true iff a >= b
1450  bool _geq(const key_type& a, const key_type& b) const { return !_lt(a, b); }
1451 
1452  //! true iff a == b. note: it is mandatory that equal keys yield equal
1453  //! hash-values => hashing not neccessary for equality-testing.
1454  bool _eq(const key_type& a, const key_type& b) const
1455  { return !cmp_(a, b) && !cmp_(b, a); }
1456 
1460  friend class iterator_map<self_type>;
1461  friend class block_cache<block_type>;
1462  friend struct HashedValuesStream<self_type, reader_type>;
1463 
1464 #if 1
1466  {
1467  reader_type reader(bids_.begin(), bids_.end(), &block_cache_);
1468 
1469  for (internal_size_type i_block = 0; i_block < bids_.size(); i_block++) {
1470  std::cout << "block " << i_block << ":\n";
1471 
1472  for (internal_size_type i_subblock = 0; i_subblock < subblocks_per_block; i_subblock++) {
1473  std::cout << " subblock " << i_subblock << ":\n ";
1474 
1475  for (external_size_type i_element = 0; i_element < subblocks_per_block; i_element++) {
1476  std::cout << reader.const_value().first << ", ";
1477  ++reader;
1478  }
1479  std::cout << std::endl;
1480  }
1481  }
1482  }
1483 
1485  {
1486  reader_type reader(bids_.begin(), bids_.end(), &block_cache_);
1487 
1488  std::cout << "number of buckets: " << buckets_.size() << std::endl;
1489  for (internal_size_type i_bucket = 0; i_bucket < buckets_.size(); i_bucket++) {
1490  const bucket_type& bucket = buckets_[i_bucket];
1491  reader.skip_to(bids_.begin() + bucket.i_block_, bucket.i_subblock_);
1492 
1493  std::cout << " bucket " << i_bucket << ": block=" << bucket.i_block_ << ", subblock=" << bucket.i_subblock_ << ", external=" << bucket.n_external_ << std::endl;
1494 
1495  node_type* node = bucket.list_;
1496  std::cout << " internal_list=";
1497  while (node) {
1498  std::cout << node->value_.first << " (del=" << node->deleted() << "), ";
1499  node = node->next();
1500  }
1501  std::cout << std::endl;
1502 
1503  std::cout << " external=";
1504  for (external_size_type i_element = 0; i_element < bucket.n_external_; i_element++) {
1505  std::cout << reader.const_value().first << ", ";
1506  ++reader;
1507  }
1508  std::cout << std::endl;
1509  }
1510  }
1511 
1513  {
1514  std::cout << "number of buckets: " << buckets_.size() << std::endl;
1515  for (internal_size_type i_bucket = 0; i_bucket < buckets_.size(); i_bucket++) {
1516  const bucket_type& bucket = buckets_[i_bucket];
1517  std::cout << " bucket " << i_bucket << ": block=" << bucket.i_block_ << ", subblock=" << bucket.i_subblock_ << ", external=" << bucket.n_external_ << ", list=" << bucket.list_ << std::endl;
1518  }
1519  }
1520 #endif
1521 
1522 public:
1523  //! Construct an equality predicate from the comparison operator
1524  struct equal_to : public std::binary_function<key_type, key_type, bool>
1525  {
1526  //! reference to hash_map
1528 
1529  //! constructor requires reference to hash_map
1530  equal_to(const self_type& map) : m_map(map) { }
1531 
1532  //! return whether the arguments compare equal (x==y).
1533  bool operator () (const key_type& x, const key_type& y) const
1534  {
1535  return m_map._eq(x, y);
1536  }
1537 
1538  //! C++11 required type
1540  //! C++11 required type
1542  //! C++11 required type
1543  typedef bool result_type;
1544  };
1545 
1546  //! Type of constructed equality predicate
1547  typedef equal_to key_equal;
1548 
1549  //! Constructed equality predicate used by this hash-map
1550  key_equal key_eq() const
1551  {
1552  return equal_to(*this);
1553  }
1554 
1555 public:
1556  //! Even more statistics: Number of buckets, number of values, buffer-size,
1557  //! values per bucket
1558  void print_load_statistics(std::ostream& o = std::cout) const
1559  {
1560  external_size_type sum_external = 0;
1561  external_size_type square_sum_external = 0;
1562  external_size_type max_external = 0;
1563 
1564  for (internal_size_type i_bucket = 0; i_bucket < buckets_.size(); i_bucket++)
1565  {
1566  const bucket_type& b = buckets_[i_bucket];
1567 
1568  sum_external += b.n_external_;
1569  square_sum_external += b.n_external_ * b.n_external_;
1570  if (b.n_external_ > max_external)
1571  max_external = b.n_external_;
1572  }
1573 
1574  double avg_external = (double)sum_external / (double)buckets_.size();
1575  double std_external = sqrt(((double)square_sum_external / (double)buckets_.size()) - (avg_external * avg_external));
1576 
1577  o << "Bucket count : " << buckets_.size() << std::endl;
1578  o << "Values total : " << num_total_ << std::endl;
1579  o << "Values buffered : " << buffer_size_ << std::endl;
1580  o << "Max Buffer-Size : " << max_buffer_size_ << std::endl;
1581  o << "Max external/bucket : " << max_external << std::endl;
1582  o << "Avg external/bucket : " << avg_external << std::endl;
1583  o << "Std external/bucket : " << std_external << std::endl;
1584  o << "Load-factor : " << load_factor() << std::endl;
1585  o << "Blocks allocated : " << bids_.size() << " => " << (bids_.size() * block_type::raw_size) << " bytes" << std::endl;
1586  o << "Bytes per value : " << ((double)(bids_.size() * block_type::raw_size) / (double)num_total_) << std::endl;
1587  }
1588 }; /* end of class hash_map */
1589 
1590 } // namespace hash_map
1591 
1593 
1594 namespace std {
1595 
1596 template <class KeyType, class MappedType, class HashType, class KeyCompareType,
1597  unsigned SubBlockSize, unsigned SubBlocksPerBlock, class AllocType>
1598 void swap(stxxl::hash_map::hash_map<KeyType, MappedType, HashType, KeyCompareType,
1599  SubBlockSize, SubBlocksPerBlock, AllocType>& a,
1600  stxxl::hash_map::hash_map<KeyType, MappedType, HashType, KeyCompareType,
1601  SubBlockSize, SubBlocksPerBlock, AllocType>& b)
1602 {
1603  if (&a != &b)
1604  a.swap(b);
1605 }
1606 
1607 } // namespace std
1608 
1609 #endif // !STXXL_CONTAINERS_HASH_MAP_HASH_MAP_HEADER
UniqueValueStream(InputStream &input, self_type &map)
Definition: hash_map.h:1210
iterator2stream< InputIterator > streamify(InputIterator begin, InputIterator end)
Input iterator range to stream converter.
Definition: stream.h:93
void print_statistics(std::ostream &o=std::cout) const
Print short general statistics to output stream.
Definition: hash_map.h:609
external_size_type n_not_found
Definition: hash_map.h:598
Additional information about a stored value:
Definition: util.h:432
internal_size_type i_bucket_
index of current bucket
Definition: iterator.h:65
KeyType key_type
type of the keys being used
Definition: hash_map.h:68
AddHashStream(InputStream &input, self_type &map)
Definition: hash_map.h:1235
void _put_node(node_type *node)
Free given node.
Definition: hash_map.h:894
void rehash(internal_size_type n=0)
Rehash with (at least) n buckets.
Definition: hash_map.h:819
void erase(const_iterator it)
Erase value by iterator.
Definition: hash_map.h:421
second_type second
Second tuple component.
Definition: tuple.h:98
void fix_iterators_all2end()
Update all iterators and make them point to the end of the hash-map (used by clear()) ...
Definition: iterator_map.h:206
ValueType value_
Definition: util.h:34
void reset_statistics()
Reset all counters to zero.
Definition: block_cache.h:519
iterator_map< self_type > iterator_map_type
for tracking active iterators
Definition: hash_map.h:132
Block manager class.
Definition: block_manager.h:61
void insert(InputIterator f, InputIterator l, internal_size_type mem)
Bulk-insert of values in the range [f, l)
Definition: hash_map.h:1298
internal_size_type buffer_size_
size of internal-memory buffer in number of entries
Definition: hash_map.h:150
void print_statistics(std::ostream &o=std::cout) const
Definition: iterator_map.h:246
long long int int64
Definition: types.h:38
key_type first_argument_type
C++11 required type.
Definition: hash_map.h:1539
std::vector< bid_type > bid_container_type
container for block-bids
Definition: hash_map.h:118
A model of stream that retrieves the data from an input iterator. For convenience use streamify funct...
Definition: stream.h:46
internal_size_type buffer_size() const
Number of bytes occupied by buffer.
Definition: hash_map.h:825
float opt_load_factor() const
Get desired load-factor.
Definition: hash_map.h:808
iterator begin()
Returns an iterator pointing to the beginning of the hash-map.
Definition: hash_map.h:875
Will return from its input-stream all values that are to be stored in the given bucket.
Definition: hash_map.h:1065
Iterator _begin() const
iterator pointing to the beginnning of the hash-map
Definition: hash_map.h:849
void print_load_statistics(std::ostream &o=std::cout) const
Even more statistics: Number of buckets, number of values, buffer-size, values per bucket...
Definition: hash_map.h:1558
bool deleted()
check if the next node is deleted.
Definition: util.h:37
node< value_type > node_type
nodes for internal-memory buffer
Definition: hash_map.h:125
key_type second_argument_type
C++11 required type.
Definition: hash_map.h:1541
equal_to(const self_type &map)
constructor requires reference to hash_map
Definition: hash_map.h:1530
hash_map_const_iterator< self_type > const_iterator
Definition: hash_map.h:94
external_size_type max_size() const
The hash-map may store up to this number of values.
Definition: hash_map.h:290
node_type * _find_key_internal(const bucket_type &bucket, const key_type &key) const
Locate the given key in the internal-memory chained list.
Definition: hash_map.h:956
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.h:234
subblock_type * get_subblock(const bid_type &bid, unsigned_type i_subblock)
Retrieve a subblock from the cache. If not yet cached, only the subblock will be loaded.
Definition: block_cache.h:361
const self_type & m_map
reference to hash_map
Definition: hash_map.h:1527
void max_buffer_size(internal_size_type buffer_size)
Set maximum buffer size.
Definition: hash_map.h:839
key_equal key_eq() const
Constructed equality predicate used by this hash-map.
Definition: hash_map.h:1550
hash_map(internal_size_type n=0, const hasher &hf=hasher(), const key_compare &cmp=key_compare(), internal_size_type buffer_size=128 *1024 *1024, const allocator_type &a=allocator_type())
Construct a new hash-map.
Definition: hash_map.h:176
first_type first
First tuple component.
Definition: tuple.h:96
void fix_iterators_2ext(internal_size_type i_bucket_old, const key_type &key, internal_size_type i_bucket_new, external_size_type i_ext)
Update iterators with given key and bucket and make them point to the specified location in external ...
Definition: iterator_map.h:127
bool _gt(const key_type &a, const key_type &b) const
true iff a &gt; b
Definition: hash_map.h:1446
hasher hash_function() const
Hash-function used by this hash-map.
Definition: hash_map.h:240
uint_pair & operator++()
prefix increment operator (directly manipulates the integer parts)
Definition: uint_types.h:163
node_allocator_type node_allocator_
used to allocate new nodes for internal buffer
Definition: hash_map.h:158
block_type::bid_type bid_type
block-identifier for blocks
Definition: hash_map.h:116
tuple< external_size_type, value_type > _find_key_external(const bucket_type &bucket, const key_type &key) const
Search for key in external part of bucket.
Definition: hash_map.h:973
external_size_type num_total_
(estimated) number of values
Definition: hash_map.h:163
stxxl::internal_size_type internal_size_type
Definition: hash_map.h:83
iterator end()
Returns an iterator pointing to the end of the hash-map.
Definition: hash_map.h:881
void opt_load_factor(float z)
Set desired load-factor.
Definition: hash_map.h:811
HashedValue< self_type > hashed_value_type
Definition: hash_map.h:1050
void fix_iterators_2int(internal_size_type i_bucket, const key_type &key, node_type *node)
Update iterators with given key and bucket and make them point to the specified node in internal memo...
Definition: iterator_map.h:160
buffered_reader< block_cache_type, bid_iterator_type > reader_type
Definition: hash_map.h:136
Comparator object for values as required by stxxl::sort.
Definition: hash_map.h:1262
block_cache_type block_cache_
Definition: hash_map.h:156
internal_size_type i_subblock_
index of first subblock
Definition: util.h:75
bool result_type
C++11 required type.
Definition: hash_map.h:1543
const_iterator begin() const
Returns a const_interator pointing to the beginning of the hash-map.
Definition: hash_map.h:878
void delete_blocks(const BIDIteratorClass &bidbegin, const BIDIteratorClass &bidend)
Deallocates blocks.
Tuning parameters for external memory hash map.
Definition: tuning.h:25
std::pair< internal_size_type, typename InputStream::value_type > value_type
(hash,value)
Definition: hash_map.h:1231
Extracts the value-part (ignoring the hashvalue); required by HashingStream (see above) ...
Definition: hash_map.h:1251
block_cache< block_type > block_cache_type
Definition: hash_map.h:134
Functor to extracts the actual value from a HashedValue-struct.
Definition: hash_map.h:1053
bool _leq(const key_type &a, const key_type &b) const
true iff a &lt;= b
Definition: hash_map.h:1448
Block containing elements of fixed length.
Definition: typed_block.h:237
key_compare key_cmp() const
Strict-weak-ordering used by this hash-map.
Definition: hash_map.h:244
Stream for filtering duplicates.
Definition: hash_map.h:1204
internal_size_type _bkt_num(const key_type &key, internal_size_type n) const
Bucket-index for values with given key.
Definition: hash_map.h:939
const_iterator end() const
Returns a const_iterator pointing to the end of the hash-map.
Definition: hash_map.h:884
bool oblivious_
false if the total-number of values is correct (false) or true if estimated (true); see *oblivious_-m...
Definition: hash_map.h:161
external_size_type erase(const key_type &key)
Erase value by key; check external memory.
Definition: hash_map.h:453
void swap(self_type &obj)
Exchange stored values with another hash-map.
Definition: hash_map.h:570
equal_to key_equal
Type of constructed equality predicate.
Definition: hash_map.h:1547
void _make_conscious()
After using *oblivious_-methods only an estimate for the total number of elements can be given...
Definition: hash_map.h:257
HashType hasher
type of (mother) hash-function
Definition: hash_map.h:87
void _rebuild_buckets(internal_size_type n_desired=0)
Definition: hash_map.h:1112
choose_int_types< my_pointer_size >::int_type int_type
Definition: types.h:63
AllocatorType allocator_type
allocator template type
Definition: hash_map.h:91
internal_size_type max_buffer_size() const
Maximum buffer size in byte.
Definition: hash_map.h:832
bid_container_type bids_
blocks-ids of allocated blocks
Definition: hash_map.h:148
typed_block< subblock_raw_size, value_type > subblock_type
a subblock consists of subblock_size values
Definition: hash_map.h:109
bucket< node_type > bucket_type
buckets
Definition: hash_map.h:127
static instance_pointer get_instance()
Definition: singleton.h:38
unsigned_type internal_size_type
Definition: types.h:66
void _erase_nodes(node_type *first, node_type *last)
Free nodes in range [first, last). If last is NULL all nodes will be freed.
Definition: hash_map.h:912
std::pair< iterator, iterator > equal_range(const key_type &key)
Finds a range containing all values with given key. Non-const access.
Definition: hash_map.h:727
const value_type & const_reference
type for constant value-references
Definition: hash_map.h:76
std::pair< const_iterator, const_iterator > equal_range(const key_type &key) const
Finds a range containing all values with given key. Const access.
Definition: hash_map.h:736
external_size_type n_found_external
Definition: hash_map.h:597
internal_size_type max_buffer_size_
maximum size for internal-memory buffer
Definition: hash_map.h:152
std::vector< bucket_type > buckets_container_type
Definition: hash_map.h:129
internal_size_type i_block_
index of first block&#39;s bid (to be used as index for hash_map&#39;s bids_-array
Definition: util.h:72
bool _geq(const key_type &a, const key_type &b) const
true iff a &gt;= b
Definition: hash_map.h:1450
std::pair< internal_size_type, value_type > max_value() const
Definition: hash_map.h:1283
#define STXXL_BEGIN_NAMESPACE
Definition: namespace.h:16
internal_size_type _bkt_num(const key_type &key) const
Bucket-index for values with given key.
Definition: hash_map.h:924
external_size_type n_external_
number of elements in external memory
Definition: util.h:68
key_compare cmp_
user supplied strict-weak-ordering for keys
Definition: hash_map.h:144
value_type & reference
type for value-references
Definition: hash_map.h:74
void clear()
Empty cache; don&#39;t write back dirty blocks.
Definition: block_cache.h:491
std::pair< KeyType, MappedType > value_type
actually store (key-data)-pairs
Definition: hash_map.h:72
hash_map_iterator< self_type > iterator
Definition: hash_map.h:93
Produces sorted stream from input stream.
Definition: sort_stream.h:1535
float load_factor() const
Average number of (sub)blocks occupied by a bucket.
Definition: hash_map.h:804
internal_size_type bucket_index(const key_type &k) const
Bucket-index for values with given key.
Definition: hash_map.h:799
node_type * _get_node()
Allocate a new buffer-node.
Definition: hash_map.h:888
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.h:241
internal_size_type i_bucket_
Definition: util.h:443
Stream interface for all value-pairs currently stored in the map.
Definition: util.h:470
bool set_deleted(bool d)
change deleted flag on the next node
Definition: util.h:42
Construct an equality predicate from the comparison operator.
Definition: hash_map.h:1524
NodeType * list_
entry point to the chain in internal memory
Definition: util.h:65
Iterator _end() const
iterator pointing to the end of the hash-map (iterator-type as template-parameter) ...
Definition: hash_map.h:867
#define STXXL_VERBOSE_HASH_MAP(m)
Definition: hash_map.h:33
hash_map(InputIterator begin, InputIterator end, internal_size_type mem_to_sort=256 *1024 *1024, internal_size_type n=0, const hasher &hf=hasher(), const key_compare &cmp=key_compare(), internal_size_type buffer_size=128 *1024 *1024, const allocator_type &a=allocator_type())
Construct a new hash-map and insert all values in the range [f,l)
Definition: hash_map.h:210
KeyCompareType key_compare
functor that imposes a ordering on keys (but see _lt())
Definition: hash_map.h:89
float opt_load_factor_
desired load factor after rehashing
Definition: hash_map.h:165
allocator_type::template rebind< node_type >::other node_allocator_type
Definition: hash_map.h:138
node_type * _new_node(const value_type &value, node_type *nxt, bool del)
Allocate a new buffer-node and initialize with given value, node and deleted-flag.
Definition: hash_map.h:901
external_size_type size() const
Number of values currently stored. Note: If the correct number is currently unknown (because *_oblivo...
Definition: hash_map.h:282
internal_size_type bucket_count() const
Number of buckets.
Definition: hash_map.h:791
void flush()
Write all dirty blocks back to disk.
Definition: block_cache.h:473
buckets_container_type buckets_
array of bucket
Definition: hash_map.h:146
bool _lt(const key_type &a, const key_type &b) const
Definition: hash_map.h:1436
iterator find(const key_type &key)
Look up value by key. Non-const access.
Definition: hash_map.h:623
uint64 external_size_type
Definition: types.h:67
Used to scan external memory with prefetching.
Definition: util.h:95
Main implementation of external memory hash map.
Definition: hash_map.h:60
subblock_type * _load_subblock(const bucket_type &bucket, internal_size_type which_subblock) const
Load the given bucket&#39;s i-th subblock.
Definition: hash_map.h:1034
allocator_type get_allocator() const
Get node memory allocator.
Definition: hash_map.h:248
source_type source_
source of current value: external or internal
Definition: iterator.h:67
typed_block< block_raw_size, subblock_type > block_type
a block consists of block_size subblocks
Definition: hash_map.h:111
stxxl::external_size_type external_size_type
Definition: hash_map.h:82
void reset_statistics()
Reset hash-map statistics.
Definition: hash_map.h:602
bid_container_type::iterator bid_iterator_type
iterator for block-bids
Definition: hash_map.h:120
InputStream::value_type value_type
Definition: hash_map.h:1067
HashingStream(InputStream &input, internal_size_type i_bucket, ValueExtractor vextract, self_type *map)
Definition: hash_map.h:1077
std::pair< internal_size_type, value_type > min_value() const
Definition: hash_map.h:1276
External associative container (map). Introduction to map container: see STXXL Map (B+-tree) tutor...
Definition: map.h:82
void clear()
Reset hash-map: erase all values, invalidate all iterators.
Definition: hash_map.h:543
bool _eq(const key_type &a, const key_type &b) const
true iff a == b. note: it is mandatory that equal keys yield equal hash-values =&gt; hashing not neccess...
Definition: hash_map.h:1454
node_type * node_
current (source=internal) or old (src=external) internal node
Definition: iterator.h:69
subblock_type::bid_type subblock_bid_type
block-identifier for subblocks
Definition: hash_map.h:114
value_type const * const_pointer
const pointer to type of keys
Definition: hash_map.h:80
external_size_type count(const key_type &k) const
Number of values with given key.
Definition: hash_map.h:718
MappedType mapped_type
type of the data to be stored
Definition: hash_map.h:70
external_size_type n_found_internal
Definition: hash_map.h:596
void print_statistics(std::ostream &o=std::cout) const
Print statistics: Number of hits/misses, blocks forced from cache or written back.
Definition: block_cache.h:508
node< ValueType > * set_next(node< ValueType > *n)
change the &quot;next&quot; value of next node pointer
Definition: util.h:54
const_iterator find(const key_type &key) const
Look up value by key. Const access.
Definition: hash_map.h:680
internal_size_type max_bucket_count() const
Maximum number of buckets.
Definition: hash_map.h:795
Buffered writing of values. New Blocks are allocated as needed.
Definition: util.h:302
value_type * pointer
pointer to type of keys
Definition: hash_map.h:78
external_size_type n_subblocks_loaded
Definition: hash_map.h:595
ExtIterator find(ExtIterator begin, ExtIterator end, const EqualityComparable &value, int_type nbuffers=0)
External equivalent of std::find, see stxxl::find.
Definition: scan.h:289
std::pair< iterator, bool > insert(const value_type &value)
Insert a new value if no value with the same key is already present; external memory must therefore b...
Definition: hash_map.h:310
key_type key_
key of current value
Definition: iterator.h:74
iterator_map_type iterator_map_
keeps track of all active iterators
Definition: hash_map.h:154
k-Tuple data type
Definition: tuple.h:67
iterator insert_oblivious(const value_type &value)
Insert a value; external memory is not accessed so that another value with the same key may be overwr...
Definition: hash_map.h:375
hash_map< KeyType, MappedType, HashType, KeyCompareType, SubBlockSize, SubBlocksPerBlock > self_type
Definition: hash_map.h:64
hasher hash_
user supplied mother hash-function
Definition: hash_map.h:142
#define STXXL_END_NAMESPACE
Definition: namespace.h:17
stxxl::int64 difference_type
Definition: hash_map.h:84
node< ValueType > * next()
return the next node, without the &quot;next&quot; flag.
Definition: util.h:49
void fix_iterators_2end(internal_size_type i_bucket, const key_type &key)
Update iterators with given key and bucket and make them point to the end of the hash-map (called by ...
Definition: iterator_map.h:185
void erase_oblivious(const key_type &key)
Erase value by key but without looking at external memory.
Definition: hash_map.h:507
bool empty() const
Check if container is empty.
Definition: hash_map.h:296