STXXL  1.4-dev
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
multiseq_selection.h
Go to the documentation of this file.
1 /***************************************************************************
2  * include/stxxl/bits/parallel/multiseq_selection.h
3  *
4  * Functions to find elements of a certain global rank in multiple sorted
5  * sequences. Also serves for splitting such sequence sets.
6  *
7  * Extracted from MCSTL - http://algo2.iti.uni-karlsruhe.de/singler/mcstl/
8  *
9  * Part of the STXXL. See http://stxxl.sourceforge.net
10  *
11  * Copyright (C) 2007 Johannes Singler <[email protected]>
12  * Copyright (C) 2014 Timo Bingmann <[email protected]>
13  *
14  * Distributed under the Boost Software License, Version 1.0.
15  * (See accompanying file LICENSE_1_0.txt or copy at
16  * http://www.boost.org/LICENSE_1_0.txt)
17  **************************************************************************/
18 
19 #ifndef STXXL_PARALLEL_MULTISEQ_SELECTION_HEADER
20 #define STXXL_PARALLEL_MULTISEQ_SELECTION_HEADER
21 
22 #include <stxxl/bits/namespace.h>
25 
26 #include <vector>
27 #include <queue>
28 #include <cstdlib>
29 #include <algorithm>
30 
32 
33 namespace parallel {
34 
35 //! Compare a pair of types lexcigraphically, ascending.
36 template <typename T1, typename T2, typename Comparator>
38  : public std::binary_function<std::pair<T1, T2>, std::pair<T1, T2>, bool>
39 {
40 protected:
41  Comparator& m_comp;
42 
43 public:
44  lexicographic(Comparator& comp) : m_comp(comp) { }
45 
46  inline bool operator () (const std::pair<T1, T2>& p1,
47  const std::pair<T1, T2>& p2)
48  {
49  if (m_comp(p1.first, p2.first))
50  return true;
51 
52  if (m_comp(p2.first, p1.first))
53  return false;
54 
55  // firsts are equal
56  return p1.second < p2.second;
57  }
58 };
59 
60 //! Compare a pair of types lexcigraphically, descending.
61 template <typename T1, typename T2, typename Comparator>
63  : public std::binary_function<std::pair<T1, T2>, std::pair<T1, T2>, bool>
64 {
65 protected:
66  Comparator& m_comp;
67 
68 public:
69  lexicographic_rev(Comparator& comp) : m_comp(comp) { }
70 
71  inline bool operator () (const std::pair<T1, T2>& p1,
72  const std::pair<T1, T2>& p2)
73  {
74  if (m_comp(p2.first, p1.first))
75  return true;
76 
77  if (m_comp(p1.first, p2.first))
78  return false;
79 
80  // firsts are equal
81  return p2.second < p1.second;
82  }
83 };
84 
85 /*!
86  * Splits several sorted sequences at a certain global rank, resulting in a
87  * splitting point for each sequence. The sequences are passed via a sequence
88  * of random-access iterator pairs, none of the sequences may be empty. If
89  * there are several equal elements across the split, the ones on the left side
90  * will be chosen from sequences with smaller number.
91  *
92  * \param begin_seqs Begin of the sequence of iterator pairs.
93  * \param end_seqs End of the sequence of iterator pairs.
94  * \param rank The global rank to partition at.
95  * \param begin_offsets A random-access sequence begin where the result will be
96  * stored in. Each element of the sequence is an iterator that points to the
97  * first element on the greater part of the respective sequence.
98  * \param comp The ordering functor, defaults to std::less<T>.
99  */
100 template <typename RanSeqs, typename RankType, typename RankIterator,
101  typename Comparator>
103  const RanSeqs& begin_seqs, const RanSeqs& end_seqs,
104  const RankType& rank,
105  RankIterator begin_offsets,
106  Comparator comp = std::less<
107  typename std::iterator_traits<typename std::iterator_traits<RanSeqs>
108  ::value_type::first_type>::value_type
109  >()) //std::less<T>
110 {
111  STXXL_PARALLEL_PCALL(end_seqs - begin_seqs);
112 
113  typedef typename std::iterator_traits<RanSeqs>
114  ::value_type::first_type iterator;
115  typedef typename std::iterator_traits<iterator>
116  ::difference_type diff_type;
117  typedef typename std::iterator_traits<iterator>
118  ::value_type value_type;
119 
120  typedef std::pair<value_type, diff_type> sample_pair;
121  // comparators for sample_pair
124 
125  // number of sequences, number of elements in total (possibly including padding)
126  const diff_type m = std::distance(begin_seqs, end_seqs);
127  diff_type nmax, n;
128  RankType N = 0;
129 
130  for (diff_type i = 0; i < m; ++i)
131  {
132  N += std::distance(begin_seqs[i].first, begin_seqs[i].second);
133  assert(std::distance(begin_seqs[i].first, begin_seqs[i].second) > 0);
134  }
135 
136  if (rank == N)
137  {
138  for (diff_type i = 0; i < m; ++i)
139  begin_offsets[i] = begin_seqs[i].second; // very end
140  return;
141  }
142 
143  assert(m != 0 && N != 0 && rank < N);
144 
145  diff_type* seqlen = new diff_type[m];
146 
147  seqlen[0] = std::distance(begin_seqs[0].first, begin_seqs[0].second);
148  nmax = seqlen[0];
149  for (diff_type i = 1; i < m; ++i)
150  {
151  seqlen[i] = std::distance(begin_seqs[i].first, begin_seqs[i].second);
152  nmax = std::max(nmax, seqlen[i]);
153  }
154 
155  // pad all lists to this length, at least as long as any ns[i], equality
156  // iff nmax = 2^k - 1
157  diff_type l = round_up_to_power_of_two(nmax + 1) - 1;
158 
159  diff_type* a = new diff_type[m], * b = new diff_type[m];
160 
161  for (diff_type i = 0; i < m; ++i)
162  a[i] = 0, b[i] = l;
163 
164  n = l / 2;
165 
166  // invariants:
167  // 0 <= a[i] <= seqlen[i], 0 <= b[i] <= l
168 
169 #define S(i) (begin_seqs[i].first)
170 
171  // initial partition
172 
173  std::vector<sample_pair> sample;
174 
175  for (diff_type i = 0; i < m; ++i) {
176  if (n < seqlen[i]) {
177  // sequence long enough
178  sample.push_back(sample_pair(S(i)[n], i));
179  }
180  }
181 
182  std::sort(sample.begin(), sample.end(), lcomp);
183 
184  for (diff_type i = 0; i < m; ++i) {
185  // conceptual infinity
186  if (n >= seqlen[i]) {
187  // sequence too short, conceptual infinity
188  sample.push_back(sample_pair(S(i)[0] /*dummy element*/, i));
189  }
190  }
191 
192  diff_type localrank = rank / l;
193 
194  diff_type j;
195  for (j = 0; j < localrank && ((n + 1) <= seqlen[sample[j].second]); ++j)
196  a[sample[j].second] += n + 1;
197  for ( ; j < m; ++j)
198  b[sample[j].second] -= n + 1;
199 
200  // further refinement
201 
202  while (n > 0)
203  {
204  n /= 2;
205 
206  const value_type* lmax = NULL; // impossible to avoid the warning?
207  for (diff_type i = 0; i < m; ++i)
208  {
209  if (a[i] > 0)
210  {
211  if (!lmax)
212  lmax = &(S(i)[a[i] - 1]);
213  else
214  {
215  // max, favor rear sequences
216  if (!comp(S(i)[a[i] - 1], *lmax))
217  lmax = &(S(i)[a[i] - 1]);
218  }
219  }
220  }
221 
222  for (diff_type i = 0; i < m; ++i)
223  {
224  diff_type middle = (b[i] + a[i]) / 2;
225  if (lmax && middle < seqlen[i] && comp(S(i)[middle], *lmax))
226  a[i] = std::min(a[i] + n + 1, seqlen[i]);
227  else
228  b[i] -= n + 1;
229  }
230 
231  diff_type leftsize = 0;
232  for (diff_type i = 0; i < m; ++i)
233  leftsize += a[i] / (n + 1);
234 
235  diff_type skew = rank / (n + 1) - leftsize;
236 
237  if (skew > 0)
238  {
239  // move to the left, find smallest
240  std::priority_queue<sample_pair, std::vector<sample_pair>,
242  pq(lrcomp);
243 
244  for (diff_type i = 0; i < m; ++i) {
245  if (b[i] < seqlen[i])
246  pq.push(sample_pair(S(i)[b[i]], i));
247  }
248 
249  for ( ; skew != 0 && !pq.empty(); --skew)
250  {
251  diff_type source = pq.top().second;
252  pq.pop();
253 
254  a[source] = std::min(a[source] + n + 1, seqlen[source]);
255  b[source] += n + 1;
256 
257  if (b[source] < seqlen[source])
258  pq.push(sample_pair(S(source)[b[source]], source));
259  }
260  }
261  else if (skew < 0)
262  {
263  // move to the right, find greatest
264  std::priority_queue<sample_pair, std::vector<sample_pair>,
266  pq(lcomp);
267 
268  for (diff_type i = 0; i < m; ++i) {
269  if (a[i] > 0)
270  pq.push(sample_pair(S(i)[a[i] - 1], i));
271  }
272 
273  for ( ; skew != 0; ++skew)
274  {
275  diff_type source = pq.top().second;
276  pq.pop();
277 
278  a[source] -= n + 1;
279  b[source] -= n + 1;
280 
281  if (a[source] > 0)
282  pq.push(sample_pair(S(source)[a[source] - 1], source));
283  }
284  }
285  }
286 
287  // postconditions: a[i] == b[i] in most cases, except when a[i] has been
288  // clamped because of having reached the boundary
289 
290  // now return the result, calculate the offset, compare the keys on both
291  // edges of the border
292 
293  // maximum of left edge, minimum of right edge
294  value_type* maxleft = NULL, * minright = NULL;
295  for (diff_type i = 0; i < m; ++i)
296  {
297  if (a[i] > 0)
298  {
299  if (!maxleft)
300  {
301  maxleft = &(S(i)[a[i] - 1]);
302  }
303  else
304  {
305  // max, favor rear sequences
306  if (!comp(S(i)[a[i] - 1], *maxleft))
307  maxleft = &(S(i)[a[i] - 1]);
308  }
309  }
310  if (b[i] < seqlen[i])
311  {
312  if (!minright)
313  {
314  minright = &(S(i)[b[i]]);
315  }
316  else
317  {
318  // min, favor fore sequences
319  if (comp(S(i)[b[i]], *minright))
320  minright = &(S(i)[b[i]]);
321  }
322  }
323  }
324 
325  for (diff_type i = 0; i < m; ++i)
326  begin_offsets[i] = S(i) + a[i];
327 
328  delete[] seqlen;
329  delete[] a;
330  delete[] b;
331 }
332 
333 /*!
334  * Selects the element at a certain global rank from several sorted sequences.
335  *
336  * The sequences are passed via a sequence of random-access iterator pairs,
337  * none of the sequences may be empty.
338  *
339  * \param begin_seqs Begin of the sequence of iterator pairs.
340  * \param end_seqs End of the sequence of iterator pairs.
341  * \param rank The global rank to partition at.
342  * \param offset The rank of the selected element in the global subsequence of
343  * elements equal to the selected element. If the selected element is unique,
344  * this number is 0.
345  * \param comp The ordering functor, defaults to std::less. */
346 template <typename ValueType, typename RanSeqs, typename RankType, typename Comparator>
347 ValueType multiseq_selection(const RanSeqs& begin_seqs, const RanSeqs& end_seqs,
348  const RankType& rank,
349  RankType& offset,
350  Comparator comp = std::less<ValueType>())
351 {
352  STXXL_PARALLEL_PCALL(end_seqs - begin_seqs);
353 
354  typedef typename std::iterator_traits<RanSeqs>
355  ::value_type::first_type iterator;
356  typedef typename std::iterator_traits<iterator>
357  ::difference_type diff_type;
358 
359  typedef std::pair<ValueType, diff_type> sample_pair;
360  // comparators for sample_pair
363 
364  // number of sequences, number of elements in total (possibly including padding)
365  const diff_type m = std::distance(begin_seqs, end_seqs);
366  diff_type nmax, n;
367  RankType N = 0;
368 
369  for (diff_type i = 0; i < m; ++i)
370  N += std::distance(begin_seqs[i].first, begin_seqs[i].second);
371 
372  if (m == 0 || N == 0 || rank < 0 || rank >= N)
373  // result undefined when there is no data or rank is outside bounds
374  throw std::exception();
375 
376  diff_type* seqlen = new diff_type[m];
377 
378  seqlen[0] = std::distance(begin_seqs[0].first, begin_seqs[0].second);
379  nmax = seqlen[0];
380  for (diff_type i = 1; i < m; ++i)
381  {
382  seqlen[i] = std::distance(begin_seqs[i].first, begin_seqs[i].second);
383  nmax = std::max(nmax, seqlen[i]);
384  }
385 
386  // pad all lists to this length, at least as long as any ns[i], equliaty iff
387  // nmax = 2^k - 1
388  diff_type l = round_up_to_power_of_two(nmax + 1) - 1;
389 
390  diff_type* a = new diff_type[m], * b = new diff_type[m];
391 
392  for (diff_type i = 0; i < m; ++i)
393  a[i] = 0, b[i] = l;
394 
395  n = l / 2;
396 
397  // invariants:
398  // 0 <= a[i] <= seqlen[i], 0 <= b[i] <= l
399 
400 #define S(i) (begin_seqs[i].first)
401 
402  //initial partition
403 
404  std::vector<sample_pair> sample;
405 
406  for (diff_type i = 0; i < m; ++i) {
407  if (n < seqlen[i])
408  sample.push_back(sample_pair(S(i)[n], i));
409  }
410 
411  std::sort(sample.begin(), sample.end(), lcomp);
412 
413  for (diff_type i = 0; i < m; ++i) {
414  //conceptual infinity
415  if (n >= seqlen[i])
416  sample.push_back(sample_pair(S(i)[0] /*dummy element*/, i));
417  }
418 
419  diff_type localrank = rank / l;
420 
421  diff_type j;
422  for (j = 0; j < localrank && ((n + 1) <= seqlen[sample[j].second]); ++j)
423  a[sample[j].second] += n + 1;
424  for ( ; j < m; ++j)
425  b[sample[j].second] -= n + 1;
426 
427  // further refinement
428 
429  while (n > 0)
430  {
431  n /= 2;
432 
433  const ValueType* lmax = NULL;
434  for (diff_type i = 0; i < m; ++i)
435  {
436  if (a[i] > 0)
437  {
438  if (!lmax)
439  {
440  lmax = &(S(i)[a[i] - 1]);
441  }
442  else
443  {
444  // max
445  if (comp(*lmax, S(i)[a[i] - 1]))
446  lmax = &(S(i)[a[i] - 1]);
447  }
448  }
449  }
450 
451  for (diff_type i = 0; i < m; ++i)
452  {
453  diff_type middle = (b[i] + a[i]) / 2;
454  if (lmax && middle < seqlen[i] && comp(S(i)[middle], *lmax))
455  a[i] = std::min(a[i] + n + 1, seqlen[i]);
456  else
457  b[i] -= n + 1;
458  }
459 
460  diff_type leftsize = 0;
461  for (diff_type i = 0; i < m; ++i)
462  leftsize += a[i] / (n + 1);
463 
464  diff_type skew = rank / (n + 1) - leftsize;
465 
466  if (skew > 0)
467  {
468  // move to the left, find smallest
469  std::priority_queue<sample_pair, std::vector<sample_pair>,
471  pq(lrcomp);
472 
473  for (diff_type i = 0; i < m; ++i) {
474  if (b[i] < seqlen[i])
475  pq.push(sample_pair(S(i)[b[i]], i));
476  }
477 
478  for ( ; skew != 0 && !pq.empty(); --skew)
479  {
480  diff_type source = pq.top().second;
481  pq.pop();
482 
483  a[source] = std::min(a[source] + n + 1, seqlen[source]);
484  b[source] += n + 1;
485 
486  if (b[source] < seqlen[source])
487  pq.push(sample_pair(S(source)[b[source]], source));
488  }
489  }
490  else if (skew < 0)
491  {
492  // move to the right, find greatest
493  std::priority_queue<sample_pair, std::vector<sample_pair>,
495  pq(lcomp);
496 
497  for (diff_type i = 0; i < m; ++i) {
498  if (a[i] > 0)
499  pq.push(sample_pair(S(i)[a[i] - 1], i));
500  }
501 
502  for ( ; skew != 0; ++skew)
503  {
504  diff_type source = pq.top().second;
505  pq.pop();
506 
507  a[source] -= n + 1;
508  b[source] -= n + 1;
509 
510  if (a[source] > 0)
511  pq.push(sample_pair(S(source)[a[source] - 1], source));
512  }
513  }
514  }
515 
516  // postconditions: a[i] == b[i] in most cases, except when a[i] has been
517  // clamped because of having reached the boundary
518 
519  // now return the result, calculate the offset, compare the keys on both
520  // edges of the border
521 
522  // maximum of left edge, minimum of right edge
523  ValueType* maxleft = NULL, * minright = NULL;
524  for (diff_type i = 0; i < m; ++i)
525  {
526  if (a[i] > 0)
527  {
528  if (!maxleft)
529  {
530  maxleft = &(S(i)[a[i] - 1]);
531  }
532  else
533  {
534  // max
535  if (comp(*maxleft, S(i)[a[i] - 1]))
536  maxleft = &(S(i)[a[i] - 1]);
537  }
538  }
539  if (b[i] < seqlen[i])
540  {
541  if (!minright)
542  {
543  minright = &(S(i)[b[i]]);
544  }
545  else
546  {
547  // min
548  if (comp(S(i)[b[i]], *minright))
549  minright = &(S(i)[b[i]]);
550  }
551  }
552  }
553 
554  // minright is the splitter, in any case
555 
556  if (!maxleft || comp(*minright, *maxleft))
557  {
558  // good luck, everything is split unambigiously
559  offset = 0;
560  }
561  else
562  {
563  // we have to calculate an offset
564  offset = 0;
565 
566  for (diff_type i = 0; i < m; ++i)
567  {
568  diff_type lb = std::lower_bound(S(i), S(i) + seqlen[i], *minright, comp) - S(i);
569  offset += a[i] - lb;
570  }
571  }
572 
573  delete[] seqlen;
574  delete[] a;
575  delete[] b;
576 
577  return *minright;
578 }
579 
580 #undef S
581 
582 } // namespace parallel
583 
585 
586 #endif // !STXXL_PARALLEL_MULTISEQ_SELECTION_HEADER
void multiseq_partition(const RanSeqs &begin_seqs, const RanSeqs &end_seqs, const RankType &rank, RankIterator begin_offsets, Comparator comp=std::less< typename std::iterator_traits< typename std::iterator_traits< RanSeqs >::value_type::first_type >::value_type >())
Splits several sorted sequences at a certain global rank, resulting in a splitting point for each seq...
void sort(ExtIterator first, ExtIterator last, StrictWeakOrdering cmp, unsigned_type M)
Sort records comparison-based, see stxxl::sort -- Sorting Comparison-Based.
Definition: sort.h:676
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.h:234
Compare a pair of types lexcigraphically, ascending.
ValueType multiseq_selection(const RanSeqs &begin_seqs, const RanSeqs &end_seqs, const RankType &rank, RankType &offset, Comparator comp=std::less< ValueType >())
Selects the element at a certain global rank from several sorted sequences.
#define STXXL_BEGIN_NAMESPACE
Definition: namespace.h:16
static uint_pair max()
return an uint_pair instance containing the largest value possible
Definition: uint_types.h:241
#define STXXL_PARALLEL_PCALL(n)
#define S(i)
Compare a pair of types lexcigraphically, descending.
static Integral round_up_to_power_of_two(Integral n)
Definition: utils.h:255
#define STXXL_END_NAMESPACE
Definition: namespace.h:17