STXXL  1.4-dev
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
matrix_arithmetic.h
Go to the documentation of this file.
1 /***************************************************************************
2  * include/stxxl/bits/containers/matrix_arithmetic.h
3  *
4  * Part of the STXXL. See http://stxxl.sourceforge.net
5  *
6  * Copyright (C) 2010-2011 Raoul Steffen <[email protected]>
7  *
8  * Distributed under the Boost Software License, Version 1.0.
9  * (See accompanying file LICENSE_1_0.txt or copy at
10  * http://www.boost.org/LICENSE_1_0.txt)
11  **************************************************************************/
12 
13 #ifndef STXXL_CONTAINERS_MATRIX_ARITHMETIC_HEADER
14 #define STXXL_CONTAINERS_MATRIX_ARITHMETIC_HEADER
15 
18 
20 
21 #ifndef STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS
22 #define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS 3
23 #endif
24 #ifndef STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE
25 #define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE 2
26 #endif
27 
28 template <typename ValueType>
29 class column_vector;
30 
31 template <typename ValueType>
32 class row_vector;
33 
34 template <typename ValueType, unsigned BlockSideLength>
35 class swappable_block_matrix;
36 
37 //! \addtogroup matrix
38 //! \{
39 
41 {
42  int_type block_multiplication_calls,
44  block_addition_calls,
45  block_additions_saved_through_zero;
46 
48  : block_multiplication_calls(0),
49  block_multiplications_saved_through_zero(0),
50  block_addition_calls(0),
51  block_additions_saved_through_zero(0) { }
52 
54  {
60  return res;
61  }
62 
64  {
70  return res;
71  }
72 };
73 
75  : public singleton<matrix_operation_statistic>, public matrix_operation_statistic_dataset
76 { };
77 
79 {
80  matrix_operation_statistic_data(const matrix_operation_statistic& stat = * matrix_operation_statistic::get_instance())
82 
85 
87  {
88  return *this = matrix_operation_statistic_data(stat);
89  }
90 
91  void set()
92  { operator = (*matrix_operation_statistic::get_instance()); }
93 
96 
99 };
100 
101 std::ostream& operator << (std::ostream& o, const matrix_operation_statistic_data& statsd)
102 {
103  o << "matrix operation statistics" << std::endl;
104  o << "block multiplication calls : "
105  << statsd.block_multiplication_calls << std::endl;
106  o << "block multiplications saved through zero blocks: "
107  << statsd.block_multiplications_saved_through_zero << std::endl;
108  o << "block multiplications performed : "
110  o << "block addition calls : "
111  << statsd.block_addition_calls << std::endl;
112  o << "block additions saved through zero blocks : "
113  << statsd.block_additions_saved_through_zero << std::endl;
114  o << "block additions performed : "
115  << statsd.block_addition_calls - statsd.block_additions_saved_through_zero << std::endl;
116  return o;
117 }
118 
119 //! \}
120 
121 //! matrix low-level operations and tools
122 namespace matrix_local {
123 
124 //! A static_quadtree holds 4^Level elements arranged in a quad tree.
125 //!
126 //! Static quad trees are useful for recursive algorithms with fixed depth
127 //! that partition the in- and output and perform pre- and postcalculations on the partitions.
128 //! The four children of one node are denoted as ul (up left), ur (up right), dl (down left), and dr (down right).
129 template <typename ValueType, unsigned Level>
131 {
132  typedef static_quadtree<ValueType, Level - 1> smaller_static_quadtree;
133 
135 
138  : ul(ul), ur(ur), dl(dl), dr(dr) { }
139 
141 
142  static_quadtree& operator &= (const static_quadtree& right)
143  {
144  ul &= right.ul, ur &= right.ur;
145  dl &= right.dl, dr &= right.dr;
146  return *this;
147  }
148 
150  {
151  ul += right.ul, ur += right.ur;
152  dl += right.dl, dr += right.dr;
153  return *this;
154  }
155 
156  static_quadtree& operator -= (const static_quadtree& right)
157  {
158  ul -= right.ul, ur -= right.ur;
159  dl -= right.dl, dr -= right.dr;
160  return *this;
161  }
162 
163  static_quadtree operator & (const static_quadtree& right) const
164  { return static_quadtree(ul & right.ul, ur & right.ur, dl & right.dl, dr & right.dr); }
165 
166  static_quadtree operator + (const static_quadtree& right) const
167  { return static_quadtree(ul + right.ul, ur + right.ur, dl + right.dl, dr + right.dr); }
168 
169  static_quadtree operator - (const static_quadtree& right) const
170  { return static_quadtree(ul - right.ul, ur - right.ur, dl - right.dl, dr - right.dr); }
171 };
172 
173 template <typename ValueType>
174 struct static_quadtree<ValueType, 0>
175 {
176  ValueType val;
177 
178  static_quadtree(const ValueType& v)
179  : val(v) { }
180 
182 
183  operator const ValueType& () const
184  { return val; }
185 
186  operator ValueType& ()
187  { return val; }
188 
189  static_quadtree& operator &= (const static_quadtree& right)
190  {
191  val &= right.val;
192  return *this;
193  }
194 
196  {
197  val += right.val;
198  return *this;
199  }
200 
201  static_quadtree& operator -= (const static_quadtree& right)
202  {
203  val -= right.val;
204  return *this;
205  }
206 
207  static_quadtree operator ! () const
208  { return static_quadtree(! val); }
209 
210  static_quadtree operator & (const static_quadtree& right) const
211  { return val & right.val; }
212 
213  static_quadtree operator + (const static_quadtree& right) const
214  { return val + right.val; }
215 
216  static_quadtree operator - (const static_quadtree& right) const
217  { return val - right.val; }
218 };
219 
220 template <typename ValueType, unsigned BlockSideLength, unsigned Level, bool AExists, bool BExists>
222 {
223  typedef static_quadtree<bool, Level> zbt; // true <=> is a zero-block
225 
226  typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, AExists, BExists> smaller_feedable_strassen_winograd_ab;
227  typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, AExists, false> smaller_feedable_strassen_winograd_a;
228  typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, false, BExists> smaller_feedable_strassen_winograd_b;
229  typedef feedable_strassen_winograd<ValueType, BlockSideLength, Level - 1, false, false> smaller_feedable_strassen_winograd_n;
230 
235 
236  const size_type n, m, l;
241 
243  const swappable_block_matrix_type& existing_a, const size_type a_from_row, const size_type a_from_col,
244  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l,
245  const swappable_block_matrix_type& existing_b, const size_type b_from_row, const size_type b_from_col)
246  : n(n), m(m), l(l),
247  p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col),
248  p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
249  p3( bs_c, n/2, m/2, l/2),
250  p4( bs_c, n/2, m/2, l/2),
251  p5( bs_c, n/2, m/2, l/2),
252  p6( bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
253  p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
254 
256  const swappable_block_matrix_type& existing_a, const size_type a_from_row, const size_type a_from_col,
257  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l)
258  : n(n), m(m), l(l),
259  p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2),
260  p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2),
261  p3( bs_c, n/2, m/2, l/2),
262  p4( bs_c, n/2, m/2, l/2),
263  p5( bs_c, n/2, m/2, l/2),
264  p6( bs_c, n/2, m/2, l/2),
265  p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
266 
268  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l,
269  const swappable_block_matrix_type& existing_b, const size_type b_from_row, const size_type b_from_col)
270  : n(n), m(m), l(l),
271  p1(bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col),
272  p2(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
273  p3(bs_c, n/2, m/2, l/2),
274  p4(bs_c, n/2, m/2, l/2),
275  p5(bs_c, n/2, m/2, l/2),
276  p6(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
277  p7(bs_c, n/2, m/2, l/2) {}
278 
280  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l)
281  : n(n), m(m), l(l),
282  p1(bs_c, n / 2, m / 2, l / 2),
283  p2(bs_c, n / 2, m / 2, l / 2),
284  p3(bs_c, n / 2, m / 2, l / 2),
285  p4(bs_c, n / 2, m / 2, l / 2),
286  p5(bs_c, n / 2, m / 2, l / 2),
287  p6(bs_c, n / 2, m / 2, l / 2),
288  p7(bs_c, n / 2, m / 2, l / 2) { }
289 
290  void begin_feeding_a_block(const size_type& block_row, const size_type& block_col, const zbt zb)
291  {
293  s1 = zb.dl & zb.dr,
294  s2 = s1 & zb.ul,
295  s3 = zb.ul & zb.dl,
296  s4 = zb.ur & s2;
297  p1.begin_feeding_a_block(block_row, block_col, zb.ul);
298  p2.begin_feeding_a_block(block_row, block_col, zb.ur);
299  p3.begin_feeding_a_block(block_row, block_col, s1);
300  p4.begin_feeding_a_block(block_row, block_col, s2);
301  p5.begin_feeding_a_block(block_row, block_col, s3);
302  p6.begin_feeding_a_block(block_row, block_col, s4);
303  p7.begin_feeding_a_block(block_row, block_col, zb.dr);
304  }
305 
306  void feed_a_element(const int_type element_num, const vt v)
307  {
309  s1 = v.dl + v.dr,
310  s2 = s1 - v.ul,
311  s3 = v.ul - v.dl,
312  s4 = v.ur - s2;
313  p1.feed_a_element(element_num, v.ul);
314  p2.feed_a_element(element_num, v.ur);
315  p3.feed_a_element(element_num, s1);
316  p4.feed_a_element(element_num, s2);
317  p5.feed_a_element(element_num, s3);
318  p6.feed_a_element(element_num, s4);
319  p7.feed_a_element(element_num, v.dr);
320  }
321 
322  void end_feeding_a_block(const size_type& block_row, const size_type& block_col, const zbt zb)
323  {
325  s1 = zb.dl & zb.dr,
326  s2 = s1 & zb.ul,
327  s3 = zb.ul & zb.dl,
328  s4 = zb.ur & s2;
329  p1.end_feeding_a_block(block_row, block_col, zb.ul);
330  p2.end_feeding_a_block(block_row, block_col, zb.ur);
331  p3.end_feeding_a_block(block_row, block_col, s1);
332  p4.end_feeding_a_block(block_row, block_col, s2);
333  p5.end_feeding_a_block(block_row, block_col, s3);
334  p6.end_feeding_a_block(block_row, block_col, s4);
335  p7.end_feeding_a_block(block_row, block_col, zb.dr);
336  }
337 
338  void begin_feeding_b_block(const size_type& block_row, const size_type& block_col, const zbt zb)
339  {
341  t1 = zb.ur & zb.ul,
342  t2 = zb.dr & t1,
343  t3 = zb.dr & zb.ur,
344  t4 = zb.dl & t2;
345  p1.begin_feeding_b_block(block_row, block_col, zb.ul);
346  p2.begin_feeding_b_block(block_row, block_col, zb.dl);
347  p3.begin_feeding_b_block(block_row, block_col, t1);
348  p4.begin_feeding_b_block(block_row, block_col, t2);
349  p5.begin_feeding_b_block(block_row, block_col, t3);
350  p6.begin_feeding_b_block(block_row, block_col, zb.dr);
351  p7.begin_feeding_b_block(block_row, block_col, t4);
352  }
353 
354  void feed_b_element(const int_type element_num, const vt v)
355  {
357  t1 = v.ur - v.ul,
358  t2 = v.dr - t1,
359  t3 = v.dr - v.ur,
360  t4 = v.dl - t2;
361  p1.feed_b_element(element_num, v.ul);
362  p2.feed_b_element(element_num, v.dl);
363  p3.feed_b_element(element_num, t1);
364  p4.feed_b_element(element_num, t2);
365  p5.feed_b_element(element_num, t3);
366  p6.feed_b_element(element_num, v.dr);
367  p7.feed_b_element(element_num, t4);
368  }
369 
370  void end_feeding_b_block(const size_type& block_row, const size_type& block_col, const zbt zb)
371  {
373  t1 = zb.ur & zb.ul,
374  t2 = zb.dr & t1,
375  t3 = zb.dr & zb.ur,
376  t4 = zb.dl & t2;
377  p1.end_feeding_b_block(block_row, block_col, zb.ul);
378  p2.end_feeding_b_block(block_row, block_col, zb.dl);
379  p3.end_feeding_b_block(block_row, block_col, t1);
380  p4.end_feeding_b_block(block_row, block_col, t2);
381  p5.end_feeding_b_block(block_row, block_col, t3);
382  p6.end_feeding_b_block(block_row, block_col, zb.dr);
383  p7.end_feeding_b_block(block_row, block_col, t4);
384  }
385 
386  void multiply()
387  {
388  p1.multiply();
389  p2.multiply();
390  p3.multiply();
391  p4.multiply();
392  p5.multiply();
393  p6.multiply();
394  p7.multiply();
395  }
396 
397  zbt begin_reading_block(const size_type& block_row, const size_type& block_col)
398  {
399  zbt r;
400  r.ur = r.ul = p1.begin_reading_block(block_row, block_col);
401  r.ul &= p2.begin_reading_block(block_row, block_col);
402  r.ur &= p4.begin_reading_block(block_row, block_col);
403  r.dr = r.dl = p5.begin_reading_block(block_row, block_col);
404  r.dl &= r.ur;
405  r.dl &= p7.begin_reading_block(block_row, block_col);
406  r.ur &= p3.begin_reading_block(block_row, block_col);
407  r.dr &= r.ur;
408  r.ur &= p6.begin_reading_block(block_row, block_col);
409  return r;
410  }
411 
412  vt read_element(int_type element_num)
413  {
414  vt r;
415  r.ur = r.ul = p1.read_element(element_num);
416  r.ul += p2.read_element(element_num);
417  r.ur += p4.read_element(element_num);
418  r.dr = r.dl = p5.read_element(element_num);
419  r.dl += r.ur;
420  r.dl += p7.read_element(element_num);
421  r.ur += p3.read_element(element_num);
422  r.dr += r.ur;
423  r.ur += p6.read_element(element_num);
424  return r;
425  }
426 
427  zbt end_reading_block(const size_type& block_row, const size_type& block_col)
428  {
429  zbt r;
430  r.ur = r.ul = p1.end_reading_block(block_row, block_col);
431  r.ul &= p2.end_reading_block(block_row, block_col);
432  r.ur &= p4.end_reading_block(block_row, block_col);
433  r.dr = r.dl = p5.end_reading_block(block_row, block_col);
434  r.dl &= r.ur;
435  r.dl &= p7.end_reading_block(block_row, block_col);
436  r.ur &= p3.end_reading_block(block_row, block_col);
437  r.dr &= r.ur;
438  r.ur &= p6.end_reading_block(block_row, block_col);
439  return r;
440  }
441 };
442 
443 template <typename ValueType, unsigned BlockSideLength, bool AExists, bool BExists>
444 struct feedable_strassen_winograd<ValueType, BlockSideLength, 0, AExists, BExists>
445 {
446  typedef static_quadtree<bool, 0> zbt; // true <=> is a zero-block
448 
453 
455  const size_type n, m, l;
457 
459  const swappable_block_matrix_type& existing_a, const size_type a_from_row, const size_type a_from_col,
460  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l,
461  const swappable_block_matrix_type& existing_b, const size_type b_from_row, const size_type b_from_col)
462  : a(existing_a, n, l, a_from_row, a_from_col),
463  b(existing_b, n, l, b_from_row, b_from_col),
464  c(bs_c, n, m),
465  n(n), m(m), l(l),
466  iblock(0) { }
467 
469  const swappable_block_matrix_type& existing_a, const size_type a_from_row, const size_type a_from_col,
470  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l)
471  : a(existing_a, n, l, a_from_row, a_from_col),
472  b(bs_c, n, l),
473  c(bs_c, n, m),
474  n(n), m(m), l(l),
475  iblock(0) { }
476 
478  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l,
479  const swappable_block_matrix_type& existing_b, const size_type b_from_row, const size_type b_from_col)
480  : a(bs_c, n, l),
481  b(existing_b, n, l, b_from_row, b_from_col),
482  c(bs_c, n, m),
483  n(n), m(m), l(l),
484  iblock(0) { }
485 
487  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l)
488  : a(bs_c, n, l),
489  b(bs_c, n, l),
490  c(bs_c, n, m),
491  n(n), m(m), l(l),
492  iblock(0) { }
493 
494  void begin_feeding_a_block(const size_type& block_row, const size_type& block_col, const zbt)
495  {
496  if (! AExists)
497  iblock = &a.bs.acquire(a(block_row, block_col), true);
498  }
499 
500  void feed_a_element(const int_type element_num, const vt v)
501  {
502  if (! AExists)
503  (*iblock)[element_num] = v;
504  }
505 
506  void end_feeding_a_block(const size_type& block_row, const size_type& block_col, const zbt zb)
507  {
508  if (! AExists)
509  {
510  a.bs.release(a(block_row, block_col), ! zb);
511  iblock = 0;
512  }
513  }
514 
515  void begin_feeding_b_block(const size_type& block_row, const size_type& block_col, const zbt)
516  {
517  if (! BExists)
518  iblock = &b.bs.acquire(b(block_row, block_col), true);
519  }
520 
521  void feed_b_element(const int_type element_num, const vt v)
522  {
523  if (! BExists)
524  (*iblock)[element_num] = v;
525  }
526 
527  void end_feeding_b_block(const size_type& block_row, const size_type& block_col, const zbt zb)
528  {
529  if (! BExists)
530  {
531  b.bs.release(b(block_row, block_col), ! zb);
532  iblock = 0;
533  }
534  }
535 
536  void multiply()
538 
539  zbt begin_reading_block(const size_type& block_row, const size_type& block_col)
540  {
541  bool zb = ! c.bs.is_initialized(c(block_row, block_col));
542  iblock = &c.bs.acquire(c(block_row, block_col));
543  return zb;
544  }
545 
546  vt read_element(const int_type element_num)
547  { return (*iblock)[element_num]; }
548 
549  zbt end_reading_block(const size_type& block_row, const size_type& block_col)
550  {
551  c.bs.release(c(block_row, block_col), false);
552  iblock = 0;
553  return ! c.bs.is_initialized(c(block_row, block_col));
554  }
555 };
556 
557 template <typename ValueType, unsigned BlockSideLength, unsigned Level>
559 {
560  typedef static_quadtree<bool, Level> zbt; // true <=> is a zero-block
562 
563  typedef matrix_to_quadtree<ValueType, BlockSideLength, Level - 1> smaller_matrix_to_quadtree;
564 
569 
571 
573  : ul(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, 0),
574  ur(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, matrix.get_width()/2),
575  dl(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, 0),
576  dr(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, matrix.get_width()/2)
577  { assert(! (matrix.get_height() % 2 | matrix.get_width() % 2)); }
578 
580  const size_type height, const size_type width, const size_type from_row, const size_type from_col)
581  : ul(matrix, height/2, width/2, from_row, from_col),
582  ur(matrix, height/2, width/2, from_row, from_col + width/2),
583  dl(matrix, height/2, width/2, from_row + height/2, from_col),
584  dr(matrix, height/2, width/2, from_row + height/2, from_col + width/2)
585  { assert(! (height % 2 | width % 2)); }
586 
587  void begin_feeding_block(const size_type& block_row, const size_type& block_col, const zbt zb)
588  {
589  ul.begin_feeding_block(block_row, block_col, zb.ul);
590  ur.begin_feeding_block(block_row, block_col, zb.ur);
591  dl.begin_feeding_block(block_row, block_col, zb.dl);
592  dr.begin_feeding_block(block_row, block_col, zb.dr);
593  }
594 
595  void feed_element(const int_type element_num, const vt v)
596  {
597  ul.feed_element(element_num, v.ul);
598  ur.feed_element(element_num, v.ur);
599  dl.feed_element(element_num, v.dl);
600  dr.feed_element(element_num, v.dr);
601  }
602 
603  void feed_and_add_element(const int_type element_num, const vt v)
604  {
605  ul.feed_and_add_element(element_num, v.ul);
606  ur.feed_and_add_element(element_num, v.ur);
607  dl.feed_and_add_element(element_num, v.dl);
608  dr.feed_and_add_element(element_num, v.dr);
609  }
610 
611  void end_feeding_block(const size_type& block_row, const size_type& block_col, const zbt zb)
612  {
613  ul.end_feeding_block(block_row, block_col, zb.ul);
614  ur.end_feeding_block(block_row, block_col, zb.ur);
615  dl.end_feeding_block(block_row, block_col, zb.dl);
616  dr.end_feeding_block(block_row, block_col, zb.dr);
617  }
618 
619  zbt begin_reading_block(const size_type& block_row, const size_type& block_col)
620  {
621  zbt zb;
622  zb.ul = ul.begin_reading_block(block_row, block_col);
623  zb.ur = ur.begin_reading_block(block_row, block_col);
624  zb.dl = dl.begin_reading_block(block_row, block_col);
625  zb.dr = dr.begin_reading_block(block_row, block_col);
626  return zb;
627  }
628 
629  vt read_element(const int_type element_num)
630  {
631  vt v;
632  v.ul = ul.read_element(element_num);
633  v.ur = ur.read_element(element_num);
634  v.dl = dl.read_element(element_num);
635  v.dr = dr.read_element(element_num);
636  return v;
637  }
638 
639  zbt end_reading_block(const size_type& block_row, const size_type& block_col)
640  {
641  zbt zb;
642  zb.ul = ul.end_reading_block(block_row, block_col);
643  zb.ur = ur.end_reading_block(block_row, block_col);
644  zb.dl = dl.end_reading_block(block_row, block_col);
645  zb.dr = dr.end_reading_block(block_row, block_col);
646  return zb;
647  }
648 
650  { return ul.get_height_in_blocks(); }
651 
653  { return ul.get_width_in_blocks(); }
654 };
655 
656 template <typename ValueType, unsigned BlockSideLength>
657 struct matrix_to_quadtree<ValueType, BlockSideLength, 0>
658 {
659  typedef static_quadtree<bool, 0> zbt; // true <=> is a zero-block
661 
666 
669 
671  : m(matrix, matrix.get_height(), matrix.get_width(), 0, 0),
672  iblock(0) { }
673 
675  const size_type height, const size_type width, const size_type from_row, const size_type from_col)
676  : m(matrix, height, width, from_row, from_col),
677  iblock(0) { }
678 
679  void begin_feeding_block(const size_type& block_row, const size_type& block_col, const zbt)
680  { iblock = &m.bs.acquire(m(block_row, block_col)); }
681 
682  void feed_element(const int_type element_num, const vt v)
683  { (*iblock)[element_num] = v; }
684 
685  void feed_and_add_element(const int_type element_num, const vt v)
686  { (*iblock)[element_num] += v; }
687 
688  void end_feeding_block(const size_type& block_row, const size_type& block_col, const zbt zb)
689  {
690  m.bs.release(m(block_row, block_col), ! zb);
691  iblock = 0;
692  }
693 
694  zbt begin_reading_block(const size_type& block_row, const size_type& block_col)
695  {
696  zbt zb = ! m.bs.is_initialized(m(block_row, block_col));
697  iblock = &m.bs.acquire(m(block_row, block_col));
698  return zb;
699  }
700 
701  vt read_element(const int_type element_num)
702  { return (*iblock)[element_num]; }
703 
704  zbt end_reading_block(const size_type& block_row, const size_type& block_col)
705  {
706  m.bs.release(m(block_row, block_col), false);
707  iblock = 0;
708  return ! m.bs.is_initialized(m(block_row, block_col));
709  }
710 
712  { return m.get_height(); }
713 
715  { return m.get_width(); }
716 };
717 
718 template <typename ValueType, unsigned BlockSideLength, unsigned Level, bool AExists, bool BExists>
720 {
721  typedef static_quadtree<bool, Level> zbt; // true <=> is a zero-block
723 
724  typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, AExists, BExists> smaller_feedable_strassen_winograd_ab;
725  typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, AExists, false> smaller_feedable_strassen_winograd_a;
726  typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, false, BExists> smaller_feedable_strassen_winograd_b;
727  typedef feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, Level - 1, false, false> smaller_feedable_strassen_winograd_n;
728 
734 
735  const size_type n, m, l;
740 
742  const swappable_block_matrix_type& existing_a, const size_type a_from_row, const size_type a_from_col,
743  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l,
744  const swappable_block_matrix_type& existing_b, const size_type b_from_row, const size_type b_from_col)
745  : n(n), m(m), l(l),
746  p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col),
747  p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
748  p3( bs_c, n/2, m/2, l/2),
749  p4( bs_c, n/2, m/2, l/2),
750  p5( bs_c, n/2, m/2, l/2),
751  p6( bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
752  p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
753 
755  const swappable_block_matrix_type& existing_a, const size_type a_from_row, const size_type a_from_col,
756  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l)
757  : n(n), m(m), l(l),
758  p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2),
759  p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2),
760  p3( bs_c, n/2, m/2, l/2),
761  p4( bs_c, n/2, m/2, l/2),
762  p5( bs_c, n/2, m/2, l/2),
763  p6( bs_c, n/2, m/2, l/2),
764  p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
765 
767  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l,
768  const swappable_block_matrix_type& existing_b, const size_type b_from_row, const size_type b_from_col)
769  : n(n), m(m), l(l),
770  p1(bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col),
771  p2(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
772  p3(bs_c, n/2, m/2, l/2),
773  p4(bs_c, n/2, m/2, l/2),
774  p5(bs_c, n/2, m/2, l/2),
775  p6(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
776  p7(bs_c, n/2, m/2, l/2) {}
777 
779  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l)
780  : n(n), m(m), l(l),
781  p1(bs_c, n / 2, m / 2, l / 2),
782  p2(bs_c, n / 2, m / 2, l / 2),
783  p3(bs_c, n / 2, m / 2, l / 2),
784  p4(bs_c, n / 2, m / 2, l / 2),
785  p5(bs_c, n / 2, m / 2, l / 2),
786  p6(bs_c, n / 2, m / 2, l / 2),
787  p7(bs_c, n / 2, m / 2, l / 2) { }
788 
789  inline void feed_a(const size_type& row, const size_type& col, const swappable_block_matrix_type& bl)
790  {
791  // partition bl
792  typename Ops::swappable_block_matrix_quarterer qbl(bl);
793  // preadditions
795  s1(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
796  s2(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
797  s3(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
798  s4(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed());
799  Ops::strassen_winograd_preaddition_a(qbl.ul, qbl.ur, qbl.dl, qbl.dr, s1, s2, s3, s4);
800  // feed recursive
801  p1.feed_a(row, col, qbl.ul);
802  p2.feed_a(row, col, qbl.ur);
803  p3.feed_a(row, col, s1);
804  p4.feed_a(row, col, s2);
805  p5.feed_a(row, col, s3);
806  p6.feed_a(row, col, s4);
807  p7.feed_a(row, col, qbl.dr);
808  }
809 
810  inline void feed_b(const size_type& row, const size_type& col, const swappable_block_matrix_type& bl)
811  {
812  // partition bl
813  typename Ops::swappable_block_matrix_quarterer qbl(bl);
814  // preadditions
816  t1(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
817  t2(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
818  t3(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed()),
819  t4(bl.bs, qbl.ul.get_height(), qbl.ul.get_width(), qbl.ul.is_transposed());
820  Ops::strassen_winograd_preaddition_b(qbl.ul, qbl.ur, qbl.dl, qbl.dr, t1, t2, t3, t4);
821  // feed recursive
822  p1.feed_b(row, col, qbl.ul);
823  p2.feed_b(row, col, qbl.dl);
824  p3.feed_b(row, col, t1);
825  p4.feed_b(row, col, t2);
826  p5.feed_b(row, col, t3);
827  p6.feed_b(row, col, qbl.dr);
828  p7.feed_b(row, col, t4);
829  }
830 
831  inline void multiply()
832  {
833  p1.multiply();
834  p2.multiply();
835  p3.multiply();
836  p4.multiply();
837  p5.multiply();
838  p6.multiply();
839  p7.multiply();
840  }
841 
842  inline void read_and_add(const size_type& row, const size_type& col, const swappable_block_matrix_type& bl)
843  {
844  // partition bl
845  typename Ops::swappable_block_matrix_quarterer qbl(bl);
846  // postadditions
848  p2.read_and_add(row, col, qbl.ul);
849  p1.read_and_add(row, col, px);
850  Ops::element_op(qbl.ul, px, typename Ops::addition());
851  p4.read_and_add(row, col, px);
852  Ops::element_op(qbl.ur, px, typename Ops::addition());
853  p5.read_and_add(row, col, px);
854  Ops::element_op_twice_nontransposed(qbl.dl, qbl.dr, px, typename Ops::addition());
855  px.set_zero();
856  p7.read_and_add(row, col, qbl.dl);
857  p3.read_and_add(row, col, px);
858  Ops::element_op_twice_nontransposed(qbl.dr, qbl.ur, px, typename Ops::addition());
859  p6.read_and_add(row, col, qbl.ur);
860  }
861 
863  { return smaller_feedable_strassen_winograd_ab::get_num_temp_grains() + (4 ^ Level) * 2; }
864 };
865 
866 template <typename ValueType, unsigned BlockSideLength, bool AExists, bool BExists>
867 struct feedable_strassen_winograd_block_grained<ValueType, BlockSideLength, 0, AExists, BExists>
868 {
874 
876 
878 
880  const swappable_block_matrix_type& existing_a, const size_type a_from_row, const size_type a_from_col,
881  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l,
882  const swappable_block_matrix_type& existing_b, const size_type b_from_row, const size_type b_from_col)
883  : a(existing_a, n, l, a_from_row, a_from_col),
884  b(existing_b, n, l, b_from_row, b_from_col),
885  c(bs_c, n, m) { }
886 
888  const swappable_block_matrix_type& existing_a, const size_type a_from_row, const size_type a_from_col,
889  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l)
890  : a(existing_a, n, l, a_from_row, a_from_col),
891  b(bs_c, n, l),
892  c(bs_c, n, m) { }
893 
895  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l,
896  const swappable_block_matrix_type& existing_b, const size_type b_from_row, const size_type b_from_col)
897  : a(bs_c, n, l),
898  b(existing_b, n, l, b_from_row, b_from_col),
899  c(bs_c, n, m) { }
900 
902  block_scheduler_type& bs_c, const size_type n, const size_type m, const size_type l)
903  : a(bs_c, n, l),
904  b(bs_c, n, l),
905  c(bs_c, n, m) { }
906 
907  inline void feed_a(const size_type& row, const size_type& col, const swappable_block_matrix_type& bl)
908  {
909  if (! AExists)
910  {
911  // copy bl to a from (row, col) (assuming a from (row, col) == 0)
912  swappable_block_matrix_type at(a, bl.get_height(), bl.get_width(), row, col);
913  Ops::element_op(at, bl, typename Ops::addition());
914  }
915  }
916 
917  inline void feed_b(const size_type& row, const size_type& col, const swappable_block_matrix_type& bl)
918  {
919  if (! BExists)
920  {
921  // copy bl(0,0) to b(row, col) (assuming b from (row, col) == 0)
922  swappable_block_matrix_type bt(b, bl.get_height(), bl.get_width(), row, col);
923  Ops::element_op(bt, bl, typename Ops::addition());
924  }
925  }
926 
927  inline void multiply()
928  {
931  if (! AExists)
932  a.set_zero();
933  if (! BExists)
934  b.set_zero();
935  }
936 
937 
938  inline void read_and_add(const size_type& row, const size_type& col, swappable_block_matrix_type& bl)
939  {
940  // add c from (row, col) to bl
941  swappable_block_matrix_type ct(c, bl.get_height(), bl.get_width(), row, col);
942  Ops::element_op(bl, ct, typename Ops::addition());
943  ct.set_zero();
944  }
945 
947  { return 0; }
948 };
949 
950 template <typename ValueType, unsigned BlockSideLength, unsigned Level, unsigned Granularity>
952 {
955 
956  typedef matrix_to_quadtree_block_grained<ValueType, BlockSideLength, Level - 1, Granularity> smaller_matrix_to_quadtree_block_grained;
957 
959 
961  : ul(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, 0),
962  ur(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, matrix.get_width()/2),
963  dl(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, 0),
964  dr(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, matrix.get_width()/2)
965  { assert(! (matrix.get_height() % 2 | matrix.get_width() % 2)); }
966 
968  const size_type height, const size_type width, const size_type from_row, const size_type from_col)
969  : ul(matrix, height/2, width/2, from_row, from_col),
970  ur(matrix, height/2, width/2, from_row, from_col + width/2),
971  dl(matrix, height/2, width/2, from_row + height/2, from_col),
972  dr(matrix, height/2, width/2, from_row + height/2, from_col + width/2)
973  { assert(! (height % 2 | width % 2)); }
974 
975  inline swappable_block_matrix_type operator () (const size_type& row, const size_type& col)
976  {
977  return swappable_block_matrix_type(ul(row, col), ur(row, col), dl(row, col), dr(row, col));
978  }
979 
980  inline const size_type get_height()
981  { return ul.get_height(); }
982 
983  inline const size_type get_width()
984  { return ul.get_width(); }
985 };
986 
987 template <typename ValueType, unsigned BlockSideLength, unsigned Granularity>
988 struct matrix_to_quadtree_block_grained<ValueType, BlockSideLength, 0, Granularity>
989 {
992 
994 
996  : m(matrix, matrix.get_height(), matrix.get_width(), 0, 0)
997  { assert(! (matrix.get_height() % Granularity | matrix.get_width() % Granularity)); }
998 
1000  const size_type height, const size_type width, const size_type from_row, const size_type from_col)
1001  : m(matrix, height, width, from_row, from_col)
1002  { assert(! (matrix.get_height() % Granularity | matrix.get_width() % Granularity)); }
1003 
1004  inline swappable_block_matrix_type operator () (const size_type& row, const size_type& col)
1005  {
1006  return swappable_block_matrix_type(m, Granularity, Granularity, row * Granularity, col * Granularity);
1007  }
1008 
1009  inline const size_type get_height()
1010  { return m.get_height() / Granularity; }
1011 
1012  inline const size_type get_width()
1013  { return m.get_width() / Granularity; }
1014 };
1015 
1016 template <typename ValueType, unsigned BlockSideLength>
1018 {
1019  // tuning-parameter: Only matrices larger than this (in blocks) are processed by Strassen-Winograd.
1020  // you have to adapt choose_level_for_feedable_sw, too
1022 
1031 
1032  // +-+-+-+ addition +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1033 
1034  struct addition
1035  {
1036  /* op(c,a,b) means c = a <op> b e.g. assign sum
1037  * op(c,a) means c <op>= a e.g. add up
1038  * op(a) means <op>a e.g. sign
1039  *
1040  * it should hold:
1041  * op(c,0,0) equivalent c = 0
1042  * op(c=0,a) equivalent c = op(a)
1043  * op(c,0) equivalent {}
1044  */
1045 
1046  inline ValueType& operator () (ValueType& c, const ValueType& a, const ValueType& b) { return c = a + b; }
1047  inline ValueType& operator () (ValueType& c, const ValueType& a) { return c += a; }
1048  inline ValueType operator () (const ValueType& a) { return +a; }
1049  };
1050 
1052  {
1053  inline ValueType& operator () (ValueType& c, const ValueType& a, const ValueType& b) { return c = a - b; }
1054  inline ValueType& operator () (ValueType& c, const ValueType& a) { return c -= a; }
1055  inline ValueType operator () (const ValueType& a) { return -a; }
1056  };
1057 
1059  {
1060  inline scalar_multiplication(const ValueType scalar = 1) : s(scalar) { }
1061  inline ValueType& operator () (ValueType& c, const ValueType& a) { return c = a * s; }
1062  inline ValueType operator () (const ValueType& a) { return a * s; }
1063  inline operator const ValueType& () { return s; }
1064  const ValueType s;
1065  };
1066 
1067  // element_op<Op>(C,A,B) calculates C = A <Op> B
1068  template <class Op>
1071  const swappable_block_matrix_type& A,
1072  const swappable_block_matrix_type& B, Op op = Op())
1073  {
1074  for (size_type row = 0; row < C.get_height(); ++row)
1075  for (size_type col = 0; col < C.get_width(); ++col)
1076  element_op_swappable_block(
1077  C(row, col), C.is_transposed(), C.bs,
1078  A(row, col), A.is_transposed(), A.bs,
1079  B(row, col), B.is_transposed(), B.bs, op);
1080  return C;
1081  }
1082 
1083  // element_op<Op>(C,A) calculates C <Op>= A
1084  template <class Op>
1085  static swappable_block_matrix_type&
1087  const swappable_block_matrix_type& A, Op op = Op())
1088  {
1089  for (size_type row = 0; row < C.get_height(); ++row)
1090  for (size_type col = 0; col < C.get_width(); ++col)
1091  element_op_swappable_block(
1092  C(row, col), C.is_transposed(), C.bs,
1093  A(row, col), A.is_transposed(), A.bs, op);
1094  return C;
1095  }
1096 
1097  // element_op<Op>(C) calculates C = <Op>C
1098  template <class Op>
1099  static swappable_block_matrix_type&
1100  element_op(swappable_block_matrix_type& C, Op op = Op())
1101  {
1102  for (size_type row = 0; row < C.get_height(); ++row)
1103  for (size_type col = 0; col < C.get_width(); ++col)
1104  element_op_swappable_block(
1105  C(row, col), C.bs, op);
1106  return C;
1107  }
1108 
1109  // calculates c = a <Op> b
1110  template <class Op>
1111  static void
1113  const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type& bs_c,
1114  const swappable_block_identifier_type a, bool a_is_transposed, block_scheduler_type& bs_a,
1115  const swappable_block_identifier_type b, bool b_is_transposed, block_scheduler_type& bs_b, Op op = Op())
1116  {
1117  if (! bs_c.is_simulating())
1118  ++matrix_operation_statistic::get_instance()->block_addition_calls;
1119  // check if zero-block (== ! initialized)
1120  if (! bs_a.is_initialized(a) && ! bs_b.is_initialized(b))
1121  {
1122  // => a and b are zero -> set c zero
1123  bs_c.deinitialize(c);
1124  if (! bs_c.is_simulating())
1125  ++matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
1126  return;
1127  }
1128  a_is_transposed = a_is_transposed != c_is_transposed;
1129  b_is_transposed = b_is_transposed != c_is_transposed;
1130  if (! bs_a.is_initialized(a))
1131  {
1132  // a is zero -> copy b
1133  internal_block_type& ic = bs_c.acquire(c, true),
1134  & ib = bs_b.acquire(b);
1135  if (! bs_c.is_simulating())
1136  {
1137  if (b_is_transposed)
1139  else
1141  }
1142  bs_b.release(b, false);
1143  bs_c.release(c, true);
1144  }
1145  else if (! bs_b.is_initialized(b))
1146  {
1147  // b is zero -> copy a
1148  internal_block_type& ic = bs_c.acquire(c, true),
1149  & ia = bs_a.acquire(a);
1150  if (! bs_c.is_simulating())
1151  {
1152  if (a_is_transposed)
1154  else
1156  }
1157  bs_a.release(a, false);
1158  bs_c.release(c, true);
1159  }
1160  else
1161  {
1162  internal_block_type& ic = bs_c.acquire(c, true),
1163  & ia = bs_a.acquire(a),
1164  & ib = bs_b.acquire(b);
1165  if (! bs_c.is_simulating())
1166  {
1167  if (a_is_transposed)
1168  {
1169  if (b_is_transposed)
1171  else
1173  }
1174  else
1175  {
1176  if (b_is_transposed)
1178  else
1180  }
1181  }
1182  bs_a.release(a, false);
1183  bs_b.release(b, false);
1184  bs_c.release(c, true);
1185  }
1186  }
1187 
1188  // calculates c <op>= a
1189  template <class Op>
1190  static void
1192  const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type& bs_c,
1193  const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type& bs_a, Op op = Op())
1194  {
1195  if (! bs_c.is_simulating())
1196  ++matrix_operation_statistic::get_instance()->block_addition_calls;
1197  // check if zero-block (== ! initialized)
1198  if (! bs_a.is_initialized(a))
1199  {
1200  // => b is zero => nothing to do
1201  if (! bs_c.is_simulating())
1202  ++matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
1203  return;
1204  }
1205  const bool c_is_zero = ! bs_c.is_initialized(c);
1206  // acquire
1207  internal_block_type& ic = bs_c.acquire(c, c_is_zero),
1208  & ia = bs_a.acquire(a);
1209  // add
1210  if (! bs_c.is_simulating())
1211  {
1212  if (c_is_zero) {
1213  if (c_is_transposed == a_is_transposed)
1215  else
1217  }
1218  else {
1219  if (c_is_transposed == a_is_transposed)
1221  else
1223  }
1224  }
1225  // release
1226  bs_c.release(c, true);
1227  bs_a.release(a, false);
1228  }
1229 
1230  // calculates c = <op>c
1231  template <class Op>
1232  static void
1234  const swappable_block_identifier_type c, block_scheduler_type& bs_c, Op op = Op())
1235  {
1236  if (! bs_c.is_simulating())
1237  ++matrix_operation_statistic::get_instance()->block_addition_calls;
1238  // check if zero-block (== ! initialized)
1239  if (! bs_c.is_initialized(c))
1240  {
1241  // => c is zero => nothing to do
1242  if (! bs_c.is_simulating())
1243  ++matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
1244  return;
1245  }
1246  // acquire
1247  internal_block_type& ic = bs_c.acquire(c);
1248  // add
1249  if (! bs_c.is_simulating())
1251  // release
1252  bs_c.release(c, true);
1253  }
1254 
1255  // additions for strassen-winograd
1256 
1257  inline static void
1266  {
1267  for (size_type row = 0; row < a11.get_height(); ++row)
1268  for (size_type col = 0; col < a11.get_width(); ++col)
1269  {
1270  op_swappable_block_nontransposed(s3, a11, subtraction(), a21, row, col);
1271  op_swappable_block_nontransposed(s1, a21, addition(), a22, row, col);
1272  op_swappable_block_nontransposed(s2, s1, subtraction(), a11, row, col);
1273  op_swappable_block_nontransposed(s4, a12, subtraction(), s2, row, col);
1274  }
1275  }
1276 
1277  inline static void
1286  {
1287  for (size_type row = 0; row < b11.get_height(); ++row)
1288  for (size_type col = 0; col < b11.get_width(); ++col)
1289  {
1290  op_swappable_block_nontransposed(t3, b22, subtraction(), b12, row, col);
1291  op_swappable_block_nontransposed(t1, b12, subtraction(), b11, row, col);
1292  op_swappable_block_nontransposed(t2, b22, subtraction(), t1, row, col);
1293  op_swappable_block_nontransposed(t4, b21, subtraction(), t2, row, col);
1294  }
1295  }
1296 
1297  inline static void
1299  swappable_block_matrix_type& c12, // = p6
1300  swappable_block_matrix_type& c21, // = p7
1301  swappable_block_matrix_type& c22, // = p4
1305  {
1306  for (size_type row = 0; row < c11.get_height(); ++row)
1307  for (size_type col = 0; col < c11.get_width(); ++col)
1308  {
1309  op_swappable_block_nontransposed(c11, addition(), p1, row, col); // (u1)
1310  op_swappable_block_nontransposed( p1, addition(), c22, row, col); // (u2)
1311  op_swappable_block_nontransposed( p5, addition(), p1, row, col); // (u3)
1312  op_swappable_block_nontransposed(c21, addition(), p5, row, col); // (u4)
1313  op_swappable_block_nontransposed(c22, p5, addition(), p3, row, col); // (u5)
1314  op_swappable_block_nontransposed( p1, addition(), p3, row, col); // (u6)
1315  op_swappable_block_nontransposed(c12, addition(), p1, row, col); // (u7)
1316  }
1317  }
1318 
1319  // calculates c1 += a; c2 += a
1320  template <class Op>
1321  inline static void
1324  const swappable_block_matrix_type& a, Op op = Op())
1325  {
1326  for (size_type row = 0; row < a.get_height(); ++row)
1327  for (size_type col = 0; col < a.get_width(); ++col)
1328  {
1329  element_op_swappable_block(
1330  c1(row, col), false, c1.bs,
1331  a(row, col), false, a.bs, op);
1332  element_op_swappable_block(
1333  c2(row, col), false, c2.bs,
1334  a(row, col), false, a.bs, op);
1335  }
1336  }
1337 
1338  template <class Op>
1339  inline static void
1342  size_type& row, size_type& col)
1343  {
1344  element_op_swappable_block(
1345  c(row, col), false, c.bs,
1346  a(row, col), false, a.bs,
1347  b(row, col), false, b.bs, op);
1348  }
1349 
1350  template <class Op>
1351  inline static void
1353  size_type& row, size_type& col)
1354  {
1355  element_op_swappable_block(
1356  c(row, col), false, c.bs,
1357  a(row, col), false, a.bs, op);
1358  }
1359 
1360  // +-+ end addition +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1361 
1362  // +-+-+-+ matrix multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1363 
1364  /* n, m and l denote the three dimensions of a matrix multiplication, according to the following ascii-art diagram:
1365  *
1366  * +--m--+
1367  * +----l-----+ | | +--m--+
1368  * | | | | | |
1369  * n A | • l B | = n C |
1370  * | | | | | |
1371  * +----------+ | | +-----+
1372  * +-----+
1373  *
1374  * The index-variables are called i, j, k for dimension
1375  * n, m, l .
1376  */
1377 
1378  // requires height and width divisible by 2
1380  {
1382  downleft, downright,
1383  & ul, & ur, & dl, & dr;
1384 
1386  : upleft (whole, whole.get_height()/2, whole.get_width()/2, 0, 0),
1387  upright (whole, whole.get_height()/2, whole.get_width()/2, 0, whole.get_width()/2),
1388  downleft (whole, whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, 0),
1389  downright(whole, whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, whole.get_width()/2),
1390  ul(upleft), ur(upright), dl(downleft), dr(downright)
1391  { assert(! (whole.get_height() % 2 | whole.get_width() % 2)); }
1392  };
1393 
1395  {
1397  downleft, downright,
1398  & ul, & ur, & dl, & dr;
1399 
1401  : upleft (whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2), 0, 0),
1402  upright (whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2), 0, div_ceil(whole.get_width(),2)),
1403  downleft (whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2), div_ceil(whole.get_height(),2), 0),
1404  downright(whole, div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2), div_ceil(whole.get_height(),2), div_ceil(whole.get_width(),2)),
1405  ul(upleft), ur(upright), dl(downleft), dr(downright) {}
1406  };
1407 
1409  {
1411  downleft, downright,
1412  & ul, & ur, & dl, & dr;
1413 
1415  : upleft (whole, whole.get_height()/2, whole.get_width()/2, 0, 0),
1416  upright (whole, whole.get_height()/2, whole.get_width() - whole.get_width()/2, 0, whole.get_width()/2),
1417  downleft (whole, whole.get_height() - whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, 0),
1418  downright(whole, whole.get_height() - whole.get_height()/2, whole.get_width() - whole.get_width()/2, whole.get_height()/2, whole.get_width()/2),
1419  ul(upleft), ur(upright), dl(downleft), dr(downright) {}
1420  };
1421 
1422  //! calculates C = A * B + C
1423  // requires fitting dimensions
1424  static swappable_block_matrix_type&
1426  const swappable_block_matrix_type& B,
1428  {
1429  int_type num_levels = ilog2_ceil(std::min(A.get_width(), std::min(C.get_width(), C.get_height())));
1431  {
1435  round_up_to_power_of_two(A.get_width(), num_levels), 0, 0),
1436  padded_b(B, round_up_to_power_of_two(B.get_height(), num_levels),
1437  round_up_to_power_of_two(B.get_width(), num_levels), 0, 0),
1438  padded_c(C, round_up_to_power_of_two(C.get_height(), num_levels),
1439  round_up_to_power_of_two(C.get_width(), num_levels), 0, 0);
1440  switch (num_levels)
1441  {
1442  #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 5 && 5 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
1443  case 5:
1444  use_feedable_sw_block_grained<5>(padded_a, padded_a, padded_c);
1445  break;
1446  #endif
1447  #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 4 && 4 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
1448  case 4:
1449  use_feedable_sw_block_grained<4>(padded_a, padded_a, padded_c);
1450  break;
1451  #endif
1452  #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 3 && 3 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
1453  case 3:
1454  use_feedable_sw_block_grained<3>(padded_a, padded_a, padded_c);
1455  break;
1456  #endif
1457  #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 2 && 2 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
1458  case 2:
1459  use_feedable_sw_block_grained<2>(padded_a, padded_a, padded_c);
1460  break;
1461  #endif
1462  default: // only here in case of wrong bounds
1463  strassen_winograd_multiply_and_add_interleaved(A, B, C);
1464  break;
1465  }
1466  }
1467  else
1468  // base case
1469  strassen_winograd_multiply_and_add_interleaved(A, B, C);
1470  return C;
1471  }
1472 
1473  // input matrices have to be padded
1474  template <unsigned Level>
1476  const swappable_block_matrix_type& B,
1478  {
1479  const unsigned granularity = 1;
1480 
1482  fsw(A, 0, 0, C.bs, C.get_height(), C.get_width(), A.get_width(), B, 0, 0);
1483  // preadditions for A
1484  {
1486  mtq_a(A);
1487  for (size_type row = 0; row < mtq_a.get_height(); ++row)
1488  for (size_type col = 0; col < mtq_a.get_width(); ++col)
1489  fsw.feed_a(row, col, mtq_a(row, col));
1490  }
1491  // preadditions for B
1492  {
1494  mtq_b(B);
1495  for (size_type row = 0; row < mtq_b.get_height(); ++row)
1496  for (size_type col = 0; col < mtq_b.get_width(); ++col)
1497  fsw.feed_b(row, col, mtq_b(row, col));
1498  }
1499  // recursive multiplications
1500  fsw.multiply();
1501  // postadditions
1502  {
1504  mtq_c(C);
1505  for (size_type row = 0; row < mtq_c.get_height(); ++row)
1506  for (size_type col = 0; col < mtq_c.get_width(); ++col)
1507  fsw.read_and_add(row, col, mtq_c(row, col));
1508  }
1509  }
1510 
1511  //! calculates C = A * B + C
1512  // requires fitting dimensions
1513  static swappable_block_matrix_type&
1515  const swappable_block_matrix_type& B,
1517  {
1519 
1521  round_up_to_power_of_two(A.get_width(), p), 0, 0),
1522  padded_b(B, round_up_to_power_of_two(B.get_height(), p),
1523  round_up_to_power_of_two(B.get_width(), p), 0, 0),
1524  padded_c(C, round_up_to_power_of_two(C.get_height(), p),
1525  round_up_to_power_of_two(C.get_width(), p), 0, 0);
1526  choose_level_for_feedable_sw(padded_a, padded_b, padded_c);
1527  return C;
1528  }
1529 
1530  // input matrices have to be padded
1532  const swappable_block_matrix_type& B,
1534  {
1535  switch (ilog2_ceil(std::min(A.get_width(), std::min(C.get_width(), C.get_height()))))
1536  {
1537  default:
1538  /*
1539  use_feedable_sw<4>(A, B, C);
1540  break;
1541  case 3:
1542  use_feedable_sw<3>(A, B, C);
1543  break;
1544  case 2:*/
1545  use_feedable_sw<2>(A, B, C);
1546  break;
1547  case 1:
1548  /*use_feedable_sw<1>(A, B, C);
1549  break;*/
1550  case 0:
1551  // base case
1552  recursive_multiply_and_add(A, B, C);
1553  break;
1554  }
1555  }
1556 
1557  // input matrices have to be padded
1558  template <unsigned Level>
1560  const swappable_block_matrix_type& B,
1562  {
1564  fsw(A, 0, 0, C.bs, C.get_height(), C.get_width(), A.get_width(), B, 0, 0);
1565  // preadditions for A
1567  mtq_a(A);
1568  for (size_type block_row = 0; block_row < mtq_a.get_height_in_blocks(); ++block_row)
1569  for (size_type block_col = 0; block_col < mtq_a.get_width_in_blocks(); ++block_col)
1570  {
1571  fsw.begin_feeding_a_block(block_row, block_col,
1572  mtq_a.begin_reading_block(block_row, block_col));
1573  #if STXXL_PARALLEL
1574  #pragma omp parallel for
1575  #endif
1576  for (int_type element_row_in_block = 0; element_row_in_block < int_type(BlockSideLength); ++element_row_in_block)
1577  for (int_type element_col_in_block = 0; element_col_in_block < int_type(BlockSideLength); ++element_col_in_block)
1578  fsw.feed_a_element(element_row_in_block * BlockSideLength + element_col_in_block,
1579  mtq_a.read_element(element_row_in_block * BlockSideLength + element_col_in_block));
1580  fsw.end_feeding_a_block(block_row, block_col,
1581  mtq_a.end_reading_block(block_row, block_col));
1582  }
1583  // preadditions for B
1585  mtq_b(B);
1586  for (size_type block_row = 0; block_row < mtq_b.get_height_in_blocks(); ++block_row)
1587  for (size_type block_col = 0; block_col < mtq_b.get_width_in_blocks(); ++block_col)
1588  {
1589  fsw.begin_feeding_b_block(block_row, block_col,
1590  mtq_b.begin_reading_block(block_row, block_col));
1591  #if STXXL_PARALLEL
1592  #pragma omp parallel for
1593  #endif
1594  for (int_type element_row_in_block = 0; element_row_in_block < int_type(BlockSideLength); ++element_row_in_block)
1595  for (int_type element_col_in_block = 0; element_col_in_block < int_type(BlockSideLength); ++element_col_in_block)
1596  fsw.feed_b_element(element_row_in_block * BlockSideLength + element_col_in_block,
1597  mtq_b.read_element(element_row_in_block * BlockSideLength + element_col_in_block));
1598  fsw.end_feeding_b_block(block_row, block_col,
1599  mtq_b.end_reading_block(block_row, block_col));
1600  }
1601  // recursive multiplications
1602  fsw.multiply();
1603  // postadditions
1605  mtq_c(C);
1606  for (size_type block_row = 0; block_row < mtq_c.get_height_in_blocks(); ++block_row)
1607  for (size_type block_col = 0; block_col < mtq_c.get_width_in_blocks(); ++block_col)
1608  {
1609  mtq_c.begin_feeding_block(block_row, block_col,
1610  fsw.begin_reading_block(block_row, block_col));
1611  #if STXXL_PARALLEL
1612  #pragma omp parallel for
1613  #endif
1614  for (int_type element_row_in_block = 0; element_row_in_block < int_type(BlockSideLength); ++element_row_in_block)
1615  for (int_type element_col_in_block = 0; element_col_in_block < int_type(BlockSideLength); ++element_col_in_block)
1616  mtq_c.feed_and_add_element(element_row_in_block * BlockSideLength + element_col_in_block,
1617  fsw.read_element(element_row_in_block * BlockSideLength + element_col_in_block));
1618  mtq_c.end_feeding_block(block_row, block_col,
1619  fsw.end_reading_block(block_row, block_col));
1620  }
1621  }
1622 
1623  //! calculates C = A * B
1624  // assumes fitting dimensions
1625  static swappable_block_matrix_type&
1627  const swappable_block_matrix_type& B,
1629  {
1630  // base case
1631  if (C.get_height() <= strassen_winograd_base_case_size
1632  || C.get_width() <= strassen_winograd_base_case_size
1633  || A.get_width() <= strassen_winograd_base_case_size)
1634  {
1635  C.set_zero();
1636  return recursive_multiply_and_add(A, B, C);
1637  }
1638 
1639  // partition matrix
1640  swappable_block_matrix_padding_quarterer qa(A), qb(B), qc(C);
1641  // preadditions
1642  swappable_block_matrix_type s1(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1643  s2(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1644  s3(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1645  s4(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1646  t1(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1647  t2(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1648  t3(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1649  t4(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed());
1650  strassen_winograd_preaddition_a(qa.ul, qa.ur, qa.dl, qa.dr, s1, s2, s3, s4);
1651  strassen_winograd_preaddition_b(qb.ul, qb.ur, qb.dl, qb.dr, t1, t2, t3, t4);
1652  // recursive multiplications
1654  // p2 stored in qc.ul
1655  p3(C.bs, qc.ul.get_height(), qc.ul.get_width(), qc.ul.is_transposed()),
1656  // p4 stored in qc.dr
1657  p5(C.bs, qc.ul.get_height(), qc.ul.get_width(), qc.ul.is_transposed());
1658  // p6 stored in qc.ur
1659  // p7 stored in qc.dl
1660  strassen_winograd_multiply(qa.ul, qb.ul, p1);
1661  strassen_winograd_multiply(qa.ur, qb.dl, qc.ul);
1662  strassen_winograd_multiply( s1, t1, p3);
1663  strassen_winograd_multiply( s2, t2, qc.dr);
1664  strassen_winograd_multiply( s3, t3, p5);
1665  strassen_winograd_multiply( s4, qb.dr, qc.ur);
1666  strassen_winograd_multiply(qa.dr, t4, qc.dl);
1667  // postadditions
1668  strassen_winograd_postaddition(qc.ul, qc.ur, qc.dl, qc.dr, p1, p3, p5);
1669  return C;
1670  }
1671 
1672  //! calculates C = A * B + C
1673  // assumes fitting dimensions
1674  static swappable_block_matrix_type&
1676  const swappable_block_matrix_type& B,
1678  {
1679  // base case
1680  if (C.get_height() <= strassen_winograd_base_case_size
1681  || C.get_width() <= strassen_winograd_base_case_size
1682  || A.get_width() <= strassen_winograd_base_case_size)
1683  return recursive_multiply_and_add(A, B, C);
1684 
1685  // partition matrix
1686  swappable_block_matrix_padding_quarterer qa(A), qb(B), qc(C);
1687  // preadditions
1688  swappable_block_matrix_type s1(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1689  s2(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1690  s3(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1691  s4(C.bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1692  t1(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1693  t2(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1694  t3(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1695  t4(C.bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed());
1696  strassen_winograd_preaddition_a(qa.ul, qa.ur, qa.dl, qa.dr, s1, s2, s3, s4);
1697  strassen_winograd_preaddition_b(qb.ul, qb.ur, qb.dl, qb.dr, t1, t2, t3, t4);
1698  // recursive multiplications and postadditions
1700  strassen_winograd_multiply_and_add_interleaved(qa.ur, qb.dl, qc.ul); // p2
1701  strassen_winograd_multiply_and_add_interleaved(qa.ul, qb.ul, px); // p1
1702  element_op<addition>(qc.ul, px);
1703  strassen_winograd_multiply_and_add_interleaved(s2, t2, px); // p4
1704  s2.set_zero();
1705  t2.set_zero();
1706  element_op<addition>(qc.ur, px);
1707  strassen_winograd_multiply_and_add_interleaved(s3, t3, px); // p5
1708  s3.set_zero();
1709  t3.set_zero();
1710  element_op_twice_nontransposed<addition>(qc.dl, qc.dr, px);
1711  px.set_zero();
1712  strassen_winograd_multiply_and_add_interleaved(qa.dr, t4, qc.dl); // p7
1713  t4.set_zero();
1714  strassen_winograd_multiply_and_add_interleaved(s1, t1, px); // p3
1715  s1.set_zero();
1716  t1.set_zero();
1717  element_op_twice_nontransposed<addition>(qc.dr, qc.ur, px);
1718  px.set_zero();
1719  strassen_winograd_multiply_and_add_interleaved(s4, qb.dr, qc.ur); // p6
1720  return C;
1721  }
1722 
1723  //! calculates C = A * B + C
1724  // assumes fitting dimensions
1725  static swappable_block_matrix_type&
1727  const swappable_block_matrix_type& B,
1729  {
1730  // base case
1731  if (C.get_height() <= strassen_winograd_base_case_size
1732  || C.get_width() <= strassen_winograd_base_case_size
1733  || A.get_width() <= strassen_winograd_base_case_size)
1734  return recursive_multiply_and_add(A, B, C);
1735 
1736  // partition matrix
1737  swappable_block_matrix_padding_quarterer qa(A), qb(B), qc(C);
1738  // preadditions
1739  swappable_block_matrix_type s1(C.bs, qa.ul.get_height(), qa.ul.get_width()),
1740  s2(C.bs, qa.ul.get_height(), qa.ul.get_width()),
1741  s3(C.bs, qa.ul.get_height(), qa.ul.get_width()),
1742  s4(C.bs, qa.ul.get_height(), qa.ul.get_width()),
1743  t1(C.bs, qb.ul.get_height(), qb.ul.get_width()),
1744  t2(C.bs, qb.ul.get_height(), qb.ul.get_width()),
1745  t3(C.bs, qb.ul.get_height(), qb.ul.get_width()),
1746  t4(C.bs, qb.ul.get_height(), qb.ul.get_width());
1747  element_op<subtraction>(s3, qa.ul, qa.dl);
1748  element_op<addition>(s1, qa.dl, qa.dr);
1749  element_op<subtraction>(s2, s1, qa.ul);
1750  element_op<subtraction>(s4, qa.ur, s2);
1751  element_op<subtraction>(t3, qb.dr, qb.ur);
1752  element_op<subtraction>(t1, qb.ur, qb.ul);
1753  element_op<subtraction>(t2, qb.dr, t1);
1754  element_op<subtraction>(t4, qb.dl, t2);
1755  // recursive multiplications and postadditions
1757  strassen_winograd_multiply_and_add(qa.ur, qb.dl, qc.ul); // p2
1758  strassen_winograd_multiply_and_add(qa.ul, qb.ul, px); // p1
1759  element_op<addition>(qc.ul, px);
1760  strassen_winograd_multiply_and_add(s2, t2, px); // p4
1761  element_op<addition>(qc.ur, px);
1762  strassen_winograd_multiply_and_add(s3, t3, px); // p5
1763  element_op<addition>(qc.dl, px);
1764  element_op<addition>(qc.dr, px);
1765  px.set_zero();
1766  strassen_winograd_multiply_and_add(qa.dr, t4, qc.dl); // p7
1767  strassen_winograd_multiply_and_add(s1, t1, px); // p3
1768  element_op<addition>(qc.dr, px);
1769  element_op<addition>(qc.ur, px);
1770  strassen_winograd_multiply_and_add(s4, qb.dr, qc.ur); // p6
1771  return C;
1772  }
1773 
1774  //! calculates C = A * B + C
1775  // assumes fitting dimensions
1776  static swappable_block_matrix_type&
1778  const swappable_block_matrix_type& B,
1780  {
1781  // catch empty intervals
1782  if (C.get_height() * C.get_width() * A.get_width() == 0)
1783  return C;
1784  // base case
1785  if ((C.get_height() == 1) + (C.get_width() == 1) + (A.get_width() == 1) >= 2)
1786  return naive_multiply_and_add(A, B, C);
1787 
1788  // partition matrix
1790  // recursive multiplication
1791  // The order of recursive calls is optimized to enhance locality. C has priority because it has to be read and written.
1792  recursive_multiply_and_add(qa.ul, qb.ul, qc.ul);
1793  recursive_multiply_and_add(qa.ur, qb.dl, qc.ul);
1794  recursive_multiply_and_add(qa.ur, qb.dr, qc.ur);
1795  recursive_multiply_and_add(qa.ul, qb.ur, qc.ur);
1796  recursive_multiply_and_add(qa.dl, qb.ur, qc.dr);
1797  recursive_multiply_and_add(qa.dr, qb.dr, qc.dr);
1798  recursive_multiply_and_add(qa.dr, qb.dl, qc.dl);
1799  recursive_multiply_and_add(qa.dl, qb.ul, qc.dl);
1800 
1801  return C;
1802  }
1803 
1804  //! calculates C = A * B + C
1805  // requires fitting dimensions
1806  static swappable_block_matrix_type&
1808  const swappable_block_matrix_type& B,
1810  {
1811  const size_type& n = C.get_height(),
1812  & m = C.get_width(),
1813  & l = A.get_width();
1814  for (size_type i = 0; i < n; ++i)
1815  for (size_type j = 0; j < m; ++j)
1816  for (size_type k = 0; k < l; ++k)
1817  multiply_and_add_swappable_block(A(i, k), A.is_transposed(), A.bs,
1818  B(k, j), B.is_transposed(), B.bs,
1819  C(i, j), C.is_transposed(), C.bs);
1820  return C;
1821  }
1822 
1824  const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type& bs_a,
1825  const swappable_block_identifier_type b, const bool b_is_transposed, block_scheduler_type& bs_b,
1826  const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type& bs_c)
1827  {
1828  if (! bs_c.is_simulating())
1829  ++matrix_operation_statistic::get_instance()->block_multiplication_calls;
1830  // check if zero-block (== ! initialized)
1831  if (! bs_a.is_initialized(a) || ! bs_b.is_initialized(b))
1832  {
1833  // => one factor is zero => product is zero
1834  if (! bs_c.is_simulating())
1835  ++matrix_operation_statistic::get_instance()->block_multiplications_saved_through_zero;
1836  return;
1837  }
1838  // acquire
1839  ValueType* ap = bs_a.acquire(a).begin(),
1840  * bp = bs_b.acquire(b).begin(),
1841  * cp = bs_c.acquire(c).begin();
1842  // multiply
1843  if (! bs_c.is_simulating())
1845  (ap, a_is_transposed, bp, b_is_transposed, cp, c_is_transposed);
1846  // release
1847  bs_a.release(a, false);
1848  bs_b.release(b, false);
1849  bs_c.release(c, true);
1850  }
1851 
1852  // +-+ end matrix multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1853 
1854  // +-+-+-+ matrix-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1855 
1856  //! calculates z = A * x
1857  static column_vector_type&
1860  const vector_size_type offset_x = 0, const vector_size_type offset_z = 0)
1861  {
1862  // catch empty intervals
1863  if (A.get_height() * A.get_width() == 0)
1864  return z;
1865  // base case
1866  if (A.get_height() == 1 || A.get_width() == 1)
1867  return naive_matrix_col_vector_multiply_and_add(A, x, z, offset_x, offset_z);
1868 
1869  // partition matrix
1871  // recursive multiplication
1872  // The order of recursive calls is optimized to enhance locality.
1873  recursive_matrix_col_vector_multiply_and_add(qa.ul, x, z, offset_x, offset_z );
1874  recursive_matrix_col_vector_multiply_and_add(qa.ur, x, z, offset_x + qa.ul.get_width(), offset_z );
1875  recursive_matrix_col_vector_multiply_and_add(qa.dr, x, z, offset_x + qa.ul.get_width(), offset_z + qa.ul.get_height());
1876  recursive_matrix_col_vector_multiply_and_add(qa.dl, x, z, offset_x, offset_z + qa.ul.get_height());
1877 
1878  return z;
1879  }
1880 
1881  static column_vector_type&
1884  const vector_size_type offset_x = 0, const vector_size_type offset_z = 0)
1885  {
1886  for (size_type row = 0; row < A.get_height(); ++row)
1887  for (size_type col = 0; col < A.get_width(); ++col)
1888  matrix_col_vector_multiply_and_add_swappable_block(A(row, col), A.is_transposed(), A.bs,
1889  x, z, (offset_x + col) * BlockSideLength, (offset_z + row) * BlockSideLength);
1890  return z;
1891  }
1892 
1894  const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type& bs_a,
1896  const vector_size_type offset_x = 0, const vector_size_type offset_z = 0)
1897  {
1898  // check if zero-block (== ! initialized)
1899  if (! bs_a.is_initialized(a))
1900  {
1901  // => matrix is zero => product is zero
1902  return;
1903  }
1904  // acquire
1905  internal_block_type& ia = bs_a.acquire(a);
1906  // multiply
1907  if (! bs_a.is_simulating())
1908  {
1909  int_type row_limit = std::min(BlockSideLength, unsigned(z.size() - offset_z)),
1910  col_limit = std::min(BlockSideLength, unsigned(x.size() - offset_x));
1911  if (a_is_transposed)
1912  for (int_type col = 0; col < col_limit; ++col)
1913  for (int_type row = 0; row < row_limit; ++row)
1914  z[offset_z + row] += x[offset_x + col] * ia[row + col * BlockSideLength];
1915  else
1916  for (int_type row = 0; row < row_limit; ++row)
1917  for (int_type col = 0; col < col_limit; ++col)
1918  z[offset_z + row] += x[offset_x + col] * ia[row * BlockSideLength + col];
1919  }
1920  // release
1921  bs_a.release(a, false);
1922  }
1923 
1924  //! calculates z = y * A
1925  static row_vector_type&
1928  const vector_size_type offset_y = 0, const vector_size_type offset_z = 0)
1929  {
1930  // catch empty intervals
1931  if (A.get_height() * A.get_width() == 0)
1932  return z;
1933  // base case
1934  if (A.get_height() == 1 || A.get_width() == 1)
1935  return naive_matrix_row_vector_multiply_and_add(y, A, z, offset_y, offset_z);
1936 
1937  // partition matrix
1939  // recursive multiplication
1940  // The order of recursive calls is optimized to enhance locality.
1941  recursive_matrix_row_vector_multiply_and_add(y, qa.ul, z, offset_y, offset_z );
1942  recursive_matrix_row_vector_multiply_and_add(y, qa.dl, z, offset_y + qa.ul.get_height(), offset_z );
1943  recursive_matrix_row_vector_multiply_and_add(y, qa.dr, z, offset_y + qa.ul.get_height(), offset_z + qa.ul.get_width());
1944  recursive_matrix_row_vector_multiply_and_add(y, qa.ur, z, offset_y, offset_z + qa.ul.get_width());
1945 
1946  return z;
1947  }
1948 
1949  static row_vector_type&
1951  row_vector_type& z,
1952  const vector_size_type offset_y = 0, const vector_size_type offset_z = 0)
1953  {
1954  for (size_type row = 0; row < A.get_height(); ++row)
1955  for (size_type col = 0; col < A.get_width(); ++col)
1956  matrix_row_vector_multiply_and_add_swappable_block(y, A(row, col), A.is_transposed(), A.bs,
1957  z, (offset_y + row) * BlockSideLength, (offset_z + col) * BlockSideLength);
1958  return z;
1959  }
1960 
1962  const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type& bs_a,
1963  row_vector_type& z,
1964  const vector_size_type offset_y = 0, const vector_size_type offset_z = 0)
1965  {
1966  // check if zero-block (== ! initialized)
1967  if (! bs_a.is_initialized(a))
1968  {
1969  // => matrix is zero => product is zero
1970  return;
1971  }
1972  // acquire
1973  internal_block_type& ia = bs_a.acquire(a);
1974  // multiply
1975  if (! bs_a.is_simulating())
1976  {
1977  int_type row_limit = std::min(BlockSideLength, unsigned(y.size() - offset_y)),
1978  col_limit = std::min(BlockSideLength, unsigned(z.size() - offset_z));
1979  if (a_is_transposed)
1980  for (int_type col = 0; col < col_limit; ++col)
1981  for (int_type row = 0; row < row_limit; ++row)
1982  z[offset_z + col] += ia[row + col * BlockSideLength] * y[offset_y + row];
1983  else
1984  for (int_type row = 0; row < row_limit; ++row)
1985  for (int_type col = 0; col < col_limit; ++col)
1986  z[offset_z + col] += ia[row * BlockSideLength + col] * y[offset_y + row];
1987  }
1988  // release
1989  bs_a.release(a, false);
1990  }
1991 
1992  // +-+ end matrix-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1993 
1994  // +-+-+-+ vector-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1995 
1997  const row_vector_type& r, vector_size_type offset_l = 0, vector_size_type offset_r = 0)
1998  {
1999  // catch empty intervals
2000  if (A.get_height() * A.get_width() == 0)
2001  return;
2002  // base case
2003  if (A.get_height() == 1 || A.get_width() == 1)
2004  {
2005  naive_matrix_from_vectors(A, l, r, offset_l, offset_r);
2006  return;
2007  }
2008 
2009  // partition matrix
2011  // recursive creation
2012  // The order of recursive calls is optimized to enhance locality.
2013  recursive_matrix_from_vectors(qa.ul, l, r, offset_l, offset_r );
2014  recursive_matrix_from_vectors(qa.ur, l, r, offset_l, offset_r + qa.ul.get_width());
2015  recursive_matrix_from_vectors(qa.dr, l, r, offset_l + qa.ul.get_height(), offset_r + qa.ul.get_width());
2016  recursive_matrix_from_vectors(qa.dl, l, r, offset_l + qa.ul.get_height(), offset_r );
2017  }
2018 
2020  const row_vector_type& r, vector_size_type offset_l = 0, vector_size_type offset_r = 0)
2021  {
2022  for (size_type row = 0; row < A.get_height(); ++row)
2023  for (size_type col = 0; col < A.get_width(); ++col)
2024  matrix_from_vectors_swappable_block(A(row, col), A.is_transposed(), A.bs,
2025  l, r, (offset_l + row) * BlockSideLength, (offset_r + col) * BlockSideLength);
2026  }
2027 
2029  const bool a_is_transposed, block_scheduler_type& bs_a,
2030  const column_vector_type& l, const row_vector_type& r,
2031  vector_size_type offset_l, vector_size_type offset_r)
2032  {
2033  // acquire
2034  internal_block_type& ia = bs_a.acquire(a, true);
2035  // multiply
2036  if (! bs_a.is_simulating())
2037  {
2038  int_type row_limit = std::min(BlockSideLength, unsigned(l.size() - offset_l)),
2039  col_limit = std::min(BlockSideLength, unsigned(r.size() - offset_r));
2040  if (a_is_transposed)
2041  for (int_type col = 0; col < col_limit; ++col)
2042  for (int_type row = 0; row < row_limit; ++row)
2043  ia[row + col * BlockSideLength] = l[row + offset_l] * r[col + offset_r];
2044  else
2045  for (int_type row = 0; row < row_limit; ++row)
2046  for (int_type col = 0; col < col_limit; ++col)
2047  ia[row * BlockSideLength + col] = l[row + offset_l] * r[col + offset_r];
2048  }
2049  // release
2050  bs_a.release(a, true);
2051  }
2052 
2053  // +-+ end vector-vector multiplication +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
2054 };
2055 
2056 // Adjust choose_level_for_feedable_sw, too!
2057 template <typename ValueType, unsigned BlockSideLength>
2058 const int_type matrix_operations<ValueType, BlockSideLength>::strassen_winograd_base_case_size = 3;
2059 
2060 } // namespace matrix_local
2061 
2063 
2064 #endif // !STXXL_CONTAINERS_MATRIX_ARITHMETIC_HEADER
void begin_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt)
swappable_block_matrix_type::size_type size_type
static void strassen_winograd_preaddition_a(swappable_block_matrix_type &a11, swappable_block_matrix_type &a12, swappable_block_matrix_type &a21, swappable_block_matrix_type &a22, swappable_block_matrix_type &s1, swappable_block_matrix_type &s2, swappable_block_matrix_type &s3, swappable_block_matrix_type &s4)
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
compat::remove_const< Integral >::type div_ceil(Integral n, Integral2 d)
Definition: utils.h:199
friend std::ostream & operator<<(std::ostream &os, const uint_pair &a)
make a uint_pair outputtable via iostreams, using unsigned long long.
Definition: uint_types.h:228
bool is_initialized(const swappable_block_identifier_type sbid) const
check if the swappable_block is initialized.
static swappable_block_matrix_type & strassen_winograd_multiply(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, AExists, false > smaller_feedable_strassen_winograd_a
matrix_to_quadtree_block_grained< ValueType, BlockSideLength, Level-1, Granularity > smaller_matrix_to_quadtree_block_grained
block_scheduler_type & bs
Definition: matrix.h:242
static swappable_block_matrix_type & element_op(swappable_block_matrix_type &C, const swappable_block_matrix_type &A, Op op=Op())
static void element_op_twice_nontransposed(swappable_block_matrix_type &c1, swappable_block_matrix_type &c2, const swappable_block_matrix_type &a, Op op=Op())
external column-vector container for matrix multiplication
Definition: matrix.h:49
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, false, BExists > smaller_feedable_strassen_winograd_b
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
void deinitialize(const swappable_block_identifier_type sbid)
Drop all data in the given block, freeing in- and external memory.
static swappable_block_matrix_type & naive_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
smaller_feedable_strassen_winograd_a p7
matrix_to_quadtree(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
static_quadtree(smaller_static_quadtree ul, smaller_static_quadtree ur, smaller_static_quadtree dl, smaller_static_quadtree dr)
static swappable_block_matrix_type &return C
static void recursive_matrix_from_vectors(swappable_block_matrix_type A, const column_vector_type &l, const row_vector_type &r, vector_size_type offset_l=0, vector_size_type offset_r=0)
void feed_b(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
SwappableBlockType::internal_block_type internal_block_type
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
void feed_a(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
void begin_feeding_block(const size_type &block_row, const size_type &block_col, const zbt zb)
void begin_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt zb)
void read_and_add(const size_type &row, const size_type &col, swappable_block_matrix_type &bl)
#define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE
void begin_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt zb)
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
External container for a (sub)matrix. Not intended for direct use.
Definition: matrix.h:232
multiplies matrices A and B, adds result to C, for arbitrary entries param pointer to blocks of A...
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
static uint_pair min()
return an uint_pair instance containing the smallest value possible
Definition: uint_types.h:234
static_quadtree< ValueType, Level-1 > smaller_static_quadtree
column_vector_type::size_type vector_size_type
size_type size() const
return the size of the vector.
Definition: vector.h:1027
swappable_block_matrix_type::swappable_block_identifier_type swappable_block_identifier_type
smaller_matrix_to_quadtree_block_grained ur
void feed_b_element(const int_type element_num, const vt v)
const size_type & get_height() const
Definition: matrix.h:391
bool is_simulating() const
Returns if simulation mode is on, i.e. if a prediction sequence is being recorded.
column_vector< ValueType > column_vector_type
void feed_a(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, false, false > smaller_feedable_strassen_winograd_n
static void element_op_swappable_block(const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type &bs_c, const swappable_block_identifier_type a, bool a_is_transposed, block_scheduler_type &bs_a, const swappable_block_identifier_type b, bool b_is_transposed, block_scheduler_type &bs_b, Op op=Op())
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
static void element_op_swappable_block(const swappable_block_identifier_type c, block_scheduler_type &bs_c, Op op=Op())
c = a [op] b; for arbitrary entries
static void op_swappable_block_nontransposed(swappable_block_matrix_type &c, Op op, swappable_block_matrix_type &a, size_type &row, size_type &col)
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
#define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS
void end_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt zb)
static void element_op_swappable_block(const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type &bs_c, const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, Op op=Op())
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
void begin_feeding_block(const size_type &block_row, const size_type &block_col, const zbt)
A static_quadtree holds 4^Level elements arranged in a quad tree.
const bool & is_transposed() const
if the elements inside the blocks are in transposed order i.e. column-major
Definition: matrix.h:398
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
block_scheduler_type::swappable_block_identifier_type swappable_block_identifier_type
Definition: matrix.h:238
swappable_block_matrix_type::size_type size_type
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, AExists, BExists > smaller_feedable_strassen_winograd_ab
swappable_block_matrix_type::size_type size_type
static const int_type strassen_winograd_base_case_size
void begin_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt)
block_scheduler_type::internal_block_type internal_block_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
uint_pair & operator+=(const uint_pair &b)
addition operator (uses 64-bit arithmetic)
Definition: uint_types.h:183
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
static void multiply_and_add_swappable_block(const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, const swappable_block_identifier_type b, const bool b_is_transposed, block_scheduler_type &bs_b, const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type &bs_c)
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
static void use_feedable_sw_block_grained(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
void end_feeding_block(const size_type &block_row, const size_type &block_col, const zbt zb)
static_quadtree< ValueType, Level > vt
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
External matrix container. Introduction to matrix container: see STXXL Matrix tutorial. Design and Internals of matrix container: see Matrix.
Definition: matrix.h:44
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
block_scheduler_type::internal_block_type internal_block_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix)
static void strassen_winograd_preaddition_b(swappable_block_matrix_type &b11, swappable_block_matrix_type &b12, swappable_block_matrix_type &b21, swappable_block_matrix_type &b22, swappable_block_matrix_type &t1, swappable_block_matrix_type &t2, swappable_block_matrix_type &t3, swappable_block_matrix_type &t4)
choose_int_types< my_pointer_size >::int_type int_type
Definition: types.h:63
void end_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt zb)
smaller_feedable_strassen_winograd_n p5
static_quadtree< ValueType, Level > vt
static swappable_block_matrix_type & strassen_winograd_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
smaller_feedable_strassen_winograd_b p6
static void strassen_winograd_postaddition(swappable_block_matrix_type &c11, swappable_block_matrix_type &c12, swappable_block_matrix_type &c21, swappable_block_matrix_type &c22, swappable_block_matrix_type &p1, swappable_block_matrix_type &p3, swappable_block_matrix_type &p5)
void end_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt zb)
static void naive_matrix_from_vectors(swappable_block_matrix_type A, const column_vector_type &l, const row_vector_type &r, vector_size_type offset_l=0, vector_size_type offset_r=0)
block_scheduler_type::internal_block_type internal_block_type
const size_type & get_width() const
Definition: matrix.h:394
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, AExists, false > smaller_feedable_strassen_winograd_a
static_quadtree< bool, Level > zbt
#define STXXL_BEGIN_NAMESPACE
Definition: namespace.h:16
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, AExists, BExists > smaller_feedable_strassen_winograd_ab
void end_feeding_block(const size_type &block_row, const size_type &block_col, const zbt zb)
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
static void op_swappable_block_nontransposed(swappable_block_matrix_type &c, swappable_block_matrix_type &a, Op op, swappable_block_matrix_type &b, size_type &row, size_type &col)
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
void feed_b(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
smaller_feedable_strassen_winograd_ab p2
void feed_element(const int_type element_num, const vt v)
static row_vector_type & naive_matrix_row_vector_multiply_and_add(const row_vector_type &y, const swappable_block_matrix_type &A, row_vector_type &z, const vector_size_type offset_y=0, const vector_size_type offset_z=0)
vector_type::size_type size_type
Definition: matrix.h:53
static swappable_block_matrix_type & recursive_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
static swappable_block_matrix_type & multi_level_strassen_winograd_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
void feed_a_element(const int_type element_num, const vt v)
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, false, BExists > smaller_feedable_strassen_winograd_b
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
matrix_to_quadtree(const swappable_block_matrix_type &matrix)
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
vt read_element(const int_type element_num)
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, false, false > smaller_feedable_strassen_winograd_n
matrix_to_quadtree(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
static void matrix_row_vector_multiply_and_add_swappable_block(const row_vector_type &y, const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, row_vector_type &z, const vector_size_type offset_y=0, const vector_size_type offset_z=0)
static swappable_block_matrix_type & element_op(swappable_block_matrix_type &C, const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, Op op=Op())
matrix_operations< ValueType, BlockSideLength > Ops
void read_and_add(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
static row_vector_type & recursive_matrix_row_vector_multiply_and_add(const row_vector_type &y, const swappable_block_matrix_type &A, row_vector_type &z, const vector_size_type offset_y=0, const vector_size_type offset_z=0)
calculates z = y * A
static swappable_block_matrix_type & multi_level_strassen_winograd_multiply_and_add_block_grained(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
choose_int_types< my_pointer_size >::unsigned_type unsigned_type
Definition: types.h:64
unsigned int ilog2_ceil(const IntegerType &i)
calculate the log2 ceiling of an integer type (by repeated bit shifts)
Definition: utils.h:188
static column_vector_type & naive_matrix_col_vector_multiply_and_add(const swappable_block_matrix_type &A, const column_vector_type &x, column_vector_type &z, const vector_size_type offset_x=0, const vector_size_type offset_z=0)
static void choose_level_for_feedable_sw(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
static void use_feedable_sw(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
external row-vector container for matrix multiplication
Definition: matrix.h:120
void release(const swappable_block_identifier_type sbid, const bool dirty)
Release the given block. Has to be in pairs with acquire. Pairs may be nested and interleaved...
Schedules swapping of blocks and provides blocks for temporary storage.
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
block_scheduler_type::internal_block_type internal_block_type
void feed_and_add_element(const int_type element_num, const vt v)
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
static void matrix_from_vectors_swappable_block(swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, const column_vector_type &l, const row_vector_type &r, vector_size_type offset_l, vector_size_type offset_r)
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
static column_vector_type & recursive_matrix_col_vector_multiply_and_add(const swappable_block_matrix_type &A, const column_vector_type &x, column_vector_type &z, const vector_size_type offset_x=0, const vector_size_type offset_z=0)
calculates z = A * x
void end_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt zb)
static swappable_block_matrix_type & strassen_winograd_multiply_and_add_interleaved(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
static void matrix_col_vector_multiply_and_add_swappable_block(const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, const column_vector_type &x, column_vector_type &z, const vector_size_type offset_x=0, const vector_size_type offset_z=0)
static Integral round_up_to_power_of_two(Integral n)
Definition: utils.h:255
matrix_to_quadtree< ValueType, BlockSideLength, Level-1 > smaller_matrix_to_quadtree
swappable_block_matrix_type::size_type size_type
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
#define STXXL_END_NAMESPACE
Definition: namespace.h:17
internal_block_type & acquire(const swappable_block_identifier_type sbid, const bool uninitialized=false)
Acquire the given block. Has to be in pairs with release. Pairs may be nested and interleaved...