13 #ifndef STXXL_CONTAINERS_MATRIX_ARITHMETIC_HEADER
14 #define STXXL_CONTAINERS_MATRIX_ARITHMETIC_HEADER
21 #ifndef STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS
22 #define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS 3
24 #ifndef STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE
25 #define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE 2
28 template <
typename ValueType>
31 template <
typename ValueType>
34 template <
typename ValueType,
unsigned BlockS
ideLength>
35 class swappable_block_matrix;
45 block_additions_saved_through_zero;
48 : block_multiplication_calls(0),
49 block_multiplications_saved_through_zero(0),
50 block_addition_calls(0),
51 block_additions_saved_through_zero(0) { }
92 { operator = (*matrix_operation_statistic::get_instance()); }
103 o <<
"matrix operation statistics" << std::endl;
104 o <<
"block multiplication calls : "
106 o <<
"block multiplications saved through zero blocks: "
108 o <<
"block multiplications performed : "
110 o <<
"block addition calls : "
112 o <<
"block additions saved through zero blocks : "
114 o <<
"block additions performed : "
122 namespace matrix_local {
129 template <
typename ValueType,
unsigned Level>
138 : ul(ul), ur(ur), dl(dl), dr(dr) { }
144 ul &= right.
ul, ur &= right.
ur;
145 dl &= right.
dl, dr &= right.
dr;
151 ul += right.
ul, ur += right.
ur;
152 dl += right.
dl, dr += right.
dr;
158 ul -= right.
ul, ur -= right.
ur;
159 dl -= right.
dl, dr -= right.
dr;
173 template <
typename ValueType>
183 operator const ValueType& ()
const
186 operator ValueType& ()
211 {
return val & right.val; }
214 {
return val + right.val; }
217 {
return val - right.val; }
220 template <
typename ValueType,
unsigned BlockS
ideLength,
unsigned Level,
bool AExists,
bool BExists>
247 p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col),
248 p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
249 p3( bs_c, n/2, m/2, l/2),
250 p4( bs_c, n/2, m/2, l/2),
251 p5( bs_c, n/2, m/2, l/2),
252 p6( bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
253 p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
259 p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2),
260 p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2),
261 p3( bs_c, n/2, m/2, l/2),
262 p4( bs_c, n/2, m/2, l/2),
263 p5( bs_c, n/2, m/2, l/2),
264 p6( bs_c, n/2, m/2, l/2),
265 p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
271 p1(bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col),
272 p2(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
273 p3(bs_c, n/2, m/2, l/2),
274 p4(bs_c, n/2, m/2, l/2),
275 p5(bs_c, n/2, m/2, l/2),
276 p6(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
277 p7(bs_c, n/2, m/2, l/2) {}
282 p1(bs_c, n / 2, m / 2, l / 2),
283 p2(bs_c, n / 2, m / 2, l / 2),
284 p3(bs_c, n / 2, m / 2, l / 2),
285 p4(bs_c, n / 2, m / 2, l / 2),
286 p5(bs_c, n / 2, m / 2, l / 2),
287 p6(bs_c, n / 2, m / 2, l / 2),
288 p7(bs_c, n / 2, m / 2, l / 2) { }
297 p1.begin_feeding_a_block(block_row, block_col, zb.
ul);
298 p2.begin_feeding_a_block(block_row, block_col, zb.
ur);
299 p3.begin_feeding_a_block(block_row, block_col, s1);
300 p4.begin_feeding_a_block(block_row, block_col, s2);
301 p5.begin_feeding_a_block(block_row, block_col, s3);
302 p6.begin_feeding_a_block(block_row, block_col, s4);
303 p7.begin_feeding_a_block(block_row, block_col, zb.
dr);
313 p1.feed_a_element(element_num, v.
ul);
314 p2.feed_a_element(element_num, v.
ur);
315 p3.feed_a_element(element_num, s1);
316 p4.feed_a_element(element_num, s2);
317 p5.feed_a_element(element_num, s3);
318 p6.feed_a_element(element_num, s4);
319 p7.feed_a_element(element_num, v.
dr);
329 p1.end_feeding_a_block(block_row, block_col, zb.
ul);
330 p2.end_feeding_a_block(block_row, block_col, zb.
ur);
331 p3.end_feeding_a_block(block_row, block_col, s1);
332 p4.end_feeding_a_block(block_row, block_col, s2);
333 p5.end_feeding_a_block(block_row, block_col, s3);
334 p6.end_feeding_a_block(block_row, block_col, s4);
335 p7.end_feeding_a_block(block_row, block_col, zb.
dr);
345 p1.begin_feeding_b_block(block_row, block_col, zb.
ul);
346 p2.begin_feeding_b_block(block_row, block_col, zb.
dl);
347 p3.begin_feeding_b_block(block_row, block_col, t1);
348 p4.begin_feeding_b_block(block_row, block_col, t2);
349 p5.begin_feeding_b_block(block_row, block_col, t3);
350 p6.begin_feeding_b_block(block_row, block_col, zb.
dr);
351 p7.begin_feeding_b_block(block_row, block_col, t4);
361 p1.feed_b_element(element_num, v.
ul);
362 p2.feed_b_element(element_num, v.
dl);
363 p3.feed_b_element(element_num, t1);
364 p4.feed_b_element(element_num, t2);
365 p5.feed_b_element(element_num, t3);
366 p6.feed_b_element(element_num, v.
dr);
367 p7.feed_b_element(element_num, t4);
377 p1.end_feeding_b_block(block_row, block_col, zb.
ul);
378 p2.end_feeding_b_block(block_row, block_col, zb.
dl);
379 p3.end_feeding_b_block(block_row, block_col, t1);
380 p4.end_feeding_b_block(block_row, block_col, t2);
381 p5.end_feeding_b_block(block_row, block_col, t3);
382 p6.end_feeding_b_block(block_row, block_col, zb.
dr);
383 p7.end_feeding_b_block(block_row, block_col, t4);
400 r.
ur = r.
ul = p1.begin_reading_block(block_row, block_col);
401 r.
ul &= p2.begin_reading_block(block_row, block_col);
402 r.
ur &= p4.begin_reading_block(block_row, block_col);
403 r.
dr = r.
dl = p5.begin_reading_block(block_row, block_col);
405 r.
dl &= p7.begin_reading_block(block_row, block_col);
406 r.
ur &= p3.begin_reading_block(block_row, block_col);
408 r.
ur &= p6.begin_reading_block(block_row, block_col);
415 r.
ur = r.
ul = p1.read_element(element_num);
416 r.
ul += p2.read_element(element_num);
417 r.
ur += p4.read_element(element_num);
418 r.
dr = r.
dl = p5.read_element(element_num);
420 r.
dl += p7.read_element(element_num);
421 r.
ur += p3.read_element(element_num);
423 r.
ur += p6.read_element(element_num);
430 r.
ur = r.
ul = p1.end_reading_block(block_row, block_col);
431 r.
ul &= p2.end_reading_block(block_row, block_col);
432 r.
ur &= p4.end_reading_block(block_row, block_col);
433 r.
dr = r.
dl = p5.end_reading_block(block_row, block_col);
435 r.
dl &= p7.end_reading_block(block_row, block_col);
436 r.
ur &= p3.end_reading_block(block_row, block_col);
438 r.
ur &= p6.end_reading_block(block_row, block_col);
443 template <
typename ValueType,
unsigned BlockS
ideLength,
bool AExists,
bool BExists>
462 : a(existing_a, n, l, a_from_row, a_from_col),
463 b(existing_b, n, l, b_from_row, b_from_col),
471 : a(existing_a, n, l, a_from_row, a_from_col),
481 b(existing_b, n, l, b_from_row, b_from_col),
497 iblock = &a.bs.acquire(a(block_row, block_col),
true);
503 (*iblock)[element_num] = v;
510 a.bs.release(a(block_row, block_col), ! zb);
518 iblock = &b.bs.acquire(b(block_row, block_col),
true);
524 (*iblock)[element_num] = v;
531 b.bs.release(b(block_row, block_col), ! zb);
541 bool zb = ! c.bs.is_initialized(c(block_row, block_col));
542 iblock = &c.bs.acquire(c(block_row, block_col));
547 {
return (*iblock)[element_num]; }
551 c.bs.release(c(block_row, block_col),
false);
553 return ! c.bs.is_initialized(c(block_row, block_col));
557 template <
typename ValueType,
unsigned BlockS
ideLength,
unsigned Level>
573 : ul(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, 0),
574 ur(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, matrix.get_width()/2),
575 dl(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, 0),
576 dr(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, matrix.get_width()/2)
581 : ul(matrix, height/2, width/2, from_row, from_col),
582 ur(matrix, height/2, width/2, from_row, from_col + width/2),
583 dl(matrix, height/2, width/2, from_row + height/2, from_col),
584 dr(matrix, height/2, width/2, from_row + height/2, from_col + width/2)
585 { assert(! (height % 2 | width % 2)); }
589 ul.begin_feeding_block(block_row, block_col, zb.
ul);
590 ur.begin_feeding_block(block_row, block_col, zb.
ur);
591 dl.begin_feeding_block(block_row, block_col, zb.
dl);
592 dr.begin_feeding_block(block_row, block_col, zb.
dr);
597 ul.feed_element(element_num, v.
ul);
598 ur.feed_element(element_num, v.
ur);
599 dl.feed_element(element_num, v.
dl);
600 dr.feed_element(element_num, v.
dr);
605 ul.feed_and_add_element(element_num, v.
ul);
606 ur.feed_and_add_element(element_num, v.
ur);
607 dl.feed_and_add_element(element_num, v.
dl);
608 dr.feed_and_add_element(element_num, v.
dr);
613 ul.end_feeding_block(block_row, block_col, zb.
ul);
614 ur.end_feeding_block(block_row, block_col, zb.
ur);
615 dl.end_feeding_block(block_row, block_col, zb.
dl);
616 dr.end_feeding_block(block_row, block_col, zb.
dr);
622 zb.
ul = ul.begin_reading_block(block_row, block_col);
623 zb.
ur = ur.begin_reading_block(block_row, block_col);
624 zb.
dl = dl.begin_reading_block(block_row, block_col);
625 zb.
dr = dr.begin_reading_block(block_row, block_col);
632 v.
ul = ul.read_element(element_num);
633 v.
ur = ur.read_element(element_num);
634 v.
dl = dl.read_element(element_num);
635 v.
dr = dr.read_element(element_num);
642 zb.
ul = ul.end_reading_block(block_row, block_col);
643 zb.
ur = ur.end_reading_block(block_row, block_col);
644 zb.
dl = dl.end_reading_block(block_row, block_col);
645 zb.
dr = dr.end_reading_block(block_row, block_col);
650 {
return ul.get_height_in_blocks(); }
653 {
return ul.get_width_in_blocks(); }
656 template <
typename ValueType,
unsigned BlockS
ideLength>
671 : m(matrix, matrix.get_height(), matrix.get_width(), 0, 0),
676 : m(matrix, height, width, from_row, from_col),
680 { iblock = &m.bs.acquire(m(block_row, block_col)); }
683 { (*iblock)[element_num] = v; }
686 { (*iblock)[element_num] += v; }
690 m.bs.release(m(block_row, block_col), ! zb);
696 zbt zb = ! m.bs.is_initialized(m(block_row, block_col));
697 iblock = &m.bs.acquire(m(block_row, block_col));
702 {
return (*iblock)[element_num]; }
706 m.bs.release(m(block_row, block_col),
false);
708 return ! m.bs.is_initialized(m(block_row, block_col));
712 {
return m.get_height(); }
715 {
return m.get_width(); }
718 template <
typename ValueType,
unsigned BlockS
ideLength,
unsigned Level,
bool AExists,
bool BExists>
746 p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col),
747 p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
748 p3( bs_c, n/2, m/2, l/2),
749 p4( bs_c, n/2, m/2, l/2),
750 p5( bs_c, n/2, m/2, l/2),
751 p6( bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
752 p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
758 p1(existing_a, a_from_row, a_from_col, bs_c, n/2, m/2, l/2),
759 p2(existing_a, a_from_row, a_from_col + l/2, bs_c, n/2, m/2, l/2),
760 p3( bs_c, n/2, m/2, l/2),
761 p4( bs_c, n/2, m/2, l/2),
762 p5( bs_c, n/2, m/2, l/2),
763 p6( bs_c, n/2, m/2, l/2),
764 p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
770 p1(bs_c, n/2, m/2, l/2, existing_b, b_from_row, b_from_col),
771 p2(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
772 p3(bs_c, n/2, m/2, l/2),
773 p4(bs_c, n/2, m/2, l/2),
774 p5(bs_c, n/2, m/2, l/2),
775 p6(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
776 p7(bs_c, n/2, m/2, l/2) {}
781 p1(bs_c, n / 2, m / 2, l / 2),
782 p2(bs_c, n / 2, m / 2, l / 2),
783 p3(bs_c, n / 2, m / 2, l / 2),
784 p4(bs_c, n / 2, m / 2, l / 2),
785 p5(bs_c, n / 2, m / 2, l / 2),
786 p6(bs_c, n / 2, m / 2, l / 2),
787 p7(bs_c, n / 2, m / 2, l / 2) { }
799 Ops::strassen_winograd_preaddition_a(qbl.
ul, qbl.
ur, qbl.
dl, qbl.
dr, s1, s2, s3, s4);
801 p1.feed_a(row, col, qbl.
ul);
802 p2.feed_a(row, col, qbl.
ur);
803 p3.feed_a(row, col, s1);
804 p4.feed_a(row, col, s2);
805 p5.feed_a(row, col, s3);
806 p6.feed_a(row, col, s4);
807 p7.feed_a(row, col, qbl.
dr);
820 Ops::strassen_winograd_preaddition_b(qbl.
ul, qbl.
ur, qbl.
dl, qbl.
dr, t1, t2, t3, t4);
822 p1.feed_b(row, col, qbl.
ul);
823 p2.feed_b(row, col, qbl.
dl);
824 p3.feed_b(row, col, t1);
825 p4.feed_b(row, col, t2);
826 p5.feed_b(row, col, t3);
827 p6.feed_b(row, col, qbl.
dr);
828 p7.feed_b(row, col, t4);
848 p2.read_and_add(row, col, qbl.
ul);
849 p1.read_and_add(row, col, px);
851 p4.read_and_add(row, col, px);
853 p5.read_and_add(row, col, px);
854 Ops::element_op_twice_nontransposed(qbl.
dl, qbl.
dr, px,
typename Ops::addition());
856 p7.read_and_add(row, col, qbl.
dl);
857 p3.read_and_add(row, col, px);
858 Ops::element_op_twice_nontransposed(qbl.
dr, qbl.
ur, px,
typename Ops::addition());
859 p6.read_and_add(row, col, qbl.
ur);
863 {
return smaller_feedable_strassen_winograd_ab::get_num_temp_grains() + (4 ^ Level) * 2; }
866 template <
typename ValueType,
unsigned BlockS
ideLength,
bool AExists,
bool BExists>
883 : a(existing_a, n, l, a_from_row, a_from_col),
884 b(existing_b, n, l, b_from_row, b_from_col),
890 : a(existing_a, n, l, a_from_row, a_from_col),
898 b(existing_b, n, l, b_from_row, b_from_col),
950 template <
typename ValueType,
unsigned BlockS
ideLength,
unsigned Level,
unsigned Granularity>
961 : ul(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, 0),
962 ur(matrix, matrix.get_height()/2, matrix.get_width()/2, 0, matrix.get_width()/2),
963 dl(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, 0),
964 dr(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, matrix.get_width()/2)
969 : ul(matrix, height/2, width/2, from_row, from_col),
970 ur(matrix, height/2, width/2, from_row, from_col + width/2),
971 dl(matrix, height/2, width/2, from_row + height/2, from_col),
972 dr(matrix, height/2, width/2, from_row + height/2, from_col + width/2)
973 { assert(! (height % 2 | width % 2)); }
981 {
return ul.get_height(); }
984 {
return ul.get_width(); }
987 template <
typename ValueType,
unsigned BlockS
ideLength,
unsigned Granularity>
996 : m(matrix, matrix.get_height(), matrix.get_width(), 0, 0)
1001 : m(matrix, height, width, from_row, from_col)
1010 {
return m.get_height() / Granularity; }
1013 {
return m.get_width() / Granularity; }
1016 template <
typename ValueType,
unsigned BlockS
ideLength>
1046 inline ValueType& operator () (ValueType& c,
const ValueType& a,
const ValueType& b) {
return c = a + b; }
1047 inline ValueType& operator () (ValueType& c,
const ValueType& a) {
return c += a; }
1048 inline ValueType operator () (
const ValueType& a) {
return +a; }
1053 inline ValueType& operator () (ValueType& c,
const ValueType& a,
const ValueType& b) {
return c = a - b; }
1054 inline ValueType& operator () (ValueType& c,
const ValueType& a) {
return c -= a; }
1055 inline ValueType operator () (
const ValueType& a) {
return -a; }
1061 inline ValueType& operator () (ValueType& c,
const ValueType& a) {
return c = a * s; }
1062 inline ValueType operator () (
const ValueType& a) {
return a * s; }
1063 inline operator const ValueType& () {
return s; }
1076 element_op_swappable_block(
1085 static swappable_block_matrix_type&
1091 element_op_swappable_block(
1099 static swappable_block_matrix_type&
1100 element_op(swappable_block_matrix_type& C, Op op = Op())
1102 for (
size_type row = 0; row < C.get_height(); ++row)
1103 for (
size_type col = 0; col < C.get_width(); ++col)
1104 element_op_swappable_block(
1105 C(row, col), C.bs, op);
1118 ++matrix_operation_statistic::get_instance()->block_addition_calls;
1125 ++matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
1128 a_is_transposed = a_is_transposed != c_is_transposed;
1129 b_is_transposed = b_is_transposed != c_is_transposed;
1137 if (b_is_transposed)
1152 if (a_is_transposed)
1167 if (a_is_transposed)
1169 if (b_is_transposed)
1176 if (b_is_transposed)
1196 ++matrix_operation_statistic::get_instance()->block_addition_calls;
1202 ++matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
1213 if (c_is_transposed == a_is_transposed)
1219 if (c_is_transposed == a_is_transposed)
1237 ++matrix_operation_statistic::get_instance()->block_addition_calls;
1243 ++matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
1270 op_swappable_block_nontransposed(s3, a11,
subtraction(), a21, row, col);
1271 op_swappable_block_nontransposed(s1, a21,
addition(), a22, row, col);
1272 op_swappable_block_nontransposed(s2, s1,
subtraction(), a11, row, col);
1273 op_swappable_block_nontransposed(s4, a12,
subtraction(), s2, row, col);
1290 op_swappable_block_nontransposed(t3, b22,
subtraction(), b12, row, col);
1291 op_swappable_block_nontransposed(t1, b12,
subtraction(), b11, row, col);
1292 op_swappable_block_nontransposed(t2, b22,
subtraction(), t1, row, col);
1293 op_swappable_block_nontransposed(t4, b21,
subtraction(), t2, row, col);
1309 op_swappable_block_nontransposed(c11,
addition(), p1, row, col);
1310 op_swappable_block_nontransposed( p1,
addition(), c22, row, col);
1311 op_swappable_block_nontransposed( p5,
addition(), p1, row, col);
1312 op_swappable_block_nontransposed(c21,
addition(), p5, row, col);
1313 op_swappable_block_nontransposed(c22, p5,
addition(), p3, row, col);
1314 op_swappable_block_nontransposed( p1,
addition(), p3, row, col);
1315 op_swappable_block_nontransposed(c12,
addition(), p1, row, col);
1329 element_op_swappable_block(
1330 c1(row, col),
false, c1.
bs,
1331 a(row, col),
false, a.
bs, op);
1332 element_op_swappable_block(
1333 c2(row, col),
false, c2.
bs,
1334 a(row, col),
false, a.
bs, op);
1344 element_op_swappable_block(
1345 c(row, col),
false, c.
bs,
1346 a(row, col),
false, a.
bs,
1347 b(row, col),
false, b.
bs, op);
1355 element_op_swappable_block(
1356 c(row, col),
false, c.
bs,
1357 a(row, col),
false, a.
bs, op);
1382 downleft, downright,
1383 & ul, &
ur, & dl, & dr;
1386 : upleft (whole, whole.get_height()/2, whole.get_width()/2, 0, 0),
1387 upright (whole, whole.get_height()/2, whole.get_width()/2, 0, whole.get_width()/2),
1388 downleft (whole, whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, 0),
1389 downright(whole, whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, whole.get_width()/2),
1390 ul(upleft), ur(upright), dl(downleft), dr(downright)
1397 downleft, downright,
1398 & ul, &
ur, & dl, & dr;
1401 : upleft (whole,
div_ceil(whole.get_height(),2),
div_ceil(whole.get_width(),2), 0, 0),
1403 downleft (whole,
div_ceil(whole.get_height(),2),
div_ceil(whole.get_width(),2),
div_ceil(whole.get_height(),2), 0),
1405 ul(upleft), ur(upright), dl(downleft), dr(downright) {}
1411 downleft, downright,
1412 & ul, &
ur, & dl, & dr;
1415 : upleft (whole, whole.get_height()/2, whole.get_width()/2, 0, 0),
1416 upright (whole, whole.get_height()/2, whole.get_width() - whole.get_width()/2, 0, whole.get_width()/2),
1417 downleft (whole, whole.get_height() - whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, 0),
1418 downright(whole, whole.get_height() - whole.get_height()/2, whole.get_width() - whole.get_width()/2, whole.get_height()/2, whole.get_width()/2),
1419 ul(upleft), ur(upright), dl(downleft), dr(downright) {}
1424 static swappable_block_matrix_type&
1442 #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 5 && 5 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
1444 use_feedable_sw_block_grained<5>(padded_a, padded_a, padded_c);
1447 #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 4 && 4 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
1449 use_feedable_sw_block_grained<4>(padded_a, padded_a, padded_c);
1452 #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 3 && 3 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
1454 use_feedable_sw_block_grained<3>(padded_a, padded_a, padded_c);
1457 #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 2 && 2 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE)
1459 use_feedable_sw_block_grained<2>(padded_a, padded_a, padded_c);
1463 strassen_winograd_multiply_and_add_interleaved(A, B, C);
1469 strassen_winograd_multiply_and_add_interleaved(A, B, C);
1474 template <
unsigned Level>
1479 const unsigned granularity = 1;
1489 fsw.feed_a(row, col, mtq_a(row, col));
1497 fsw.feed_b(row, col, mtq_b(row, col));
1507 fsw.read_and_add(row, col, mtq_c(row, col));
1513 static swappable_block_matrix_type&
1526 choose_level_for_feedable_sw(padded_a, padded_b, padded_c);
1545 use_feedable_sw<2>(A, B, C);
1552 recursive_multiply_and_add(A, B, C);
1558 template <
unsigned Level>
1568 for (
size_type block_row = 0; block_row < mtq_a.get_height_in_blocks(); ++block_row)
1569 for (
size_type block_col = 0; block_col < mtq_a.get_width_in_blocks(); ++block_col)
1571 fsw.begin_feeding_a_block(block_row, block_col,
1574 #pragma omp parallel for
1576 for (
int_type element_row_in_block = 0; element_row_in_block <
int_type(BlockSideLength); ++element_row_in_block)
1577 for (int_type element_col_in_block = 0; element_col_in_block <
int_type(BlockSideLength); ++element_col_in_block)
1578 fsw.feed_a_element(element_row_in_block * BlockSideLength + element_col_in_block,
1579 mtq_a.
read_element(element_row_in_block * BlockSideLength + element_col_in_block));
1580 fsw.end_feeding_a_block(block_row, block_col,
1589 fsw.begin_feeding_b_block(block_row, block_col,
1592 #pragma omp parallel for
1594 for (
int_type element_row_in_block = 0; element_row_in_block <
int_type(BlockSideLength); ++element_row_in_block)
1595 for (int_type element_col_in_block = 0; element_col_in_block <
int_type(BlockSideLength); ++element_col_in_block)
1596 fsw.feed_b_element(element_row_in_block * BlockSideLength + element_col_in_block,
1597 mtq_b.
read_element(element_row_in_block * BlockSideLength + element_col_in_block));
1598 fsw.end_feeding_b_block(block_row, block_col,
1610 fsw.begin_reading_block(block_row, block_col));
1612 #pragma omp parallel for
1614 for (
int_type element_row_in_block = 0; element_row_in_block <
int_type(BlockSideLength); ++element_row_in_block)
1615 for (int_type element_col_in_block = 0; element_col_in_block <
int_type(BlockSideLength); ++element_col_in_block)
1617 fsw.read_element(element_row_in_block * BlockSideLength + element_col_in_block));
1619 fsw.end_reading_block(block_row, block_col));
1625 static swappable_block_matrix_type&
1631 if (C.
get_height() <= strassen_winograd_base_case_size
1632 || C.
get_width() <= strassen_winograd_base_case_size
1633 || A.
get_width() <= strassen_winograd_base_case_size)
1636 return recursive_multiply_and_add(A, B, C);
1643 s2(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1644 s3(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1645 s4(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1646 t1(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1647 t2(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1648 t3(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1649 t4(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed());
1650 strassen_winograd_preaddition_a(qa.ul, qa.ur, qa.dl, qa.dr, s1, s2, s3, s4);
1651 strassen_winograd_preaddition_b(qb.ul, qb.ur, qb.dl, qb.dr, t1, t2, t3, t4);
1660 strassen_winograd_multiply(qa.ul, qb.ul, p1);
1661 strassen_winograd_multiply(qa.ur, qb.dl, qc.
ul);
1662 strassen_winograd_multiply( s1, t1, p3);
1663 strassen_winograd_multiply( s2, t2, qc.
dr);
1664 strassen_winograd_multiply( s3, t3, p5);
1665 strassen_winograd_multiply( s4, qb.dr, qc.
ur);
1666 strassen_winograd_multiply(qa.dr, t4, qc.
dl);
1668 strassen_winograd_postaddition(qc.
ul, qc.
ur, qc.
dl, qc.
dr, p1, p3, p5);
1674 static swappable_block_matrix_type&
1680 if (C.
get_height() <= strassen_winograd_base_case_size
1681 || C.
get_width() <= strassen_winograd_base_case_size
1682 || A.
get_width() <= strassen_winograd_base_case_size)
1683 return recursive_multiply_and_add(A, B, C);
1689 s2(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1690 s3(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1691 s4(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
1692 t1(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1693 t2(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1694 t3(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
1695 t4(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed());
1696 strassen_winograd_preaddition_a(qa.ul, qa.ur, qa.dl, qa.dr, s1, s2, s3, s4);
1697 strassen_winograd_preaddition_b(qb.ul, qb.ur, qb.dl, qb.dr, t1, t2, t3, t4);
1700 strassen_winograd_multiply_and_add_interleaved(qa.ur, qb.dl, qc.
ul);
1701 strassen_winograd_multiply_and_add_interleaved(qa.ul, qb.ul, px);
1702 element_op<addition>(qc.
ul, px);
1703 strassen_winograd_multiply_and_add_interleaved(s2, t2, px);
1706 element_op<addition>(qc.
ur, px);
1707 strassen_winograd_multiply_and_add_interleaved(s3, t3, px);
1710 element_op_twice_nontransposed<addition>(qc.
dl, qc.
dr, px);
1712 strassen_winograd_multiply_and_add_interleaved(qa.dr, t4, qc.
dl);
1714 strassen_winograd_multiply_and_add_interleaved(s1, t1, px);
1717 element_op_twice_nontransposed<addition>(qc.
dr, qc.
ur, px);
1719 strassen_winograd_multiply_and_add_interleaved(s4, qb.dr, qc.
ur);
1725 static swappable_block_matrix_type&
1731 if (C.
get_height() <= strassen_winograd_base_case_size
1732 || C.
get_width() <= strassen_winograd_base_case_size
1733 || A.
get_width() <= strassen_winograd_base_case_size)
1734 return recursive_multiply_and_add(A, B, C);
1740 s2(C.
bs, qa.ul.get_height(), qa.ul.get_width()),
1741 s3(C.
bs, qa.ul.get_height(), qa.ul.get_width()),
1742 s4(C.
bs, qa.ul.get_height(), qa.ul.get_width()),
1743 t1(C.
bs, qb.ul.get_height(), qb.ul.get_width()),
1744 t2(C.
bs, qb.ul.get_height(), qb.ul.get_width()),
1745 t3(C.
bs, qb.ul.get_height(), qb.ul.get_width()),
1746 t4(C.
bs, qb.ul.get_height(), qb.ul.get_width());
1747 element_op<subtraction>(s3, qa.ul, qa.dl);
1748 element_op<addition>(s1, qa.dl, qa.dr);
1749 element_op<subtraction>(s2, s1, qa.ul);
1750 element_op<subtraction>(s4, qa.ur, s2);
1751 element_op<subtraction>(t3, qb.dr, qb.ur);
1752 element_op<subtraction>(t1, qb.ur, qb.ul);
1753 element_op<subtraction>(t2, qb.dr, t1);
1754 element_op<subtraction>(t4, qb.dl, t2);
1757 strassen_winograd_multiply_and_add(qa.ur, qb.dl, qc.
ul);
1758 strassen_winograd_multiply_and_add(qa.ul, qb.ul, px);
1759 element_op<addition>(qc.
ul, px);
1760 strassen_winograd_multiply_and_add(s2, t2, px);
1761 element_op<addition>(qc.
ur, px);
1762 strassen_winograd_multiply_and_add(s3, t3, px);
1763 element_op<addition>(qc.
dl, px);
1764 element_op<addition>(qc.
dr, px);
1766 strassen_winograd_multiply_and_add(qa.dr, t4, qc.
dl);
1767 strassen_winograd_multiply_and_add(s1, t1, px);
1768 element_op<addition>(qc.
dr, px);
1769 element_op<addition>(qc.
ur, px);
1770 strassen_winograd_multiply_and_add(s4, qb.dr, qc.
ur);
1776 static swappable_block_matrix_type&
1786 return naive_multiply_and_add(A, B, C);
1792 recursive_multiply_and_add(qa.ul, qb.ul, qc.
ul);
1793 recursive_multiply_and_add(qa.ur, qb.dl, qc.
ul);
1794 recursive_multiply_and_add(qa.ur, qb.dr, qc.
ur);
1795 recursive_multiply_and_add(qa.ul, qb.ur, qc.
ur);
1796 recursive_multiply_and_add(qa.dl, qb.ur, qc.
dr);
1797 recursive_multiply_and_add(qa.dr, qb.dr, qc.
dr);
1798 recursive_multiply_and_add(qa.dr, qb.dl, qc.
dl);
1799 recursive_multiply_and_add(qa.dl, qb.ul, qc.
dl);
1806 static swappable_block_matrix_type&
1829 ++matrix_operation_statistic::get_instance()->block_multiplication_calls;
1835 ++matrix_operation_statistic::get_instance()->block_multiplications_saved_through_zero;
1839 ValueType* ap = bs_a.
acquire(a).begin(),
1840 * bp = bs_b.
acquire(b).begin(),
1841 * cp = bs_c.
acquire(c).begin();
1845 (ap, a_is_transposed, bp, b_is_transposed, cp, c_is_transposed);
1857 static column_vector_type&
1867 return naive_matrix_col_vector_multiply_and_add(A, x, z, offset_x, offset_z);
1873 recursive_matrix_col_vector_multiply_and_add(qa.
ul, x, z, offset_x, offset_z );
1874 recursive_matrix_col_vector_multiply_and_add(qa.
ur, x, z, offset_x + qa.
ul.
get_width(), offset_z );
1876 recursive_matrix_col_vector_multiply_and_add(qa.
dl, x, z, offset_x, offset_z + qa.
ul.
get_height());
1881 static column_vector_type&
1888 matrix_col_vector_multiply_and_add_swappable_block(A(row, col), A.
is_transposed(), A.
bs,
1889 x, z, (offset_x + col) * BlockSideLength, (offset_z + row) * BlockSideLength);
1910 col_limit =
std::min(BlockSideLength,
unsigned(x.
size() - offset_x));
1911 if (a_is_transposed)
1912 for (
int_type col = 0; col < col_limit; ++col)
1913 for (
int_type row = 0; row < row_limit; ++row)
1914 z[offset_z + row] += x[offset_x + col] * ia[row + col * BlockSideLength];
1916 for (
int_type row = 0; row < row_limit; ++row)
1917 for (
int_type col = 0; col < col_limit; ++col)
1918 z[offset_z + row] += x[offset_x + col] * ia[row * BlockSideLength + col];
1925 static row_vector_type&
1935 return naive_matrix_row_vector_multiply_and_add(y, A, z, offset_y, offset_z);
1941 recursive_matrix_row_vector_multiply_and_add(y, qa.
ul, z, offset_y, offset_z );
1942 recursive_matrix_row_vector_multiply_and_add(y, qa.
dl, z, offset_y + qa.
ul.
get_height(), offset_z );
1944 recursive_matrix_row_vector_multiply_and_add(y, qa.
ur, z, offset_y, offset_z + qa.
ul.
get_width());
1949 static row_vector_type&
1956 matrix_row_vector_multiply_and_add_swappable_block(y, A(row, col), A.
is_transposed(), A.
bs,
1957 z, (offset_y + row) * BlockSideLength, (offset_z + col) * BlockSideLength);
1978 col_limit =
std::min(BlockSideLength,
unsigned(z.
size() - offset_z));
1979 if (a_is_transposed)
1980 for (
int_type col = 0; col < col_limit; ++col)
1981 for (
int_type row = 0; row < row_limit; ++row)
1982 z[offset_z + col] += ia[row + col * BlockSideLength] * y[offset_y + row];
1984 for (
int_type row = 0; row < row_limit; ++row)
1985 for (
int_type col = 0; col < col_limit; ++col)
1986 z[offset_z + col] += ia[row * BlockSideLength + col] * y[offset_y + row];
2005 naive_matrix_from_vectors(A, l, r, offset_l, offset_r);
2013 recursive_matrix_from_vectors(qa.
ul, l, r, offset_l, offset_r );
2014 recursive_matrix_from_vectors(qa.
ur, l, r, offset_l, offset_r + qa.
ul.
get_width());
2016 recursive_matrix_from_vectors(qa.
dl, l, r, offset_l + qa.
ul.
get_height(), offset_r );
2024 matrix_from_vectors_swappable_block(A(row, col), A.
is_transposed(), A.
bs,
2025 l, r, (offset_l + row) * BlockSideLength, (offset_r + col) * BlockSideLength);
2039 col_limit =
std::min(BlockSideLength,
unsigned(r.
size() - offset_r));
2040 if (a_is_transposed)
2041 for (
int_type col = 0; col < col_limit; ++col)
2042 for (
int_type row = 0; row < row_limit; ++row)
2043 ia[row + col * BlockSideLength] = l[row + offset_l] * r[col + offset_r];
2045 for (
int_type row = 0; row < row_limit; ++row)
2046 for (
int_type col = 0; col < col_limit; ++col)
2047 ia[row * BlockSideLength + col] = l[row + offset_l] * r[col + offset_r];
2057 template <
typename ValueType,
unsigned BlockS
ideLength>
2058 const int_type matrix_operations<ValueType, BlockSideLength>::strassen_winograd_base_case_size = 3;
2064 #endif // !STXXL_CONTAINERS_MATRIX_ARITHMETIC_HEADER
void begin_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt)
swappable_block_matrix_type & ur
swappable_block_matrix_type::size_type size_type
static void strassen_winograd_preaddition_a(swappable_block_matrix_type &a11, swappable_block_matrix_type &a12, swappable_block_matrix_type &a21, swappable_block_matrix_type &a22, swappable_block_matrix_type &s1, swappable_block_matrix_type &s2, swappable_block_matrix_type &s3, swappable_block_matrix_type &s4)
swappable_block_matrix_type::size_type size_type
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
swappable_block_matrix_type & ur
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
int_type block_multiplication_calls
compat::remove_const< Integral >::type div_ceil(Integral n, Integral2 d)
swappable_block_matrix_type m
friend std::ostream & operator<<(std::ostream &os, const uint_pair &a)
make a uint_pair outputtable via iostreams, using unsigned long long.
void feed_element(const int_type element_num, const vt v)
matrix_operation_statistic_dataset()
bool is_initialized(const swappable_block_identifier_type sbid) const
check if the swappable_block is initialized.
static swappable_block_matrix_type & strassen_winograd_multiply(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, AExists, false > smaller_feedable_strassen_winograd_a
matrix_to_quadtree_block_grained< ValueType, BlockSideLength, Level-1, Granularity > smaller_matrix_to_quadtree_block_grained
block_scheduler_type & bs
static swappable_block_matrix_type & element_op(swappable_block_matrix_type &C, const swappable_block_matrix_type &A, Op op=Op())
static void element_op_twice_nontransposed(swappable_block_matrix_type &c1, swappable_block_matrix_type &c2, const swappable_block_matrix_type &a, Op op=Op())
external column-vector container for matrix multiplication
swappable_block_matrix_type & ur
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, false, BExists > smaller_feedable_strassen_winograd_b
static unsigned_type get_num_temp_grains()
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
void deinitialize(const swappable_block_identifier_type sbid)
Drop all data in the given block, freeing in- and external memory.
static swappable_block_matrix_type & naive_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
smaller_feedable_strassen_winograd_a p7
matrix_to_quadtree(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
static_quadtree(smaller_static_quadtree ul, smaller_static_quadtree ur, smaller_static_quadtree dl, smaller_static_quadtree dr)
static swappable_block_matrix_type &return C
static void recursive_matrix_from_vectors(swappable_block_matrix_type A, const column_vector_type &l, const row_vector_type &r, vector_size_type offset_l=0, vector_size_type offset_r=0)
swappable_block_matrix_type::swappable_block_identifier_type swappable_block_identifier_type
void feed_b(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
SwappableBlockType::internal_block_type internal_block_type
swappable_block_matrix_type c
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
const size_type & get_width_in_blocks()
swappable_block_matrix_type & dr
void feed_and_add_element(const int_type element_num, const vt v)
void feed_a(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
void begin_feeding_block(const size_type &block_row, const size_type &block_col, const zbt zb)
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix)
void begin_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt zb)
void read_and_add(const size_type &row, const size_type &col, swappable_block_matrix_type &bl)
#define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE
void begin_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt zb)
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
External container for a (sub)matrix. Not intended for direct use.
multiplies matrices A and B, adds result to C, for arbitrary entries param pointer to blocks of A...
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
swappable_block_matrix_type & dl
static uint_pair min()
return an uint_pair instance containing the smallest value possible
static_quadtree< ValueType, Level-1 > smaller_static_quadtree
column_vector_type::size_type vector_size_type
swappable_block_matrix_type & ul
static unsigned_type get_num_temp_grains()
matrix_to_quadtree(const swappable_block_matrix_type &matrix)
size_type size() const
return the size of the vector.
swappable_block_matrix_type::swappable_block_identifier_type swappable_block_identifier_type
smaller_matrix_to_quadtree_block_grained ur
void feed_b_element(const int_type element_num, const vt v)
void feed_b_element(const int_type element_num, const vt v)
const size_type & get_height() const
bool is_simulating() const
Returns if simulation mode is on, i.e. if a prediction sequence is being recorded.
smaller_feedable_strassen_winograd_n p5
column_vector< ValueType > column_vector_type
smaller_feedable_strassen_winograd_b p6
void feed_a(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, false, false > smaller_feedable_strassen_winograd_n
static void element_op_swappable_block(const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type &bs_c, const swappable_block_identifier_type a, bool a_is_transposed, block_scheduler_type &bs_a, const swappable_block_identifier_type b, bool b_is_transposed, block_scheduler_type &bs_b, Op op=Op())
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
static void element_op_swappable_block(const swappable_block_identifier_type c, block_scheduler_type &bs_c, Op op=Op())
c = a [op] b; for arbitrary entries
static void op_swappable_block_nontransposed(swappable_block_matrix_type &c, Op op, swappable_block_matrix_type &a, size_type &row, size_type &col)
smaller_matrix_to_quadtree ur
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
static_quadtree< bool, Level > zbt
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
#define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS
static_quadtree< ValueType, 0 > vt
void end_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt zb)
static void element_op_swappable_block(const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type &bs_c, const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, Op op=Op())
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
void begin_feeding_block(const size_type &block_row, const size_type &block_col, const zbt)
A static_quadtree holds 4^Level elements arranged in a quad tree.
const bool & is_transposed() const
if the elements inside the blocks are in transposed order i.e. column-major
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
block_scheduler_type::swappable_block_identifier_type swappable_block_identifier_type
swappable_block_matrix_type::size_type size_type
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, AExists, BExists > smaller_feedable_strassen_winograd_ab
swappable_block_matrix_type::size_type size_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
static const int_type strassen_winograd_base_case_size
void begin_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt)
block_scheduler_type::internal_block_type internal_block_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
static_quadtree< bool, Level > zbt
uint_pair & operator+=(const uint_pair &b)
addition operator (uses 64-bit arithmetic)
block_scheduler_type::internal_block_type internal_block_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
static void multiply_and_add_swappable_block(const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, const swappable_block_identifier_type b, const bool b_is_transposed, block_scheduler_type &bs_b, const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type &bs_c)
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
static void use_feedable_sw_block_grained(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
smaller_static_quadtree dl
static_quadtree< bool, 0 > zbt
void end_feeding_block(const size_type &block_row, const size_type &block_col, const zbt zb)
static_quadtree< ValueType, Level > vt
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
c [op]= a; for arbitrary entries
External matrix container. Introduction to matrix container: see STXXL Matrix tutorial. Design and Internals of matrix container: see Matrix.
static_quadtree< ValueType, 0 > vt
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
int_type block_multiplications_saved_through_zero
block_scheduler_type::internal_block_type internal_block_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix)
static void strassen_winograd_preaddition_b(swappable_block_matrix_type &b11, swappable_block_matrix_type &b12, swappable_block_matrix_type &b21, swappable_block_matrix_type &b22, swappable_block_matrix_type &t1, swappable_block_matrix_type &t2, swappable_block_matrix_type &t3, swappable_block_matrix_type &t4)
choose_int_types< my_pointer_size >::int_type int_type
internal_block_type * iblock
int_type block_addition_calls
void end_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt zb)
c =[op] a; for arbitrary entries
smaller_feedable_strassen_winograd_n p5
static_quadtree< ValueType, Level > vt
const size_type get_width()
vt read_element(int_type element_num)
block_scheduler_type::internal_block_type internal_block_type
static_quadtree< bool, 0 > zbt
swappable_block_matrix_type::size_type size_type
static swappable_block_matrix_type & strassen_winograd_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
smaller_feedable_strassen_winograd_b p6
static void strassen_winograd_postaddition(swappable_block_matrix_type &c11, swappable_block_matrix_type &c12, swappable_block_matrix_type &c21, swappable_block_matrix_type &c22, swappable_block_matrix_type &p1, swappable_block_matrix_type &p3, swappable_block_matrix_type &p5)
void end_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt zb)
swappable_block_matrix_approximative_quarterer(const swappable_block_matrix_type &whole)
static void naive_matrix_from_vectors(swappable_block_matrix_type A, const column_vector_type &l, const row_vector_type &r, vector_size_type offset_l=0, vector_size_type offset_r=0)
block_scheduler_type::internal_block_type internal_block_type
const size_type & get_height_in_blocks()
const size_type & get_width() const
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, AExists, false > smaller_feedable_strassen_winograd_a
static_quadtree< bool, Level > zbt
static_quadtree< ValueType, Level > vt
#define STXXL_BEGIN_NAMESPACE
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, AExists, BExists > smaller_feedable_strassen_winograd_ab
void end_feeding_block(const size_type &block_row, const size_type &block_col, const zbt zb)
smaller_feedable_strassen_winograd_a p7
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
static void op_swappable_block_nontransposed(swappable_block_matrix_type &c, swappable_block_matrix_type &a, Op op, swappable_block_matrix_type &b, size_type &row, size_type &col)
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
swappable_block_matrix_type c
scalar_multiplication(const ValueType scalar=1)
swappable_block_matrix_type & dr
void feed_b(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
smaller_feedable_strassen_winograd_ab p2
void feed_element(const int_type element_num, const vt v)
static row_vector_type & naive_matrix_row_vector_multiply_and_add(const row_vector_type &y, const swappable_block_matrix_type &A, row_vector_type &z, const vector_size_type offset_y=0, const vector_size_type offset_z=0)
vector_type::size_type size_type
swappable_block_matrix_type::size_type size_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
smaller_static_quadtree ul
static swappable_block_matrix_type & recursive_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
static swappable_block_matrix_type & multi_level_strassen_winograd_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
swappable_block_matrix_type & dl
void feed_a_element(const int_type element_num, const vt v)
row_vector< ValueType > row_vector_type
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, false, BExists > smaller_feedable_strassen_winograd_b
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
matrix_to_quadtree(const swappable_block_matrix_type &matrix)
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
static_quadtree< swappable_block_identifier_type, 0 > bt
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
vt read_element(const int_type element_num)
vt read_element(const int_type element_num)
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, false, false > smaller_feedable_strassen_winograd_n
swappable_block_matrix_type::size_type size_type
const size_type & get_height_in_blocks()
matrix_to_quadtree(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
const size_type & get_width_in_blocks()
static void matrix_row_vector_multiply_and_add_swappable_block(const row_vector_type &y, const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, row_vector_type &z, const vector_size_type offset_y=0, const vector_size_type offset_z=0)
static swappable_block_matrix_type & element_op(swappable_block_matrix_type &C, const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, Op op=Op())
matrix_operations< ValueType, BlockSideLength > Ops
void read_and_add(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
swappable_block_matrix_type & ul
const size_type get_width()
int_type block_additions_saved_through_zero
static row_vector_type & recursive_matrix_row_vector_multiply_and_add(const row_vector_type &y, const swappable_block_matrix_type &A, row_vector_type &z, const vector_size_type offset_y=0, const vector_size_type offset_z=0)
calculates z = y * A
static swappable_block_matrix_type & multi_level_strassen_winograd_multiply_and_add_block_grained(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
const size_type get_height()
smaller_static_quadtree ur
matrix_operations< ValueType, BlockSideLength > Ops
choose_int_types< my_pointer_size >::unsigned_type unsigned_type
const size_type get_height()
unsigned int ilog2_ceil(const IntegerType &i)
calculate the log2 ceiling of an integer type (by repeated bit shifts)
static column_vector_type & naive_matrix_col_vector_multiply_and_add(const swappable_block_matrix_type &A, const column_vector_type &x, column_vector_type &z, const vector_size_type offset_x=0, const vector_size_type offset_z=0)
smaller_feedable_strassen_winograd_ab p2
static void choose_level_for_feedable_sw(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
static void use_feedable_sw(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
external row-vector container for matrix multiplication
void release(const swappable_block_identifier_type sbid, const bool dirty)
Release the given block. Has to be in pairs with acquire. Pairs may be nested and interleaved...
Schedules swapping of blocks and provides blocks for temporary storage.
swappable_block_matrix_type::size_type size_type
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
swappable_block_matrix_quarterer(const swappable_block_matrix_type &whole)
swappable_block_matrix_type & ul
static_quadtree(const ValueType &v)
smaller_static_quadtree dr
internal_block_type * iblock
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
block_scheduler_type::internal_block_type internal_block_type
swappable_block_matrix_type & dr
void feed_and_add_element(const int_type element_num, const vt v)
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
swappable_block_matrix_type m
static void matrix_from_vectors_swappable_block(swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, const column_vector_type &l, const row_vector_type &r, vector_size_type offset_l, vector_size_type offset_r)
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
static column_vector_type & recursive_matrix_col_vector_multiply_and_add(const swappable_block_matrix_type &A, const column_vector_type &x, column_vector_type &z, const vector_size_type offset_x=0, const vector_size_type offset_z=0)
calculates z = A * x
void end_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt zb)
static swappable_block_matrix_type & strassen_winograd_multiply_and_add_interleaved(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C
vt read_element(const int_type element_num)
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
Integral round_up_to_power_of_two(Integral n, unsigned_type power)
round n up to next larger multiple of 2^power. example: (48,4) = 64, (48,3) = 48. ...
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
static void matrix_col_vector_multiply_and_add_swappable_block(const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, const column_vector_type &x, column_vector_type &z, const vector_size_type offset_x=0, const vector_size_type offset_z=0)
matrix_to_quadtree< ValueType, BlockSideLength, Level-1 > smaller_matrix_to_quadtree
swappable_block_matrix_type & dl
void feed_a_element(const int_type element_num, const vt v)
swappable_block_matrix_type::size_type size_type
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
swappable_block_matrix_padding_quarterer(const swappable_block_matrix_type &whole)
#define STXXL_END_NAMESPACE
internal_block_type & acquire(const swappable_block_identifier_type sbid, const bool uninitialized=false)
Acquire the given block. Has to be in pairs with release. Pairs may be nested and interleaved...