13 #ifndef STXXL_CONTAINERS_MATRIX_ARITHMETIC_HEADER 
   14 #define STXXL_CONTAINERS_MATRIX_ARITHMETIC_HEADER 
   21 #ifndef STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS 
   22 #define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS 3 
   24 #ifndef STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE 
   25 #define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE 2 
   28 template <
typename ValueType>
 
   31 template <
typename ValueType>
 
   34 template <
typename ValueType, 
unsigned BlockS
ideLength>
 
   35 class swappable_block_matrix;
 
   45         block_additions_saved_through_zero;
 
   48         : block_multiplication_calls(0),
 
   49           block_multiplications_saved_through_zero(0),
 
   50           block_addition_calls(0),
 
   51           block_additions_saved_through_zero(0) { }
 
   92     { operator = (*matrix_operation_statistic::get_instance()); }
 
  103     o << 
"matrix operation statistics" << std::endl;
 
  104     o << 
"block multiplication calls                     : " 
  106     o << 
"block multiplications saved through zero blocks: " 
  108     o << 
"block multiplications performed                : " 
  110     o << 
"block addition calls                           : " 
  112     o << 
"block additions saved through zero blocks      : " 
  114     o << 
"block additions performed                      : " 
  122 namespace matrix_local {
 
  129 template <
typename ValueType, 
unsigned Level>
 
  138         : ul(ul), ur(ur), dl(dl), dr(dr) { }
 
  144         ul &= right.
ul, ur &= right.
ur;
 
  145         dl &= right.
dl, dr &= right.
dr;
 
  151         ul += right.
ul, ur += right.
ur;
 
  152         dl += right.
dl, dr += right.
dr;
 
  158         ul -= right.
ul, ur -= right.
ur;
 
  159         dl -= right.
dl, dr -= right.
dr;
 
  173 template <
typename ValueType>
 
  183     operator const ValueType& () 
const 
  186     operator ValueType& ()
 
  211     { 
return val & right.val; }
 
  214     { 
return val + right.val; }
 
  217     { 
return val - right.val; }
 
  220 template <
typename ValueType, 
unsigned BlockS
ideLength, 
unsigned Level, 
bool AExists, 
bool BExists>
 
  247           p1(existing_a, a_from_row,       a_from_col,       bs_c, n/2, m/2, l/2, existing_b, b_from_row,       b_from_col),
 
  248           p2(existing_a, a_from_row,       a_from_col + l/2, bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
 
  249           p3(                                                bs_c, n/2, m/2, l/2),
 
  250           p4(                                                bs_c, n/2, m/2, l/2),
 
  251           p5(                                                bs_c, n/2, m/2, l/2),
 
  252           p6(                                                bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
 
  253           p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
 
  259           p1(existing_a, a_from_row,       a_from_col,       bs_c, n/2, m/2, l/2),
 
  260           p2(existing_a, a_from_row,       a_from_col + l/2, bs_c, n/2, m/2, l/2),
 
  261           p3(                                                bs_c, n/2, m/2, l/2),
 
  262           p4(                                                bs_c, n/2, m/2, l/2),
 
  263           p5(                                                bs_c, n/2, m/2, l/2),
 
  264           p6(                                                bs_c, n/2, m/2, l/2),
 
  265           p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
 
  271           p1(bs_c, n/2, m/2, l/2, existing_b, b_from_row,       b_from_col),
 
  272           p2(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
 
  273           p3(bs_c, n/2, m/2, l/2),
 
  274           p4(bs_c, n/2, m/2, l/2),
 
  275           p5(bs_c, n/2, m/2, l/2),
 
  276           p6(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
 
  277           p7(bs_c, n/2, m/2, l/2) {}
 
  282           p1(bs_c, n / 2, m / 2, l / 2),
 
  283           p2(bs_c, n / 2, m / 2, l / 2),
 
  284           p3(bs_c, n / 2, m / 2, l / 2),
 
  285           p4(bs_c, n / 2, m / 2, l / 2),
 
  286           p5(bs_c, n / 2, m / 2, l / 2),
 
  287           p6(bs_c, n / 2, m / 2, l / 2),
 
  288           p7(bs_c, n / 2, m / 2, l / 2) { }
 
  297         p1.begin_feeding_a_block(block_row, block_col, zb.
ul);
 
  298         p2.begin_feeding_a_block(block_row, block_col, zb.
ur);
 
  299         p3.begin_feeding_a_block(block_row, block_col, s1);
 
  300         p4.begin_feeding_a_block(block_row, block_col, s2);
 
  301         p5.begin_feeding_a_block(block_row, block_col, s3);
 
  302         p6.begin_feeding_a_block(block_row, block_col, s4);
 
  303         p7.begin_feeding_a_block(block_row, block_col, zb.
dr);
 
  313         p1.feed_a_element(element_num, v.
ul);
 
  314         p2.feed_a_element(element_num, v.
ur);
 
  315         p3.feed_a_element(element_num, s1);
 
  316         p4.feed_a_element(element_num, s2);
 
  317         p5.feed_a_element(element_num, s3);
 
  318         p6.feed_a_element(element_num, s4);
 
  319         p7.feed_a_element(element_num, v.
dr);
 
  329         p1.end_feeding_a_block(block_row, block_col, zb.
ul);
 
  330         p2.end_feeding_a_block(block_row, block_col, zb.
ur);
 
  331         p3.end_feeding_a_block(block_row, block_col, s1);
 
  332         p4.end_feeding_a_block(block_row, block_col, s2);
 
  333         p5.end_feeding_a_block(block_row, block_col, s3);
 
  334         p6.end_feeding_a_block(block_row, block_col, s4);
 
  335         p7.end_feeding_a_block(block_row, block_col, zb.
dr);
 
  345         p1.begin_feeding_b_block(block_row, block_col, zb.
ul);
 
  346         p2.begin_feeding_b_block(block_row, block_col, zb.
dl);
 
  347         p3.begin_feeding_b_block(block_row, block_col, t1);
 
  348         p4.begin_feeding_b_block(block_row, block_col, t2);
 
  349         p5.begin_feeding_b_block(block_row, block_col, t3);
 
  350         p6.begin_feeding_b_block(block_row, block_col, zb.
dr);
 
  351         p7.begin_feeding_b_block(block_row, block_col, t4);
 
  361         p1.feed_b_element(element_num, v.
ul);
 
  362         p2.feed_b_element(element_num, v.
dl);
 
  363         p3.feed_b_element(element_num, t1);
 
  364         p4.feed_b_element(element_num, t2);
 
  365         p5.feed_b_element(element_num, t3);
 
  366         p6.feed_b_element(element_num, v.
dr);
 
  367         p7.feed_b_element(element_num, t4);
 
  377         p1.end_feeding_b_block(block_row, block_col, zb.
ul);
 
  378         p2.end_feeding_b_block(block_row, block_col, zb.
dl);
 
  379         p3.end_feeding_b_block(block_row, block_col, t1);
 
  380         p4.end_feeding_b_block(block_row, block_col, t2);
 
  381         p5.end_feeding_b_block(block_row, block_col, t3);
 
  382         p6.end_feeding_b_block(block_row, block_col, zb.
dr);
 
  383         p7.end_feeding_b_block(block_row, block_col, t4);
 
  400         r.
ur = r.
ul = p1.begin_reading_block(block_row, block_col);
 
  401         r.
ul &= p2.begin_reading_block(block_row, block_col);
 
  402         r.
ur &= p4.begin_reading_block(block_row, block_col);
 
  403         r.
dr = r.
dl = p5.begin_reading_block(block_row, block_col);
 
  405         r.
dl &= p7.begin_reading_block(block_row, block_col);
 
  406         r.
ur &= p3.begin_reading_block(block_row, block_col);
 
  408         r.
ur &= p6.begin_reading_block(block_row, block_col);
 
  415         r.
ur = r.
ul = p1.read_element(element_num);
 
  416         r.
ul += p2.read_element(element_num);
 
  417         r.
ur += p4.read_element(element_num);
 
  418         r.
dr = r.
dl = p5.read_element(element_num);
 
  420         r.
dl += p7.read_element(element_num);
 
  421         r.
ur += p3.read_element(element_num);
 
  423         r.
ur += p6.read_element(element_num);
 
  430         r.
ur = r.
ul = p1.end_reading_block(block_row, block_col);
 
  431         r.
ul &= p2.end_reading_block(block_row, block_col);
 
  432         r.
ur &= p4.end_reading_block(block_row, block_col);
 
  433         r.
dr = r.
dl = p5.end_reading_block(block_row, block_col);
 
  435         r.
dl &= p7.end_reading_block(block_row, block_col);
 
  436         r.
ur &= p3.end_reading_block(block_row, block_col);
 
  438         r.
ur &= p6.end_reading_block(block_row, block_col);
 
  443 template <
typename ValueType, 
unsigned BlockS
ideLength, 
bool AExists, 
bool BExists>
 
  462         : a(existing_a, n, l, a_from_row, a_from_col),
 
  463           b(existing_b, n, l, b_from_row, b_from_col),
 
  471         : a(existing_a, n, l, a_from_row, a_from_col),
 
  481           b(existing_b, n, l, b_from_row, b_from_col),
 
  497             iblock = &a.bs.acquire(a(block_row, block_col), 
true);
 
  503             (*iblock)[element_num] = v;
 
  510             a.bs.release(a(block_row, block_col), ! zb);
 
  518             iblock = &b.bs.acquire(b(block_row, block_col), 
true);
 
  524             (*iblock)[element_num] = v;
 
  531             b.bs.release(b(block_row, block_col), ! zb);
 
  541         bool zb = ! c.bs.is_initialized(c(block_row, block_col));
 
  542         iblock = &c.bs.acquire(c(block_row, block_col));
 
  547     { 
return (*iblock)[element_num]; }
 
  551         c.bs.release(c(block_row, block_col), 
false);
 
  553         return ! c.bs.is_initialized(c(block_row, block_col));
 
  557 template <
typename ValueType, 
unsigned BlockS
ideLength, 
unsigned Level>
 
  573         : ul(matrix, matrix.get_height()/2, matrix.get_width()/2,                     0,                    0),
 
  574           ur(matrix, matrix.get_height()/2, matrix.get_width()/2,                     0, matrix.get_width()/2),
 
  575           dl(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2,                    0),
 
  576           dr(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, matrix.get_width()/2)
 
  581         : ul(matrix, height/2, width/2, from_row,            from_col),
 
  582           ur(matrix, height/2, width/2, from_row,            from_col + width/2),
 
  583           dl(matrix, height/2, width/2, from_row + height/2, from_col),
 
  584           dr(matrix, height/2, width/2, from_row + height/2, from_col + width/2)
 
  585     { assert(! (height % 2 | width % 2)); }
 
  589         ul.begin_feeding_block(block_row, block_col, zb.
ul);
 
  590         ur.begin_feeding_block(block_row, block_col, zb.
ur);
 
  591         dl.begin_feeding_block(block_row, block_col, zb.
dl);
 
  592         dr.begin_feeding_block(block_row, block_col, zb.
dr);
 
  597         ul.feed_element(element_num, v.
ul);
 
  598         ur.feed_element(element_num, v.
ur);
 
  599         dl.feed_element(element_num, v.
dl);
 
  600         dr.feed_element(element_num, v.
dr);
 
  605         ul.feed_and_add_element(element_num, v.
ul);
 
  606         ur.feed_and_add_element(element_num, v.
ur);
 
  607         dl.feed_and_add_element(element_num, v.
dl);
 
  608         dr.feed_and_add_element(element_num, v.
dr);
 
  613         ul.end_feeding_block(block_row, block_col, zb.
ul);
 
  614         ur.end_feeding_block(block_row, block_col, zb.
ur);
 
  615         dl.end_feeding_block(block_row, block_col, zb.
dl);
 
  616         dr.end_feeding_block(block_row, block_col, zb.
dr);
 
  622         zb.
ul = ul.begin_reading_block(block_row, block_col);
 
  623         zb.
ur = ur.begin_reading_block(block_row, block_col);
 
  624         zb.
dl = dl.begin_reading_block(block_row, block_col);
 
  625         zb.
dr = dr.begin_reading_block(block_row, block_col);
 
  632         v.
ul = ul.read_element(element_num);
 
  633         v.
ur = ur.read_element(element_num);
 
  634         v.
dl = dl.read_element(element_num);
 
  635         v.
dr = dr.read_element(element_num);
 
  642         zb.
ul = ul.end_reading_block(block_row, block_col);
 
  643         zb.
ur = ur.end_reading_block(block_row, block_col);
 
  644         zb.
dl = dl.end_reading_block(block_row, block_col);
 
  645         zb.
dr = dr.end_reading_block(block_row, block_col);
 
  650     { 
return ul.get_height_in_blocks(); }
 
  653     { 
return ul.get_width_in_blocks(); }
 
  656 template <
typename ValueType, 
unsigned BlockS
ideLength>
 
  671         : m(matrix, matrix.get_height(), matrix.get_width(), 0, 0),
 
  676         : m(matrix, height, width, from_row, from_col),
 
  680     { iblock = &m.bs.acquire(m(block_row, block_col)); }
 
  683     { (*iblock)[element_num] = v; }
 
  686     { (*iblock)[element_num] += v; }
 
  690         m.bs.release(m(block_row, block_col), ! zb);
 
  696         zbt zb = ! m.bs.is_initialized(m(block_row, block_col));
 
  697         iblock = &m.bs.acquire(m(block_row, block_col));
 
  702     { 
return (*iblock)[element_num]; }
 
  706         m.bs.release(m(block_row, block_col), 
false);
 
  708         return ! m.bs.is_initialized(m(block_row, block_col));
 
  712     { 
return m.get_height(); }
 
  715     { 
return m.get_width(); }
 
  718 template <
typename ValueType, 
unsigned BlockS
ideLength, 
unsigned Level, 
bool AExists, 
bool BExists>
 
  746           p1(existing_a, a_from_row,       a_from_col,       bs_c, n/2, m/2, l/2, existing_b, b_from_row,       b_from_col),
 
  747           p2(existing_a, a_from_row,       a_from_col + l/2, bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
 
  748           p3(                                                bs_c, n/2, m/2, l/2),
 
  749           p4(                                                bs_c, n/2, m/2, l/2),
 
  750           p5(                                                bs_c, n/2, m/2, l/2),
 
  751           p6(                                                bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
 
  752           p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
 
  758           p1(existing_a, a_from_row,       a_from_col,       bs_c, n/2, m/2, l/2),
 
  759           p2(existing_a, a_from_row,       a_from_col + l/2, bs_c, n/2, m/2, l/2),
 
  760           p3(                                                bs_c, n/2, m/2, l/2),
 
  761           p4(                                                bs_c, n/2, m/2, l/2),
 
  762           p5(                                                bs_c, n/2, m/2, l/2),
 
  763           p6(                                                bs_c, n/2, m/2, l/2),
 
  764           p7(existing_a, a_from_row + n/2, a_from_col + l/2, bs_c, n/2, m/2, l/2) {}
 
  770           p1(bs_c, n/2, m/2, l/2, existing_b, b_from_row,       b_from_col),
 
  771           p2(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col),
 
  772           p3(bs_c, n/2, m/2, l/2),
 
  773           p4(bs_c, n/2, m/2, l/2),
 
  774           p5(bs_c, n/2, m/2, l/2),
 
  775           p6(bs_c, n/2, m/2, l/2, existing_b, b_from_row + l/2, b_from_col + m/2),
 
  776           p7(bs_c, n/2, m/2, l/2) {}
 
  781           p1(bs_c, n / 2, m / 2, l / 2),
 
  782           p2(bs_c, n / 2, m / 2, l / 2),
 
  783           p3(bs_c, n / 2, m / 2, l / 2),
 
  784           p4(bs_c, n / 2, m / 2, l / 2),
 
  785           p5(bs_c, n / 2, m / 2, l / 2),
 
  786           p6(bs_c, n / 2, m / 2, l / 2),
 
  787           p7(bs_c, n / 2, m / 2, l / 2) { }
 
  799         Ops::strassen_winograd_preaddition_a(qbl.
ul, qbl.
ur, qbl.
dl, qbl.
dr, s1, s2, s3, s4);
 
  801         p1.feed_a(row, col, qbl.
ul);
 
  802         p2.feed_a(row, col, qbl.
ur);
 
  803         p3.feed_a(row, col, s1);
 
  804         p4.feed_a(row, col, s2);
 
  805         p5.feed_a(row, col, s3);
 
  806         p6.feed_a(row, col, s4);
 
  807         p7.feed_a(row, col, qbl.
dr);
 
  820         Ops::strassen_winograd_preaddition_b(qbl.
ul, qbl.
ur, qbl.
dl, qbl.
dr, t1, t2, t3, t4);
 
  822         p1.feed_b(row, col, qbl.
ul);
 
  823         p2.feed_b(row, col, qbl.
dl);
 
  824         p3.feed_b(row, col, t1);
 
  825         p4.feed_b(row, col, t2);
 
  826         p5.feed_b(row, col, t3);
 
  827         p6.feed_b(row, col, qbl.
dr);
 
  828         p7.feed_b(row, col, t4);
 
  848         p2.read_and_add(row, col, qbl.
ul);
 
  849         p1.read_and_add(row, col, px);
 
  851         p4.read_and_add(row, col, px);
 
  853         p5.read_and_add(row, col, px);
 
  854         Ops::element_op_twice_nontransposed(qbl.
dl, qbl.
dr, px, 
typename Ops::addition());
 
  856         p7.read_and_add(row, col, qbl.
dl);
 
  857         p3.read_and_add(row, col, px);
 
  858         Ops::element_op_twice_nontransposed(qbl.
dr, qbl.
ur, px, 
typename Ops::addition());
 
  859         p6.read_and_add(row, col, qbl.
ur);
 
  863     { 
return smaller_feedable_strassen_winograd_ab::get_num_temp_grains() + (4 ^ Level) * 2; }
 
  866 template <
typename ValueType, 
unsigned BlockS
ideLength, 
bool AExists, 
bool BExists>
 
  883         : a(existing_a, n, l, a_from_row, a_from_col),
 
  884           b(existing_b, n, l, b_from_row, b_from_col),
 
  890         : a(existing_a, n, l, a_from_row, a_from_col),
 
  898           b(existing_b, n, l, b_from_row, b_from_col),
 
  950 template <
typename ValueType, 
unsigned BlockS
ideLength, 
unsigned Level, 
unsigned Granularity>
 
  961         : ul(matrix, matrix.get_height()/2, matrix.get_width()/2,                     0,                    0),
 
  962           ur(matrix, matrix.get_height()/2, matrix.get_width()/2,                     0, matrix.get_width()/2),
 
  963           dl(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2,                    0),
 
  964           dr(matrix, matrix.get_height()/2, matrix.get_width()/2, matrix.get_height()/2, matrix.get_width()/2)
 
  969         : ul(matrix, height/2, width/2, from_row,            from_col),
 
  970           ur(matrix, height/2, width/2, from_row,            from_col + width/2),
 
  971           dl(matrix, height/2, width/2, from_row + height/2, from_col),
 
  972           dr(matrix, height/2, width/2, from_row + height/2, from_col + width/2)
 
  973     { assert(! (height % 2 | width % 2)); }
 
  981     { 
return ul.get_height(); }
 
  984     { 
return ul.get_width(); }
 
  987 template <
typename ValueType, 
unsigned BlockS
ideLength, 
unsigned Granularity>
 
  996         : m(matrix, matrix.get_height(), matrix.get_width(), 0, 0)
 
 1001         : m(matrix, height, width, from_row, from_col)
 
 1010     { 
return m.get_height() / Granularity; }
 
 1013     { 
return m.get_width() / Granularity; }
 
 1016 template <
typename ValueType, 
unsigned BlockS
ideLength>
 
 1046         inline ValueType& operator () (ValueType& c, 
const ValueType& a, 
const ValueType& b) { 
return c = a + b; }
 
 1047         inline ValueType& operator () (ValueType& c, 
const ValueType& a) { 
return c += a; }
 
 1048         inline ValueType operator () (
const ValueType& a) { 
return +a; }
 
 1053         inline ValueType& operator () (ValueType& c, 
const ValueType& a, 
const ValueType& b) { 
return c = a - b; }
 
 1054         inline ValueType& operator () (ValueType& c, 
const ValueType& a) { 
return c -= a; }
 
 1055         inline ValueType operator () (
const ValueType& a) { 
return -a; }
 
 1061         inline ValueType& operator () (ValueType& c, 
const ValueType& a) { 
return c = a * s; }
 
 1062         inline ValueType operator () (
const ValueType& a) { 
return a * s; }
 
 1063         inline operator const ValueType& () { 
return s; }
 
 1076                 element_op_swappable_block(
 
 1085     static swappable_block_matrix_type&
 
 1091                 element_op_swappable_block(
 
 1099     static swappable_block_matrix_type&
 
 1100     element_op(swappable_block_matrix_type& C, Op op = Op())
 
 1102         for (
size_type row = 0; row < C.get_height(); ++row)
 
 1103             for (
size_type col = 0; col < C.get_width(); ++col)
 
 1104                 element_op_swappable_block(
 
 1105                     C(row, col), C.bs, op);
 
 1118             ++matrix_operation_statistic::get_instance()->block_addition_calls;
 
 1125                 ++matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
 
 1128         a_is_transposed = a_is_transposed != c_is_transposed;
 
 1129         b_is_transposed = b_is_transposed != c_is_transposed;
 
 1137                 if (b_is_transposed)
 
 1152                 if (a_is_transposed)
 
 1167                 if (a_is_transposed)
 
 1169                     if (b_is_transposed)
 
 1176                     if (b_is_transposed)
 
 1196             ++matrix_operation_statistic::get_instance()->block_addition_calls;
 
 1202                 ++matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
 
 1213                 if (c_is_transposed == a_is_transposed)
 
 1219                 if (c_is_transposed == a_is_transposed)
 
 1237             ++matrix_operation_statistic::get_instance()->block_addition_calls;
 
 1243                 ++matrix_operation_statistic::get_instance()->block_additions_saved_through_zero;
 
 1270                 op_swappable_block_nontransposed(s3, a11, 
subtraction(), a21, row, col);
 
 1271                 op_swappable_block_nontransposed(s1, a21,    
addition(), a22, row, col);
 
 1272                 op_swappable_block_nontransposed(s2,  s1, 
subtraction(), a11, row, col);
 
 1273                 op_swappable_block_nontransposed(s4, a12, 
subtraction(),  s2, row, col);
 
 1290                 op_swappable_block_nontransposed(t3, b22, 
subtraction(), b12, row, col);
 
 1291                 op_swappable_block_nontransposed(t1, b12, 
subtraction(), b11, row, col);
 
 1292                 op_swappable_block_nontransposed(t2, b22, 
subtraction(),  t1, row, col);
 
 1293                 op_swappable_block_nontransposed(t4, b21, 
subtraction(),  t2, row, col);
 
 1309                 op_swappable_block_nontransposed(c11,     
addition(),  p1, row, col); 
 
 1310                 op_swappable_block_nontransposed( p1,     
addition(), c22, row, col); 
 
 1311                 op_swappable_block_nontransposed( p5,     
addition(),  p1, row, col); 
 
 1312                 op_swappable_block_nontransposed(c21,     
addition(),  p5, row, col); 
 
 1313                 op_swappable_block_nontransposed(c22, p5, 
addition(),  p3, row, col); 
 
 1314                 op_swappable_block_nontransposed( p1,     
addition(),  p3, row, col); 
 
 1315                 op_swappable_block_nontransposed(c12,     
addition(),  p1, row, col); 
 
 1329                 element_op_swappable_block(
 
 1330                     c1(row, col), 
false, c1.
bs,
 
 1331                     a(row, col), 
false, a.
bs, op);
 
 1332                 element_op_swappable_block(
 
 1333                     c2(row, col), 
false, c2.
bs,
 
 1334                     a(row, col), 
false, a.
bs, op);
 
 1344         element_op_swappable_block(
 
 1345             c(row, col), 
false, c.
bs,
 
 1346             a(row, col), 
false, a.
bs,
 
 1347             b(row, col), 
false, b.
bs, op);
 
 1355         element_op_swappable_block(
 
 1356             c(row, col), 
false, c.
bs,
 
 1357             a(row, col), 
false, a.
bs, op);
 
 1382                                     downleft, downright,
 
 1383                 & ul, & 
ur, & dl, & dr;
 
 1386             : upleft   (whole, whole.get_height()/2, whole.get_width()/2,                    0,                   0),
 
 1387               upright  (whole, whole.get_height()/2, whole.get_width()/2,                    0, whole.get_width()/2),
 
 1388               downleft (whole, whole.get_height()/2, whole.get_width()/2, whole.get_height()/2,                   0),
 
 1389               downright(whole, whole.get_height()/2, whole.get_width()/2, whole.get_height()/2, whole.get_width()/2),
 
 1390               ul(upleft), ur(upright), dl(downleft), dr(downright)
 
 1397                                     downleft, downright,
 
 1398                 & ul, & 
ur, & dl, & dr;
 
 1401             : upleft   (whole, 
div_ceil(whole.get_height(),2), 
div_ceil(whole.get_width(),2),                              0,                             0),
 
 1403               downleft (whole, 
div_ceil(whole.get_height(),2), 
div_ceil(whole.get_width(),2), 
div_ceil(whole.get_height(),2),                             0),
 
 1405               ul(upleft), ur(upright), dl(downleft), dr(downright) {}
 
 1411             downleft, downright,
 
 1412         & ul, & 
ur, & dl, & dr;
 
 1415             : upleft   (whole,                      whole.get_height()/2,                     whole.get_width()/2,                    0,                   0),
 
 1416               upright  (whole,                      whole.get_height()/2, whole.get_width() - whole.get_width()/2,                    0, whole.get_width()/2),
 
 1417               downleft (whole, whole.get_height() - whole.get_height()/2,                     whole.get_width()/2, whole.get_height()/2,                   0),
 
 1418               downright(whole, whole.get_height() - whole.get_height()/2, whole.get_width() - whole.get_width()/2, whole.get_height()/2, whole.get_width()/2),
 
 1419               ul(upleft), ur(upright), dl(downleft), dr(downright) {}
 
 1424     static swappable_block_matrix_type&
 
 1442             #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 5 && 5 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE) 
 1444                 use_feedable_sw_block_grained<5>(padded_a, padded_a, padded_c);
 
 1447             #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 4 && 4 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE) 
 1449                 use_feedable_sw_block_grained<4>(padded_a, padded_a, padded_c);
 
 1452             #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 3 && 3 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE) 
 1454                 use_feedable_sw_block_grained<3>(padded_a, padded_a, padded_c);
 
 1457             #if (STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS >= 2 && 2 > STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE) 
 1459                 use_feedable_sw_block_grained<2>(padded_a, padded_a, padded_c);
 
 1463                 strassen_winograd_multiply_and_add_interleaved(A, B, C);
 
 1469             strassen_winograd_multiply_and_add_interleaved(A, B, C);
 
 1474     template <
unsigned Level>
 
 1479         const unsigned granularity = 1;
 
 1489                     fsw.feed_a(row, col, mtq_a(row, col));
 
 1497                     fsw.feed_b(row, col, mtq_b(row, col));
 
 1507                     fsw.read_and_add(row, col, mtq_c(row, col));
 
 1513     static swappable_block_matrix_type&
 
 1526         choose_level_for_feedable_sw(padded_a, padded_b, padded_c);
 
 1545             use_feedable_sw<2>(A, B, C);
 
 1552             recursive_multiply_and_add(A, B, C);
 
 1558     template <
unsigned Level>
 
 1568         for (
size_type block_row = 0; block_row < mtq_a.get_height_in_blocks(); ++block_row)
 
 1569             for (
size_type block_col = 0; block_col < mtq_a.get_width_in_blocks(); ++block_col)
 
 1571                 fsw.begin_feeding_a_block(block_row, block_col,
 
 1574                 #pragma omp parallel for 
 1576                 for (
int_type element_row_in_block = 0; element_row_in_block < 
int_type(BlockSideLength); ++element_row_in_block)
 
 1577                     for (int_type element_col_in_block = 0; element_col_in_block < 
int_type(BlockSideLength); ++element_col_in_block)
 
 1578                         fsw.feed_a_element(element_row_in_block * BlockSideLength + element_col_in_block,
 
 1579                                            mtq_a.
read_element(element_row_in_block * BlockSideLength + element_col_in_block));
 
 1580                 fsw.end_feeding_a_block(block_row, block_col,
 
 1589                 fsw.begin_feeding_b_block(block_row, block_col,
 
 1592                 #pragma omp parallel for 
 1594                 for (
int_type element_row_in_block = 0; element_row_in_block < 
int_type(BlockSideLength); ++element_row_in_block)
 
 1595                     for (int_type element_col_in_block = 0; element_col_in_block < 
int_type(BlockSideLength); ++element_col_in_block)
 
 1596                         fsw.feed_b_element(element_row_in_block * BlockSideLength + element_col_in_block,
 
 1597                                            mtq_b.
read_element(element_row_in_block * BlockSideLength + element_col_in_block));
 
 1598                 fsw.end_feeding_b_block(block_row, block_col,
 
 1610                                           fsw.begin_reading_block(block_row, block_col));
 
 1612                 #pragma omp parallel for 
 1614                 for (
int_type element_row_in_block = 0; element_row_in_block < 
int_type(BlockSideLength); ++element_row_in_block)
 
 1615                     for (int_type element_col_in_block = 0; element_col_in_block < 
int_type(BlockSideLength); ++element_col_in_block)
 
 1617                                                    fsw.read_element(element_row_in_block * BlockSideLength + element_col_in_block));
 
 1619                                         fsw.end_reading_block(block_row, block_col));
 
 1625     static swappable_block_matrix_type&
 
 1631         if (C.
get_height() <= strassen_winograd_base_case_size
 
 1632             || C.
get_width() <= strassen_winograd_base_case_size
 
 1633             || A.
get_width() <= strassen_winograd_base_case_size)
 
 1636             return recursive_multiply_and_add(A, B, C);
 
 1643                                     s2(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
 
 1644                                     s3(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
 
 1645                                     s4(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
 
 1646                                     t1(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
 
 1647                                     t2(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
 
 1648                                     t3(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
 
 1649                                     t4(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed());
 
 1650         strassen_winograd_preaddition_a(qa.ul, qa.ur, qa.dl, qa.dr, s1, s2, s3, s4);
 
 1651         strassen_winograd_preaddition_b(qb.ul, qb.ur, qb.dl, qb.dr, t1, t2, t3, t4);
 
 1660         strassen_winograd_multiply(qa.ul, qb.ul,    p1);
 
 1661         strassen_winograd_multiply(qa.ur, qb.dl, qc.
ul);
 
 1662         strassen_winograd_multiply(   s1,    t1,    p3);
 
 1663         strassen_winograd_multiply(   s2,    t2, qc.
dr);
 
 1664         strassen_winograd_multiply(   s3,    t3,    p5);
 
 1665         strassen_winograd_multiply(   s4, qb.dr, qc.
ur);
 
 1666         strassen_winograd_multiply(qa.dr,    t4, qc.
dl);
 
 1668         strassen_winograd_postaddition(qc.
ul, qc.
ur, qc.
dl, qc.
dr, p1, p3, p5);
 
 1674     static swappable_block_matrix_type&
 
 1680         if (C.
get_height() <= strassen_winograd_base_case_size
 
 1681             || C.
get_width() <= strassen_winograd_base_case_size
 
 1682             || A.
get_width() <= strassen_winograd_base_case_size)
 
 1683             return recursive_multiply_and_add(A, B, C);
 
 1689                                     s2(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
 
 1690                                     s3(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
 
 1691                                     s4(C.
bs, qa.ul.get_height(), qa.ul.get_width(), qa.ul.is_transposed()),
 
 1692                                     t1(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
 
 1693                                     t2(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
 
 1694                                     t3(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed()),
 
 1695                                     t4(C.
bs, qb.ul.get_height(), qb.ul.get_width(), qb.ul.is_transposed());
 
 1696         strassen_winograd_preaddition_a(qa.ul, qa.ur, qa.dl, qa.dr, s1, s2, s3, s4);
 
 1697         strassen_winograd_preaddition_b(qb.ul, qb.ur, qb.dl, qb.dr, t1, t2, t3, t4);
 
 1700         strassen_winograd_multiply_and_add_interleaved(qa.ur, qb.dl, qc.
ul); 
 
 1701         strassen_winograd_multiply_and_add_interleaved(qa.ul, qb.ul, px); 
 
 1702         element_op<addition>(qc.
ul, px);
 
 1703         strassen_winograd_multiply_and_add_interleaved(s2, t2, px); 
 
 1706         element_op<addition>(qc.
ur, px);
 
 1707         strassen_winograd_multiply_and_add_interleaved(s3, t3, px); 
 
 1710         element_op_twice_nontransposed<addition>(qc.
dl, qc.
dr, px);
 
 1712         strassen_winograd_multiply_and_add_interleaved(qa.dr, t4, qc.
dl); 
 
 1714         strassen_winograd_multiply_and_add_interleaved(s1, t1, px); 
 
 1717         element_op_twice_nontransposed<addition>(qc.
dr, qc.
ur, px);
 
 1719         strassen_winograd_multiply_and_add_interleaved(s4, qb.dr, qc.
ur); 
 
 1725     static swappable_block_matrix_type&
 
 1731         if (C.
get_height() <= strassen_winograd_base_case_size
 
 1732             || C.
get_width() <= strassen_winograd_base_case_size
 
 1733             || A.
get_width() <= strassen_winograd_base_case_size)
 
 1734             return recursive_multiply_and_add(A, B, C);
 
 1740         s2(C.
bs, qa.ul.get_height(), qa.ul.get_width()),
 
 1741         s3(C.
bs, qa.ul.get_height(), qa.ul.get_width()),
 
 1742         s4(C.
bs, qa.ul.get_height(), qa.ul.get_width()),
 
 1743         t1(C.
bs, qb.ul.get_height(), qb.ul.get_width()),
 
 1744         t2(C.
bs, qb.ul.get_height(), qb.ul.get_width()),
 
 1745         t3(C.
bs, qb.ul.get_height(), qb.ul.get_width()),
 
 1746         t4(C.
bs, qb.ul.get_height(), qb.ul.get_width());
 
 1747         element_op<subtraction>(s3, qa.ul, qa.dl);
 
 1748         element_op<addition>(s1, qa.dl, qa.dr);
 
 1749         element_op<subtraction>(s2, s1, qa.ul);
 
 1750         element_op<subtraction>(s4, qa.ur, s2);
 
 1751         element_op<subtraction>(t3, qb.dr, qb.ur);
 
 1752         element_op<subtraction>(t1, qb.ur, qb.ul);
 
 1753         element_op<subtraction>(t2, qb.dr, t1);
 
 1754         element_op<subtraction>(t4, qb.dl, t2);
 
 1757         strassen_winograd_multiply_and_add(qa.ur, qb.dl, qc.
ul); 
 
 1758         strassen_winograd_multiply_and_add(qa.ul, qb.ul, px); 
 
 1759         element_op<addition>(qc.
ul, px);
 
 1760         strassen_winograd_multiply_and_add(s2, t2, px); 
 
 1761         element_op<addition>(qc.
ur, px);
 
 1762         strassen_winograd_multiply_and_add(s3, t3, px); 
 
 1763         element_op<addition>(qc.
dl, px);
 
 1764         element_op<addition>(qc.
dr, px);
 
 1766         strassen_winograd_multiply_and_add(qa.dr, t4, qc.
dl); 
 
 1767         strassen_winograd_multiply_and_add(s1, t1, px); 
 
 1768         element_op<addition>(qc.
dr, px);
 
 1769         element_op<addition>(qc.
ur, px);
 
 1770         strassen_winograd_multiply_and_add(s4, qb.dr, qc.
ur); 
 
 1776     static swappable_block_matrix_type&
 
 1786             return naive_multiply_and_add(A, B, C);
 
 1792         recursive_multiply_and_add(qa.ul, qb.ul, qc.
ul);
 
 1793         recursive_multiply_and_add(qa.ur, qb.dl, qc.
ul);
 
 1794         recursive_multiply_and_add(qa.ur, qb.dr, qc.
ur);
 
 1795         recursive_multiply_and_add(qa.ul, qb.ur, qc.
ur);
 
 1796         recursive_multiply_and_add(qa.dl, qb.ur, qc.
dr);
 
 1797         recursive_multiply_and_add(qa.dr, qb.dr, qc.
dr);
 
 1798         recursive_multiply_and_add(qa.dr, qb.dl, qc.
dl);
 
 1799         recursive_multiply_and_add(qa.dl, qb.ul, qc.
dl);
 
 1806     static swappable_block_matrix_type&
 
 1829             ++matrix_operation_statistic::get_instance()->block_multiplication_calls;
 
 1835                 ++matrix_operation_statistic::get_instance()->block_multiplications_saved_through_zero;
 
 1839         ValueType* ap = bs_a.
acquire(a).begin(),
 
 1840         * bp = bs_b.
acquire(b).begin(),
 
 1841         * cp = bs_c.
acquire(c).begin();
 
 1845                 (ap, a_is_transposed, bp, b_is_transposed, cp, c_is_transposed);
 
 1857     static column_vector_type&
 
 1867             return naive_matrix_col_vector_multiply_and_add(A, x, z, offset_x, offset_z);
 
 1873         recursive_matrix_col_vector_multiply_and_add(qa.
ul, x, z, offset_x,                     offset_z                     );
 
 1874         recursive_matrix_col_vector_multiply_and_add(qa.
ur, x, z, offset_x + qa.
ul.
get_width(), offset_z                     );
 
 1876         recursive_matrix_col_vector_multiply_and_add(qa.
dl, x, z, offset_x,                     offset_z + qa.
ul.
get_height());
 
 1881     static column_vector_type&
 
 1888                 matrix_col_vector_multiply_and_add_swappable_block(A(row, col), A.
is_transposed(), A.
bs,
 
 1889                                                                    x, z, (offset_x + col) * BlockSideLength, (offset_z + row) * BlockSideLength);
 
 1910                 col_limit = 
std::min(BlockSideLength, 
unsigned(x.
size() - offset_x));
 
 1911             if (a_is_transposed)
 
 1912                 for (
int_type col = 0; col < col_limit; ++col)
 
 1913                     for (
int_type row = 0; row < row_limit; ++row)
 
 1914                         z[offset_z + row] += x[offset_x + col] * ia[row + col * BlockSideLength];
 
 1916                 for (
int_type row = 0; row < row_limit; ++row)
 
 1917                     for (
int_type col = 0; col < col_limit; ++col)
 
 1918                         z[offset_z + row] += x[offset_x + col] * ia[row * BlockSideLength + col];
 
 1925     static row_vector_type&
 
 1935             return naive_matrix_row_vector_multiply_and_add(y, A, z, offset_y, offset_z);
 
 1941         recursive_matrix_row_vector_multiply_and_add(y, qa.
ul, z, offset_y,                      offset_z                    );
 
 1942         recursive_matrix_row_vector_multiply_and_add(y, qa.
dl, z, offset_y + qa.
ul.
get_height(), offset_z                    );
 
 1944         recursive_matrix_row_vector_multiply_and_add(y, qa.
ur, z, offset_y,                      offset_z + qa.
ul.
get_width());
 
 1949     static row_vector_type&
 
 1956                 matrix_row_vector_multiply_and_add_swappable_block(y, A(row, col), A.
is_transposed(), A.
bs,
 
 1957                                                                    z, (offset_y + row) * BlockSideLength, (offset_z + col) * BlockSideLength);
 
 1978                 col_limit = 
std::min(BlockSideLength, 
unsigned(z.
size() - offset_z));
 
 1979             if (a_is_transposed)
 
 1980                 for (
int_type col = 0; col < col_limit; ++col)
 
 1981                     for (
int_type row = 0; row < row_limit; ++row)
 
 1982                         z[offset_z + col] += ia[row + col * BlockSideLength] * y[offset_y + row];
 
 1984                 for (
int_type row = 0; row < row_limit; ++row)
 
 1985                     for (
int_type col = 0; col < col_limit; ++col)
 
 1986                         z[offset_z + col] += ia[row * BlockSideLength + col] * y[offset_y + row];
 
 2005             naive_matrix_from_vectors(A, l, r, offset_l, offset_r);
 
 2013         recursive_matrix_from_vectors(qa.
ul, l, r, offset_l,                      offset_r                    );
 
 2014         recursive_matrix_from_vectors(qa.
ur, l, r, offset_l,                      offset_r + qa.
ul.
get_width());
 
 2016         recursive_matrix_from_vectors(qa.
dl, l, r, offset_l + qa.
ul.
get_height(), offset_r                    );
 
 2024                 matrix_from_vectors_swappable_block(A(row, col), A.
is_transposed(), A.
bs,
 
 2025                                                     l, r, (offset_l + row) * BlockSideLength, (offset_r + col) * BlockSideLength);
 
 2039                 col_limit = 
std::min(BlockSideLength, 
unsigned(r.
size() - offset_r));
 
 2040             if (a_is_transposed)
 
 2041                 for (
int_type col = 0; col < col_limit; ++col)
 
 2042                     for (
int_type row = 0; row < row_limit; ++row)
 
 2043                         ia[row + col * BlockSideLength] = l[row + offset_l] * r[col + offset_r];
 
 2045                 for (
int_type row = 0; row < row_limit; ++row)
 
 2046                     for (
int_type col = 0; col < col_limit; ++col)
 
 2047                         ia[row * BlockSideLength + col] = l[row + offset_l] * r[col + offset_r];
 
 2057 template <
typename ValueType, 
unsigned BlockS
ideLength>
 
 2058 const int_type matrix_operations<ValueType, BlockSideLength>::strassen_winograd_base_case_size = 3;
 
 2064 #endif // !STXXL_CONTAINERS_MATRIX_ARITHMETIC_HEADER 
void begin_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt)
swappable_block_matrix_type & ur
swappable_block_matrix_type::size_type size_type
static void strassen_winograd_preaddition_a(swappable_block_matrix_type &a11, swappable_block_matrix_type &a12, swappable_block_matrix_type &a21, swappable_block_matrix_type &a22, swappable_block_matrix_type &s1, swappable_block_matrix_type &s2, swappable_block_matrix_type &s3, swappable_block_matrix_type &s4)
swappable_block_matrix_type::size_type size_type
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
swappable_block_matrix_type & ur
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
int_type block_multiplication_calls
compat::remove_const< Integral >::type div_ceil(Integral n, Integral2 d)
swappable_block_matrix_type m
friend std::ostream & operator<<(std::ostream &os, const uint_pair &a)
make a uint_pair outputtable via iostreams, using unsigned long long. 
void feed_element(const int_type element_num, const vt v)
matrix_operation_statistic_dataset()
bool is_initialized(const swappable_block_identifier_type sbid) const 
check if the swappable_block is initialized. 
static swappable_block_matrix_type & strassen_winograd_multiply(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B 
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, AExists, false > smaller_feedable_strassen_winograd_a
matrix_to_quadtree_block_grained< ValueType, BlockSideLength, Level-1, Granularity > smaller_matrix_to_quadtree_block_grained
block_scheduler_type & bs
static swappable_block_matrix_type & element_op(swappable_block_matrix_type &C, const swappable_block_matrix_type &A, Op op=Op())
static void element_op_twice_nontransposed(swappable_block_matrix_type &c1, swappable_block_matrix_type &c2, const swappable_block_matrix_type &a, Op op=Op())
external column-vector container for matrix multiplication 
swappable_block_matrix_type & ur
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, false, BExists > smaller_feedable_strassen_winograd_b
static unsigned_type get_num_temp_grains()
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
void deinitialize(const swappable_block_identifier_type sbid)
Drop all data in the given block, freeing in- and external memory. 
static swappable_block_matrix_type & naive_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C 
smaller_feedable_strassen_winograd_a p7
matrix_to_quadtree(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
static_quadtree(smaller_static_quadtree ul, smaller_static_quadtree ur, smaller_static_quadtree dl, smaller_static_quadtree dr)
static swappable_block_matrix_type &return C
static void recursive_matrix_from_vectors(swappable_block_matrix_type A, const column_vector_type &l, const row_vector_type &r, vector_size_type offset_l=0, vector_size_type offset_r=0)
swappable_block_matrix_type::swappable_block_identifier_type swappable_block_identifier_type
void feed_b(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
SwappableBlockType::internal_block_type internal_block_type
swappable_block_matrix_type c
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
const size_type & get_width_in_blocks()
swappable_block_matrix_type & dr
void feed_and_add_element(const int_type element_num, const vt v)
void feed_a(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
void begin_feeding_block(const size_type &block_row, const size_type &block_col, const zbt zb)
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix)
void begin_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt zb)
void read_and_add(const size_type &row, const size_type &col, swappable_block_matrix_type &bl)
#define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_BASE_CASE
void begin_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt zb)
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
External container for a (sub)matrix. Not intended for direct use. 
multiplies matrices A and B, adds result to C, for arbitrary entries param pointer to blocks of A...
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
swappable_block_matrix_type & dl
static uint_pair min()
return an uint_pair instance containing the smallest value possible 
static_quadtree< ValueType, Level-1 > smaller_static_quadtree
column_vector_type::size_type vector_size_type
swappable_block_matrix_type & ul
static unsigned_type get_num_temp_grains()
matrix_to_quadtree(const swappable_block_matrix_type &matrix)
size_type size() const 
return the size of the vector. 
swappable_block_matrix_type::swappable_block_identifier_type swappable_block_identifier_type
smaller_matrix_to_quadtree_block_grained ur
void feed_b_element(const int_type element_num, const vt v)
void feed_b_element(const int_type element_num, const vt v)
const size_type & get_height() const 
bool is_simulating() const 
Returns if simulation mode is on, i.e. if a prediction sequence is being recorded. 
smaller_feedable_strassen_winograd_n p5
column_vector< ValueType > column_vector_type
smaller_feedable_strassen_winograd_b p6
void feed_a(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, false, false > smaller_feedable_strassen_winograd_n
static void element_op_swappable_block(const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type &bs_c, const swappable_block_identifier_type a, bool a_is_transposed, block_scheduler_type &bs_a, const swappable_block_identifier_type b, bool b_is_transposed, block_scheduler_type &bs_b, Op op=Op())
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
static void element_op_swappable_block(const swappable_block_identifier_type c, block_scheduler_type &bs_c, Op op=Op())
c = a [op] b; for arbitrary entries 
static void op_swappable_block_nontransposed(swappable_block_matrix_type &c, Op op, swappable_block_matrix_type &a, size_type &row, size_type &col)
smaller_matrix_to_quadtree ur
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
static_quadtree< bool, Level > zbt
feedable_strassen_winograd_block_grained(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
#define STXXL_MATRIX_MULTI_LEVEL_STRASSEN_WINOGRAD_MAX_NUM_LEVELS
static_quadtree< ValueType, 0 > vt
void end_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt zb)
static void element_op_swappable_block(const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type &bs_c, const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, Op op=Op())
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
void begin_feeding_block(const size_type &block_row, const size_type &block_col, const zbt)
A static_quadtree holds 4^Level elements arranged in a quad tree. 
const bool & is_transposed() const 
if the elements inside the blocks are in transposed order i.e. column-major 
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
block_scheduler_type::swappable_block_identifier_type swappable_block_identifier_type
swappable_block_matrix_type::size_type size_type
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, AExists, BExists > smaller_feedable_strassen_winograd_ab
swappable_block_matrix_type::size_type size_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
static const int_type strassen_winograd_base_case_size
void begin_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt)
block_scheduler_type::internal_block_type internal_block_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
static_quadtree< bool, Level > zbt
uint_pair & operator+=(const uint_pair &b)
addition operator (uses 64-bit arithmetic) 
block_scheduler_type::internal_block_type internal_block_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
static void multiply_and_add_swappable_block(const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, const swappable_block_identifier_type b, const bool b_is_transposed, block_scheduler_type &bs_b, const swappable_block_identifier_type c, const bool c_is_transposed, block_scheduler_type &bs_c)
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
static void use_feedable_sw_block_grained(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
smaller_static_quadtree dl
static_quadtree< bool, 0 > zbt
void end_feeding_block(const size_type &block_row, const size_type &block_col, const zbt zb)
static_quadtree< ValueType, Level > vt
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
c [op]= a; for arbitrary entries 
External matrix container.   Introduction  to matrix container: see STXXL Matrix tutorial.   Design and Internals  of matrix container: see Matrix. 
static_quadtree< ValueType, 0 > vt
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
int_type block_multiplications_saved_through_zero
block_scheduler_type::internal_block_type internal_block_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix)
static void strassen_winograd_preaddition_b(swappable_block_matrix_type &b11, swappable_block_matrix_type &b12, swappable_block_matrix_type &b21, swappable_block_matrix_type &b22, swappable_block_matrix_type &t1, swappable_block_matrix_type &t2, swappable_block_matrix_type &t3, swappable_block_matrix_type &t4)
choose_int_types< my_pointer_size >::int_type int_type
internal_block_type * iblock
int_type block_addition_calls
void end_feeding_a_block(const size_type &block_row, const size_type &block_col, const zbt zb)
c =[op] a; for arbitrary entries 
smaller_feedable_strassen_winograd_n p5
static_quadtree< ValueType, Level > vt
const size_type get_width()
vt read_element(int_type element_num)
block_scheduler_type::internal_block_type internal_block_type
static_quadtree< bool, 0 > zbt
swappable_block_matrix_type::size_type size_type
static swappable_block_matrix_type & strassen_winograd_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C 
smaller_feedable_strassen_winograd_b p6
static void strassen_winograd_postaddition(swappable_block_matrix_type &c11, swappable_block_matrix_type &c12, swappable_block_matrix_type &c21, swappable_block_matrix_type &c22, swappable_block_matrix_type &p1, swappable_block_matrix_type &p3, swappable_block_matrix_type &p5)
void end_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt zb)
swappable_block_matrix_approximative_quarterer(const swappable_block_matrix_type &whole)
static void naive_matrix_from_vectors(swappable_block_matrix_type A, const column_vector_type &l, const row_vector_type &r, vector_size_type offset_l=0, vector_size_type offset_r=0)
block_scheduler_type::internal_block_type internal_block_type
const size_type & get_height_in_blocks()
const size_type & get_width() const 
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, AExists, false > smaller_feedable_strassen_winograd_a
static_quadtree< bool, Level > zbt
static_quadtree< ValueType, Level > vt
#define STXXL_BEGIN_NAMESPACE
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, AExists, BExists > smaller_feedable_strassen_winograd_ab
void end_feeding_block(const size_type &block_row, const size_type &block_col, const zbt zb)
smaller_feedable_strassen_winograd_a p7
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
static void op_swappable_block_nontransposed(swappable_block_matrix_type &c, swappable_block_matrix_type &a, Op op, swappable_block_matrix_type &b, size_type &row, size_type &col)
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
swappable_block_matrix_type c
scalar_multiplication(const ValueType scalar=1)
swappable_block_matrix_type & dr
void feed_b(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
smaller_feedable_strassen_winograd_ab p2
void feed_element(const int_type element_num, const vt v)
static row_vector_type & naive_matrix_row_vector_multiply_and_add(const row_vector_type &y, const swappable_block_matrix_type &A, row_vector_type &z, const vector_size_type offset_y=0, const vector_size_type offset_z=0)
vector_type::size_type size_type
swappable_block_matrix_type::size_type size_type
swappable_block_matrix< ValueType, BlockSideLength > swappable_block_matrix_type
smaller_static_quadtree ul
static swappable_block_matrix_type & recursive_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C 
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
static swappable_block_matrix_type & multi_level_strassen_winograd_multiply_and_add(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C 
swappable_block_matrix_type & dl
void feed_a_element(const int_type element_num, const vt v)
row_vector< ValueType > row_vector_type
feedable_strassen_winograd< ValueType, BlockSideLength, Level-1, false, BExists > smaller_feedable_strassen_winograd_b
zbt begin_reading_block(const size_type &block_row, const size_type &block_col)
matrix_to_quadtree(const swappable_block_matrix_type &matrix)
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
static_quadtree< swappable_block_identifier_type, 0 > bt
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
vt read_element(const int_type element_num)
vt read_element(const int_type element_num)
feedable_strassen_winograd_block_grained< ValueType, BlockSideLength, Level-1, false, false > smaller_feedable_strassen_winograd_n
swappable_block_matrix_type::size_type size_type
const size_type & get_height_in_blocks()
matrix_to_quadtree(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
const size_type & get_width_in_blocks()
static void matrix_row_vector_multiply_and_add_swappable_block(const row_vector_type &y, const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, row_vector_type &z, const vector_size_type offset_y=0, const vector_size_type offset_z=0)
static swappable_block_matrix_type & element_op(swappable_block_matrix_type &C, const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, Op op=Op())
matrix_operations< ValueType, BlockSideLength > Ops
void read_and_add(const size_type &row, const size_type &col, const swappable_block_matrix_type &bl)
swappable_block_matrix_type & ul
const size_type get_width()
int_type block_additions_saved_through_zero
static row_vector_type & recursive_matrix_row_vector_multiply_and_add(const row_vector_type &y, const swappable_block_matrix_type &A, row_vector_type &z, const vector_size_type offset_y=0, const vector_size_type offset_z=0)
calculates z = y * A 
static swappable_block_matrix_type & multi_level_strassen_winograd_multiply_and_add_block_grained(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C 
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
const size_type get_height()
smaller_static_quadtree ur
matrix_operations< ValueType, BlockSideLength > Ops
choose_int_types< my_pointer_size >::unsigned_type unsigned_type
const size_type get_height()
unsigned int ilog2_ceil(const IntegerType &i)
calculate the log2 ceiling of an integer type (by repeated bit shifts) 
static column_vector_type & naive_matrix_col_vector_multiply_and_add(const swappable_block_matrix_type &A, const column_vector_type &x, column_vector_type &z, const vector_size_type offset_x=0, const vector_size_type offset_z=0)
smaller_feedable_strassen_winograd_ab p2
static void choose_level_for_feedable_sw(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
static void use_feedable_sw(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
external row-vector container for matrix multiplication 
void release(const swappable_block_identifier_type sbid, const bool dirty)
Release the given block. Has to be in pairs with acquire. Pairs may be nested and interleaved...
Schedules swapping of blocks and provides blocks for temporary storage. 
swappable_block_matrix_type::size_type size_type
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
swappable_block_matrix_quarterer(const swappable_block_matrix_type &whole)
swappable_block_matrix_type & ul
static_quadtree(const ValueType &v)
smaller_static_quadtree dr
internal_block_type * iblock
swappable_block_matrix_type::block_scheduler_type block_scheduler_type
block_scheduler_type::internal_block_type internal_block_type
swappable_block_matrix_type & dr
void feed_and_add_element(const int_type element_num, const vt v)
feedable_strassen_winograd_block_grained(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
swappable_block_matrix_type m
static void matrix_from_vectors_swappable_block(swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, const column_vector_type &l, const row_vector_type &r, vector_size_type offset_l, vector_size_type offset_r)
feedable_strassen_winograd(block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l, const swappable_block_matrix_type &existing_b, const size_type b_from_row, const size_type b_from_col)
static column_vector_type & recursive_matrix_col_vector_multiply_and_add(const swappable_block_matrix_type &A, const column_vector_type &x, column_vector_type &z, const vector_size_type offset_x=0, const vector_size_type offset_z=0)
calculates z = A * x 
void end_feeding_b_block(const size_type &block_row, const size_type &block_col, const zbt zb)
static swappable_block_matrix_type & strassen_winograd_multiply_and_add_interleaved(const swappable_block_matrix_type &A, const swappable_block_matrix_type &B, swappable_block_matrix_type &C)
calculates C = A * B + C 
vt read_element(const int_type element_num)
matrix_to_quadtree_block_grained(const swappable_block_matrix_type &matrix, const size_type height, const size_type width, const size_type from_row, const size_type from_col)
feedable_strassen_winograd(const swappable_block_matrix_type &existing_a, const size_type a_from_row, const size_type a_from_col, block_scheduler_type &bs_c, const size_type n, const size_type m, const size_type l)
static void matrix_col_vector_multiply_and_add_swappable_block(const swappable_block_identifier_type a, const bool a_is_transposed, block_scheduler_type &bs_a, const column_vector_type &x, column_vector_type &z, const vector_size_type offset_x=0, const vector_size_type offset_z=0)
static Integral round_up_to_power_of_two(Integral n)
matrix_to_quadtree< ValueType, BlockSideLength, Level-1 > smaller_matrix_to_quadtree
swappable_block_matrix_type & dl
void feed_a_element(const int_type element_num, const vt v)
swappable_block_matrix_type::size_type size_type
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
zbt end_reading_block(const size_type &block_row, const size_type &block_col)
swappable_block_matrix_padding_quarterer(const swappable_block_matrix_type &whole)
#define STXXL_END_NAMESPACE
internal_block_type & acquire(const swappable_block_identifier_type sbid, const bool uninitialized=false)
Acquire the given block. Has to be in pairs with release. Pairs may be nested and interleaved...