001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    
018    package org.apache.commons.math.linear;
019    
020    import java.io.Serializable;
021    
022    import org.apache.commons.math.Field;
023    import org.apache.commons.math.FieldElement;
024    import org.apache.commons.math.exception.NoDataException;
025    import org.apache.commons.math.exception.DimensionMismatchException;
026    import org.apache.commons.math.exception.util.LocalizedFormats;
027    import org.apache.commons.math.util.FastMath;
028    import org.apache.commons.math.util.MathUtils;
029    
030    /**
031     * Cache-friendly implementation of FieldMatrix using a flat arrays to store
032     * square blocks of the matrix.
033     * <p>
034     * This implementation is specially designed to be cache-friendly. Square blocks are
035     * stored as small arrays and allow efficient traversal of data both in row major direction
036     * and columns major direction, one block at a time. This greatly increases performances
037     * for algorithms that use crossed directions loops like multiplication or transposition.
038     * </p>
039     * <p>
040     * The size of square blocks is a static parameter. It may be tuned according to the cache
041     * size of the target computer processor. As a rule of thumbs, it should be the largest
042     * value that allows three blocks to be simultaneously cached (this is necessary for example
043     * for matrix multiplication). The default value is to use 36x36 blocks.
044     * </p>
045     * <p>
046     * The regular blocks represent {@link #BLOCK_SIZE} x {@link #BLOCK_SIZE} squares. Blocks
047     * at right hand side and bottom side which may be smaller to fit matrix dimensions. The square
048     * blocks are flattened in row major order in single dimension arrays which are therefore
049     * {@link #BLOCK_SIZE}<sup>2</sup> elements long for regular blocks. The blocks are themselves
050     * organized in row major order.
051     * </p>
052     * <p>
053     * As an example, for a block size of 36x36, a 100x60 matrix would be stored in 6 blocks.
054     * Block 0 would be a Field[1296] array holding the upper left 36x36 square, block 1 would be
055     * a Field[1296] array holding the upper center 36x36 square, block 2 would be a Field[1008]
056     * array holding the upper right 36x28 rectangle, block 3 would be a Field[864] array holding
057     * the lower left 24x36 rectangle, block 4 would be a Field[864] array holding the lower center
058     * 24x36 rectangle and block 5 would be a Field[672] array holding the lower right 24x28
059     * rectangle.
060     * </p>
061     * <p>
062     * The layout complexity overhead versus simple mapping of matrices to java
063     * arrays is negligible for small matrices (about 1%). The gain from cache efficiency leads
064     * to up to 3-fold improvements for matrices of moderate to large size.
065     * </p>
066     * @param <T> the type of the field elements
067     * @version $Id: BlockFieldMatrix.java 1132432 2011-06-05 14:59:29Z luc $
068     * @since 2.0
069     */
070    public class BlockFieldMatrix<T extends FieldElement<T>> extends AbstractFieldMatrix<T> implements Serializable {
071        /** Block size. */
072        public static final int BLOCK_SIZE = 36;
073        /** Serializable version identifier. */
074        private static final long serialVersionUID = -4602336630143123183L;
075        /** Blocks of matrix entries. */
076        private final T blocks[][];
077        /** Number of rows of the matrix. */
078        private final int rows;
079        /** Number of columns of the matrix. */
080        private final int columns;
081        /** Number of block rows of the matrix. */
082        private final int blockRows;
083        /** Number of block columns of the matrix. */
084        private final int blockColumns;
085    
086        /**
087         * Create a new matrix with the supplied row and column dimensions.
088         *
089         * @param field Field to which the elements belong.
090         * @param rows Number of rows in the new matrix.
091         * @param columns Number of columns in the new matrix.
092         * @throws org.apache.commons.math.exception.NotStrictlyPositiveException
093         * if row or column dimension is not positive.
094         */
095        public BlockFieldMatrix(final Field<T> field, final int rows, final int columns) {
096            super(field, rows, columns);
097            this.rows    = rows;
098            this.columns = columns;
099    
100            // number of blocks
101            blockRows    = (rows    + BLOCK_SIZE - 1) / BLOCK_SIZE;
102            blockColumns = (columns + BLOCK_SIZE - 1) / BLOCK_SIZE;
103    
104            // allocate storage blocks, taking care of smaller ones at right and bottom
105            blocks = createBlocksLayout(field, rows, columns);
106        }
107    
108        /**
109         * Create a new dense matrix copying entries from raw layout data.
110         * <p>The input array <em>must</em> already be in raw layout.</p>
111         * <p>Calling this constructor is equivalent to call:
112         * <pre>matrix = new BlockFieldMatrix<T>(getField(), rawData.length, rawData[0].length,
113         *                                   toBlocksLayout(rawData), false);</pre>
114         * </p>
115         * @param rawData Data for the new matrix, in raw layout.
116         *
117         * @exception DimensionMismatchException if the {@code blockData} shape is
118         * inconsistent with block layout.
119         * @see #BlockFieldMatrix(int, int, FieldElement[][], boolean)
120         */
121        public BlockFieldMatrix(final T[][] rawData) {
122            this(rawData.length, rawData[0].length, toBlocksLayout(rawData), false);
123        }
124    
125        /**
126         * Create a new dense matrix copying entries from block layout data.
127         * <p>The input array <em>must</em> already be in blocks layout.</p>
128         * @param rows  the number of rows in the new matrix
129         * @param columns  the number of columns in the new matrix
130         * @param blockData data for new matrix
131         * @param copyArray if true, the input array will be copied, otherwise
132         * it will be referenced
133         *
134         * @exception DimensionMismatchException if the {@code blockData} shape is
135         * inconsistent with block layout.
136         * @see #createBlocksLayout(Field, int, int)
137         * @see #toBlocksLayout(FieldElement[][])
138         * @see #BlockFieldMatrix(FieldElement[][])
139         */
140        public BlockFieldMatrix(final int rows, final int columns,
141                                final T[][] blockData, final boolean copyArray) {
142            super(extractField(blockData), rows, columns);
143            this.rows    = rows;
144            this.columns = columns;
145    
146            // number of blocks
147            blockRows    = (rows    + BLOCK_SIZE - 1) / BLOCK_SIZE;
148            blockColumns = (columns + BLOCK_SIZE - 1) / BLOCK_SIZE;
149    
150            if (copyArray) {
151                // allocate storage blocks, taking care of smaller ones at right and bottom
152                blocks = buildArray(getField(), blockRows * blockColumns, -1);
153            } else {
154                // reference existing array
155                blocks = blockData;
156            }
157    
158            int index = 0;
159            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
160                final int iHeight = blockHeight(iBlock);
161                for (int jBlock = 0; jBlock < blockColumns; ++jBlock, ++index) {
162                    if (blockData[index].length != iHeight * blockWidth(jBlock)) {
163                        throw new DimensionMismatchException(blockData[index].length,
164                                                             iHeight * blockWidth(jBlock));
165                    }
166                    if (copyArray) {
167                        blocks[index] = blockData[index].clone();
168                    }
169                }
170            }
171        }
172    
173        /**
174         * Convert a data array from raw layout to blocks layout.
175         * <p>
176         * Raw layout is the straightforward layout where element at row i and
177         * column j is in array element <code>rawData[i][j]</code>. Blocks layout
178         * is the layout used in {@link BlockFieldMatrix} instances, where the matrix
179         * is split in square blocks (except at right and bottom side where blocks may
180         * be rectangular to fit matrix size) and each block is stored in a flattened
181         * one-dimensional array.
182         * </p>
183         * <p>
184         * This method creates an array in blocks layout from an input array in raw layout.
185         * It can be used to provide the array argument of the {@link
186         * #BlockFieldMatrix(int, int, FieldElement[][], boolean)}
187         * constructor.
188         * </p>
189         * @param <T> Type of the field elements.
190         * @param rawData Data array in raw layout.
191         * @return a new data array containing the same entries but in blocks layout
192         * @throws DimensionMismatchException if {@code rawData} is not rectangular
193         *  (not all rows have the same length).
194         * @see #createBlocksLayout(Field, int, int)
195         * @see #BlockFieldMatrix(int, int, FieldElement[][], boolean)
196         */
197        public static <T extends FieldElement<T>> T[][] toBlocksLayout(final T[][] rawData) {
198    
199            final int rows         = rawData.length;
200            final int columns      = rawData[0].length;
201            final int blockRows    = (rows    + BLOCK_SIZE - 1) / BLOCK_SIZE;
202            final int blockColumns = (columns + BLOCK_SIZE - 1) / BLOCK_SIZE;
203    
204            // safety checks
205            for (int i = 0; i < rawData.length; ++i) {
206                final int length = rawData[i].length;
207                if (length != columns) {
208                    throw new DimensionMismatchException(columns, length);
209                }
210            }
211    
212            // convert array
213            final Field<T> field = extractField(rawData);
214            final T[][] blocks = buildArray(field, blockRows * blockColumns, -1);
215            int blockIndex = 0;
216            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
217                final int pStart  = iBlock * BLOCK_SIZE;
218                final int pEnd    = FastMath.min(pStart + BLOCK_SIZE, rows);
219                final int iHeight = pEnd - pStart;
220                for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
221                    final int qStart = jBlock * BLOCK_SIZE;
222                    final int qEnd   = FastMath.min(qStart + BLOCK_SIZE, columns);
223                    final int jWidth = qEnd - qStart;
224    
225                    // allocate new block
226                    final T[] block = buildArray(field, iHeight * jWidth);
227                    blocks[blockIndex] = block;
228    
229                    // copy data
230                    int index = 0;
231                    for (int p = pStart; p < pEnd; ++p) {
232                        System.arraycopy(rawData[p], qStart, block, index, jWidth);
233                        index += jWidth;
234                    }
235    
236                    ++blockIndex;
237                }
238            }
239    
240            return blocks;
241        }
242    
243        /**
244         * Create a data array in blocks layout.
245         * <p>
246         * This method can be used to create the array argument of the {@link
247         * #BlockFieldMatrix(int, int, FieldElement[][], boolean)}
248         * constructor.
249         * </p>
250         * @param <T> Type of the field elements.
251         * @param field Field to which the elements belong.
252         * @param rows Number of rows in the new matrix.
253         * @param columns Number of columns in the new matrix.
254         * @return a new data array in blocks layout.
255         * @see #toBlocksLayout(FieldElement[][])
256         * @see #BlockFieldMatrix(int, int, FieldElement[][], boolean)
257         */
258        public static <T extends FieldElement<T>> T[][] createBlocksLayout(final Field<T> field,
259                                                                           final int rows, final int columns) {
260            final int blockRows    = (rows    + BLOCK_SIZE - 1) / BLOCK_SIZE;
261            final int blockColumns = (columns + BLOCK_SIZE - 1) / BLOCK_SIZE;
262    
263            final T[][] blocks = buildArray(field, blockRows * blockColumns, -1);
264            int blockIndex = 0;
265            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
266                final int pStart  = iBlock * BLOCK_SIZE;
267                final int pEnd    = FastMath.min(pStart + BLOCK_SIZE, rows);
268                final int iHeight = pEnd - pStart;
269                for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
270                    final int qStart = jBlock * BLOCK_SIZE;
271                    final int qEnd   = FastMath.min(qStart + BLOCK_SIZE, columns);
272                    final int jWidth = qEnd - qStart;
273                    blocks[blockIndex] = buildArray(field, iHeight * jWidth);
274                    ++blockIndex;
275                }
276            }
277    
278            return blocks;
279        }
280    
281        /** {@inheritDoc} */
282        @Override
283        public FieldMatrix<T> createMatrix(final int rowDimension, final int columnDimension) {
284            return new BlockFieldMatrix<T>(getField(), rowDimension, columnDimension);
285        }
286    
287        /** {@inheritDoc} */
288        @Override
289        public FieldMatrix<T> copy() {
290    
291            // create an empty matrix
292            BlockFieldMatrix<T> copied = new BlockFieldMatrix<T>(getField(), rows, columns);
293    
294            // copy the blocks
295            for (int i = 0; i < blocks.length; ++i) {
296                System.arraycopy(blocks[i], 0, copied.blocks[i], 0, blocks[i].length);
297            }
298    
299            return copied;
300        }
301    
302        /** {@inheritDoc} */
303        @Override
304        public FieldMatrix<T> add(final FieldMatrix<T> m) {
305            try {
306                return add((BlockFieldMatrix<T>) m);
307            } catch (ClassCastException cce) {
308    
309                // safety check
310                checkAdditionCompatible(m);
311    
312                final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), rows, columns);
313    
314                // perform addition block-wise, to ensure good cache behavior
315                int blockIndex = 0;
316                for (int iBlock = 0; iBlock < out.blockRows; ++iBlock) {
317                    for (int jBlock = 0; jBlock < out.blockColumns; ++jBlock) {
318    
319                        // perform addition on the current block
320                        final T[] outBlock = out.blocks[blockIndex];
321                        final T[] tBlock   = blocks[blockIndex];
322                        final int      pStart   = iBlock * BLOCK_SIZE;
323                        final int      pEnd     = FastMath.min(pStart + BLOCK_SIZE, rows);
324                        final int      qStart   = jBlock * BLOCK_SIZE;
325                        final int      qEnd     = FastMath.min(qStart + BLOCK_SIZE, columns);
326                        int k = 0;
327                        for (int p = pStart; p < pEnd; ++p) {
328                            for (int q = qStart; q < qEnd; ++q) {
329                                outBlock[k] = tBlock[k].add(m.getEntry(p, q));
330                                ++k;
331                            }
332                        }
333    
334                        // go to next block
335                        ++blockIndex;
336    
337                    }
338                }
339    
340                return out;
341            }
342        }
343    
344        /**
345         * Compute the sum of this and <code>m</code>.
346         *
347         * @param m    matrix to be added
348         * @return     this + m
349         * @throws  IllegalArgumentException if m is not the same size as this
350         */
351        public BlockFieldMatrix<T> add(final BlockFieldMatrix<T> m) {
352    
353            // safety check
354            checkAdditionCompatible(m);
355    
356            final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), rows, columns);
357    
358            // perform addition block-wise, to ensure good cache behavior
359            for (int blockIndex = 0; blockIndex < out.blocks.length; ++blockIndex) {
360                final T[] outBlock = out.blocks[blockIndex];
361                final T[] tBlock   = blocks[blockIndex];
362                final T[] mBlock   = m.blocks[blockIndex];
363                for (int k = 0; k < outBlock.length; ++k) {
364                    outBlock[k] = tBlock[k].add(mBlock[k]);
365                }
366            }
367    
368            return out;
369        }
370    
371        /** {@inheritDoc} */
372        @Override
373        public FieldMatrix<T> subtract(final FieldMatrix<T> m) {
374            try {
375                return subtract((BlockFieldMatrix<T>) m);
376            } catch (ClassCastException cce) {
377    
378                // safety check
379                checkSubtractionCompatible(m);
380    
381                final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), rows, columns);
382    
383                // perform subtraction block-wise, to ensure good cache behavior
384                int blockIndex = 0;
385                for (int iBlock = 0; iBlock < out.blockRows; ++iBlock) {
386                    for (int jBlock = 0; jBlock < out.blockColumns; ++jBlock) {
387    
388                        // perform subtraction on the current block
389                        final T[] outBlock = out.blocks[blockIndex];
390                        final T[] tBlock   = blocks[blockIndex];
391                        final int      pStart   = iBlock * BLOCK_SIZE;
392                        final int      pEnd     = FastMath.min(pStart + BLOCK_SIZE, rows);
393                        final int      qStart   = jBlock * BLOCK_SIZE;
394                        final int      qEnd     = FastMath.min(qStart + BLOCK_SIZE, columns);
395                        int k = 0;
396                        for (int p = pStart; p < pEnd; ++p) {
397                            for (int q = qStart; q < qEnd; ++q) {
398                                outBlock[k] = tBlock[k].subtract(m.getEntry(p, q));
399                                ++k;
400                            }
401                        }
402    
403                        // go to next block
404                        ++blockIndex;
405    
406                    }
407                }
408    
409                return out;
410            }
411        }
412    
413        /**
414         * Compute this minus <code>m</code>.
415         *
416         * @param m    matrix to be subtracted
417         * @return     this - m
418         * @throws  IllegalArgumentException if m is not the same size as this
419         */
420        public BlockFieldMatrix<T> subtract(final BlockFieldMatrix<T> m) {
421            // safety check
422            checkSubtractionCompatible(m);
423    
424            final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), rows, columns);
425    
426            // perform subtraction block-wise, to ensure good cache behavior
427            for (int blockIndex = 0; blockIndex < out.blocks.length; ++blockIndex) {
428                final T[] outBlock = out.blocks[blockIndex];
429                final T[] tBlock   = blocks[blockIndex];
430                final T[] mBlock   = m.blocks[blockIndex];
431                for (int k = 0; k < outBlock.length; ++k) {
432                    outBlock[k] = tBlock[k].subtract(mBlock[k]);
433                }
434            }
435    
436            return out;
437        }
438    
439        /** {@inheritDoc} */
440        @Override
441        public FieldMatrix<T> scalarAdd(final T d) {
442            final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), rows, columns);
443    
444            // perform subtraction block-wise, to ensure good cache behavior
445            for (int blockIndex = 0; blockIndex < out.blocks.length; ++blockIndex) {
446                final T[] outBlock = out.blocks[blockIndex];
447                final T[] tBlock   = blocks[blockIndex];
448                for (int k = 0; k < outBlock.length; ++k) {
449                    outBlock[k] = tBlock[k].add(d);
450                }
451            }
452    
453            return out;
454        }
455    
456        /** {@inheritDoc} */
457        @Override
458        public FieldMatrix<T> scalarMultiply(final T d) {
459    
460            final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), rows, columns);
461    
462            // perform subtraction block-wise, to ensure good cache behavior
463            for (int blockIndex = 0; blockIndex < out.blocks.length; ++blockIndex) {
464                final T[] outBlock = out.blocks[blockIndex];
465                final T[] tBlock   = blocks[blockIndex];
466                for (int k = 0; k < outBlock.length; ++k) {
467                    outBlock[k] = tBlock[k].multiply(d);
468                }
469            }
470    
471            return out;
472        }
473    
474        /** {@inheritDoc} */
475        @Override
476        public FieldMatrix<T> multiply(final FieldMatrix<T> m) {
477            try {
478                return multiply((BlockFieldMatrix<T>) m);
479            } catch (ClassCastException cce) {
480    
481                // safety check
482                checkMultiplicationCompatible(m);
483    
484                final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), rows, m.getColumnDimension());
485                final T zero = getField().getZero();
486    
487                // perform multiplication block-wise, to ensure good cache behavior
488                int blockIndex = 0;
489                for (int iBlock = 0; iBlock < out.blockRows; ++iBlock) {
490    
491                    final int pStart = iBlock * BLOCK_SIZE;
492                    final int pEnd   = FastMath.min(pStart + BLOCK_SIZE, rows);
493    
494                    for (int jBlock = 0; jBlock < out.blockColumns; ++jBlock) {
495    
496                        final int qStart = jBlock * BLOCK_SIZE;
497                        final int qEnd   = FastMath.min(qStart + BLOCK_SIZE, m.getColumnDimension());
498    
499                        // select current block
500                        final T[] outBlock = out.blocks[blockIndex];
501    
502                        // perform multiplication on current block
503                        for (int kBlock = 0; kBlock < blockColumns; ++kBlock) {
504                            final int kWidth      = blockWidth(kBlock);
505                            final T[] tBlock = blocks[iBlock * blockColumns + kBlock];
506                            final int rStart      = kBlock * BLOCK_SIZE;
507                            int k = 0;
508                            for (int p = pStart; p < pEnd; ++p) {
509                                final int lStart = (p - pStart) * kWidth;
510                                final int lEnd   = lStart + kWidth;
511                                for (int q = qStart; q < qEnd; ++q) {
512                                    T sum = zero;
513                                    int r = rStart;
514                                    for (int l = lStart; l < lEnd; ++l) {
515                                        sum = sum.add(tBlock[l].multiply(m.getEntry(r, q)));
516                                        ++r;
517                                    }
518                                    outBlock[k] = outBlock[k].add(sum);
519                                    ++k;
520                                }
521                            }
522                        }
523    
524                        // go to next block
525                        ++blockIndex;
526    
527                    }
528                }
529    
530                return out;
531            }
532        }
533    
534        /**
535         * Returns the result of postmultiplying this by m.
536         *
537         * @param m    matrix to postmultiply by
538         * @return     this * m
539         * @throws     IllegalArgumentException
540         *             if columnDimension(this) != rowDimension(m)
541         */
542        public BlockFieldMatrix<T> multiply(BlockFieldMatrix<T> m) {
543    
544            // safety check
545            checkMultiplicationCompatible(m);
546    
547            final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), rows, m.columns);
548            final T zero = getField().getZero();
549    
550            // perform multiplication block-wise, to ensure good cache behavior
551            int blockIndex = 0;
552            for (int iBlock = 0; iBlock < out.blockRows; ++iBlock) {
553    
554                final int pStart = iBlock * BLOCK_SIZE;
555                final int pEnd   = FastMath.min(pStart + BLOCK_SIZE, rows);
556    
557                for (int jBlock = 0; jBlock < out.blockColumns; ++jBlock) {
558                    final int jWidth = out.blockWidth(jBlock);
559                    final int jWidth2 = jWidth  + jWidth;
560                    final int jWidth3 = jWidth2 + jWidth;
561                    final int jWidth4 = jWidth3 + jWidth;
562    
563                    // select current block
564                    final T[] outBlock = out.blocks[blockIndex];
565    
566                    // perform multiplication on current block
567                    for (int kBlock = 0; kBlock < blockColumns; ++kBlock) {
568                        final int kWidth = blockWidth(kBlock);
569                        final T[] tBlock = blocks[iBlock * blockColumns + kBlock];
570                        final T[] mBlock = m.blocks[kBlock * m.blockColumns + jBlock];
571                        int k = 0;
572                        for (int p = pStart; p < pEnd; ++p) {
573                            final int lStart = (p - pStart) * kWidth;
574                            final int lEnd   = lStart + kWidth;
575                            for (int nStart = 0; nStart < jWidth; ++nStart) {
576                                T sum = zero;
577                                int l = lStart;
578                                int n = nStart;
579                                while (l < lEnd - 3) {
580                                    sum = sum.
581                                          add(tBlock[l].multiply(mBlock[n])).
582                                          add(tBlock[l + 1].multiply(mBlock[n + jWidth])).
583                                          add(tBlock[l + 2].multiply(mBlock[n + jWidth2])).
584                                          add(tBlock[l + 3].multiply(mBlock[n + jWidth3]));
585                                    l += 4;
586                                    n += jWidth4;
587                                }
588                                while (l < lEnd) {
589                                    sum = sum.add(tBlock[l++].multiply(mBlock[n]));
590                                    n += jWidth;
591                                }
592                                outBlock[k] = outBlock[k].add(sum);
593                                ++k;
594                            }
595                        }
596                    }
597    
598                    // go to next block
599                    ++blockIndex;
600                }
601            }
602    
603            return out;
604        }
605    
606        /** {@inheritDoc} */
607        @Override
608        public T[][] getData() {
609    
610            final T[][] data = buildArray(getField(), getRowDimension(), getColumnDimension());
611            final int lastColumns = columns - (blockColumns - 1) * BLOCK_SIZE;
612    
613            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
614                final int pStart = iBlock * BLOCK_SIZE;
615                final int pEnd   = FastMath.min(pStart + BLOCK_SIZE, rows);
616                int regularPos   = 0;
617                int lastPos      = 0;
618                for (int p = pStart; p < pEnd; ++p) {
619                    final T[] dataP = data[p];
620                    int blockIndex = iBlock * blockColumns;
621                    int dataPos    = 0;
622                    for (int jBlock = 0; jBlock < blockColumns - 1; ++jBlock) {
623                        System.arraycopy(blocks[blockIndex++], regularPos, dataP, dataPos, BLOCK_SIZE);
624                        dataPos += BLOCK_SIZE;
625                    }
626                    System.arraycopy(blocks[blockIndex], lastPos, dataP, dataPos, lastColumns);
627                    regularPos += BLOCK_SIZE;
628                    lastPos    += lastColumns;
629                }
630            }
631    
632            return data;
633        }
634    
635        /** {@inheritDoc} */
636        @Override
637        public FieldMatrix<T> getSubMatrix(final int startRow, final int endRow,
638                                           final int startColumn, final int endColumn) {
639            // safety checks
640            checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
641    
642            // create the output matrix
643            final BlockFieldMatrix<T> out =
644                new BlockFieldMatrix<T>(getField(), endRow - startRow + 1, endColumn - startColumn + 1);
645    
646            // compute blocks shifts
647            final int blockStartRow    = startRow    / BLOCK_SIZE;
648            final int rowsShift        = startRow    % BLOCK_SIZE;
649            final int blockStartColumn = startColumn / BLOCK_SIZE;
650            final int columnsShift     = startColumn % BLOCK_SIZE;
651    
652            // perform extraction block-wise, to ensure good cache behavior
653            int pBlock = blockStartRow;
654            for (int iBlock = 0; iBlock < out.blockRows; ++iBlock) {
655                final int iHeight = out.blockHeight(iBlock);
656                int qBlock = blockStartColumn;
657                for (int jBlock = 0; jBlock < out.blockColumns; ++jBlock) {
658                    final int jWidth = out.blockWidth(jBlock);
659    
660                    // handle one block of the output matrix
661                    final int      outIndex = iBlock * out.blockColumns + jBlock;
662                    final T[] outBlock = out.blocks[outIndex];
663                    final int      index    = pBlock * blockColumns + qBlock;
664                    final int      width    = blockWidth(qBlock);
665    
666                    final int heightExcess = iHeight + rowsShift - BLOCK_SIZE;
667                    final int widthExcess  = jWidth + columnsShift - BLOCK_SIZE;
668                    if (heightExcess > 0) {
669                        // the submatrix block spans on two blocks rows from the original matrix
670                        if (widthExcess > 0) {
671                            // the submatrix block spans on two blocks columns from the original matrix
672                            final int width2 = blockWidth(qBlock + 1);
673                            copyBlockPart(blocks[index], width,
674                                          rowsShift, BLOCK_SIZE,
675                                          columnsShift, BLOCK_SIZE,
676                                          outBlock, jWidth, 0, 0);
677                            copyBlockPart(blocks[index + 1], width2,
678                                          rowsShift, BLOCK_SIZE,
679                                          0, widthExcess,
680                                          outBlock, jWidth, 0, jWidth - widthExcess);
681                            copyBlockPart(blocks[index + blockColumns], width,
682                                          0, heightExcess,
683                                          columnsShift, BLOCK_SIZE,
684                                          outBlock, jWidth, iHeight - heightExcess, 0);
685                            copyBlockPart(blocks[index + blockColumns + 1], width2,
686                                          0, heightExcess,
687                                          0, widthExcess,
688                                          outBlock, jWidth, iHeight - heightExcess, jWidth - widthExcess);
689                        } else {
690                            // the submatrix block spans on one block column from the original matrix
691                            copyBlockPart(blocks[index], width,
692                                          rowsShift, BLOCK_SIZE,
693                                          columnsShift, jWidth + columnsShift,
694                                          outBlock, jWidth, 0, 0);
695                            copyBlockPart(blocks[index + blockColumns], width,
696                                          0, heightExcess,
697                                          columnsShift, jWidth + columnsShift,
698                                          outBlock, jWidth, iHeight - heightExcess, 0);
699                        }
700                    } else {
701                        // the submatrix block spans on one block row from the original matrix
702                        if (widthExcess > 0) {
703                            // the submatrix block spans on two blocks columns from the original matrix
704                            final int width2 = blockWidth(qBlock + 1);
705                            copyBlockPart(blocks[index], width,
706                                          rowsShift, iHeight + rowsShift,
707                                          columnsShift, BLOCK_SIZE,
708                                          outBlock, jWidth, 0, 0);
709                            copyBlockPart(blocks[index + 1], width2,
710                                          rowsShift, iHeight + rowsShift,
711                                          0, widthExcess,
712                                          outBlock, jWidth, 0, jWidth - widthExcess);
713                        } else {
714                            // the submatrix block spans on one block column from the original matrix
715                            copyBlockPart(blocks[index], width,
716                                          rowsShift, iHeight + rowsShift,
717                                          columnsShift, jWidth + columnsShift,
718                                          outBlock, jWidth, 0, 0);
719                        }
720                   }
721                    ++qBlock;
722                }
723                ++pBlock;
724            }
725    
726            return out;
727        }
728    
729        /**
730         * Copy a part of a block into another one
731         * <p>This method can be called only when the specified part fits in both
732         * blocks, no verification is done here.</p>
733         * @param srcBlock source block
734         * @param srcWidth source block width ({@link #BLOCK_SIZE} or smaller)
735         * @param srcStartRow start row in the source block
736         * @param srcEndRow end row (exclusive) in the source block
737         * @param srcStartColumn start column in the source block
738         * @param srcEndColumn end column (exclusive) in the source block
739         * @param dstBlock destination block
740         * @param dstWidth destination block width ({@link #BLOCK_SIZE} or smaller)
741         * @param dstStartRow start row in the destination block
742         * @param dstStartColumn start column in the destination block
743         */
744        private void copyBlockPart(final T[] srcBlock, final int srcWidth,
745                                   final int srcStartRow, final int srcEndRow,
746                                   final int srcStartColumn, final int srcEndColumn,
747                                   final T[] dstBlock, final int dstWidth,
748                                   final int dstStartRow, final int dstStartColumn) {
749            final int length = srcEndColumn - srcStartColumn;
750            int srcPos = srcStartRow * srcWidth + srcStartColumn;
751            int dstPos = dstStartRow * dstWidth + dstStartColumn;
752            for (int srcRow = srcStartRow; srcRow < srcEndRow; ++srcRow) {
753                System.arraycopy(srcBlock, srcPos, dstBlock, dstPos, length);
754                srcPos += srcWidth;
755                dstPos += dstWidth;
756            }
757        }
758    
759        /** {@inheritDoc} */
760        @Override
761        public void setSubMatrix(final T[][] subMatrix, final int row, final int column) {
762            // safety checks
763            MathUtils.checkNotNull(subMatrix);
764            final int refLength = subMatrix[0].length;
765            if (refLength == 0) {
766                throw new NoDataException(LocalizedFormats.AT_LEAST_ONE_COLUMN);
767            }
768            final int endRow    = row + subMatrix.length - 1;
769            final int endColumn = column + refLength - 1;
770            checkSubMatrixIndex(row, endRow, column, endColumn);
771            for (final T[] subRow : subMatrix) {
772                if (subRow.length != refLength) {
773                    throw new DimensionMismatchException(refLength, subRow.length);
774                }
775            }
776    
777            // compute blocks bounds
778            final int blockStartRow    = row / BLOCK_SIZE;
779            final int blockEndRow      = (endRow + BLOCK_SIZE) / BLOCK_SIZE;
780            final int blockStartColumn = column / BLOCK_SIZE;
781            final int blockEndColumn   = (endColumn + BLOCK_SIZE) / BLOCK_SIZE;
782    
783            // perform copy block-wise, to ensure good cache behavior
784            for (int iBlock = blockStartRow; iBlock < blockEndRow; ++iBlock) {
785                final int iHeight  = blockHeight(iBlock);
786                final int firstRow = iBlock * BLOCK_SIZE;
787                final int iStart   = FastMath.max(row,    firstRow);
788                final int iEnd     = FastMath.min(endRow + 1, firstRow + iHeight);
789    
790                for (int jBlock = blockStartColumn; jBlock < blockEndColumn; ++jBlock) {
791                    final int jWidth      = blockWidth(jBlock);
792                    final int firstColumn = jBlock * BLOCK_SIZE;
793                    final int jStart      = FastMath.max(column,    firstColumn);
794                    final int jEnd        = FastMath.min(endColumn + 1, firstColumn + jWidth);
795                    final int jLength     = jEnd - jStart;
796    
797                    // handle one block, row by row
798                    final T[] block = blocks[iBlock * blockColumns + jBlock];
799                    for (int i = iStart; i < iEnd; ++i) {
800                        System.arraycopy(subMatrix[i - row], jStart - column,
801                                         block, (i - firstRow) * jWidth + (jStart - firstColumn),
802                                         jLength);
803                    }
804    
805                }
806            }
807        }
808    
809        /** {@inheritDoc} */
810        @Override
811        public FieldMatrix<T> getRowMatrix(final int row) {
812            checkRowIndex(row);
813            final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), 1, columns);
814    
815            // perform copy block-wise, to ensure good cache behavior
816            final int iBlock  = row / BLOCK_SIZE;
817            final int iRow    = row - iBlock * BLOCK_SIZE;
818            int outBlockIndex = 0;
819            int outIndex      = 0;
820            T[] outBlock = out.blocks[outBlockIndex];
821            for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
822                final int jWidth     = blockWidth(jBlock);
823                final T[] block = blocks[iBlock * blockColumns + jBlock];
824                final int available  = outBlock.length - outIndex;
825                if (jWidth > available) {
826                    System.arraycopy(block, iRow * jWidth, outBlock, outIndex, available);
827                    outBlock = out.blocks[++outBlockIndex];
828                    System.arraycopy(block, iRow * jWidth, outBlock, 0, jWidth - available);
829                    outIndex = jWidth - available;
830                } else {
831                    System.arraycopy(block, iRow * jWidth, outBlock, outIndex, jWidth);
832                    outIndex += jWidth;
833                }
834            }
835    
836            return out;
837        }
838    
839        /** {@inheritDoc} */
840        @Override
841        public void setRowMatrix(final int row, final FieldMatrix<T> matrix) {
842            try {
843                setRowMatrix(row, (BlockFieldMatrix<T>) matrix);
844            } catch (ClassCastException cce) {
845                super.setRowMatrix(row, matrix);
846            }
847        }
848    
849        /**
850         * Sets the entries in row number <code>row</code>
851         * as a row matrix.  Row indices start at 0.
852         *
853         * @param row the row to be set
854         * @param matrix row matrix (must have one row and the same number of columns
855         * as the instance)
856         * @throws org.apache.commons.math.exception.OutOfRangeException if the
857         * specified row index is invalid.
858         * @throws MatrixDimensionMismatchException if the matrix dimensions do
859         * not match one instance row.
860         */
861        public void setRowMatrix(final int row, final BlockFieldMatrix<T> matrix) {
862            checkRowIndex(row);
863            final int nCols = getColumnDimension();
864            if ((matrix.getRowDimension() != 1) ||
865                (matrix.getColumnDimension() != nCols)) {
866                throw new MatrixDimensionMismatchException(matrix.getRowDimension(),
867                                                           matrix.getColumnDimension(),
868                                                           1, nCols);
869            }
870    
871            // perform copy block-wise, to ensure good cache behavior
872            final int iBlock = row / BLOCK_SIZE;
873            final int iRow   = row - iBlock * BLOCK_SIZE;
874            int mBlockIndex  = 0;
875            int mIndex       = 0;
876            T[] mBlock  = matrix.blocks[mBlockIndex];
877            for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
878                final int jWidth     = blockWidth(jBlock);
879                final T[] block = blocks[iBlock * blockColumns + jBlock];
880                final int available  = mBlock.length - mIndex;
881                if (jWidth > available) {
882                    System.arraycopy(mBlock, mIndex, block, iRow * jWidth, available);
883                    mBlock = matrix.blocks[++mBlockIndex];
884                    System.arraycopy(mBlock, 0, block, iRow * jWidth, jWidth - available);
885                    mIndex = jWidth - available;
886                } else {
887                    System.arraycopy(mBlock, mIndex, block, iRow * jWidth, jWidth);
888                    mIndex += jWidth;
889               }
890            }
891        }
892    
893        /** {@inheritDoc} */
894        @Override
895        public FieldMatrix<T> getColumnMatrix(final int column) {
896            checkColumnIndex(column);
897            final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), rows, 1);
898    
899            // perform copy block-wise, to ensure good cache behavior
900            final int jBlock  = column / BLOCK_SIZE;
901            final int jColumn = column - jBlock * BLOCK_SIZE;
902            final int jWidth  = blockWidth(jBlock);
903            int outBlockIndex = 0;
904            int outIndex      = 0;
905            T[] outBlock = out.blocks[outBlockIndex];
906            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
907                final int iHeight = blockHeight(iBlock);
908                final T[] block = blocks[iBlock * blockColumns + jBlock];
909                for (int i = 0; i < iHeight; ++i) {
910                    if (outIndex >= outBlock.length) {
911                        outBlock = out.blocks[++outBlockIndex];
912                        outIndex = 0;
913                    }
914                    outBlock[outIndex++] = block[i * jWidth + jColumn];
915                }
916            }
917    
918            return out;
919        }
920    
921        /** {@inheritDoc} */
922        @Override
923        public void setColumnMatrix(final int column, final FieldMatrix<T> matrix) {
924            try {
925                setColumnMatrix(column, (BlockFieldMatrix<T>) matrix);
926            } catch (ClassCastException cce) {
927                super.setColumnMatrix(column, matrix);
928            }
929        }
930    
931        /**
932         * Sets the entries in column number {@code column}
933         * as a column matrix.  Column indices start at 0.
934         *
935         * @param column Column to be set.
936         * @param matrix Column matrix (must have one column and the same number of rows
937         * as the instance).
938         * @throws org.apache.commons.math.exception.OutOfRangeException if
939         * the specified column index is invalid.
940         * @throws MatrixDimensionMismatchException if the matrix dimensions do
941         * not match one instance column.
942         */
943        void setColumnMatrix(final int column, final BlockFieldMatrix<T> matrix) {
944            checkColumnIndex(column);
945            final int nRows = getRowDimension();
946            if ((matrix.getRowDimension() != nRows) ||
947                (matrix.getColumnDimension() != 1)) {
948                throw new MatrixDimensionMismatchException(matrix.getRowDimension(),
949                                                           matrix.getColumnDimension(),
950                                                           nRows, 1);
951            }
952    
953            // perform copy block-wise, to ensure good cache behavior
954            final int jBlock  = column / BLOCK_SIZE;
955            final int jColumn = column - jBlock * BLOCK_SIZE;
956            final int jWidth  = blockWidth(jBlock);
957            int mBlockIndex = 0;
958            int mIndex      = 0;
959            T[] mBlock = matrix.blocks[mBlockIndex];
960            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
961                final int iHeight = blockHeight(iBlock);
962                final T[] block = blocks[iBlock * blockColumns + jBlock];
963                for (int i = 0; i < iHeight; ++i) {
964                    if (mIndex >= mBlock.length) {
965                        mBlock = matrix.blocks[++mBlockIndex];
966                        mIndex = 0;
967                    }
968                    block[i * jWidth + jColumn] = mBlock[mIndex++];
969                }
970            }
971        }
972    
973        /** {@inheritDoc} */
974        @Override
975        public FieldVector<T> getRowVector(final int row) {
976            checkRowIndex(row);
977            final T[] outData = buildArray(getField(), columns);
978    
979            // perform copy block-wise, to ensure good cache behavior
980            final int iBlock  = row / BLOCK_SIZE;
981            final int iRow    = row - iBlock * BLOCK_SIZE;
982            int outIndex      = 0;
983            for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
984                final int jWidth     = blockWidth(jBlock);
985                final T[] block = blocks[iBlock * blockColumns + jBlock];
986                System.arraycopy(block, iRow * jWidth, outData, outIndex, jWidth);
987                outIndex += jWidth;
988            }
989    
990            return new ArrayFieldVector<T>(getField(), outData, false);
991        }
992    
993        /** {@inheritDoc} */
994        @Override
995        public void setRowVector(final int row, final FieldVector<T> vector) {
996            try {
997                setRow(row, ((ArrayFieldVector<T>) vector).getDataRef());
998            } catch (ClassCastException cce) {
999                super.setRowVector(row, vector);
1000            }
1001        }
1002    
1003        /** {@inheritDoc} */
1004        @Override
1005        public FieldVector<T> getColumnVector(final int column) {
1006            checkColumnIndex(column);
1007            final T[] outData = buildArray(getField(), rows);
1008    
1009            // perform copy block-wise, to ensure good cache behavior
1010            final int jBlock  = column / BLOCK_SIZE;
1011            final int jColumn = column - jBlock * BLOCK_SIZE;
1012            final int jWidth  = blockWidth(jBlock);
1013            int outIndex      = 0;
1014            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
1015                final int iHeight = blockHeight(iBlock);
1016                final T[] block = blocks[iBlock * blockColumns + jBlock];
1017                for (int i = 0; i < iHeight; ++i) {
1018                    outData[outIndex++] = block[i * jWidth + jColumn];
1019                }
1020            }
1021    
1022            return new ArrayFieldVector<T>(getField(), outData, false);
1023        }
1024    
1025        /** {@inheritDoc} */
1026        @Override
1027        public void setColumnVector(final int column, final FieldVector<T> vector) {
1028            try {
1029                setColumn(column, ((ArrayFieldVector<T>) vector).getDataRef());
1030            } catch (ClassCastException cce) {
1031                super.setColumnVector(column, vector);
1032            }
1033        }
1034    
1035        /** {@inheritDoc} */
1036        @Override
1037        public T[] getRow(final int row) {
1038            checkRowIndex(row);
1039            final T[] out = buildArray(getField(), columns);
1040    
1041            // perform copy block-wise, to ensure good cache behavior
1042            final int iBlock  = row / BLOCK_SIZE;
1043            final int iRow    = row - iBlock * BLOCK_SIZE;
1044            int outIndex      = 0;
1045            for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
1046                final int jWidth     = blockWidth(jBlock);
1047                final T[] block = blocks[iBlock * blockColumns + jBlock];
1048                System.arraycopy(block, iRow * jWidth, out, outIndex, jWidth);
1049                outIndex += jWidth;
1050            }
1051    
1052            return out;
1053        }
1054    
1055        /** {@inheritDoc} */
1056        @Override
1057        public void setRow(final int row, final T[] array) {
1058            checkRowIndex(row);
1059            final int nCols = getColumnDimension();
1060            if (array.length != nCols) {
1061                throw new MatrixDimensionMismatchException(1, array.length, 1, nCols);
1062            }
1063    
1064            // perform copy block-wise, to ensure good cache behavior
1065            final int iBlock  = row / BLOCK_SIZE;
1066            final int iRow    = row - iBlock * BLOCK_SIZE;
1067            int outIndex      = 0;
1068            for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
1069                final int jWidth     = blockWidth(jBlock);
1070                final T[] block = blocks[iBlock * blockColumns + jBlock];
1071                System.arraycopy(array, outIndex, block, iRow * jWidth, jWidth);
1072                outIndex += jWidth;
1073            }
1074        }
1075    
1076        /** {@inheritDoc} */
1077        @Override
1078        public T[] getColumn(final int column) {
1079            checkColumnIndex(column);
1080            final T[] out = buildArray(getField(), rows);
1081    
1082            // perform copy block-wise, to ensure good cache behavior
1083            final int jBlock  = column / BLOCK_SIZE;
1084            final int jColumn = column - jBlock * BLOCK_SIZE;
1085            final int jWidth  = blockWidth(jBlock);
1086            int outIndex      = 0;
1087            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
1088                final int iHeight = blockHeight(iBlock);
1089                final T[] block = blocks[iBlock * blockColumns + jBlock];
1090                for (int i = 0; i < iHeight; ++i) {
1091                    out[outIndex++] = block[i * jWidth + jColumn];
1092                }
1093            }
1094    
1095            return out;
1096        }
1097    
1098        /** {@inheritDoc} */
1099        @Override
1100        public void setColumn(final int column, final T[] array) {
1101            checkColumnIndex(column);
1102            final int nRows = getRowDimension();
1103            if (array.length != nRows) {
1104                throw new MatrixDimensionMismatchException(array.length, 1, nRows, 1);
1105            }
1106    
1107            // perform copy block-wise, to ensure good cache behavior
1108            final int jBlock  = column / BLOCK_SIZE;
1109            final int jColumn = column - jBlock * BLOCK_SIZE;
1110            final int jWidth  = blockWidth(jBlock);
1111            int outIndex      = 0;
1112            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
1113                final int iHeight = blockHeight(iBlock);
1114                final T[] block = blocks[iBlock * blockColumns + jBlock];
1115                for (int i = 0; i < iHeight; ++i) {
1116                    block[i * jWidth + jColumn] = array[outIndex++];
1117                }
1118            }
1119        }
1120    
1121        /** {@inheritDoc} */
1122        @Override
1123        public T getEntry(final int row, final int column) {
1124            checkRowIndex(row);
1125            checkColumnIndex(column);
1126    
1127            final int iBlock = row    / BLOCK_SIZE;
1128            final int jBlock = column / BLOCK_SIZE;
1129            final int k      = (row    - iBlock * BLOCK_SIZE) * blockWidth(jBlock) +
1130                (column - jBlock * BLOCK_SIZE);
1131    
1132            return blocks[iBlock * blockColumns + jBlock][k];
1133        }
1134    
1135        /** {@inheritDoc} */
1136        @Override
1137        public void setEntry(final int row, final int column, final T value) {
1138            checkRowIndex(row);
1139            checkColumnIndex(column);
1140    
1141            final int iBlock = row    / BLOCK_SIZE;
1142            final int jBlock = column / BLOCK_SIZE;
1143            final int k      = (row    - iBlock * BLOCK_SIZE) * blockWidth(jBlock) +
1144                (column - jBlock * BLOCK_SIZE);
1145    
1146            blocks[iBlock * blockColumns + jBlock][k] = value;
1147        }
1148    
1149        /** {@inheritDoc} */
1150        @Override
1151        public void addToEntry(final int row, final int column, final T increment) {
1152            checkRowIndex(row);
1153            checkColumnIndex(column);
1154    
1155            final int iBlock = row    / BLOCK_SIZE;
1156            final int jBlock = column / BLOCK_SIZE;
1157            final int k      = (row    - iBlock * BLOCK_SIZE) * blockWidth(jBlock) +
1158                (column - jBlock * BLOCK_SIZE);
1159            final T[] blockIJ = blocks[iBlock * blockColumns + jBlock];
1160    
1161            blockIJ[k] = blockIJ[k].add(increment);
1162        }
1163    
1164        /** {@inheritDoc} */
1165        @Override
1166        public void multiplyEntry(final int row, final int column, final T factor) {
1167            checkRowIndex(row);
1168            checkColumnIndex(column);
1169    
1170            final int iBlock = row    / BLOCK_SIZE;
1171            final int jBlock = column / BLOCK_SIZE;
1172            final int k      = (row    - iBlock * BLOCK_SIZE) * blockWidth(jBlock) +
1173                (column - jBlock * BLOCK_SIZE);
1174            final T[] blockIJ = blocks[iBlock * blockColumns + jBlock];
1175    
1176            blockIJ[k] = blockIJ[k].multiply(factor);
1177        }
1178    
1179        /** {@inheritDoc} */
1180        @Override
1181        public FieldMatrix<T> transpose() {
1182            final int nRows = getRowDimension();
1183            final int nCols = getColumnDimension();
1184            final BlockFieldMatrix<T> out = new BlockFieldMatrix<T>(getField(), nCols, nRows);
1185    
1186            // perform transpose block-wise, to ensure good cache behavior
1187            int blockIndex = 0;
1188            for (int iBlock = 0; iBlock < blockColumns; ++iBlock) {
1189                for (int jBlock = 0; jBlock < blockRows; ++jBlock) {
1190    
1191                    // transpose current block
1192                    final T[] outBlock = out.blocks[blockIndex];
1193                    final T[] tBlock   = blocks[jBlock * blockColumns + iBlock];
1194                    final int      pStart   = iBlock * BLOCK_SIZE;
1195                    final int      pEnd     = FastMath.min(pStart + BLOCK_SIZE, columns);
1196                    final int      qStart   = jBlock * BLOCK_SIZE;
1197                    final int      qEnd     = FastMath.min(qStart + BLOCK_SIZE, rows);
1198                    int k = 0;
1199                    for (int p = pStart; p < pEnd; ++p) {
1200                        final int lInc = pEnd - pStart;
1201                        int l = p - pStart;
1202                        for (int q = qStart; q < qEnd; ++q) {
1203                            outBlock[k] = tBlock[l];
1204                            ++k;
1205                            l+= lInc;
1206                        }
1207                    }
1208    
1209                    // go to next block
1210                    ++blockIndex;
1211    
1212                }
1213            }
1214    
1215            return out;
1216        }
1217    
1218        /** {@inheritDoc} */
1219        @Override
1220        public int getRowDimension() {
1221            return rows;
1222        }
1223    
1224        /** {@inheritDoc} */
1225        @Override
1226        public int getColumnDimension() {
1227            return columns;
1228        }
1229    
1230        /** {@inheritDoc} */
1231        @Override
1232        public T[] operate(final T[] v) {
1233            if (v.length != columns) {
1234                throw new DimensionMismatchException(v.length, columns);
1235            }
1236            final T[] out = buildArray(getField(), rows);
1237            final T zero = getField().getZero();
1238    
1239            // perform multiplication block-wise, to ensure good cache behavior
1240            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
1241                final int pStart = iBlock * BLOCK_SIZE;
1242                final int pEnd   = FastMath.min(pStart + BLOCK_SIZE, rows);
1243                for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
1244                    final T[] block  = blocks[iBlock * blockColumns + jBlock];
1245                    final int      qStart = jBlock * BLOCK_SIZE;
1246                    final int      qEnd   = FastMath.min(qStart + BLOCK_SIZE, columns);
1247                    int k = 0;
1248                    for (int p = pStart; p < pEnd; ++p) {
1249                        T sum = zero;
1250                        int q = qStart;
1251                        while (q < qEnd - 3) {
1252                            sum = sum.
1253                                  add(block[k].multiply(v[q])).
1254                                  add(block[k + 1].multiply(v[q + 1])).
1255                                  add(block[k + 2].multiply(v[q + 2])).
1256                                  add(block[k + 3].multiply(v[q + 3]));
1257                            k += 4;
1258                            q += 4;
1259                        }
1260                        while (q < qEnd) {
1261                            sum = sum.add(block[k++].multiply(v[q++]));
1262                        }
1263                        out[p] = out[p].add(sum);
1264                    }
1265                }
1266            }
1267    
1268            return out;
1269        }
1270    
1271        /** {@inheritDoc} */
1272        @Override
1273        public T[] preMultiply(final T[] v) {
1274    
1275            if (v.length != rows) {
1276                throw new DimensionMismatchException(v.length, rows);
1277            }
1278            final T[] out = buildArray(getField(), columns);
1279            final T zero = getField().getZero();
1280    
1281            // perform multiplication block-wise, to ensure good cache behavior
1282            for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
1283                final int jWidth  = blockWidth(jBlock);
1284                final int jWidth2 = jWidth  + jWidth;
1285                final int jWidth3 = jWidth2 + jWidth;
1286                final int jWidth4 = jWidth3 + jWidth;
1287                final int qStart = jBlock * BLOCK_SIZE;
1288                final int qEnd   = FastMath.min(qStart + BLOCK_SIZE, columns);
1289                for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
1290                    final T[] block  = blocks[iBlock * blockColumns + jBlock];
1291                    final int      pStart = iBlock * BLOCK_SIZE;
1292                    final int      pEnd   = FastMath.min(pStart + BLOCK_SIZE, rows);
1293                    for (int q = qStart; q < qEnd; ++q) {
1294                        int k = q - qStart;
1295                        T sum = zero;
1296                        int p = pStart;
1297                        while (p < pEnd - 3) {
1298                            sum = sum.
1299                                  add(block[k].multiply(v[p])).
1300                                  add(block[k + jWidth].multiply(v[p + 1])).
1301                                  add(block[k + jWidth2].multiply(v[p + 2])).
1302                                  add(block[k + jWidth3].multiply(v[p + 3]));
1303                            k += jWidth4;
1304                            p += 4;
1305                        }
1306                        while (p < pEnd) {
1307                            sum = sum.add(block[k].multiply(v[p++]));
1308                            k += jWidth;
1309                        }
1310                        out[q] = out[q].add(sum);
1311                    }
1312                }
1313            }
1314    
1315            return out;
1316        }
1317    
1318        /** {@inheritDoc} */
1319        @Override
1320        public T walkInRowOrder(final FieldMatrixChangingVisitor<T> visitor) {
1321            visitor.start(rows, columns, 0, rows - 1, 0, columns - 1);
1322            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
1323                final int pStart = iBlock * BLOCK_SIZE;
1324                final int pEnd   = FastMath.min(pStart + BLOCK_SIZE, rows);
1325                for (int p = pStart; p < pEnd; ++p) {
1326                    for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
1327                        final int jWidth = blockWidth(jBlock);
1328                        final int qStart = jBlock * BLOCK_SIZE;
1329                        final int qEnd   = FastMath.min(qStart + BLOCK_SIZE, columns);
1330                        final T[] block = blocks[iBlock * blockColumns + jBlock];
1331                        int k = (p - pStart) * jWidth;
1332                        for (int q = qStart; q < qEnd; ++q) {
1333                            block[k] = visitor.visit(p, q, block[k]);
1334                            ++k;
1335                        }
1336                    }
1337                 }
1338            }
1339            return visitor.end();
1340        }
1341    
1342        /** {@inheritDoc} */
1343        @Override
1344        public T walkInRowOrder(final FieldMatrixPreservingVisitor<T> visitor) {
1345            visitor.start(rows, columns, 0, rows - 1, 0, columns - 1);
1346            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
1347                final int pStart = iBlock * BLOCK_SIZE;
1348                final int pEnd   = FastMath.min(pStart + BLOCK_SIZE, rows);
1349                for (int p = pStart; p < pEnd; ++p) {
1350                    for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
1351                        final int jWidth = blockWidth(jBlock);
1352                        final int qStart = jBlock * BLOCK_SIZE;
1353                        final int qEnd   = FastMath.min(qStart + BLOCK_SIZE, columns);
1354                        final T[] block = blocks[iBlock * blockColumns + jBlock];
1355                        int k = (p - pStart) * jWidth;
1356                        for (int q = qStart; q < qEnd; ++q) {
1357                            visitor.visit(p, q, block[k]);
1358                            ++k;
1359                        }
1360                    }
1361                 }
1362            }
1363            return visitor.end();
1364        }
1365    
1366        /** {@inheritDoc} */
1367        @Override
1368        public T walkInRowOrder(final FieldMatrixChangingVisitor<T> visitor,
1369                                final int startRow, final int endRow,
1370                                final int startColumn, final int endColumn) {
1371            checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
1372            visitor.start(rows, columns, startRow, endRow, startColumn, endColumn);
1373            for (int iBlock = startRow / BLOCK_SIZE; iBlock < 1 + endRow / BLOCK_SIZE; ++iBlock) {
1374                final int p0     = iBlock * BLOCK_SIZE;
1375                final int pStart = FastMath.max(startRow, p0);
1376                final int pEnd   = FastMath.min((iBlock + 1) * BLOCK_SIZE, 1 + endRow);
1377                for (int p = pStart; p < pEnd; ++p) {
1378                    for (int jBlock = startColumn / BLOCK_SIZE; jBlock < 1 + endColumn / BLOCK_SIZE; ++jBlock) {
1379                        final int jWidth = blockWidth(jBlock);
1380                        final int q0     = jBlock * BLOCK_SIZE;
1381                        final int qStart = FastMath.max(startColumn, q0);
1382                        final int qEnd   = FastMath.min((jBlock + 1) * BLOCK_SIZE, 1 + endColumn);
1383                        final T[] block = blocks[iBlock * blockColumns + jBlock];
1384                        int k = (p - p0) * jWidth + qStart - q0;
1385                        for (int q = qStart; q < qEnd; ++q) {
1386                            block[k] = visitor.visit(p, q, block[k]);
1387                            ++k;
1388                        }
1389                    }
1390                 }
1391            }
1392            return visitor.end();
1393        }
1394    
1395        /** {@inheritDoc} */
1396        @Override
1397        public T walkInRowOrder(final FieldMatrixPreservingVisitor<T> visitor,
1398                                final int startRow, final int endRow,
1399                                final int startColumn, final int endColumn) {
1400            checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
1401            visitor.start(rows, columns, startRow, endRow, startColumn, endColumn);
1402            for (int iBlock = startRow / BLOCK_SIZE; iBlock < 1 + endRow / BLOCK_SIZE; ++iBlock) {
1403                final int p0     = iBlock * BLOCK_SIZE;
1404                final int pStart = FastMath.max(startRow, p0);
1405                final int pEnd   = FastMath.min((iBlock + 1) * BLOCK_SIZE, 1 + endRow);
1406                for (int p = pStart; p < pEnd; ++p) {
1407                    for (int jBlock = startColumn / BLOCK_SIZE; jBlock < 1 + endColumn / BLOCK_SIZE; ++jBlock) {
1408                        final int jWidth = blockWidth(jBlock);
1409                        final int q0     = jBlock * BLOCK_SIZE;
1410                        final int qStart = FastMath.max(startColumn, q0);
1411                        final int qEnd   = FastMath.min((jBlock + 1) * BLOCK_SIZE, 1 + endColumn);
1412                        final T[] block = blocks[iBlock * blockColumns + jBlock];
1413                        int k = (p - p0) * jWidth + qStart - q0;
1414                        for (int q = qStart; q < qEnd; ++q) {
1415                            visitor.visit(p, q, block[k]);
1416                            ++k;
1417                        }
1418                    }
1419                 }
1420            }
1421            return visitor.end();
1422        }
1423    
1424        /** {@inheritDoc} */
1425        @Override
1426        public T walkInOptimizedOrder(final FieldMatrixChangingVisitor<T> visitor) {
1427            visitor.start(rows, columns, 0, rows - 1, 0, columns - 1);
1428            int blockIndex = 0;
1429            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
1430                final int pStart = iBlock * BLOCK_SIZE;
1431                final int pEnd   = FastMath.min(pStart + BLOCK_SIZE, rows);
1432                for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
1433                    final int qStart = jBlock * BLOCK_SIZE;
1434                    final int qEnd   = FastMath.min(qStart + BLOCK_SIZE, columns);
1435                    final T[] block = blocks[blockIndex];
1436                    int k = 0;
1437                    for (int p = pStart; p < pEnd; ++p) {
1438                        for (int q = qStart; q < qEnd; ++q) {
1439                            block[k] = visitor.visit(p, q, block[k]);
1440                            ++k;
1441                        }
1442                    }
1443                    ++blockIndex;
1444                }
1445            }
1446            return visitor.end();
1447        }
1448    
1449        /** {@inheritDoc} */
1450        @Override
1451        public T walkInOptimizedOrder(final FieldMatrixPreservingVisitor<T> visitor) {
1452            visitor.start(rows, columns, 0, rows - 1, 0, columns - 1);
1453            int blockIndex = 0;
1454            for (int iBlock = 0; iBlock < blockRows; ++iBlock) {
1455                final int pStart = iBlock * BLOCK_SIZE;
1456                final int pEnd   = FastMath.min(pStart + BLOCK_SIZE, rows);
1457                for (int jBlock = 0; jBlock < blockColumns; ++jBlock) {
1458                    final int qStart = jBlock * BLOCK_SIZE;
1459                    final int qEnd   = FastMath.min(qStart + BLOCK_SIZE, columns);
1460                    final T[] block = blocks[blockIndex];
1461                    int k = 0;
1462                    for (int p = pStart; p < pEnd; ++p) {
1463                        for (int q = qStart; q < qEnd; ++q) {
1464                            visitor.visit(p, q, block[k]);
1465                            ++k;
1466                        }
1467                    }
1468                    ++blockIndex;
1469                }
1470            }
1471            return visitor.end();
1472        }
1473    
1474        /** {@inheritDoc} */
1475        @Override
1476        public T walkInOptimizedOrder(final FieldMatrixChangingVisitor<T> visitor,
1477                                           final int startRow, final int endRow,
1478                                           final int startColumn, final int endColumn) {
1479            checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
1480            visitor.start(rows, columns, startRow, endRow, startColumn, endColumn);
1481            for (int iBlock = startRow / BLOCK_SIZE; iBlock < 1 + endRow / BLOCK_SIZE; ++iBlock) {
1482                final int p0     = iBlock * BLOCK_SIZE;
1483                final int pStart = FastMath.max(startRow, p0);
1484                final int pEnd   = FastMath.min((iBlock + 1) * BLOCK_SIZE, 1 + endRow);
1485                for (int jBlock = startColumn / BLOCK_SIZE; jBlock < 1 + endColumn / BLOCK_SIZE; ++jBlock) {
1486                    final int jWidth = blockWidth(jBlock);
1487                    final int q0     = jBlock * BLOCK_SIZE;
1488                    final int qStart = FastMath.max(startColumn, q0);
1489                    final int qEnd   = FastMath.min((jBlock + 1) * BLOCK_SIZE, 1 + endColumn);
1490                    final T[] block = blocks[iBlock * blockColumns + jBlock];
1491                    for (int p = pStart; p < pEnd; ++p) {
1492                        int k = (p - p0) * jWidth + qStart - q0;
1493                        for (int q = qStart; q < qEnd; ++q) {
1494                            block[k] = visitor.visit(p, q, block[k]);
1495                            ++k;
1496                        }
1497                    }
1498                }
1499            }
1500            return visitor.end();
1501        }
1502    
1503        /** {@inheritDoc} */
1504        @Override
1505        public T walkInOptimizedOrder(final FieldMatrixPreservingVisitor<T> visitor,
1506                                           final int startRow, final int endRow,
1507                                           final int startColumn, final int endColumn) {
1508            checkSubMatrixIndex(startRow, endRow, startColumn, endColumn);
1509            visitor.start(rows, columns, startRow, endRow, startColumn, endColumn);
1510            for (int iBlock = startRow / BLOCK_SIZE; iBlock < 1 + endRow / BLOCK_SIZE; ++iBlock) {
1511                final int p0     = iBlock * BLOCK_SIZE;
1512                final int pStart = FastMath.max(startRow, p0);
1513                final int pEnd   = FastMath.min((iBlock + 1) * BLOCK_SIZE, 1 + endRow);
1514                for (int jBlock = startColumn / BLOCK_SIZE; jBlock < 1 + endColumn / BLOCK_SIZE; ++jBlock) {
1515                    final int jWidth = blockWidth(jBlock);
1516                    final int q0     = jBlock * BLOCK_SIZE;
1517                    final int qStart = FastMath.max(startColumn, q0);
1518                    final int qEnd   = FastMath.min((jBlock + 1) * BLOCK_SIZE, 1 + endColumn);
1519                    final T[] block = blocks[iBlock * blockColumns + jBlock];
1520                    for (int p = pStart; p < pEnd; ++p) {
1521                        int k = (p - p0) * jWidth + qStart - q0;
1522                        for (int q = qStart; q < qEnd; ++q) {
1523                            visitor.visit(p, q, block[k]);
1524                            ++k;
1525                        }
1526                    }
1527                }
1528            }
1529            return visitor.end();
1530        }
1531    
1532        /**
1533         * Get the height of a block.
1534         * @param blockRow row index (in block sense) of the block
1535         * @return height (number of rows) of the block
1536         */
1537        private int blockHeight(final int blockRow) {
1538            return (blockRow == blockRows - 1) ? rows - blockRow * BLOCK_SIZE : BLOCK_SIZE;
1539        }
1540    
1541        /**
1542         * Get the width of a block.
1543         * @param blockColumn column index (in block sense) of the block
1544         * @return width (number of columns) of the block
1545         */
1546        private int blockWidth(final int blockColumn) {
1547            return (blockColumn == blockColumns - 1) ? columns - blockColumn * BLOCK_SIZE : BLOCK_SIZE;
1548        }
1549    }