Commit eadaeefd authored by Lucas Serrano's avatar Lucas Serrano
Browse files

Support for non squared matrices

parent dc9332c6
......@@ -17,9 +17,10 @@ constexpr int nearest_power_of_two(int min_value, int current_value) {
}
template <typename T, int MatrixSize, int MaxVecSize>
template <typename T, int NRows, int NCols, int MaxVecSize>
class BaseMatrix;
/*
template <typename T, int MatrixSize, int MaxVecSize>
class IdentityMatrix;
......@@ -40,13 +41,13 @@ template <typename T, int MatrixSize, int MaxVecSize>
Matrix<T, MatrixSize, MaxVecSize> transpose(TransposedMatrix<T, MatrixSize, MaxVecSize> &m) {
return static_cast<Matrix<T, MatrixSize, MaxVecSize>>(m);
}
template <typename T, int MatrixSize, int MaxVecSize>
std::ostream & operator<<(std::ostream &os, BaseMatrix<T, MatrixSize, MaxVecSize> &mat) {
*/
template <typename T, int NRows, int NCols, int MaxVecSize>
std::ostream & operator<<(std::ostream &os, BaseMatrix<T, NRows, NCols, MaxVecSize> &mat) {
// Matrix printing function
for (int i=0; i<MatrixSize; i++) {
for (int j=0; j<MatrixSize; j++) {
os << mat.array[i*BaseMatrix<T, MatrixSize, MaxVecSize>::VecSize + j] << ", ";
for (int i=0; i<NRows; i++) {
for (int j=0; j<NCols; j++) {
os << mat.array[i*BaseMatrix<T, NRows, NCols, MaxVecSize>::VecSize + j] << ", ";
}
os << std::endl;
}
......@@ -56,7 +57,7 @@ std::ostream & operator<<(std::ostream &os, BaseMatrix<T, MatrixSize, MaxVecSize
template <typename M>
static inline void matrix_add(M &a, M &b, M &c) {
using pack_t = typename M::pack_t;
for (int i=0; i<M::MatSize; i++) {
for (int i=0; i<M::NRows; i++) {
int index = M::VecSize*i;
pack_t row_a(&a.array[index]), row_b(&b.array[index]);
bs::aligned_store(row_a + row_b, &c.array[index]);
......@@ -66,15 +67,15 @@ static inline void matrix_add(M &a, M &b, M &c) {
template <typename M>
static inline void matrix_sub(M &a, M &b, M &c) {
using pack_t = typename M::pack_t;
for (int i=0; i<M::MatSize; i++) {
for (int i=0; i<M::NRows; i++) {
int index = M::VecSize*i;
pack_t row_a(&a.array[index]), row_b(&b.array[index]);
bs::aligned_store(row_a - row_b, &c.array[index]);
}
}
template <typename M>
static inline void matrix_mul_m_m(M &a, M &b, M &c) {
template <typename T, int l, int m, int n, int MVS>
static inline void matrix_mul_m_m(BaseMatrix<T, l, m, MVS> &a, BaseMatrix<T, m, n, MVS> &b, BaseMatrix<T, l, n, MVS> &c) {
/*
* Computes the following matrix product C = A*B
* where A, B and C are BaseMatrix objects.
......@@ -82,7 +83,9 @@ static inline void matrix_mul_m_m(M &a, M &b, M &c) {
* of C in one vector.
* This use only vector operations so the compiler can optimize easily.
*/
using pack_t = bs::pack<typename M::ElementType, M::VecSize>;
static constexpr int LeftVecSize = BaseMatrix<T, l, m, MVS>::VecSize;
static constexpr int RightVecSize = BaseMatrix<T, m, n, MVS>::VecSize;
using pack_t = bs::pack<T, RightVecSize>;
/* Algorithm:
* For each row of C matrix:
* * Load one element of A and broadcast it into vector.
......@@ -91,41 +94,41 @@ static inline void matrix_mul_m_m(M &a, M &b, M &c) {
* * Repeat for each element of A in the row and sum these results.
* * Store the sommation as corresponding row of C.
*/
for (int i=0; i<M::MatSize; i++) {
for (int i=0; i<l; i++) {
pack_t res = bs::Zero<pack_t>();
int mult = M::VecSize*i;
for (int j=0; j<M::MatSize; j++) {
pack_t factor {a.array[mult + j]};
pack_t row(&b.array[M::VecSize*j]);
for (int j=0; j<m; j++) {
pack_t factor {a.array[LeftVecSize*i + j]};
pack_t row(&b.array[RightVecSize*j]);
res += factor * row;
}
bs::aligned_store(res, &c.array[mult]);
bs::aligned_store(res, &c.array[RightVecSize*i]);
}
}
template <typename M>
static inline void matrix_mul_mt_m(M &a, M &b, M &c) {
template <typename T, int l, int m, int n, int MVS>
static inline void matrix_mul_mt_m(BaseMatrix<T, m, l, MVS> &a, BaseMatrix<T, m, n, MVS> &b, BaseMatrix<T, l, n, MVS> &c) {
/*
* Computes the following matrix product C = t(A)*B
* where A, B and C are BaseMatrix objects.
* This code is nearly identical to matrix_mul_m_m but we
* run through A in column instead of row
*/
using pack_t = bs::pack<typename M::ElementType, M::VecSize>;
for (int i=0; i<M::MatSize; i++) {
static constexpr int LeftVecSize = BaseMatrix<T, m, l, MVS>::VecSize;
static constexpr int RightVecSize = BaseMatrix<T, m, n, MVS>::VecSize;
using pack_t = bs::pack<T, RightVecSize>;
for (int i=0; i<l; i++) {
pack_t res = bs::Zero<pack_t>();
int mult = M::VecSize*i;
for (int j=0; j<M::MatSize; j++) {
pack_t factor {a.array[M::VecSize*j + i]};
pack_t row(&b.array[M::VecSize*j]);
for (int j=0; j<m; j++) {
pack_t factor {a.array[LeftVecSize*j + i]};
pack_t row(&b.array[RightVecSize*j]);
res += factor * row;
}
bs::aligned_store(res, &c.array[mult]);
bs::aligned_store(res, &c.array[RightVecSize*i]);
}
}
template <typename M>
static inline void matrix_mul_m_mt(M &a, M &b, M &c) {
template <typename T, int l, int m, int n, int MVS>
static inline void matrix_mul_m_mt(BaseMatrix<T, l, m, MVS> &a, BaseMatrix<T, n, m, MVS> &b, BaseMatrix<T, l, n, MVS> &c) {
/*
* Computes the following matrix product C = A*t(B)
* where A, B and C are BaseMatrix objects.
......@@ -134,7 +137,9 @@ static inline void matrix_mul_m_mt(M &a, M &b, M &c) {
* to load columns of t(B) into vector register we can load row of B instead.
* Assuming that B is store in row order, row of B are correctly aligned for loading.
*/
using pack_t = bs::pack<typename M::ElementType, M::VecSize>;
static constexpr int VecSize = BaseMatrix<T, l, m, MVS>::VecSize; //VecSize are the same for left and right matrices
static constexpr int ResVecSize = BaseMatrix<T, l, n, MVS>::VecSize;
using pack_t = bs::pack<T, VecSize>;
/* Algorithm:
* For each C matrix element:
* * Load corresponding row of A into vector.
......@@ -142,163 +147,152 @@ static inline void matrix_mul_m_mt(M &a, M &b, M &c) {
* * Computes dot product between the two vector.
* * Store the resulting scalar of C.
*/
for (int i=0; i<M::MatSize; i++) {
pack_t row(&a.array[M::VecSize*i]);
for (int j=0; j<M::MatSize; j++) {
pack_t row_transpose(&b.array[M::VecSize*j]);
for (int i=0; i<l; i++) {
pack_t row(&a.array[VecSize*i]);
for (int j=0; j<n; j++) {
pack_t row_transpose(&b.array[VecSize*j]);
// Dot is a slow operation but it will get better with newer hardware.
c.array[M::VecSize*i + j] = bs::dot(row, row_transpose);
c.array[ResVecSize*i + j] = bs::dot(row, row_transpose);
}
}
}
template <typename M>
bs::pack<typename M::ElementType, M::VecSize> column_dot_product(M &a, M &b) {
using pack_t = bs::pack<typename M::ElementType, M::VecSize>;
pack_t res = bs::Zero<pack_t>();
for (int i=0; i < M::MatSize; i++) {
pack_t col_a(&a.array[M::VecSize*i]), col_b(&b.array[M::VecSize*i]);
res += col_a * col_b;
}
return res;
}
template <typename M, typename pack_t>
void column_scalar_product(pack_t &scalar, M &a, M &b) {
for (int i=0; i < M::MatSize; i++) {
pack_t row(&a.array[M::VecSize*i]);
bs::aligned_store(row*scalar, &b.array[M::VecSize*i]);
}
}
template <int Size>
static constexpr int blend_for_size(int i, int c) {
/* Utility function for proctect_for_division */
return i < Size ? i : i+c;
}
template <typename pack_t, int Size>
pack_t& protect_for_division(pack_t &vector) {
vector = bs::shuffle<bs::pattern<blend_for_size<Size>>>(vector, bs::One<pack_t>());
return vector;
}
template <typename M>
void matrix_inv(M &a, M &inv) {
/* Computes inverse of `a` by using Cholesky decomposition.
* M must be symmetric, positive-definite.
* a = t(r)*r, where r is upper triangular.
* a^-1 = (r^-1)*t(r^-1) = inv
*/
M r;
using pack_t = typename M::pack_t;
// Cholesky decomposition
for (int i=0; i<M::MatSize; i++) {
pack_t sum_row = bs::Zero<pack_t>();
template <typename T, int NumberRows, int NumberCols, int MaximumVectorSize>
class Inverse {
using M = BaseMatrix<T, NumberRows, NumberCols, MaximumVectorSize>;
public:
static void inverse(M &a, M &inv) {
/* Computes inverse of `a` by using Cholesky decomposition.
* M must be symmetric, positive-definite.
* a = t(r)*r, where r is upper triangular.
* a^-1 = (r^-1)*t(r^-1) = inv
*/
M r;
using pack_t = typename M::pack_t;
// Cholesky decomposition
for (int i=0; i<M::NRows; i++) {
pack_t sum_row = bs::Zero<pack_t>();
for (int j=0; j<i; j++) {
pack_t cur_row(&r.array[j*M::VecSize]);
pack_t factor {r.array[j*M::VecSize+i]};
sum_row += cur_row*factor;
}
for (int j=0; j<i; j++) {
pack_t cur_row(&r.array[j*M::VecSize]);
pack_t factor {r.array[j*M::VecSize+i]};
sum_row += cur_row*factor;
pack_t res_row(&a.array[i*M::VecSize]);
res_row -= sum_row;
pack_t diviser {1/std::sqrt(res_row[i])};
res_row *= diviser;
bs::aligned_store(res_row, &r.array[i*M::VecSize]);
}
pack_t res_row(&a.array[i*M::VecSize]);
res_row -= sum_row;
pack_t diviser {1/std::sqrt(res_row[i])};
res_row *= diviser;
bs::aligned_store(res_row, &r.array[i*M::VecSize]);
}
M inv_r;
static M Id(1);
// Inversion of R by backward substitution
for (int i=(M::NRows-1); i>=0; i--) {
pack_t sum_row = bs::Zero<pack_t>();
M inv_r;
static M Id(1);
// Inversion of R by backward substitution
for (int i=(M::MatSize-1); i>=0; i--) {
pack_t sum_row = bs::Zero<pack_t>();
for (int j=(M::NRows-1); j > i; j--) {
pack_t cur_row(&inv_r.array[j*M::VecSize]);
pack_t factor {r.array[i*M::VecSize+j]};
sum_row += cur_row*factor;
}
for (int j=(M::MatSize-1); j > i; j--) {
pack_t cur_row(&inv_r.array[j*M::VecSize]);
pack_t factor {r.array[i*M::VecSize+j]};
sum_row += cur_row*factor;
pack_t res_row(&Id.array[i*M::VecSize]);
res_row -= sum_row;
pack_t diviser {1/r.array[i*M::VecSize+i]};
res_row *= diviser;
bs::aligned_store(res_row, &inv_r.array[i*M::VecSize]);
}
pack_t res_row(&Id.array[i*M::VecSize]);
res_row -= sum_row;
pack_t diviser {1/r.array[i*M::VecSize+i]};
res_row *= diviser;
bs::aligned_store(res_row, &inv_r.array[i*M::VecSize]);
}
// Compute inverse of initial matrix
/* TODO: Since inv_r is a triangular matrix there is room
* for optimisation on this product
*/
matrix_mul_mt_m(inv_r, inv_r, inv);
// Compute inverse of initial matrix
/* TODO: Since inv_r is a triangular matrix there is room
* for optimisation on this product
*/
matrix_mul_mt_m(inv_r, inv_r, inv);
}
};
}
template <typename T, int MaximumVectorSize>
class Inverse<T, 2, 2, MaximumVectorSize> {
using M = BaseMatrix<T, 2, 2, MaximumVectorSize>;
public:
static void matrix_inv(M &a, M &inv) {
T inv_det = 1/(a.array[0]*a.array[M::VecSize + 1]- a.array[1]*a.array[1]);
inv.array[0] = inv_det*a.array[M::VecSize + 1];
inv.array[1] = inv.array[M::VecSize] = -inv_det*a.array[1];
inv.array[M::VecSize + 1] = inv_det*a.array[0];
}
};
template <typename T, int MatrixSize, int MaximumVectorSize>
template <typename T, int NumberRows, int NumberCols, int MaximumVectorSize>
class BaseMatrix {
/* Class for matrix object
* Matrix object are linearized 2D array with padded line to fit vector size
*/
static_assert(MatrixSize <= MaximumVectorSize, "Matrix size must be less than maximum vector size");
static_assert(NumberCols <= MaximumVectorSize, "Matrix width must be less than maximum vector size");
static_assert((MaximumVectorSize >= 4) & !(MaximumVectorSize & (MaximumVectorSize - 1)), "Maximum vector size must be a power of two.");
protected:
static constexpr int MatSize = MatrixSize;
static constexpr int MaxVecSize = MaximumVectorSize;
// We use the smallest vector size possible
static constexpr int VecSize = nearest_power_of_two(MatrixSize, MaximumVectorSize);
static constexpr int VecSize = nearest_power_of_two(NumberCols, MaximumVectorSize);
using matrix_t = BaseMatrix<T, MatrixSize, MaxVecSize>;
using matrix_t = BaseMatrix<T, NumberRows, NumberCols, MaximumVectorSize>;
using pack_t = bs::pack<T, VecSize>;
using ElementType = T;
public:
static constexpr int NRows = NumberRows;
static constexpr int NCols = NumberCols;
BaseMatrix() = default;
BaseMatrix(T const a[]) {
if (MatSize == VecSize) { // In this case there is no padding, we can copy directly the array
std::memcpy(this->array, a, sizeof(T)*MatSize*VecSize);
if (NCols == VecSize) { // In this case there is no padding, we can copy directly the array
std::memcpy(this->array, a, sizeof(T)*NRows*VecSize);
}
else {
for (int i=0; i<MatSize; i++) {
std::memcpy(&this->array[i*VecSize], &a[i*MatSize], sizeof(T)*MatSize);
for (int i=0; i<NRows; i++) {
// In this case we copy data row by row
std::memcpy(&this->array[i*VecSize], &a[i*NCols], sizeof(T)*NCols);
}
}
}
BaseMatrix(T const a) {
for (int i=0; i<MatrixSize; i++) {
for (int i=0; i<NRows; i++) {
array[VecSize*i+i] = a;
}
}
matrix_t operator=(T const a[]) {
if (MatSize == VecSize) {
std::memcpy(this->array, a, sizeof(T)*MatSize*VecSize);
if (NCols == VecSize) { // In this case there is no padding, we can copy directly the array
std::memcpy(this->array, a, sizeof(T)*NRows*VecSize);
}
else {
for (int i=0; i<MatSize; i++) {
std::memcpy(&this->array[i*VecSize], &a[i*MatSize], sizeof(T)*MatSize);
for (int i=0; i<NRows; i++) {
// In this case we copy data row by row
std::memcpy(&this->array[i*VecSize], &a[i*NCols], sizeof(T)*NCols);
}
}
return *this;
}
matrix_t operator=(matrix_t &other) {
std::memcpy(this->array, other.array, sizeof(T)*MatSize*VecSize);
std::memcpy(this->array, other.array, sizeof(T)*NRows*VecSize);
return *this;
}
ElementType* dump_array() {
ElementType* returned_array = new ElementType[MatSize*MatSize];
if (MatSize == VecSize) { // In this case there is no padding, we can copy directly the array
std::memcpy(returned_array, this->array, sizeof(T)*MatSize*VecSize);
ElementType* returned_array = new ElementType[NRows*NCols];
if (NCols == VecSize) { // In this case there is no padding, we can copy directly the array
std::memcpy(returned_array, this->array, sizeof(T)*NRows*VecSize);
}
else {
for (int i=0; i<MatSize; i++) {
std::memcpy(&returned_array[i*MatSize], &this->array[i*VecSize], sizeof(T)*MatSize);
for (int i=0; i<NRows; i++) {
std::memcpy(&returned_array[i*NCols], &this->array[i*VecSize], sizeof(T)*NCols);
}
}
return returned_array;
......@@ -308,28 +302,13 @@ class BaseMatrix {
private:
friend void matrix_add<matrix_t>(matrix_t &a, matrix_t &b, matrix_t &c);
friend void matrix_sub<matrix_t>(matrix_t &a, matrix_t &b, matrix_t &c);
friend void matrix_mul_m_m<matrix_t>(matrix_t &a, matrix_t &b, matrix_t &c);
friend void matrix_mul_mt_m<matrix_t>(matrix_t &a, matrix_t &b, matrix_t &c);
friend void matrix_mul_m_mt<matrix_t>(matrix_t &a, matrix_t &b, matrix_t &c);
friend void matrix_inv<matrix_t>(matrix_t &a, matrix_t &b);
friend bs::pack<ElementType, VecSize> column_dot_product<matrix_t>(matrix_t &a, matrix_t &b);
friend void column_scalar_product<matrix_t, pack_t>(pack_t &scalar, matrix_t &b, matrix_t &c);
template <typename M, int N> friend void matrix_inv(M &a, M &b);
friend std::ostream & operator<<<T, MatrixSize, MaxVecSize>(std::ostream &os, matrix_t &mat);
template <typename U, int l, int m, int n, int MVS> friend void matrix_mul_m_m(BaseMatrix<U, l, m, MVS> &a, BaseMatrix<U, m, n, MVS> &b, BaseMatrix<U, l, n, MVS> &c);
template <typename U, int l, int m, int n, int MVS> friend void matrix_mul_mt_m(BaseMatrix<U, m, l, MVS> &a, BaseMatrix<U, m, n, MVS> &b, BaseMatrix<U, l, n, MVS> &c);
template <typename U, int l, int m, int n, int MVS> friend void matrix_mul_m_mt(BaseMatrix<U, l, m, MVS> &a, BaseMatrix<U, n, m, MVS> &b, BaseMatrix<U, l, n, MVS> &c);
friend class Inverse<T, NRows, NCols, MaxVecSize>;
friend std::ostream & operator<<<T, NRows, NCols, MaxVecSize>(std::ostream &os, matrix_t &mat);
protected:
alignas(sizeof(T)*VecSize) float array[MatSize*VecSize] = {0};
alignas(sizeof(T)*VecSize) float array[NRows*VecSize] = {0};
};
/*
template<typename T, int MatrixSize, int MaxVecSize>
class IdentityMatrix: public BaseMatrix<T, MatrixSize, MaxVecSize> {
using BaseMatrix<T, MatrixSize, MaxVecSize>::VecSize;
public:
IdentityMatrix() {
for (int i=0; i<MatrixSize; i++) {
this->array[VecSize*i+i] = 1;
}
}
};*/
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment