aboutsummaryrefslogtreecommitdiffstats
path: root/buch/papers/multiplikation/code/MM
diff options
context:
space:
mode:
authorAndreas Müller <andreas.mueller@ost.ch>2021-08-03 07:37:42 +0200
committerGitHub <noreply@github.com>2021-08-03 07:37:42 +0200
commitf31aca6129f3c84f1ed4f59378fd31cbdc58ec3b (patch)
tree97c32dbdcbcc888a9030d149f5a765f006fcd631 /buch/papers/multiplikation/code/MM
parent1. Version Kapitel Rotation und Spiegelung (diff)
parentMerge pull request #60 from Kuehnee/master (diff)
downloadSeminarMatrizen-f31aca6129f3c84f1ed4f59378fd31cbdc58ec3b.tar.gz
SeminarMatrizen-f31aca6129f3c84f1ed4f59378fd31cbdc58ec3b.zip
Merge branch 'master' into master
Diffstat (limited to '')
-rwxr-xr-xbuch/papers/multiplikation/code/MMbin0 -> 26848 bytes
-rwxr-xr-xbuch/papers/multiplikation/code/MM.c465
-rw-r--r--buch/papers/multiplikation/code/MM.py311
3 files changed, 776 insertions, 0 deletions
diff --git a/buch/papers/multiplikation/code/MM b/buch/papers/multiplikation/code/MM
new file mode 100755
index 0000000..f07985f
--- /dev/null
+++ b/buch/papers/multiplikation/code/MM
Binary files differ
diff --git a/buch/papers/multiplikation/code/MM.c b/buch/papers/multiplikation/code/MM.c
new file mode 100755
index 0000000..04c4dab
--- /dev/null
+++ b/buch/papers/multiplikation/code/MM.c
@@ -0,0 +1,465 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <time.h>
+#include <omp.h>
+#include "c_matrix.h"
+#include <gsl/gsl_cblas.h>
+#include <string.h>
+
+void MM(int *A, int *B, int *C, int n);
+void openMP_MM(int *A, int *B, int *C, int n);
+void winograd(int *A, int *B, int *C, int n);
+int winograd_inner(int *a, int *b, int n);
+void run_algo(void (*algo)(), char alog_name[], int print);
+void run_algo_cblas(int print);
+void MM_dc(int *A, int *B, int *C, int n);
+void strassen(int *A, int *B, int *C, int n);
+void printMatrix(int *C, int n);
+void printMatrix_double(double *C, int n);
+void split(int *in, int *out, int n, int col, int row);
+void join(int *in, int *out, int n, int col, int row);
+void add(int *A, int *B, int *C, int n);
+void sub(int *A, int *B, int *C, int n);
+void multiply(int *A, int *B, int *C, int n);
+
+int main() {
+ // omp_set_dynamic(0);
+ // omp_set_num_threads(4);
+// run_algo(openMP_MM, "openMP_MM",0);
+ run_algo(MM_dc, "MM_dc",0);
+ run_algo(strassen, "strassen",0);
+
+ run_algo(MM, "MM", 0);
+ // run_algo(winograd, "winograd", 0);
+ run_algo_cblas(0);
+
+ return 0;
+}
+
+void MM(int *A, int *B, int *C, int n) {
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ int sum = 0;
+ for (int k = 0; k < n; ++k) {
+ sum += (*((A + i * n) + k)) * (*((B + k * n) + j));
+ }
+ *((C + i * n) + j) = sum;
+ }
+ }
+}
+
+int winograd_inner(int *a, int *b, int n){
+ int ab = 0;
+ if(n%2==0)
+ {
+ int xi = 0;
+ int etha = 0;
+ for(int i = 0; i<n/2;++i)
+ {
+ xi += a[2*i]*a[2*i+1];
+ etha += b[2*i]*b[2*i+1];
+ ab += (a[2*i]+b[2*i+1])*(a[2*i+1]+b[2*i]);
+ }
+ ab = ab-etha-xi;
+ }
+ return ab;
+ }
+
+ void winograd(int *A, int *B, int *C, int n) {
+
+ int xi_array[n];
+ int etha_array[n];
+ int xi = 0;
+ int etha = 0;
+ int ab = 0;
+
+ for (int i = 0; i < n; ++i) {
+ xi = 0;
+ etha = 0;
+ for(int k = 0;k<n/2;++k)
+ {
+ xi += (*((A + i * n) + 2*k))*(*((A + i * n) + (2*k+1)));
+ etha += (*((B + 2*k * n) + i))*(*((B + (2*k+1) * n) + i));
+ }
+ xi_array[i] = xi;
+ etha_array[i] = etha;
+ }
+
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ ab = 0;
+ for(int k = 0;k<n/2;++k)
+ {
+ ab += ((*((A + i * n) + 2*k))+(*((B + (2*k+1) * n) + j)))*((*((A + i * n) + (2*k+1)))+(*((B + 2*k * n) + j)));
+ }
+ *((C + i * n) + j) = ab-etha_array[j]-xi_array[i];
+ }
+ }
+
+
+
+
+ // for (int i = 0; i < n; ++i) {
+ // int *a = (int*) malloc(n * sizeof(int));
+ // for(int k = 0; k<n; ++k)
+ // {
+ // a[k] = (*((A + i * n) + k));
+ // }
+ //
+ // for (int j = 0; j < n; ++j) {
+ // int *b = (int*) malloc(n * sizeof(int));
+ // for(int k = 0; k<n; ++k)
+ // {
+ // b[k] =(*((B + k * n) + j));
+ // }
+ // *((C + i * n) + j) = winograd_inner(a,b,n);
+ // }
+ // }
+ }
+
+
+void openMP_MM(int *A, int *B, int *C, int n) {
+
+ #pragma omp parallel for
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ int sum = 0;
+ for (int k = 0; k < n; ++k) {
+ sum += (*((A + i * n) + k)) * (*((B + k * n) + j));
+ }
+ *((C + i * n) + j) = sum;
+ }
+ }
+}
+
+void MM_dc(int *A, int *B, int *C, int n) {
+ if (n <= 2) {
+ MM((int*) A, (int*) B, (int*) C, n);
+ } else {
+ int *A11 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *A12 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *A21 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *A22 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *B11 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *B12 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *B21 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *B22 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+
+ split((int*) A, (int*) A11, n / 2, 0, 0);
+ split((int*) A, (int*) A12, n / 2, 0, n / 2);
+ split((int*) A, (int*) A21, n / 2, n / 2, 0);
+ split((int*) A, (int*) A22, n / 2, n / 2, n / 2);
+ split((int*) B, (int*) B11, n / 2, 0, 0);
+ split((int*) B, (int*) B12, n / 2, 0, n / 2);
+ split((int*) B, (int*) B21, n / 2, n / 2, 0);
+ split((int*) B, (int*) B22, n / 2, n / 2, n / 2);
+
+ int *tmp1 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *tmp2 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *tmp3 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *tmp4 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *tmp5 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *tmp6 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *tmp7 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *tmp8 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+
+ MM_dc((int*) A11, (int*) B11, (int*) tmp1, n / 2);
+ MM_dc((int*) A12, (int*) B21, (int*) tmp2, n / 2);
+ MM_dc((int*) A11, (int*) B12, (int*) tmp3, n / 2);
+ MM_dc((int*) A12, (int*) B22, (int*) tmp4, n / 2);
+ MM_dc((int*) A21, (int*) B11, (int*) tmp5, n / 2);
+ MM_dc((int*) A22, (int*) B21, (int*) tmp6, n / 2);
+ MM_dc((int*) A21, (int*) B12, (int*) tmp7, n / 2);
+ MM_dc((int*) A22, (int*) B22, (int*) tmp8, n / 2);
+
+ free(A11);
+ free(A12);
+ free(A21);
+ free(A22);
+ free(B11);
+ free(B12);
+ free(B21);
+ free(B22);
+
+ int *C11 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *C12 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *C21 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *C22 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+
+ add((int*) tmp1, (int*) tmp2, (int*) C11, n / 2);
+ add((int*) tmp3, (int*) tmp4, (int*) C12, n / 2);
+ add((int*) tmp5, (int*) tmp6, (int*) C21, n / 2);
+ add((int*) tmp7, (int*) tmp8, (int*) C22, n / 2);
+
+ free(tmp1);
+ free(tmp2);
+ free(tmp3);
+ free(tmp4);
+ free(tmp5);
+ free(tmp6);
+ free(tmp7);
+ free(tmp8);
+
+ join((int*) C11, (int*) C, n / 2, 0, 0);
+ join((int*) C12, (int*) C, n / 2, 0, n / 2);
+ join((int*) C21, (int*) C, n / 2, n / 2, 0);
+ join((int*) C22, (int*) C, n / 2, n / 2, n / 2);
+
+ free(C11);
+ free(C12);
+ free(C21);
+ free(C22);
+
+ }
+}
+
+void strassen(int *A, int *B, int *C, int n) {
+ if (n <= 2) {
+
+ int P, Q, R, S, T, U, V;
+ P = ((*((A + 0 * n) + 0)) + (*((A + 1 * n) + 1)))
+ * ((*((B + 0 * n) + 0)) + (*((B + 1 * n) + 1)));
+ Q = ((*((A + 1 * n) + 0)) + (*((A + 1 * n) + 1)))
+ * ((*((B + 0 * n) + 0)));
+ R = ((*((A + 0 * n) + 0)))
+ * ((*((B + 0 * n) + 1)) - (*((B + 1 * n) + 1)));
+ S = ((*((A + 1 * n) + 1)))
+ * ((*((B + 1 * n) + 0)) - (*((B + 0 * n) + 0)));
+ T = ((*((A + 0 * n) + 0)) + (*((A + 0 * n) + 1)))
+ * ((*((B + 1 * n) + 1)));
+ U = ((*((A + 1 * n) + 0)) - (*((A + 0 * n) + 0)))
+ * ((*((B + 0 * n) + 0)) + (*((B + 0 * n) + 1)));
+ V = ((*((A + 0 * n) + 1)) - (*((A + 1 * n) + 1)))
+ * ((*((B + 1 * n) + 0)) + (*((B + 1 * n) + 1)));
+ (*((C + 0 * n) + 0)) = P + S - T + V;
+ (*((C + 0 * n) + 1)) = R + T;
+ (*((C + 1 * n) + 0)) = Q + S;
+ (*((C + 1 * n) + 1)) = P + R - Q + U;
+
+ } else {
+ int *A11 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *A12 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *A21 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *A22 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *B11 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *B12 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *B21 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *B22 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+
+ split((int*) A, (int*) A11, n / 2, 0, 0);
+ split((int*) A, (int*) A12, n / 2, 0, n / 2);
+ split((int*) A, (int*) A21, n / 2, n / 2, 0);
+ split((int*) A, (int*) A22, n / 2, n / 2, n / 2);
+ split((int*) B, (int*) B11, n / 2, 0, 0);
+ split((int*) B, (int*) B12, n / 2, 0, n / 2);
+ split((int*) B, (int*) B21, n / 2, n / 2, 0);
+ split((int*) B, (int*) B22, n / 2, n / 2, n / 2);
+
+ int *P = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *Q = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *R = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *S = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *T = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *U = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *V = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+
+ int *addA = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *addB = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+
+ add((int*) A11, (int*) A22, (int*) addA, n / 2);
+ add((int*) B11, (int*) B22, (int*) addB, n / 2);
+ strassen((int*) addA, (int*) addB, (int*) P, n / 2);
+
+ add((int*) A21, (int*) A22, (int*) addA, n / 2);
+ strassen((int*) addA, (int*) B11, (int*) Q, n / 2);
+
+ sub((int*) B12, (int*) B22, (int*) addB, n / 2);
+ strassen((int*) A11, (int*) addB, (int*) R, n / 2);
+
+ sub((int*) B21, (int*) B11, (int*) addB, n / 2);
+ strassen((int*) A22, (int*) addB, (int*) S, n / 2);
+
+ add((int*) A11, (int*) A12, (int*) addA, n / 2);
+ strassen((int*) addA, (int*) B22, (int*) T, n / 2);
+
+ sub((int*) A21, (int*) A11, (int*) addA, n / 2);
+ add((int*) B11, (int*) B12, (int*) addB, n / 2);
+ strassen((int*) addA, (int*) addB, (int*) U, n / 2);
+
+ sub((int*) A12, (int*) A22, (int*) addA, n / 2);
+ add((int*) B21, (int*) B22, (int*) addB, n / 2);
+ strassen((int*) addA, (int*) addB, (int*) V, n / 2);
+
+ free(A11);
+ free(A12);
+ free(A21);
+ free(A22);
+ free(B11);
+ free(B12);
+ free(B21);
+ free(B22);
+ free(addA);
+ free(addB);
+
+ int *C11 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *C12 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *C21 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *C22 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+
+ int *resAdd1 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+ int *resAdd2 = (int*) malloc(n / 2 * n / 2 * sizeof(int));
+
+ add((int*) R, (int*) T, (int*) C12, n / 2);
+ add((int*) Q, (int*) S, (int*) C21, n / 2);
+
+ add((int*) P, (int*) S, (int*) resAdd1, n / 2);
+ add((int*) resAdd1, (int*) V, (int*) resAdd2, n / 2);
+ sub((int*) resAdd2, (int*) T, (int*) C11, n / 2);
+
+ add((int*) P, (int*) R, (int*) resAdd1, n / 2);
+ add((int*) resAdd1, (int*) U, (int*) resAdd2, n / 2);
+ sub((int*) resAdd2, (int*) Q, (int*) C22, n / 2);
+
+ free(P);
+ free(Q);
+ free(R);
+ free(S);
+ free(T);
+ free(U);
+ free(V);
+ free(resAdd1);
+ free(resAdd2);
+
+ join((int*) C11, (int*) C, n / 2, 0, 0);
+ join((int*) C12, (int*) C, n / 2, 0, n / 2);
+ join((int*) C21, (int*) C, n / 2, n / 2, 0);
+ join((int*) C22, (int*) C, n / 2, n / 2, n / 2);
+
+ free(C11);
+ free(C12);
+ free(C21);
+ free(C22);
+ }
+}
+
+void add(int *A, int *B, int *C, int n) {
+ for (int i = 0; i < n; i++) {
+ for (int j = 0; j < n; j++) {
+ *((C + i * n) + j) = *((A + i * n) + j) + *((B + i * n) + j);
+ }
+ }
+}
+
+void sub(int *A, int *B, int *C, int n) {
+ for (int i = 0; i < n; i++) {
+ for (int j = 0; j < n; j++) {
+ *((C + i * n) + j) = *((A + i * n) + j) - *((B + i * n) + j);
+ }
+ }
+}
+
+void multiply(int *A, int *B, int *C, int n) {
+ int mul;
+
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ mul = (*((A + i * n) + j)) * (*((B + i * n) + j));
+ *((C + i * n) + j) = mul;
+ }
+ }
+}
+
+void split(int *in, int *out, int n, int col, int row) {
+ for (int i1 = 0, i2 = col; i1 < n; i1++, i2++)
+ for (int j1 = 0, j2 = row; j1 < n; j1++, j2++) {
+ *((out + i1 * n) + j1) = *((in + i2 * n * 2) + j2);
+
+ }
+}
+
+void join(int *in, int *out, int n, int col, int row) {
+ for (int i1 = 0, i2 = col; i1 < n; i1++, i2++)
+ for (int j1 = 0, j2 = row; j1 < n; j1++, j2++)
+ *((out + i2 * n * 2) + j2) = *((in + i1 * n) + j1);
+}
+
+void printMatrix(int *C, int n) {
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ printf("%d ", *((C + i * n) + j));
+ }
+ printf("\n");
+ }
+}
+
+void printMatrix_double(double *C, int n) {
+ for (int i = 0; i < n; ++i) {
+ for (int j = 0; j < n; ++j) {
+ printf("%.0f ", *((C + i * n) + j));
+ }
+ printf("\n");
+ }
+}
+
+void run_algo(void (*algo)(), char alog_name[], int print)
+{
+ FILE *fptr;
+
+ char fileName[40] = "meas/";
+ strcat(fileName, alog_name);
+ strcat(fileName, ".txt");
+ fptr = fopen(fileName, "w");
+
+
+ for(int i=0; i<n_arrays; ++i)
+ {
+ for(int j = 0; j<1; ++j)
+ {
+ int *C = (int*) malloc(n[i] * n[i] * sizeof(int));
+ double dtime = omp_get_wtime();
+ algo(Ap[i], Bp[i], (int*) C, n[i]);
+ dtime = omp_get_wtime() - dtime;
+ // printf("The %s program took %f seconds to execute \n", alog_name, dtime);
+ fprintf(fptr, "%f,%d\n", dtime, n[i]);
+
+ if(print==1)
+ {
+ printMatrix((int*)C, n[i]);
+ }
+ free(C);
+ }
+ }
+ fclose(fptr);
+
+}
+
+void run_algo_cblas(int print)
+{
+
+ FILE *fptr;
+
+ fptr = fopen("meas/blas.txt", "w");
+ for(int i=0; i<n_arrays; ++i)
+ {
+ for(int j = 0; j<1; ++j)
+ {
+ double *dC = (double*) malloc(n[i] * n[i] * sizeof(double));
+ double dtime = omp_get_wtime();
+ cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, n[i], n[i], n[i], 1.0, dAp[i], n[i],
+ dBp[i], n[i], 0.0, dC, n[i]);
+ dtime = omp_get_wtime() - dtime;
+ // printf("The cblas program took %f seconds to execute \n", dtime);
+ fprintf(fptr, "%f,%d\n",dtime, n[i]);
+
+ if(print==1)
+ {
+ printMatrix_double( (double*)dC, n[i]);
+ }
+
+ free(dC);
+ }
+ }
+ fclose(fptr);
+
+}
diff --git a/buch/papers/multiplikation/code/MM.py b/buch/papers/multiplikation/code/MM.py
new file mode 100644
index 0000000..626b82d
--- /dev/null
+++ b/buch/papers/multiplikation/code/MM.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar 19 07:31:29 2021
+
+@author: nunigan
+"""
+import numpy as np
+import time
+import matplotlib.pyplot as plt
+from scipy.optimize import curve_fit
+import tikzplotlib
+def MM(A, B):
+ n = np.shape(A)[0]
+ C = np.zeros((n, n))
+ for i in range(n):
+ for j in range(n):
+ C[i, j] = 0
+ for k in range(n):
+ C[i, j] += A[i, k]*B[k, j]
+ return C
+
+
+def MM_dc(A, B):
+ n = np.shape(A)[0]
+ if(n <= 2):
+ C = np.zeros((n, n))
+ C[0, 0] = A[0, 0]*B[0, 0]+A[0, 1]*B[1, 0]
+ C[0, 1] = A[0, 0]*B[0, 1]+A[0, 1]*B[1, 1]
+ C[1, 0] = A[1, 0]*B[0, 0]+A[1, 1]*B[1, 0]
+ C[1, 1] = A[1, 0]*B[0, 1]+A[1, 1]*B[1, 1]
+ return C
+ else:
+ A11, A12, A21, A22 = A[:n//2, :n//2], A[:n//2, n//2:], A[n//2:, :n//2], A[n//2:, n//2:]
+ B11, B12, B21, B22 = B[:n//2, :n//2], B[:n//2, n//2:], B[n//2:, :n//2], B[n//2:, n//2:]
+ C11 = MM_dc(A11, B11) + MM_dc(A12, B21)
+ C12 = MM_dc(A11, B12) + MM_dc(A12, B22)
+ C21 = MM_dc(A21, B11) + MM_dc(A22, B21)
+ C22 = MM_dc(A21, B12) + MM_dc(A22, B22)
+ C = np.vstack((np.hstack((C11, C12)), np.hstack((C21, C22))))
+ return C
+
+
+def strassen(A, B):
+ n = np.shape(A)[0]
+ if(n <= 2):
+ C = np.zeros((n, n))
+ P = (A[0, 0]+A[1, 1])*(B[0, 0]+B[1, 1])
+ Q = (A[1, 0]+A[1, 1])*B[0, 0]
+ R = A[0, 0]*(B[0, 1]-B[1, 1])
+ S = A[1, 1]*(B[1, 0]-B[0, 0])
+ T = (A[0, 0]+A[0, 1])*B[1, 1]
+ U = (A[1, 0]-A[0, 0])*(B[0, 0]+B[0, 1])
+ V = (A[0, 1]-A[1, 1])*(B[1, 0]+B[1, 1])
+ C[0, 0] = P+S-T+V
+ C[0, 1] = R+T
+ C[1, 0] = Q+S
+ C[1, 1] = P+R-Q+U
+ return C
+ else:
+ m = n//2
+ A11, A12, A21, A22 = A[:m, :m], A[:m, m:], A[m:, :m], A[m:, m:]
+ B11, B12, B21, B22 = B[:m, :m], B[:m, m:], B[m:, :m], B[m:, m:]
+ P = strassen((A11+A22),(B11+B22))
+ Q = strassen((A21+A22),B11)
+ R = strassen(A11,(B12-B22))
+ S = strassen(A22,(B21-B11))
+ T = strassen((A11+A12),B22)
+ U = strassen((A21-A11),(B11+B12))
+ V = strassen((A12-A22),(B21+B22))
+
+ C11 = P+S-T+V
+ C12 = R+T
+ C21 = Q+S
+ C22 = P+R-Q+U
+
+ C = np.vstack((np.hstack((C11, C12)), np.hstack((C21, C22))))
+ return C
+
+def winograd_inner(a, b):
+ n = np.shape(a)[0]
+ if n%2 == 0:
+ xi = np.sum(a[::2]*a[1::2])
+ etha = np.sum(b[::2]*b[1::2])
+ # print("xi = {}, etha = {}".format(xi, etha))
+ ab = np.sum((a[::2]+b[1::2])*(a[1::2]+b[::2]))-xi-etha
+ else:
+ xi = np.sum(a[0:-1:2]*a[1::2])
+ etha = np.sum(b[0:-1:2]*b[1::2])
+ ab = np.sum((a[0:-1:2]+b[1::2])*(a[1::2]+b[0:-1:2]))-xi-etha+a[-1]*b[-1]
+ return ab
+
+def winograd(A, B):
+ m,n = np.shape(A)
+ n2,p = np.shape(B)
+ C = np.zeros((m,p))
+ for i in range(np.shape(A)[0]):
+ for j in range(np.shape(B)[1]):
+ C[i,j] = winograd_inner(A[i,:], B[:,j])
+ return C
+
+def winograd2(A, B):
+ m,n = np.shape(A)
+ n2,p = np.shape(B)
+ C = np.zeros((m,p))
+ xi = np.zeros((m))
+ eta = np.zeros((p))
+ ab = 0
+ for i in range(m):
+ for j in range(n//2):
+ xi[i] += A[i,2*j]*A[i,2*j+1]
+
+ for i in range(p):
+ for j in range(n//2):
+ eta[i] += B[2*j,i]*B[2*j+1,i]
+
+ if n%2==0:
+ for i in range(m):
+ for j in range(p):
+ ab = 0
+ for k in range(n//2):
+ ab += (A[i,2*k]+B[2*k+1,j])*(A[i,2*k+1]+B[2*k,j])
+ C[i,j] = ab-eta[j]-xi[i]
+ else:
+ for i in range(m):
+ for j in range(p):
+ ab = 0
+ for k in range(n//2):
+ ab += (A[i,2*k]+B[2*k+1,j])*(A[i,2*k+1]+B[2*k,j])
+ C[i,j] = ab-eta[j]-xi[i]+A[i,-1]*B[-1,j]
+
+ return C
+
+def test_perfomance(n):
+ t_mm = []
+ t_mm_dc = []
+ t_mm_strassen = []
+ t_wino = []
+ t_np = []
+
+ for i in n:
+ A = np.random.randn(i, i)
+ B = np.random.randn(i, i)
+ # A = np.random.randint(-100, 100,(i, i))
+ # B = np.random.randint(-100, 100,(i, i))
+
+ start = time.time()
+ C3 = strassen(A, B)
+ t_mm_strassen.append(time.time() - start)
+
+ start = time.time()
+ C1 = MM(A, B)
+ t_mm.append(time.time() - start)
+
+ start = time.time()
+ C2 = MM_dc(A, B)
+ t_mm_dc.append(time.time() - start)
+
+ start = time.time()
+ C4 = winograd2(A, B)
+ t_wino.append(time.time() - start)
+
+ start = time.time()
+ C = A@B
+ t_np.append(time.time() - start)
+
+ plt.figure(figsize=(13,8))
+ plt.rcParams['font.family'] = 'STIXGeneral'
+ plt.rc('axes', labelsize=23)
+ plt.rc('xtick', labelsize=23)
+ plt.rc('ytick', labelsize=23)
+ plt.plot(n, t_mm, label='Standard', lw=5)
+ plt.plot(n, t_mm_dc, label='Divide and conquer', lw=5)
+ plt.plot(n, t_mm_strassen, label='Strassen', lw=5)
+ plt.plot(n, t_wino, label='Winograd', lw=5)
+ plt.plot(n, t_np, label='NumPy A@B', lw=5)
+ plt.legend()
+ plt.xlabel("n")
+ plt.ylabel("time (s)")
+ plt.grid(True)
+ plt.tight_layout()
+ # plt.yscale('log')
+ plt.legend(fontsize=19)
+ plt.savefig('meas_' + str(max(n))+ '.pdf')
+ arr = np.array([n, t_mm, t_mm_dc, t_mm_strassen, t_wino, t_np])
+ np.savetxt('meas_' + str(max(n))+ '.txt',arr)
+ return arr
+
+
+def plot(num):
+ arr = np.loadtxt('meas_{}.txt'.format(num))
+ n, t_mm, t_mm_dc, t_mm_strassen, t_wino, t_np = arr
+ plt.figure(figsize=(13,8))
+ plt.rcParams['font.family'] = 'STIXGeneral'
+ plt.rc('axes', labelsize=23)
+ plt.rc('xtick', labelsize=23)
+ plt.rc('ytick', labelsize=23)
+ plt.plot(n, t_mm, label='3 For Loops', lw=5)
+ plt.plot(n, t_mm_dc, label='Divide and Conquer', lw=5)
+ plt.plot(n, t_mm_strassen, label='Strassen', lw=5)
+ # plt.plot(n, t_wino, label='Winograd', lw=5)
+ plt.plot(n, t_np, label='NumPy A@B', lw=5)
+ plt.legend()
+ plt.xlabel("n")
+ plt.ylabel("time (s)")
+ plt.grid(True)
+ plt.tight_layout()
+ # plt.yscale('log')
+ plt.legend(fontsize=19)
+ plt.savefig('meas_' + str(num)+ '.pdf')
+ return arr
+
+def plot_c_res(ave, num):
+ MM = np.loadtxt("meas/MM.txt", delimiter=',')
+ # winograd = np.loadtxt("meas/winograd.txt", delimiter=',')
+ blas = np.loadtxt("meas/blas.txt", delimiter=',')
+ MM_dc = np.loadtxt("meas/MM_dc.txt", delimiter=',')
+ strassen = np.loadtxt("meas/strassen.txt", delimiter=',')
+
+ MM_t = MM[:,0]
+ MM_n = MM[:,1]
+ MM_t = np.mean(MM_t.reshape(-1,ave),axis=1)
+ MM_n = np.mean(MM_n.reshape(-1,ave),axis=1)
+
+ MM_dc_t = MM_dc[:,0]
+ MM_dc_n = MM_dc[:,1]
+ MM_dc_t = np.mean(MM_dc_t.reshape(-1,ave),axis=1)
+ MM_dc_n = np.mean(MM_dc_n.reshape(-1,ave),axis=1)
+
+ strassen_t = strassen[:,0]
+ strassen_n = strassen[:,1]
+ strassen_t = np.mean(strassen_t.reshape(-1,ave),axis=1)
+ strassen_n = np.mean(strassen_n.reshape(-1,ave),axis=1)
+
+ # winograd_t = winograd[:,0]
+ # winograd_n = winograd[:,1]
+ # winograd_t = np.mean(winograd_t.reshape(-1,ave),axis=1)
+ # winograd_n = np.mean(winograd_n.reshape(-1,ave),axis=1)
+
+ blas_t = blas[:,0]
+ blas_n = blas[:,1]
+ blas_t = np.mean(blas_t.reshape(-1,ave),axis=1)
+ blas_n = np.mean(blas_n.reshape(-1,ave),axis=1)
+
+ def func(x, a,b):
+ return b*x**a
+
+ # popt, pcov = curve_fit(func, blas_n, blas_t)
+ # popt1, pcov2 = curve_fit(func, blas_n, winograd_t)
+ # popt2, pcov2 = curve_fit(func, blas_n, MM_t)
+
+ plt.figure(figsize=(13,8))
+ plt.rcParams['font.family'] = 'STIXGeneral'
+ plt.rc('axes', labelsize=23)
+ plt.rc('xtick', labelsize=23)
+ plt.rc('ytick', labelsize=23)
+ plt.plot(MM_n, MM_t, label='3 For Loops', lw=5)
+ # plt.plot(winograd_n, winograd_t, label='Winograd MM', lw=5)
+ plt.plot(blas_n, blas_t, label='Blas', lw=5)
+ plt.plot(strassen_n, strassen_t, label='Strassen', lw=5)
+ plt.plot(MM_dc_n, MM_dc_t, label='Divide and Conquer', lw=5)
+ plt.xlabel("n")
+ plt.ylabel("time (s)")
+ plt.grid(True)
+ plt.tight_layout()
+ plt.legend(fontsize=19)
+ plt.savefig('c_meas_' + str(num)+ '.pdf')
+
+ # plt.plot(blas_n, func(blas_n, *popt), 'r-', label='fit blas: a=%5.5f, b=%5.10f' % tuple(popt))
+ # plt.plot(blas_n, func(blas_n, *popt1), 'r-', label='fit winograd: a=%5.5f, b=%5.10f' % tuple(popt1))
+ # plt.plot(blas_n, func(blas_n, *popt2), 'r-', label='fit MM: a=%5.5f, b=%5.10f' % tuple(popt2))
+
+ plt.legend()
+
+
+# test%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+if __name__ == '__main__':
+ plot_c_res(1, 4096)
+
+
+ # plot(8)
+ # n = np.logspace(1,10,10,base=2,dtype=(np.int))
+ # n = np.arange(1,50,2)
+ A = np.random.randint(-10, 10, (5,3))
+ B = np.random.randint(-10, 10, (3,5))
+
+ C = winograd2(A, B)
+ C_test = A@B
+ print(C)
+ print(C_test)
+ # print(np.equal(C, C_test))
+
+ # t_np = test_perfomance(n)
+ # C = strassen(A, B)
+ # C_test = A@B
+
+
+ # plot_c_res()
+ # def func(x, a):
+ # return x**a
+
+ # popt, pcov = curve_fit(func, n, t_np, bounds=(2, 3))
+
+
+ # plt.figure()
+ # plt.plot(n, t_np, 'b-', label='data')
+ # plt.plot(n, func(n, *popt), 'r-', label='fit: a=%5.3f' % tuple(popt))
+ # plt.xlabel('x')
+ # plt.ylabel('y')
+ # plt.legend()
+ \ No newline at end of file